Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_read_sql_redshift_pandas2(session, bucket, redshift_parameters):
n: int = 1_000_000
df = pd.DataFrame({"id": list((range(n))), "val": list(["foo" if i % 2 == 0 else "boo" for i in range(n)])})
con = Redshift.generate_connection(
database="test",
host=redshift_parameters.get("RedshiftAddress"),
port=redshift_parameters.get("RedshiftPort"),
user="test",
password=redshift_parameters.get("RedshiftPassword"),
)
path = f"s3://{bucket}/test_read_sql_redshift_pandas2/"
session.pandas.to_redshift(
dataframe=df,
path=path,
schema="public",
table="test",
connection=con,
iam_role=redshift_parameters.get("RedshiftRole"),
mode="overwrite",
preserve_index=True,
def test_connection_with_different_port_types(redshift_parameters):
conn = Redshift.generate_connection(
database="test",
host=redshift_parameters.get("RedshiftAddress"),
port=str(redshift_parameters.get("RedshiftPort")),
user="test",
password=redshift_parameters.get("RedshiftPassword"),
)
conn.close()
conn = Redshift.generate_connection(
database="test",
host=redshift_parameters.get("RedshiftAddress"),
port=float(redshift_parameters.get("RedshiftPort")),
user="test",
password=redshift_parameters.get("RedshiftPassword"),
)
conn.close()
path = f"data_samples/{sample_name}.csv"
if sample_name == "micro":
schema = "id SMALLINT, name STRING, value FLOAT, date DATE"
timestamp_format = "yyyy-MM-dd"
elif sample_name == "small":
schema = "id BIGINT, name STRING, date DATE"
timestamp_format = "dd-MM-yy"
elif sample_name == "nano":
schema = "id INTEGER, name STRING, value DOUBLE, date DATE, time TIMESTAMP"
timestamp_format = "yyyy-MM-dd"
dataframe = session.spark.read_csv(path=path,
schema=schema,
timestampFormat=timestamp_format,
dateFormat=timestamp_format,
header=True)
con = Redshift.generate_connection(
database="test",
host=redshift_parameters.get("RedshiftAddress"),
port=redshift_parameters.get("RedshiftPort"),
user="test",
password=redshift_parameters.get("RedshiftPassword"),
)
session.spark.to_redshift(
dataframe=dataframe,
path=f"s3://{bucket}/redshift-load/",
connection=con,
schema="public",
table="test",
iam_role=redshift_parameters.get("RedshiftRole"),
diststyle=diststyle,
distkey=distkey,
sortstyle=sortstyle,
def test_to_redshift_spark_decimal(session, bucket, redshift_parameters):
df = session.spark_session.createDataFrame(pd.DataFrame({
"id": [1, 2, 3],
"decimal_2": [Decimal((0, (1, 9, 9), -2)), None, Decimal((0, (1, 9, 0), -2))],
"decimal_5": [Decimal((0, (1, 9, 9, 9, 9, 9), -5)), None,
Decimal((0, (1, 9, 0, 0, 0, 0), -5))]
}),
schema="id INTEGER, decimal_2 DECIMAL(3,2), decimal_5 DECIMAL(6,5)")
con = Redshift.generate_connection(
database="test",
host=redshift_parameters.get("RedshiftAddress"),
port=redshift_parameters.get("RedshiftPort"),
user="test",
password=redshift_parameters.get("RedshiftPassword"),
)
path = f"s3://{bucket}/redshift-load2/"
session.spark.to_redshift(
dataframe=df,
path=path,
schema="public",
table="test2",
connection=con,
iam_role=redshift_parameters.get("RedshiftRole"),
mode="overwrite",
)
def test_to_redshift_pandas_exceptions(session, bucket, redshift_parameters, sample_name, mode, factor, diststyle,
distkey, sortstyle, sortkey, exc):
dataframe = pd.read_csv(f"data_samples/{sample_name}.csv")
con = Redshift.generate_connection(
database="test",
host=redshift_parameters.get("RedshiftAddress"),
port=redshift_parameters.get("RedshiftPort"),
user="test",
password=redshift_parameters.get("RedshiftPassword"),
)
path = f"s3://{bucket}/redshift-load/"
with pytest.raises(exc):
assert session.pandas.to_redshift(
dataframe=dataframe,
path=path,
schema="public",
table="test",
connection=con,
iam_role=redshift_parameters.get("RedshiftRole"),
diststyle=diststyle,
def test_stress_to_redshift_spark_big(session, bucket, redshift_parameters):
print("Creating DataFrame...")
dataframe = session.spark_session.createDataFrame(pd.DataFrame({
"A": list(range(10_000)),
"B": list(range(10_000))
}))
dataframe.cache()
for i in range(10):
print(f"Run number: {i}")
con = Redshift.generate_connection(
database="test",
host=redshift_parameters.get("RedshiftAddress"),
port=redshift_parameters.get("RedshiftPort"),
user="test",
password=redshift_parameters.get("RedshiftPassword"),
)
session.spark.to_redshift(
dataframe=dataframe,
path=f"s3://{bucket}/redshift-load-{i}/",
connection=con,
schema="public",
table="test",
iam_role=redshift_parameters.get("RedshiftRole"),
mode="overwrite",
min_num_partitions=16,
)
def test_read_sql_redshift_pandas(session, bucket, redshift_parameters, sample_name):
if sample_name == "micro":
dates = ["date"]
elif sample_name == "small":
dates = ["date"]
else:
dates = ["date", "time"]
df = pd.read_csv(f"data_samples/{sample_name}.csv", parse_dates=dates, infer_datetime_format=True)
df["date"] = df["date"].dt.date
con = Redshift.generate_connection(
database="test",
host=redshift_parameters.get("RedshiftAddress"),
port=redshift_parameters.get("RedshiftPort"),
user="test",
password=redshift_parameters.get("RedshiftPassword"),
)
path = f"s3://{bucket}/test_read_sql_redshift_pandas/"
session.pandas.to_redshift(
dataframe=df,
path=path,
schema="public",
table="test",
connection=con,
iam_role=redshift_parameters.get("RedshiftRole"),
mode="overwrite",
preserve_index=True,
def test_connection_timeout(redshift_parameters):
with pytest.raises(pg8000.core.InterfaceError):
Redshift.generate_connection(
database="test",
host=redshift_parameters.get("RedshiftAddress"),
port=12345,
user="test",
password=redshift_parameters.get("RedshiftPassword"),
)
def test_to_redshift_spark_exceptions(session, bucket, redshift_parameters, sample_name, mode, factor, diststyle,
distkey, sortstyle, sortkey, exc):
path = f"data_samples/{sample_name}.csv"
dataframe = session.spark.read_csv(path=path)
con = Redshift.generate_connection(
database="test",
host=redshift_parameters.get("RedshiftAddress"),
port=redshift_parameters.get("RedshiftPort"),
user="test",
password=redshift_parameters.get("RedshiftPassword"),
)
with pytest.raises(exc):
assert session.spark.to_redshift(
dataframe=dataframe,
path=f"s3://{bucket}/redshift-load/",
connection=con,
schema="public",
table="test",
iam_role=redshift_parameters.get("RedshiftRole"),
diststyle=diststyle,
distkey=distkey,
def test_to_redshift_pandas(session, bucket, redshift_parameters, sample_name, mode, factor, diststyle, distkey,
sortstyle, sortkey):
if sample_name == "micro":
dates = ["date"]
if sample_name == "small":
dates = ["date"]
if sample_name == "nano":
dates = ["date", "time"]
dataframe = pd.read_csv(f"data_samples/{sample_name}.csv", parse_dates=dates, infer_datetime_format=True)
dataframe["date"] = dataframe["date"].dt.date
con = Redshift.generate_connection(
database="test",
host=redshift_parameters.get("RedshiftAddress"),
port=redshift_parameters.get("RedshiftPort"),
user="test",
password=redshift_parameters.get("RedshiftPassword"),
)
path = f"s3://{bucket}/redshift-load/"
session.pandas.to_redshift(
dataframe=dataframe,
path=path,
schema="public",
table="test",
connection=con,
iam_role=redshift_parameters.get("RedshiftRole"),
diststyle=diststyle,
distkey=distkey,