How to use awswrangler - 10 common examples

To help you get started, we’ve selected a few awswrangler examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github awslabs / aws-data-wrangler / testing / test_awswrangler / test_redshift.py View on Github external
def test_read_sql_redshift_pandas2(session, bucket, redshift_parameters):
    n: int = 1_000_000
    df = pd.DataFrame({"id": list((range(n))), "val": list(["foo" if i % 2 == 0 else "boo" for i in range(n)])})
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="test",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    path = f"s3://{bucket}/test_read_sql_redshift_pandas2/"
    session.pandas.to_redshift(
        dataframe=df,
        path=path,
        schema="public",
        table="test",
        connection=con,
        iam_role=redshift_parameters.get("RedshiftRole"),
        mode="overwrite",
        preserve_index=True,
github awslabs / aws-data-wrangler / testing / test_awswrangler / test_redshift.py View on Github external
def test_connection_with_different_port_types(redshift_parameters):
    conn = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=str(redshift_parameters.get("RedshiftPort")),
        user="test",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    conn.close()
    conn = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=float(redshift_parameters.get("RedshiftPort")),
        user="test",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    conn.close()
github awslabs / aws-data-wrangler / testing / test_awswrangler / test_redshift.py View on Github external
path = f"data_samples/{sample_name}.csv"
    if sample_name == "micro":
        schema = "id SMALLINT, name STRING, value FLOAT, date DATE"
        timestamp_format = "yyyy-MM-dd"
    elif sample_name == "small":
        schema = "id BIGINT, name STRING, date DATE"
        timestamp_format = "dd-MM-yy"
    elif sample_name == "nano":
        schema = "id INTEGER, name STRING, value DOUBLE, date DATE, time TIMESTAMP"
        timestamp_format = "yyyy-MM-dd"
    dataframe = session.spark.read_csv(path=path,
                                       schema=schema,
                                       timestampFormat=timestamp_format,
                                       dateFormat=timestamp_format,
                                       header=True)
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="test",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    session.spark.to_redshift(
        dataframe=dataframe,
        path=f"s3://{bucket}/redshift-load/",
        connection=con,
        schema="public",
        table="test",
        iam_role=redshift_parameters.get("RedshiftRole"),
        diststyle=diststyle,
        distkey=distkey,
        sortstyle=sortstyle,
github awslabs / aws-data-wrangler / testing / test_awswrangler / test_redshift.py View on Github external
def test_to_redshift_spark_decimal(session, bucket, redshift_parameters):
    df = session.spark_session.createDataFrame(pd.DataFrame({
        "id": [1, 2, 3],
        "decimal_2": [Decimal((0, (1, 9, 9), -2)), None, Decimal((0, (1, 9, 0), -2))],
        "decimal_5": [Decimal((0, (1, 9, 9, 9, 9, 9), -5)), None,
                      Decimal((0, (1, 9, 0, 0, 0, 0), -5))]
    }),
                                               schema="id INTEGER, decimal_2 DECIMAL(3,2), decimal_5 DECIMAL(6,5)")
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="test",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    path = f"s3://{bucket}/redshift-load2/"
    session.spark.to_redshift(
        dataframe=df,
        path=path,
        schema="public",
        table="test2",
        connection=con,
        iam_role=redshift_parameters.get("RedshiftRole"),
        mode="overwrite",
    )
github awslabs / aws-data-wrangler / testing / test_awswrangler / test_redshift.py View on Github external
def test_to_redshift_pandas_exceptions(session, bucket, redshift_parameters, sample_name, mode, factor, diststyle,
                                       distkey, sortstyle, sortkey, exc):
    dataframe = pd.read_csv(f"data_samples/{sample_name}.csv")
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="test",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    path = f"s3://{bucket}/redshift-load/"
    with pytest.raises(exc):
        assert session.pandas.to_redshift(
            dataframe=dataframe,
            path=path,
            schema="public",
            table="test",
            connection=con,
            iam_role=redshift_parameters.get("RedshiftRole"),
            diststyle=diststyle,
github awslabs / aws-data-wrangler / testing / test_awswrangler / test_redshift.py View on Github external
def test_stress_to_redshift_spark_big(session, bucket, redshift_parameters):
    print("Creating DataFrame...")
    dataframe = session.spark_session.createDataFrame(pd.DataFrame({
        "A": list(range(10_000)),
        "B": list(range(10_000))
    }))
    dataframe.cache()
    for i in range(10):
        print(f"Run number: {i}")
        con = Redshift.generate_connection(
            database="test",
            host=redshift_parameters.get("RedshiftAddress"),
            port=redshift_parameters.get("RedshiftPort"),
            user="test",
            password=redshift_parameters.get("RedshiftPassword"),
        )
        session.spark.to_redshift(
            dataframe=dataframe,
            path=f"s3://{bucket}/redshift-load-{i}/",
            connection=con,
            schema="public",
            table="test",
            iam_role=redshift_parameters.get("RedshiftRole"),
            mode="overwrite",
            min_num_partitions=16,
        )
github awslabs / aws-data-wrangler / testing / test_awswrangler / test_redshift.py View on Github external
def test_read_sql_redshift_pandas(session, bucket, redshift_parameters, sample_name):
    if sample_name == "micro":
        dates = ["date"]
    elif sample_name == "small":
        dates = ["date"]
    else:
        dates = ["date", "time"]
    df = pd.read_csv(f"data_samples/{sample_name}.csv", parse_dates=dates, infer_datetime_format=True)
    df["date"] = df["date"].dt.date
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="test",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    path = f"s3://{bucket}/test_read_sql_redshift_pandas/"
    session.pandas.to_redshift(
        dataframe=df,
        path=path,
        schema="public",
        table="test",
        connection=con,
        iam_role=redshift_parameters.get("RedshiftRole"),
        mode="overwrite",
        preserve_index=True,
github awslabs / aws-data-wrangler / testing / test_awswrangler / test_redshift.py View on Github external
def test_connection_timeout(redshift_parameters):
    with pytest.raises(pg8000.core.InterfaceError):
        Redshift.generate_connection(
            database="test",
            host=redshift_parameters.get("RedshiftAddress"),
            port=12345,
            user="test",
            password=redshift_parameters.get("RedshiftPassword"),
        )
github awslabs / aws-data-wrangler / testing / test_awswrangler / test_redshift.py View on Github external
def test_to_redshift_spark_exceptions(session, bucket, redshift_parameters, sample_name, mode, factor, diststyle,
                                      distkey, sortstyle, sortkey, exc):
    path = f"data_samples/{sample_name}.csv"
    dataframe = session.spark.read_csv(path=path)
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="test",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    with pytest.raises(exc):
        assert session.spark.to_redshift(
            dataframe=dataframe,
            path=f"s3://{bucket}/redshift-load/",
            connection=con,
            schema="public",
            table="test",
            iam_role=redshift_parameters.get("RedshiftRole"),
            diststyle=diststyle,
            distkey=distkey,
github awslabs / aws-data-wrangler / testing / test_awswrangler / test_redshift.py View on Github external
def test_to_redshift_pandas(session, bucket, redshift_parameters, sample_name, mode, factor, diststyle, distkey,
                            sortstyle, sortkey):

    if sample_name == "micro":
        dates = ["date"]
    if sample_name == "small":
        dates = ["date"]
    if sample_name == "nano":
        dates = ["date", "time"]
    dataframe = pd.read_csv(f"data_samples/{sample_name}.csv", parse_dates=dates, infer_datetime_format=True)
    dataframe["date"] = dataframe["date"].dt.date
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="test",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    path = f"s3://{bucket}/redshift-load/"
    session.pandas.to_redshift(
        dataframe=dataframe,
        path=path,
        schema="public",
        table="test",
        connection=con,
        iam_role=redshift_parameters.get("RedshiftRole"),
        diststyle=diststyle,
        distkey=distkey,