How to use the pyspark.sql.Row function in pyspark

To help you get started, we’ve selected a few pyspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dagster-io / dagster / examples / dagster_examples_tests / airline_demo_tests / test_types.py View on Github external
def test_spark_dataframe_output_csv():
    spark = SparkSession.builder.getOrCreate()
    num_df = (
        spark.read.format('csv')
        .options(header='true', inferSchema='true')
        .load(file_relative_path(__file__, 'num.csv'))
    )

    assert num_df.collect() == [Row(num1=1, num2=2)]

    @solid
    def emit(_):
        return num_df

    @solid(input_defs=[InputDefinition('df', DataFrame)], output_defs=[OutputDefinition(DataFrame)])
    def passthrough_df(_context, df):
        return df

    @pipeline
    def passthrough():
        passthrough_df(emit())

    with seven.TemporaryDirectory() as tempdir:
        file_name = os.path.join(tempdir, 'output.csv')
        result = execute_pipeline(
github capitalone / datacompy / tests / test_sparkcompare.py View on Github external
),
        Row(
            acct=10000001236,
            dollar_amt=1345.0,
            name="George Bluth",
            float_fld=1.0,
            accnt_purge=False,
        ),
        Row(
            acct=10000001237,
            dollar_amt=123456.0,
            name="Bob Loblaw",
            float_fld=345.12,
            accnt_purge=False,
        ),
        Row(
            acct=10000001238,
            dollar_amt=1.05,
            name="Loose Seal Bluth",
            float_fld=111.0,
            accnt_purge=True,
        ),
        Row(
            acct=10000001238,
            dollar_amt=1.05,
            name="Loose Seal Bluth",
            float_fld=111.0,
            accnt_purge=True,
        ),
    ]

    return spark.createDataFrame(mock_data2)
github UCLA-VAST / blaze / spark-1.5.1 / python / pyspark / sql / tests.py View on Github external
def test_infer_nested_schema(self):
        NestedRow = Row("f1", "f2")
        nestedRdd1 = self.sc.parallelize([NestedRow([1, 2], {"row1": 1.0}),
                                          NestedRow([2, 3], {"row2": 2.0})])
        df = self.sqlCtx.inferSchema(nestedRdd1)
        self.assertEqual(Row(f1=[1, 2], f2={u'row1': 1.0}), df.collect()[0])

        nestedRdd2 = self.sc.parallelize([NestedRow([[1, 2], [2, 3]], [1, 2]),
                                          NestedRow([[2, 3], [3, 4]], [2, 3])])
        df = self.sqlCtx.inferSchema(nestedRdd2)
        self.assertEqual(Row(f1=[[1, 2], [2, 3]], f2=[1, 2]), df.collect()[0])

        from collections import namedtuple
        CustomRow = namedtuple('CustomRow', 'field1 field2')
        rdd = self.sc.parallelize([CustomRow(field1=1, field2="row1"),
                                   CustomRow(field1=2, field2="row2"),
                                   CustomRow(field1=3, field2="row3")])
        df = self.sqlCtx.inferSchema(rdd)
github dagster-io / dagster / python_modules / lakehouse / lakehouse_tests / test_pyspark_custom_url_scheme_lakehouse.py View on Github external
def test_smoke_test_table_three():
    spark = spark_session_from_config()

    result = invoke_compute(
        TableThree,
        inputs={
            'table_one': spark.createDataFrame([Row(num=1)]),
            'table_two': spark.createDataFrame([Row(num=2)]),
        },
    )

    assert result.success
    assert set(result.output_value().collect()) == set([Row(num=1), Row(num=2)])
github capitalone / datacompy / tests / test_sparkcompare.py View on Github external
def test_rows_only_base_returns_a_dataframe_with_rows_only_in_base(spark, comparison1):
    # require schema if contains only 1 row and contain field value as None
    schema = StructType(
        [
            StructField("acct", LongType(), True),
            StructField("date_fld", DateType(), True),
            StructField("dollar_amt", LongType(), True),
            StructField("float_fld", DoubleType(), True),
            StructField("name", StringType(), True),
        ]
    )
    expected_df = spark.createDataFrame(
        [
            Row(
                acct=10000001239,
                dollar_amt=1,
                name="Lucille Bluth",
                float_fld=None,
                date_fld=datetime.date(2017, 1, 1),
            )
        ],
        schema,
    )

    assert comparison1.rows_only_base.count() == 1
    assert expected_df.union(comparison1.rows_only_base).distinct().count() == 1
github OXPHOS / GeneMiner / src / pipeline / rnaseq_processor.py View on Github external
                .map(lambda x: Row(gene_id=x[0].split('.')[0], expr_val=x[1])) \
                .toDF()
github qubole / spark-on-lambda / python / pyspark / sql / group.py View on Github external
spark = SparkSession.builder\
        .master("local[4]")\
        .appName("sql.group tests")\
        .getOrCreate()
    sc = spark.sparkContext
    globs['sc'] = sc
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))
    globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80),
                                   Row(name='Bob', age=5, height=85)]).toDF()
    globs['df4'] = sc.parallelize([Row(course="dotNET", year=2012, earnings=10000),
                                   Row(course="Java",   year=2012, earnings=20000),
                                   Row(course="dotNET", year=2012, earnings=5000),
                                   Row(course="dotNET", year=2013, earnings=48000),
                                   Row(course="Java",   year=2013, earnings=30000)]).toDF()

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.group, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    spark.stop()
    if failure_count:
        exit(-1)
github amesar / mlflow-fun / tools / mlflow_fun / analytics / build_tables.py View on Github external
def build_status_table(self):
        rtime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(time.time()))
        print("Refreshed at:",rtime)
        tracking_uri = mlflow.tracking.get_tracking_uri()
        rows = [ Row(refreshed_at=rtime, \
            tracking_uri = tracking_uri,\
            tracking_host = mlflow_utils.get_mlflow_host(tracking_uri), \
            version = mlflow.version.VERSION) ]
        df = spark.createDataFrame(rows)
        self.write_df(df,"mlflow_status")
        self.build_table_ddl("mlflow_status")
github LukeTillman / killrvideo-csharp / data / spark / recommendations.py View on Github external
video_map = video_ids.map(lambda (x, y): Row(videoid=x.videoid, videoid_int=y)).toDF().cache()
github aws-samples / aws-glue-samples / utilities / Hive_metastore_migration / src / hive_metastore_migration.py View on Github external
def generate_id_df(self, df, id_name):
        """
        generate_id_df, creates a new column , with unique id for each row in df
        :param df: dataframe to be given id column
        :param id_name: the id name
        :return: new df with generated id
        """
        initial_id = self.start_id_map[id_name] if id_name in self.start_id_map else 0

        row_with_index = Row(*(["id"] + df.columns))
        df_columns = df.columns

        # using zipWithIndex to generate consecutive ids, rather than monotonically_increasing_ids
        # consecutive ids are desired because ids unnecessarily large will complicate future
        # appending to the same metastore (generated ids have to be bigger than the max of ids
        # already in the database
        def make_row_with_uid(columns, row, uid):
            row_dict = row.asDict()
            return row_with_index(*([uid] + [row_dict.get(c) for c in columns]))

        df_with_pk = (df.rdd
                      .zipWithIndex()
                      .map(lambda row_uid: make_row_with_uid(df_columns, *row_uid))
                      .toDF(StructType([StructField("zip_id", LongType(), False)] + df.schema.fields)))

        return df_with_pk.withColumn(id_name, df_with_pk.zip_id + initial_id).drop("zip_id")