Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_spark_dataframe_output_csv():
spark = SparkSession.builder.getOrCreate()
num_df = (
spark.read.format('csv')
.options(header='true', inferSchema='true')
.load(file_relative_path(__file__, 'num.csv'))
)
assert num_df.collect() == [Row(num1=1, num2=2)]
@solid
def emit(_):
return num_df
@solid(input_defs=[InputDefinition('df', DataFrame)], output_defs=[OutputDefinition(DataFrame)])
def passthrough_df(_context, df):
return df
@pipeline
def passthrough():
passthrough_df(emit())
with seven.TemporaryDirectory() as tempdir:
file_name = os.path.join(tempdir, 'output.csv')
result = execute_pipeline(
),
Row(
acct=10000001236,
dollar_amt=1345.0,
name="George Bluth",
float_fld=1.0,
accnt_purge=False,
),
Row(
acct=10000001237,
dollar_amt=123456.0,
name="Bob Loblaw",
float_fld=345.12,
accnt_purge=False,
),
Row(
acct=10000001238,
dollar_amt=1.05,
name="Loose Seal Bluth",
float_fld=111.0,
accnt_purge=True,
),
Row(
acct=10000001238,
dollar_amt=1.05,
name="Loose Seal Bluth",
float_fld=111.0,
accnt_purge=True,
),
]
return spark.createDataFrame(mock_data2)
def test_infer_nested_schema(self):
NestedRow = Row("f1", "f2")
nestedRdd1 = self.sc.parallelize([NestedRow([1, 2], {"row1": 1.0}),
NestedRow([2, 3], {"row2": 2.0})])
df = self.sqlCtx.inferSchema(nestedRdd1)
self.assertEqual(Row(f1=[1, 2], f2={u'row1': 1.0}), df.collect()[0])
nestedRdd2 = self.sc.parallelize([NestedRow([[1, 2], [2, 3]], [1, 2]),
NestedRow([[2, 3], [3, 4]], [2, 3])])
df = self.sqlCtx.inferSchema(nestedRdd2)
self.assertEqual(Row(f1=[[1, 2], [2, 3]], f2=[1, 2]), df.collect()[0])
from collections import namedtuple
CustomRow = namedtuple('CustomRow', 'field1 field2')
rdd = self.sc.parallelize([CustomRow(field1=1, field2="row1"),
CustomRow(field1=2, field2="row2"),
CustomRow(field1=3, field2="row3")])
df = self.sqlCtx.inferSchema(rdd)
def test_smoke_test_table_three():
spark = spark_session_from_config()
result = invoke_compute(
TableThree,
inputs={
'table_one': spark.createDataFrame([Row(num=1)]),
'table_two': spark.createDataFrame([Row(num=2)]),
},
)
assert result.success
assert set(result.output_value().collect()) == set([Row(num=1), Row(num=2)])
def test_rows_only_base_returns_a_dataframe_with_rows_only_in_base(spark, comparison1):
# require schema if contains only 1 row and contain field value as None
schema = StructType(
[
StructField("acct", LongType(), True),
StructField("date_fld", DateType(), True),
StructField("dollar_amt", LongType(), True),
StructField("float_fld", DoubleType(), True),
StructField("name", StringType(), True),
]
)
expected_df = spark.createDataFrame(
[
Row(
acct=10000001239,
dollar_amt=1,
name="Lucille Bluth",
float_fld=None,
date_fld=datetime.date(2017, 1, 1),
)
],
schema,
)
assert comparison1.rows_only_base.count() == 1
assert expected_df.union(comparison1.rows_only_base).distinct().count() == 1
.map(lambda x: Row(gene_id=x[0].split('.')[0], expr_val=x[1])) \
.toDF()
spark = SparkSession.builder\
.master("local[4]")\
.appName("sql.group tests")\
.getOrCreate()
sc = spark.sparkContext
globs['sc'] = sc
globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
.toDF(StructType([StructField('age', IntegerType()),
StructField('name', StringType())]))
globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80),
Row(name='Bob', age=5, height=85)]).toDF()
globs['df4'] = sc.parallelize([Row(course="dotNET", year=2012, earnings=10000),
Row(course="Java", year=2012, earnings=20000),
Row(course="dotNET", year=2012, earnings=5000),
Row(course="dotNET", year=2013, earnings=48000),
Row(course="Java", year=2013, earnings=30000)]).toDF()
(failure_count, test_count) = doctest.testmod(
pyspark.sql.group, globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
spark.stop()
if failure_count:
exit(-1)
def build_status_table(self):
rtime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(time.time()))
print("Refreshed at:",rtime)
tracking_uri = mlflow.tracking.get_tracking_uri()
rows = [ Row(refreshed_at=rtime, \
tracking_uri = tracking_uri,\
tracking_host = mlflow_utils.get_mlflow_host(tracking_uri), \
version = mlflow.version.VERSION) ]
df = spark.createDataFrame(rows)
self.write_df(df,"mlflow_status")
self.build_table_ddl("mlflow_status")
video_map = video_ids.map(lambda (x, y): Row(videoid=x.videoid, videoid_int=y)).toDF().cache()
def generate_id_df(self, df, id_name):
"""
generate_id_df, creates a new column , with unique id for each row in df
:param df: dataframe to be given id column
:param id_name: the id name
:return: new df with generated id
"""
initial_id = self.start_id_map[id_name] if id_name in self.start_id_map else 0
row_with_index = Row(*(["id"] + df.columns))
df_columns = df.columns
# using zipWithIndex to generate consecutive ids, rather than monotonically_increasing_ids
# consecutive ids are desired because ids unnecessarily large will complicate future
# appending to the same metastore (generated ids have to be bigger than the max of ids
# already in the database
def make_row_with_uid(columns, row, uid):
row_dict = row.asDict()
return row_with_index(*([uid] + [row_dict.get(c) for c in columns]))
df_with_pk = (df.rdd
.zipWithIndex()
.map(lambda row_uid: make_row_with_uid(df_columns, *row_uid))
.toDF(StructType([StructField("zip_id", LongType(), False)] + df.schema.fields)))
return df_with_pk.withColumn(id_name, df_with_pk.zip_id + initial_id).drop("zip_id")