Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
@sqlite_table
def TableTwo(context):
context.resources.conn.execute('''CREATE TABLE TableTwo AS SELECT 2 as num''')
context.resources.conn.commit()
@sqlite_table(
input_tables=[input_table('table_one', TableOne), input_table('table_two', TableTwo)]
)
def TableThree(context, **_kwargs):
context.resources.conn.execute(
'CREATE TABLE TableThree AS SELECT num from TableOne UNION SELECT num from TableTwo'
)
context.resources.conn.commit()
conn = sqlite3.connect(':memory:')
pipeline_def = construct_lakehouse_pipeline(
name='sqllite_lakehouse_pipeline',
lakehouse_tables=[TableOne, TableTwo, TableThree],
resources={'conn': conn, 'lakehouse': SqlLiteLakehouse()},
)
result = execute_pipeline(pipeline_def)
assert result.success
assert conn.cursor().execute('SELECT * FROM TableThree').fetchall() == [(1,), (2,)]
def test_execute_typed_in_mem_lakehouse(execute_spark_lakehouse_build):
lakehouse = TypedPySparkMemLakehouse()
pipeline_result = execute_spark_lakehouse_build(
tables=[NumberTable, StringTable, JoinTable], lakehouse=lakehouse
)
assert pipeline_result.success
# Row ordering varies on 3.5 - compare as dicts
assert (
lakehouse.collected_tables['JoinTable'][0].asDict()
== Row(id=1, number=2, string='23').asDict()
)
# for dagit
typed_lakehouse_pipeline = construct_lakehouse_pipeline(
name='typed_lakehouse_pipeline',
lakehouse_tables=[NumberTable, StringTable, JoinTable],
resources={'lakehouse': typed_pyspark_mem_lakehouse, 'spark': pyspark_resource},
)
def test_execute_byfeature_parquet_lakehouse():
with get_temp_dir() as temp_dir:
lakehouse = ByFeatureParquetLakehouse(temp_dir)
pipeline_def = construct_lakehouse_pipeline(
name='test',
lakehouse_tables=[TableOne, TableTwo, TableThree],
resources={'spark': pyspark_resource, 'lakehouse': lakehouse},
)
pipeline_result = execute_pipeline(pipeline_def)
assert pipeline_result.success
def get_table(table_def):
spark = spark_session_from_config()
return spark.read.parquet(
os.path.join(temp_dir, table_def.metadata[FEATURE_AREA], table_def.name)
).collect()
assert get_table(TableOne) == [Row(num=1)]
assert get_table(TableTwo) == [Row(num=2)]
def test_missing_resource():
with pytest.raises(DagsterInvalidDefinitionError):
@pyspark_table
def missing(_):
pass
construct_lakehouse_pipeline('test', lakehouse_tables=[missing], resources={})
def test_snowflake():
construct_lakehouse_pipeline(
name='snowflake_lake',
lakehouse_tables=[TableOne],
resources={'snowflake': snowflake_resource, 'lakehouse': SnowflakeLakehouse()},
)
def test_file_based_sqlite_pipeline():
def path_for_table(table_name):
return file_relative_path(
__file__, 'basic_sqllite_test_files/{table_name}.sql'.format(table_name=table_name)
)
TableOne = create_sqllite_table_from_file(path_for_table('TableOne'))
TableTwo = create_sqllite_table_from_file(path_for_table('TableTwo'))
TableThree = create_sqllite_table_from_file(
path_for_table('TableThree'),
input_tables=[input_table('table_one', TableOne), input_table('table_two', TableTwo)],
)
conn = sqlite3.connect(':memory:')
pipeline_def = construct_lakehouse_pipeline(
name='sqllite_lakehouse_pipeline',
lakehouse_tables=[TableOne, TableTwo, TableThree],
resources={'conn': conn, 'lakehouse': SqlLiteLakehouse()},
)
result = execute_pipeline(pipeline_def)
assert result.success
assert conn.cursor().execute('SELECT * FROM TableThree').fetchall() == [(1,), (2,)]
def execute_spark_lakehouse_build(tables, lakehouse, environment_dict=None):
return execute_pipeline(
construct_lakehouse_pipeline(
name='spark_lakehouse_pipeline',
lakehouse_tables=tables,
resources={'lakehouse': lakehouse, 'spark': spark_session_resource},
),
environment_dict=environment_dict,
)