Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
@this_pyspark_table(feature_area=FEATURE_ONE)
def TableTwo(context) -> SparkDF:
return context.resources.spark.spark_session.createDataFrame([Row(num=2)])
@this_pyspark_table(
input_tables=[input_table('table_one', TableOne), input_table('table_two', TableTwo)],
feature_area=FEATURE_TWO,
)
def TableThree(_, table_one: SparkDF, table_two: SparkDF) -> SparkDF:
return table_one.union(table_two)
class ByFeatureParquetLakehouse(Lakehouse):
def __init__(self, root_dir):
self.lakehouse_path = check.str_param(root_dir, 'root_dir')
def _path_for_table(self, table_type, table_metadata):
return os.path.join(self.lakehouse_path, table_metadata[FEATURE_AREA], table_type.name)
def hydrate(self, context, table_type, table_metadata, _table_handle, _dest_metadata):
path = self._path_for_table(table_type, table_metadata)
return context.resources.spark.spark_session.read.parquet(path)
def materialize(self, _context, table_type, table_metadata, value):
path = self._path_for_table(table_type, table_metadata)
value.write.parquet(path=path, mode='overwrite')
return Materialization.file(path), None
)
def create_column_descriptions(spark_type):
buildme = '**Columns:**\n\n '
parts = []
for spark_field in spark_type.fields:
parts.append(
'{name}: {type_name}'.format(
name=spark_field.name, type_name=spark_field.dataType.typeName()
)
)
return buildme + '\n '.join(parts)
class TypedPySparkMemLakehouse(Lakehouse):
def __init__(self):
self.collected_tables = {}
def hydrate(self, _context, _table_type, _table_metadata, table_handle, _dest_metadata):
return table_handle.value
def materialize(self, _context, table_type, _table_metadata, value):
self.collected_tables[table_type.name] = value.collect()
return None, InMemTableHandle(value=value)
@resource
def typed_pyspark_mem_lakehouse(_):
return TypedPySparkMemLakehouse()
import os
from dagster import Materialization, check, execute_pipeline
from dagster_pyspark import spark_session_resource
from lakehouse import construct_lakehouse_pipeline, Lakehouse
class LocalOnDiskSparkCsvLakehouse(Lakehouse):
def __init__(self, root_dir):
self.lakehouse_path = check.str_param(root_dir, 'root_dir')
def _path_for_table(self, table_type):
return os.path.join(self.lakehouse_path, table_type.name)
def hydrate(self, context, table_type, _table_metadata, _table_handle, _dest_metadata):
path = self._path_for_table(table_type)
return context.resources.spark.read.csv(path, header=True, inferSchema=True)
def materialize(self, _context, table_type, _table_metadata, value):
path = self._path_for_table(table_type)
value.write.csv(path=path, header=True, mode='overwrite')
return Materialization.file(path), None