How to use the lakehouse.Lakehouse function in lakehouse

To help you get started, we’ve selected a few lakehouse examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dagster-io / dagster / python_modules / lakehouse / lakehouse_tests / test_pyspark_custom_url_scheme_lakehouse.py View on Github external
@this_pyspark_table(feature_area=FEATURE_ONE)
def TableTwo(context) -> SparkDF:
    return context.resources.spark.spark_session.createDataFrame([Row(num=2)])


@this_pyspark_table(
    input_tables=[input_table('table_one', TableOne), input_table('table_two', TableTwo)],
    feature_area=FEATURE_TWO,
)
def TableThree(_, table_one: SparkDF, table_two: SparkDF) -> SparkDF:
    return table_one.union(table_two)


class ByFeatureParquetLakehouse(Lakehouse):
    def __init__(self, root_dir):
        self.lakehouse_path = check.str_param(root_dir, 'root_dir')

    def _path_for_table(self, table_type, table_metadata):
        return os.path.join(self.lakehouse_path, table_metadata[FEATURE_AREA], table_type.name)

    def hydrate(self, context, table_type, table_metadata, _table_handle, _dest_metadata):
        path = self._path_for_table(table_type, table_metadata)
        return context.resources.spark.spark_session.read.parquet(path)

    def materialize(self, _context, table_type, table_metadata, value):
        path = self._path_for_table(table_type, table_metadata)
        value.write.parquet(path=path, mode='overwrite')
        return Materialization.file(path), None
github dagster-io / dagster / python_modules / lakehouse / lakehouse_tests / test_typed_pyspark_lakehouse.py View on Github external
)


def create_column_descriptions(spark_type):
    buildme = '**Columns:**\n\n    '
    parts = []
    for spark_field in spark_type.fields:
        parts.append(
            '{name}: {type_name}'.format(
                name=spark_field.name, type_name=spark_field.dataType.typeName()
            )
        )
    return buildme + '\n    '.join(parts)


class TypedPySparkMemLakehouse(Lakehouse):
    def __init__(self):
        self.collected_tables = {}

    def hydrate(self, _context, _table_type, _table_metadata, table_handle, _dest_metadata):
        return table_handle.value

    def materialize(self, _context, table_type, _table_metadata, value):
        self.collected_tables[table_type.name] = value.collect()
        return None, InMemTableHandle(value=value)


@resource
def typed_pyspark_mem_lakehouse(_):
    return TypedPySparkMemLakehouse()
github dagster-io / dagster / python_modules / lakehouse / lakehouse_tests / common.py View on Github external
import os

from dagster import Materialization, check, execute_pipeline
from dagster_pyspark import spark_session_resource

from lakehouse import construct_lakehouse_pipeline, Lakehouse


class LocalOnDiskSparkCsvLakehouse(Lakehouse):
    def __init__(self, root_dir):
        self.lakehouse_path = check.str_param(root_dir, 'root_dir')

    def _path_for_table(self, table_type):
        return os.path.join(self.lakehouse_path, table_type.name)

    def hydrate(self, context, table_type, _table_metadata, _table_handle, _dest_metadata):
        path = self._path_for_table(table_type)
        return context.resources.spark.read.csv(path, header=True, inferSchema=True)

    def materialize(self, _context, table_type, _table_metadata, value):
        path = self._path_for_table(table_type)
        value.write.csv(path=path, header=True, mode='overwrite')
        return Materialization.file(path), None