Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
self.table_config = [self.config_file] + destination_params
emr_system = EMRSystem(
self.config_file,
self.destination_system,
self.destination_database,
self.destination_environment
)
# self.s3_table = S3Table(emr_system, self.destination_table)
if data_type is None:
data_type = DataType.STRUCTURED
self.dataset = DataSetFactory.create_dataset(
emr_system,
HiveTable.TableLoadType.APPEND,
data_type,
self.destination_table
)
config_filename = "append_load-{}-{}.json".format(self.destination_environment, self.dataset.table_lake)
self.config_filepath = os.path.join(self.dataset.dir_apps_append_load, config_filename)
self.db_name_lake = self.scon_emr_dict["environments"][self.destination_environment]["schemas"]["lake"]
self.expected_algorithms_jar_path = "s3://" + os.path.join(
(self.scon_emr_dict["environments"][self.destination_environment]["s3_buckets"]["application"]).strip("/"),
(self.scon_emr_dict["environments"][self.destination_environment]["s3_deployment_dir_base"]).strip("/"),
self.destination_environment,
self.scon_emr_dict["subdir"]["m3d"],
self.config_dict["subdir_projects"]["m3d_api"],
self.scon_emr_dict["spark"]["jar_name"]
)
def _get_supported_emr_load_types():
"""
Return a list of the available EMR load
:return: dictionary load-name -> load-class
"""
return {
HiveTable.TableLoadType.FULL: FullLoad,
HiveTable.TableLoadType.DELTA: DeltaLoad,
HiveTable.TableLoadType.APPEND: AppendLoad
}
def get_load_type(self):
return HiveTable.TableLoadType.APPEND
def get_load_type(self):
return HiveTable.TableLoadType.FULL
destination_environment,
destination_table,
**kwargs
):
"""
Initialize Hive table
:param config: system config file
:param destination_system: destination system code
:param destination_database: destination database code
:param destination_environment: destination environment code
:param destination_table: destination table code
"""
# call super constructor
super(HiveTable, self).__init__(
config,
destination_system,
destination_database,
destination_environment,
destination_table,
**kwargs
)
def get_load_type(self):
return HiveTable.TableLoadType.DELTA
def create_dataset(execution_system, load_type, data_type, dataset_name):
if data_type == DataType.STRUCTURED:
dataset = S3Table(
emr_system=execution_system,
destination_table=dataset_name
)
elif data_type == DataType.SEMISTRUCTURED:
if load_type == HiveTable.TableLoadType.APPEND:
dataset = SemistructuredDataSet(
emr_system=execution_system,
dataset_name=dataset_name
)
else:
raise M3DUnsupportedLoadTypeException(
load_type=load_type,
message="Loading algorithm {} not support for data type {}.".format(load_type, data_type)
)
else:
raise M3DUnsupportedDataTypeException(
message="Data Type {} not available.".format(data_type)
)
return dataset
def _get_supported_emr_load_types():
"""
Return a list of the available EMR load
:return: dictionary load-name -> load-class
"""
return {
HiveTable.TableLoadType.FULL: FullLoad,
HiveTable.TableLoadType.DELTA: DeltaLoad,
HiveTable.TableLoadType.APPEND: AppendLoad
}
import logging
import os
from m3d.config.config_service import ConfigService
from m3d.exceptions.m3d_exceptions import M3DDatabaseException, M3DException
from m3d.hadoop.core.hive_table import HiveTable
from m3d.hadoop.core.spark_parameters import SparkParameters
from m3d.hadoop.emr.emr_exceptions import M3DEMRStepException
from m3d.util.hql_generator import HQLGenerator
class S3Table(HiveTable):
def __init__(self, emr_system, destination_table, spark_params=None, **kwargs):
"""
Initialize representation of Hive table on S3
:param config: system config file
:param destination_system: destination system code
:param destination_database: destination database code
:param destination_environment: destination environment code
:param destination_table: destination table code
:param emr_cluster_id: emr cluster id
:param spark_params: external spark parameters to override scon defaults
"""
# call super constructor
super(S3Table, self).__init__(