Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# self.source_location = os.path.join("s3://",
# self._execution_system.bucket_lake, self._parameters["source_location"])
self.target_table = self._execution_system.db_lake + "." + self._parameters["target_table"]
self.output_dictionary = {
"source_table": self.source_table,
# you can use a source location as parquet files on the lake instead of a hive table
# "source_location": self.source_location,
"target_table": self.target_table,
"date_from": self._parameters["date_from"],
"date_to": self._parameters["date_to"]
}
execution_system.add_cluster_tags({
EMRSystem.EMRClusterTag.SOURCE_TABLE: self.source_table,
EMRSystem.EMRClusterTag.TARGET_TABLE: self.target_table
})
python_class = algorithm_config.get_python_class()
available_algorithms = self._get_supported_emr_algorithms()
if python_class not in available_algorithms:
raise M3DUnsupportedAlgorithmException(python_class)
self._spark_parameters = algorithm_config.get_spark_params()
self._algorithm_instance = algorithm_config.get_algorithm_instance()
self._algorithm_wrapper = available_algorithms[python_class](
execution_system=self._execution_system,
algorithm_instance=algorithm_config.get_algorithm_instance(),
algorithm_params=algorithm_config.get_algorithm_params()
)
self._execution_system.add_cluster_tags({
EMRSystem.EMRClusterTag.API_METHOD: M3D.run_algorithm.__name__,
EMRSystem.EMRClusterTag.ALGORITHM_CLASS: python_class,
EMRSystem.EMRClusterTag.ALGORITHM_INSTANCE: algorithm_config.get_algorithm_instance()
})
"""
super(AlgorithmPartitionMaterialization.BasePartitionMaterialization, self).__init__(
execution_system,
algorithm_instance,
algorithm_params
)
view_name = self._parameters[self.ConfigKeys.VIEW]
self.target_partitions = self._parameters[self.ConfigKeys.TARGET_PARTITIONS]
self.metadata_update_strategy = self._parameters.get(self.ConfigKeys.METADATA_UPDATE_STRATEGY, None)
self.source_view = "{}.{}".format(execution_system.db_mart_mod, view_name)
self.target_table = "{}.{}".format(execution_system.db_mart_cal, view_name)
execution_system.add_cluster_tags({
EMRSystem.EMRClusterTag.SOURCE_VIEW: self.source_view,
EMRSystem.EMRClusterTag.TARGET_TABLE: self.target_table
})
:param execution_system: an instance of EMRSystem object
:param algorithm_instance: name of the algorithm instance
:param algorithm_params: algorithm configuration
"""
super(AlgorithmFixedLengthStringExtractor,
self).__init__(execution_system, algorithm_instance, algorithm_params)
self.validate_parameters()
self.source_table = self._execution_system.db_lake + "." + self._parameters["source_table"]
self.target_table = self._execution_system.db_lake + "." + self._parameters["target_table"]
self.metadata_update_strategy = self._parameters.get("metadata_update_strategy", None)
execution_system.add_cluster_tags({
EMRSystem.EMRClusterTag.SOURCE_TABLE: self.source_table,
EMRSystem.EMRClusterTag.TARGET_TABLE: self.target_table
})
destination_table
)
destination_system_technology = abstract_table.get_destination_technology()
# hadoop
if destination_system_technology == DataSystem.SystemTechnology.HIVE:
if abstract_table.storage_type == DataSystem.StorageType.S3:
from m3d.hadoop.emr.emr_system import EMRSystem
emr_system = EMRSystem(
config,
destination_system,
destination_database,
destination_environment,
emr_cluster_id
)
emr_system.add_cluster_tag(EMRSystem.EMRClusterTag.API_METHOD, M3D.drop_lake_out_view.__name__)
emr_system.drop_lake_out_view(destination_table)
else:
raise m3d_exceptions.M3DUnsupportedStorageException(abstract_table.storage_type)
else:
raise m3d_exceptions.M3DUnsupportedDestinationSystemException(destination_system_technology)
available_algorithms = self._get_supported_emr_algorithms()
if python_class not in available_algorithms:
raise M3DUnsupportedAlgorithmException(python_class)
self._spark_parameters = algorithm_config.get_spark_params()
self._algorithm_instance = algorithm_config.get_algorithm_instance()
self._algorithm_wrapper = available_algorithms[python_class](
execution_system=self._execution_system,
algorithm_instance=algorithm_config.get_algorithm_instance(),
algorithm_params=algorithm_config.get_algorithm_params()
)
self._execution_system.add_cluster_tags({
EMRSystem.EMRClusterTag.API_METHOD: M3D.run_algorithm.__name__,
EMRSystem.EMRClusterTag.ALGORITHM_CLASS: python_class,
EMRSystem.EMRClusterTag.ALGORITHM_INSTANCE: algorithm_config.get_algorithm_instance()
})
emr_version = self.default_emr_version
if ebs_size is None:
ebs_size = self.default_ebs_size
client = OpenSourceClient(aws_region, log_uri, credentials)
# start cluster without job
cluster_id = client.start_cluster(log_uri, emr_version, ebs_size, master_instance_type,
core_instance_type, core_instance_count)
emr_cluster_id = cluster_id['JobFlowId']
self.emr_cluster_client = self._create_emr_cluster_client(emr_cluster_id)
self.add_cluster_tags({
EMRSystem.EMRClusterTag.SYSTEM: self.source_system,
EMRSystem.EMRClusterTag.ENVIRONMENT: self.environment
})
logging.info("Creation of \"{}\" EMR cluster has been initialized.".format(
emr_cluster_id
))
return emr_cluster_id
super(AlgorithmPartitionMaterialization.BasePartitionMaterialization, self).__init__(
execution_system,
algorithm_instance,
algorithm_params
)
view_name = self._parameters[self.ConfigKeys.VIEW]
self.target_partitions = self._parameters[self.ConfigKeys.TARGET_PARTITIONS]
self.metadata_update_strategy = self._parameters.get(self.ConfigKeys.METADATA_UPDATE_STRATEGY, None)
self.source_view = "{}.{}".format(execution_system.db_mart_mod, view_name)
self.target_table = "{}.{}".format(execution_system.db_mart_cal, view_name)
execution_system.add_cluster_tags({
EMRSystem.EMRClusterTag.SOURCE_VIEW: self.source_view,
EMRSystem.EMRClusterTag.TARGET_TABLE: self.target_table
})
execution_system,
load_type,
data_type,
destination_table
)
self._load_wrapper = available_loads[load_type](
execution_system=self._execution_system,
dataset=dataset,
load_params=load_params
)
self._execution_system.add_cluster_tags({
EMRSystem.EMRClusterTag.API_METHOD: M3D.load_table.__name__,
EMRSystem.EMRClusterTag.LOAD_TYPE: load_type,
EMRSystem.EMRClusterTag.TARGET_TABLE: destination_table
})