How to use the m3d.hadoop.emr.emr_system.EMRSystem.EMRClusterTag function in m3d

To help you get started, we’ve selected a few m3d examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github adidas / m3d-api / m3d / hadoop / algorithm / algorithm_algorithm_template.py View on Github external
# self.source_location = os.path.join("s3://",
        #                                      self._execution_system.bucket_lake, self._parameters["source_location"])
        self.target_table = self._execution_system.db_lake + "." + self._parameters["target_table"]

        self.output_dictionary = {
            "source_table": self.source_table,
            # you can use a source location as parquet files on the lake instead of a hive table
            # "source_location": self.source_location,
            "target_table": self.target_table,
            "date_from": self._parameters["date_from"],
            "date_to": self._parameters["date_to"]
        }

        execution_system.add_cluster_tags({
            EMRSystem.EMRClusterTag.SOURCE_TABLE: self.source_table,
            EMRSystem.EMRClusterTag.TARGET_TABLE: self.target_table
        })
github adidas / m3d-api / m3d / hadoop / algorithm / algorithm_executor_hadoop.py View on Github external
python_class = algorithm_config.get_python_class()
        available_algorithms = self._get_supported_emr_algorithms()
        if python_class not in available_algorithms:
            raise M3DUnsupportedAlgorithmException(python_class)

        self._spark_parameters = algorithm_config.get_spark_params()
        self._algorithm_instance = algorithm_config.get_algorithm_instance()
        self._algorithm_wrapper = available_algorithms[python_class](
            execution_system=self._execution_system,
            algorithm_instance=algorithm_config.get_algorithm_instance(),
            algorithm_params=algorithm_config.get_algorithm_params()
        )

        self._execution_system.add_cluster_tags({
            EMRSystem.EMRClusterTag.API_METHOD: M3D.run_algorithm.__name__,
            EMRSystem.EMRClusterTag.ALGORITHM_CLASS: python_class,
            EMRSystem.EMRClusterTag.ALGORITHM_INSTANCE: algorithm_config.get_algorithm_instance()
        })
github adidas / m3d-api / m3d / hadoop / algorithm / algorithm_partition_materialization.py View on Github external
"""

            super(AlgorithmPartitionMaterialization.BasePartitionMaterialization, self).__init__(
                execution_system,
                algorithm_instance,
                algorithm_params
            )

            view_name = self._parameters[self.ConfigKeys.VIEW]
            self.target_partitions = self._parameters[self.ConfigKeys.TARGET_PARTITIONS]
            self.metadata_update_strategy = self._parameters.get(self.ConfigKeys.METADATA_UPDATE_STRATEGY, None)
            self.source_view = "{}.{}".format(execution_system.db_mart_mod, view_name)
            self.target_table = "{}.{}".format(execution_system.db_mart_cal, view_name)

            execution_system.add_cluster_tags({
                EMRSystem.EMRClusterTag.SOURCE_VIEW: self.source_view,
                EMRSystem.EMRClusterTag.TARGET_TABLE: self.target_table
            })
github adidas / m3d-api / m3d / hadoop / algorithm / algorithm_fixed_length_string_extractor.py View on Github external
:param execution_system: an instance of EMRSystem object
        :param algorithm_instance: name of the algorithm instance
        :param algorithm_params: algorithm configuration
        """

        super(AlgorithmFixedLengthStringExtractor,
              self).__init__(execution_system, algorithm_instance, algorithm_params)

        self.validate_parameters()

        self.source_table = self._execution_system.db_lake + "." + self._parameters["source_table"]
        self.target_table = self._execution_system.db_lake + "." + self._parameters["target_table"]
        self.metadata_update_strategy = self._parameters.get("metadata_update_strategy", None)

        execution_system.add_cluster_tags({
            EMRSystem.EMRClusterTag.SOURCE_TABLE: self.source_table,
            EMRSystem.EMRClusterTag.TARGET_TABLE: self.target_table
        })
github adidas / m3d-api / m3d / m3d.py View on Github external
destination_table
        )
        destination_system_technology = abstract_table.get_destination_technology()

        # hadoop
        if destination_system_technology == DataSystem.SystemTechnology.HIVE:
            if abstract_table.storage_type == DataSystem.StorageType.S3:
                from m3d.hadoop.emr.emr_system import EMRSystem
                emr_system = EMRSystem(
                    config,
                    destination_system,
                    destination_database,
                    destination_environment,
                    emr_cluster_id
                )
                emr_system.add_cluster_tag(EMRSystem.EMRClusterTag.API_METHOD, M3D.drop_lake_out_view.__name__)
                emr_system.drop_lake_out_view(destination_table)
            else:
                raise m3d_exceptions.M3DUnsupportedStorageException(abstract_table.storage_type)
        else:
            raise m3d_exceptions.M3DUnsupportedDestinationSystemException(destination_system_technology)
github adidas / m3d-api / m3d / hadoop / algorithm / algorithm_executor_hadoop.py View on Github external
available_algorithms = self._get_supported_emr_algorithms()
        if python_class not in available_algorithms:
            raise M3DUnsupportedAlgorithmException(python_class)

        self._spark_parameters = algorithm_config.get_spark_params()
        self._algorithm_instance = algorithm_config.get_algorithm_instance()
        self._algorithm_wrapper = available_algorithms[python_class](
            execution_system=self._execution_system,
            algorithm_instance=algorithm_config.get_algorithm_instance(),
            algorithm_params=algorithm_config.get_algorithm_params()
        )

        self._execution_system.add_cluster_tags({
            EMRSystem.EMRClusterTag.API_METHOD: M3D.run_algorithm.__name__,
            EMRSystem.EMRClusterTag.ALGORITHM_CLASS: python_class,
            EMRSystem.EMRClusterTag.ALGORITHM_INSTANCE: algorithm_config.get_algorithm_instance()
        })
github adidas / m3d-api / m3d / hadoop / emr / emr_system.py View on Github external
emr_version = self.default_emr_version

        if ebs_size is None:
            ebs_size = self.default_ebs_size

        client = OpenSourceClient(aws_region, log_uri, credentials)

        # start cluster without job
        cluster_id = client.start_cluster(log_uri, emr_version, ebs_size, master_instance_type,
                                          core_instance_type, core_instance_count)

        emr_cluster_id = cluster_id['JobFlowId']

        self.emr_cluster_client = self._create_emr_cluster_client(emr_cluster_id)
        self.add_cluster_tags({
            EMRSystem.EMRClusterTag.SYSTEM: self.source_system,
            EMRSystem.EMRClusterTag.ENVIRONMENT: self.environment
        })

        logging.info("Creation of \"{}\" EMR cluster has been initialized.".format(
            emr_cluster_id
        ))
        return emr_cluster_id
github adidas / m3d-api / m3d / hadoop / algorithm / algorithm_partition_materialization.py View on Github external
super(AlgorithmPartitionMaterialization.BasePartitionMaterialization, self).__init__(
                execution_system,
                algorithm_instance,
                algorithm_params
            )

            view_name = self._parameters[self.ConfigKeys.VIEW]
            self.target_partitions = self._parameters[self.ConfigKeys.TARGET_PARTITIONS]
            self.metadata_update_strategy = self._parameters.get(self.ConfigKeys.METADATA_UPDATE_STRATEGY, None)
            self.source_view = "{}.{}".format(execution_system.db_mart_mod, view_name)
            self.target_table = "{}.{}".format(execution_system.db_mart_cal, view_name)

            execution_system.add_cluster_tags({
                EMRSystem.EMRClusterTag.SOURCE_VIEW: self.source_view,
                EMRSystem.EMRClusterTag.TARGET_TABLE: self.target_table
            })
github adidas / m3d-api / m3d / hadoop / load / load_executor_hadoop.py View on Github external
execution_system,
            load_type,
            data_type,
            destination_table
        )

        self._load_wrapper = available_loads[load_type](
            execution_system=self._execution_system,
            dataset=dataset,
            load_params=load_params
        )

        self._execution_system.add_cluster_tags({
            EMRSystem.EMRClusterTag.API_METHOD: M3D.load_table.__name__,
            EMRSystem.EMRClusterTag.LOAD_TYPE: load_type,
            EMRSystem.EMRClusterTag.TARGET_TABLE: destination_table
        })