How to use m3d - 10 common examples

To help you get started, we’ve selected a few m3d examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github adidas / m3d-api / test / integration / test_load_table_append_s3.py View on Github external
def test_load_table_append(self, remove_json_patch, add_tags_patch, _0, _1):

        target_partitions = ["year", "month", "day"]
        regex_filename = ["[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})", "(?<=[0-9]{6})([0-9]{2})"]
        spark_external_parameters = '''
                {
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''
        null_value = "test_null_value"
        quote_character = "test_quote"
        compute_table_statistics = True
        verify_schema = False
        data_type = DataType.STRUCTURED
        reader_mode = "DROPMALFORMED"
        metadata_update_strategy = "SparkRecoverPartitionsCustom"

        source_system = AppendLoadConfig.destination_table.split("_", 1)[0]
        table = AppendLoadConfig.destination_table.split("_", 1)[-1]
        test_target_dir = "s3://{lake_bucket}/{destination_environment}/{system}/{table}/data/".format(
            lake_bucket=self.default_dev_lake_bucket,
            destination_environment=AppendLoadConfig.destination_environment,
            system=source_system,
            table=table
        )

        config = AppendLoadConfig(
            self.local_run_dir,
            self.env_setup,
            target_partitions,
github adidas / m3d-api / test / integration / test_algorithm_gzip_decompressor_emr.py View on Github external
def run_command_in_cluster_patch(cmd, name):
            # Check command name
            assert "Running Spark Application" in str(name)
            print("Command is: {0}".format(cmd))
            command_components = cmd.split()

            # Check algorithm name from the spark command
            algorithm_class_name = command_components[-3]
            assert algorithm_class_name == ScalaClasses.GZIP_DECOMPRESSOR

            # Check configuration file content
            algorithm_config_file_name = command_components[-2]
            actual_config_file_content = self.get_object_content_from_s3(algorithm_config_file_name)
            print("Actual config content: {0}".format(actual_config_file_content))

            algorithm_config_file_dict = json.loads(actual_config_file_content)

            assert algorithm_config_file_dict == expected_param_dict
github adidas / m3d-api / test / unit / m3d / hadoop / algorithm / test_algorithm_partition_materialization_emr.py View on Github external
def _create_emr_system(self):
        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "prod"

        m3d_config_file, _, _, _ = self.env_setup(
            self.local_run_dir,
            destination_system,
            destination_database,
            destination_environment
        )
        return EMRSystem(
            m3d_config_file,
            destination_system,
            destination_database,
            destination_environment,
            self.emr_cluster_id
        )
github adidas / m3d-api / test / integration / test_create_out_view_s3.py View on Github external
destination_database,
            destination_environment,
            destination_table
        ]

        table_config_kwargs = {
            "emr_cluster_id": self.emr_cluster_id
        }

        emr_steps_completer = self.create_emr_steps_completer(expected_steps_count=1, timeout_seconds=3)

        with ConcurrentExecutor(emr_steps_completer, delay_sec=0.4):
            logging.info("Calling M3D.create_lake_out_view().")
            M3D.create_lake_out_view(*table_config, **table_config_kwargs)

        emr_system = EMRSystem(*table_config[:5])
        s3_table = S3Table(emr_system, destination_table)

        mock_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]
        assert 1 == len(mock_cluster.steps)

        hive_step = mock_cluster.steps[0]

        assert hive_step.args[0] == "hive"
        assert hive_step.args[1] == "--silent"
        assert hive_step.args[2] == "-f"

        actual_hql_content_in_bucket = self.get_object_content_from_s3(hive_step.args[3])

        column_name_pairs = [
            ("record_date", "v_record_date"),
            ("p_string", "v_string"),
github adidas / m3d-api / test / unit / m3d / hadoop / emr / test_emr_system.py View on Github external
    @patch.object(EMRSystem, 'config_service', new=MockConfigService, spec=None, create=True)
    def test_parses_basic_attributes_from_system_config_file(self, _, __):
        """
        Test case checks that all relevant key-values are extracted from sconx file and assigned to correct member
        variables of EMRSystem object.
        """
        aws_api_credentials = AWSCredentials("fake_aws_api_access_key", "fake_aws_api_secret_key")
        aws_api_credentials_file = self.local_run_dir.join("aws-credentials-emr-api.json")
        self.dump_aws_credentials(aws_api_credentials, str(aws_api_credentials_file))

        aws_s3_put_credentials = AWSCredentials("fake_aws_s3_put_access_key", "fake_aws_s3_put_secret_key")
        aws_s3_put_credentials_file = self.local_run_dir.join("aws-credentials-emr-s3_put.json")
        self.dump_aws_credentials(aws_s3_put_credentials, str(aws_s3_put_credentials_file))

        aws_s3_del_credentials = AWSCredentials("fake_aws_s3_del_access_key", "fake_aws_s3_del_secret_key")
        aws_s3_del_credentials_file = self.local_run_dir.join("aws-credentials-emr-s3_del.json")
        self.dump_aws_credentials(aws_s3_del_credentials, str(aws_s3_del_credentials_file))
github adidas / m3d-api / test / integration / test_load_table_append_s3.py View on Github external
spark_external_parameters = '''
        {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }
        '''

        config = AppendLoadConfig(
            self.local_run_dir,
            self.env_setup,
            ["year", "month"],
            ["[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})", "(?<=[0-9]{6})([0-9]{2})"]
        )
        with pytest.raises(M3DIllegalArgumentException) as ex:
            config.load_table(self.emr_cluster_id, spark_external_parameters)

        assert str(ex.value).startswith("Lengths of target_partitions and regex_filename do not match")
github adidas / m3d-api / test / integration / test_load_table_delta_s3.py View on Github external
assert app_files[0] == s3_table_active.dir_apps_delta_load + filename_json

        delta_load_config_s3 = app_files[0]
        delta_load_config_content = self.get_object_content_from_s3(delta_load_config_s3)

        load_table_parameters = json.loads(delta_load_config_content)

        assert load_table_parameters["active_records_table_lake"] == s3_table_active.db_table_lake
        assert load_table_parameters["active_records_dir_lake"] == s3_table_active.dir_lake_final
        assert load_table_parameters["delta_records_file_path"] == s3_table_active.dir_landing_data
        assert load_table_parameters["technical_key"] == ["m3d_timestamp", "datapakid", "partno", "record"]
        assert load_table_parameters["business_key"] == s3_table_active.business_key

        if s3_table_active.partitioned_by in Util.defined_partitions:
            target_partitions = Util.get_target_partitions_list(s3_table_active.partitioned_by)
        else:
            target_partitions = s3_table_active.partitioned_by

        assert load_table_parameters["target_partitions"] == target_partitions
        assert load_table_parameters["partition_column"] == s3_table_active.partition_column
        assert load_table_parameters["partition_column_format"] == s3_table_active.partition_column_format
        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]["s3_deployment_dir_base"],
            destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
github adidas / m3d-api / test / integration / test_load_table_delta_s3.py View on Github external
assert len(app_files) == 1

        assert app_files[0] == s3_table_active.dir_apps_delta_load + filename_json

        delta_load_config_s3 = app_files[0]
        delta_load_config_content = self.get_object_content_from_s3(delta_load_config_s3)

        load_table_parameters = json.loads(delta_load_config_content)

        assert load_table_parameters["active_records_table_lake"] == s3_table_active.db_table_lake
        assert load_table_parameters["active_records_dir_lake"] == s3_table_active.dir_lake_final
        assert load_table_parameters["delta_records_file_path"] == s3_table_active.dir_landing_data
        assert load_table_parameters["technical_key"] == ["m3d_timestamp", "datapakid", "partno", "record"]
        assert load_table_parameters["business_key"] == s3_table_active.business_key

        if s3_table_active.partitioned_by in Util.defined_partitions:
            target_partitions = Util.get_target_partitions_list(s3_table_active.partitioned_by)
        else:
            target_partitions = s3_table_active.partitioned_by

        assert load_table_parameters["target_partitions"] == target_partitions
        assert load_table_parameters["partition_column"] == s3_table_active.partition_column
        assert load_table_parameters["partition_column_format"] == s3_table_active.partition_column_format

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]["s3_deployment_dir_base"],
            destination_environment,
github adidas / m3d-api / test / unit / m3d / util / test_util.py View on Github external
def test_get_target_partitions_list(self):
        """
        This method tests the correct functionality of get_partition_column_list of Util class
        :return:
        """
        assert Util.get_target_partitions_list("year") == ["year"]
        assert Util.get_target_partitions_list("month") == ["year", "month"]
        assert Util.get_target_partitions_list("day") == ["year", "month", "day"]
        assert Util.get_target_partitions_list("") == []

        with pytest.raises(Exception) as exc_info:
            Util.get_target_partitions_list("country")
        assert "Partition type country not supported" in str(exc_info.value)
github adidas / m3d-api / test / integration / test_load_table_delta_s3.py View on Github external
assert app_files[0] == s3_table_active.dir_apps_delta_load + filename_json

        delta_load_config_s3 = app_files[0]
        delta_load_config_content = self.get_object_content_from_s3(delta_load_config_s3)

        load_table_parameters = json.loads(delta_load_config_content)

        assert load_table_parameters["active_records_table_lake"] == s3_table_active.db_table_lake
        assert load_table_parameters["active_records_dir_lake"] == s3_table_active.dir_lake_final
        assert load_table_parameters["delta_records_file_path"] == s3_table_active.dir_landing_data
        assert load_table_parameters["technical_key"] == ["m3d_timestamp", "datapakid", "partno", "record"]
        assert load_table_parameters["business_key"] == s3_table_active.business_key

        if s3_table_active.partitioned_by in Util.defined_partitions:
            target_partitions = Util.get_target_partitions_list(s3_table_active.partitioned_by)
        else:
            target_partitions = s3_table_active.partitioned_by

        assert load_table_parameters["target_partitions"] == target_partitions
        assert load_table_parameters["partition_column"] == s3_table_active.partition_column
        assert load_table_parameters["partition_column_format"] == s3_table_active.partition_column_format

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]["s3_deployment_dir_base"],
            destination_environment,
            scon_emr_dict["subdir"]["m3d"],