Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_load_table_append(self, remove_json_patch, add_tags_patch, _0, _1):
target_partitions = ["year", "month", "day"]
regex_filename = ["[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})", "(?<=[0-9]{6})([0-9]{2})"]
spark_external_parameters = '''
{
"spark.driver.memory": "99G",
"spark.executor.instances": "99",
"spark.executor.memory": "90G"
}
'''
null_value = "test_null_value"
quote_character = "test_quote"
compute_table_statistics = True
verify_schema = False
data_type = DataType.STRUCTURED
reader_mode = "DROPMALFORMED"
metadata_update_strategy = "SparkRecoverPartitionsCustom"
source_system = AppendLoadConfig.destination_table.split("_", 1)[0]
table = AppendLoadConfig.destination_table.split("_", 1)[-1]
test_target_dir = "s3://{lake_bucket}/{destination_environment}/{system}/{table}/data/".format(
lake_bucket=self.default_dev_lake_bucket,
destination_environment=AppendLoadConfig.destination_environment,
system=source_system,
table=table
)
config = AppendLoadConfig(
self.local_run_dir,
self.env_setup,
target_partitions,
def run_command_in_cluster_patch(cmd, name):
# Check command name
assert "Running Spark Application" in str(name)
print("Command is: {0}".format(cmd))
command_components = cmd.split()
# Check algorithm name from the spark command
algorithm_class_name = command_components[-3]
assert algorithm_class_name == ScalaClasses.GZIP_DECOMPRESSOR
# Check configuration file content
algorithm_config_file_name = command_components[-2]
actual_config_file_content = self.get_object_content_from_s3(algorithm_config_file_name)
print("Actual config content: {0}".format(actual_config_file_content))
algorithm_config_file_dict = json.loads(actual_config_file_content)
assert algorithm_config_file_dict == expected_param_dict
def _create_emr_system(self):
destination_system = "bdp"
destination_database = "emr_test"
destination_environment = "prod"
m3d_config_file, _, _, _ = self.env_setup(
self.local_run_dir,
destination_system,
destination_database,
destination_environment
)
return EMRSystem(
m3d_config_file,
destination_system,
destination_database,
destination_environment,
self.emr_cluster_id
)
destination_database,
destination_environment,
destination_table
]
table_config_kwargs = {
"emr_cluster_id": self.emr_cluster_id
}
emr_steps_completer = self.create_emr_steps_completer(expected_steps_count=1, timeout_seconds=3)
with ConcurrentExecutor(emr_steps_completer, delay_sec=0.4):
logging.info("Calling M3D.create_lake_out_view().")
M3D.create_lake_out_view(*table_config, **table_config_kwargs)
emr_system = EMRSystem(*table_config[:5])
s3_table = S3Table(emr_system, destination_table)
mock_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]
assert 1 == len(mock_cluster.steps)
hive_step = mock_cluster.steps[0]
assert hive_step.args[0] == "hive"
assert hive_step.args[1] == "--silent"
assert hive_step.args[2] == "-f"
actual_hql_content_in_bucket = self.get_object_content_from_s3(hive_step.args[3])
column_name_pairs = [
("record_date", "v_record_date"),
("p_string", "v_string"),
@patch.object(EMRSystem, 'config_service', new=MockConfigService, spec=None, create=True)
def test_parses_basic_attributes_from_system_config_file(self, _, __):
"""
Test case checks that all relevant key-values are extracted from sconx file and assigned to correct member
variables of EMRSystem object.
"""
aws_api_credentials = AWSCredentials("fake_aws_api_access_key", "fake_aws_api_secret_key")
aws_api_credentials_file = self.local_run_dir.join("aws-credentials-emr-api.json")
self.dump_aws_credentials(aws_api_credentials, str(aws_api_credentials_file))
aws_s3_put_credentials = AWSCredentials("fake_aws_s3_put_access_key", "fake_aws_s3_put_secret_key")
aws_s3_put_credentials_file = self.local_run_dir.join("aws-credentials-emr-s3_put.json")
self.dump_aws_credentials(aws_s3_put_credentials, str(aws_s3_put_credentials_file))
aws_s3_del_credentials = AWSCredentials("fake_aws_s3_del_access_key", "fake_aws_s3_del_secret_key")
aws_s3_del_credentials_file = self.local_run_dir.join("aws-credentials-emr-s3_del.json")
self.dump_aws_credentials(aws_s3_del_credentials, str(aws_s3_del_credentials_file))
spark_external_parameters = '''
{
"spark.driver.memory": "99G",
"spark.executor.instances": "99",
"spark.executor.memory": "90G"
}
'''
config = AppendLoadConfig(
self.local_run_dir,
self.env_setup,
["year", "month"],
["[0-9]{4}", "(?<=[0-9]{4})([0-9]{2})(?=[0-9]{2})", "(?<=[0-9]{6})([0-9]{2})"]
)
with pytest.raises(M3DIllegalArgumentException) as ex:
config.load_table(self.emr_cluster_id, spark_external_parameters)
assert str(ex.value).startswith("Lengths of target_partitions and regex_filename do not match")
assert app_files[0] == s3_table_active.dir_apps_delta_load + filename_json
delta_load_config_s3 = app_files[0]
delta_load_config_content = self.get_object_content_from_s3(delta_load_config_s3)
load_table_parameters = json.loads(delta_load_config_content)
assert load_table_parameters["active_records_table_lake"] == s3_table_active.db_table_lake
assert load_table_parameters["active_records_dir_lake"] == s3_table_active.dir_lake_final
assert load_table_parameters["delta_records_file_path"] == s3_table_active.dir_landing_data
assert load_table_parameters["technical_key"] == ["m3d_timestamp", "datapakid", "partno", "record"]
assert load_table_parameters["business_key"] == s3_table_active.business_key
if s3_table_active.partitioned_by in Util.defined_partitions:
target_partitions = Util.get_target_partitions_list(s3_table_active.partitioned_by)
else:
target_partitions = s3_table_active.partitioned_by
assert load_table_parameters["target_partitions"] == target_partitions
assert load_table_parameters["partition_column"] == s3_table_active.partition_column
assert load_table_parameters["partition_column_format"] == s3_table_active.partition_column_format
# Check EMR steps.
fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]
assert 1 == len(fake_cluster.steps)
expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
scon_emr_dict["environments"][destination_environment]["s3_deployment_dir_base"],
destination_environment,
scon_emr_dict["subdir"]["m3d"],
m3d_config_dict["subdir_projects"]["m3d_api"],
assert len(app_files) == 1
assert app_files[0] == s3_table_active.dir_apps_delta_load + filename_json
delta_load_config_s3 = app_files[0]
delta_load_config_content = self.get_object_content_from_s3(delta_load_config_s3)
load_table_parameters = json.loads(delta_load_config_content)
assert load_table_parameters["active_records_table_lake"] == s3_table_active.db_table_lake
assert load_table_parameters["active_records_dir_lake"] == s3_table_active.dir_lake_final
assert load_table_parameters["delta_records_file_path"] == s3_table_active.dir_landing_data
assert load_table_parameters["technical_key"] == ["m3d_timestamp", "datapakid", "partno", "record"]
assert load_table_parameters["business_key"] == s3_table_active.business_key
if s3_table_active.partitioned_by in Util.defined_partitions:
target_partitions = Util.get_target_partitions_list(s3_table_active.partitioned_by)
else:
target_partitions = s3_table_active.partitioned_by
assert load_table_parameters["target_partitions"] == target_partitions
assert load_table_parameters["partition_column"] == s3_table_active.partition_column
assert load_table_parameters["partition_column_format"] == s3_table_active.partition_column_format
# Check EMR steps.
fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]
assert 1 == len(fake_cluster.steps)
expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
scon_emr_dict["environments"][destination_environment]["s3_deployment_dir_base"],
destination_environment,
def test_get_target_partitions_list(self):
"""
This method tests the correct functionality of get_partition_column_list of Util class
:return:
"""
assert Util.get_target_partitions_list("year") == ["year"]
assert Util.get_target_partitions_list("month") == ["year", "month"]
assert Util.get_target_partitions_list("day") == ["year", "month", "day"]
assert Util.get_target_partitions_list("") == []
with pytest.raises(Exception) as exc_info:
Util.get_target_partitions_list("country")
assert "Partition type country not supported" in str(exc_info.value)
assert app_files[0] == s3_table_active.dir_apps_delta_load + filename_json
delta_load_config_s3 = app_files[0]
delta_load_config_content = self.get_object_content_from_s3(delta_load_config_s3)
load_table_parameters = json.loads(delta_load_config_content)
assert load_table_parameters["active_records_table_lake"] == s3_table_active.db_table_lake
assert load_table_parameters["active_records_dir_lake"] == s3_table_active.dir_lake_final
assert load_table_parameters["delta_records_file_path"] == s3_table_active.dir_landing_data
assert load_table_parameters["technical_key"] == ["m3d_timestamp", "datapakid", "partno", "record"]
assert load_table_parameters["business_key"] == s3_table_active.business_key
if s3_table_active.partitioned_by in Util.defined_partitions:
target_partitions = Util.get_target_partitions_list(s3_table_active.partitioned_by)
else:
target_partitions = s3_table_active.partitioned_by
assert load_table_parameters["target_partitions"] == target_partitions
assert load_table_parameters["partition_column"] == s3_table_active.partition_column
assert load_table_parameters["partition_column_format"] == s3_table_active.partition_column_format
# Check EMR steps.
fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]
assert 1 == len(fake_cluster.steps)
expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
scon_emr_dict["environments"][destination_environment]["s3_deployment_dir_base"],
destination_environment,
scon_emr_dict["subdir"]["m3d"],