Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def Do(self, input_dict: Dict[Text, List[types.Artifact]],
output_dict: Dict[Text, List[types.Artifact]],
exec_properties: Dict[Text, Any]) -> None:
input_path = artifact_utils.get_single_uri(input_dict['input'])
output_path = artifact_utils.get_single_uri(output_dict['output'])
tf.io.gfile.copy(input_path, output_path)
will include a single pbtxt file which contains all anomalies found.
exec_properties: A dict of execution properties. Not used yet.
Returns:
None
"""
self._log_startup(input_dict, output_dict, exec_properties)
tf.logging.info('Validating schema against the computed statistics.')
schema = io_utils.SchemaReader().read(
io_utils.get_only_uri_in_dir(
artifact_utils.get_single_uri(input_dict['schema'])))
stats = tfdv.load_statistics(
io_utils.get_only_uri_in_dir(
artifact_utils.get_split_uri(input_dict['stats'], 'eval')))
output_uri = artifact_utils.get_single_uri(output_dict['output'])
anomalies = tfdv.validate_statistics(stats, schema)
io_utils.write_pbtxt_file(
os.path.join(output_uri, DEFAULT_FILE_NAME), anomalies)
tf.logging.info(
'Validation complete. Anomalies written to {}.'.format(output_uri))
"""Read TFRecord files to PCollection of TF examples.
Note that each input split will be transformed by this function separately.
Args:
pipeline: beam pipeline.
input_dict: Input dict from input key to a list of Artifacts.
- input_base: input dir that contains tf example data.
exec_properties: A dict of execution properties.
split_pattern: Split.pattern in Input config, glob relative file pattern
that maps to input files with root directory given by input_base.
Returns:
PCollection of TF examples.
"""
input_base_uri = artifact_utils.get_single_uri(input_dict['input_base'])
input_split_pattern = os.path.join(input_base_uri, split_pattern)
absl.logging.info(
'Reading input TFExample data {}.'.format(input_split_pattern))
# TODO(jyzhao): profile input examples.
return (pipeline
# TODO(jyzhao): support multiple input format.
| 'ReadFromTFRecord' >>
beam.io.ReadFromTFRecord(file_pattern=input_split_pattern)
# TODO(jyzhao): consider move serialization out of base example gen.
| 'ToTFExample' >> beam.Map(tf.train.Example.FromString))
Raises:
TimeoutError:
When there is no decision made within timeout_sec.
ConnectionError:
When connection to slack server cannot be established.
"""
self._log_startup(input_dict, output_dict, exec_properties)
# Fetch execution properties from exec_properties dict.
slack_token = exec_properties['slack_token']
slack_channel_id = exec_properties['slack_channel_id']
timeout_sec = exec_properties['timeout_sec']
# Fetch input URIs from input_dict.
model_export_uri = artifact_utils.get_single_uri(input_dict['model'])
model_blessing = artifact_utils.get_single_instance(
input_dict['model_blessing'])
# Fetch output artifact from output_dict.
slack_blessing = artifact_utils.get_single_instance(
output_dict['slack_blessing'])
# We only consider a model as blessed if both of the following conditions
# are met:
# - The model is blessed by model validator. This is determined by looking
# for file named 'BLESSED' from the output from Model Validator.
# - The model is blessed by a human reviewer. This logic is in
# _fetch_slack_blessing().
slack_response = None
with Timeout(timeout_sec):
if model_utils.is_model_blessed(model_blessing):
instance, providing the way to slice the data.
Returns:
None
"""
if 'model_exports' not in input_dict:
raise ValueError('\'model_exports\' is missing in input dict.')
if 'examples' not in input_dict:
raise ValueError('\'examples\' is missing in input dict.')
if 'output' not in output_dict:
raise ValueError('\'output\' is missing in output dict.')
self._log_startup(input_dict, output_dict, exec_properties)
# Extract input artifacts
model_exports_uri = artifact_utils.get_single_uri(
input_dict['model_exports'])
feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
json_format.Parse(exec_properties['feature_slicing_spec'],
feature_slicing_spec)
slice_spec = self._get_slice_spec_from_feature_slicing_spec(
feature_slicing_spec)
output_uri = artifact_utils.get_single_uri(output_dict['output'])
eval_model_path = path_utils.eval_model_path(model_exports_uri)
# Add fairness indicator metric callback if necessary.
fairness_indicator_thresholds = exec_properties.get(
'fairness_indicator_thresholds', None)
add_metrics_callbacks = None
Args:
pipeline: beam pipeline.
input_dict: Input dict from input key to a list of Artifacts.
- input_base: input dir that contains csv data. csv files must have header
line.
exec_properties: A dict of execution properties.
split_pattern: Split.pattern in Input config, glob relative file pattern
that maps to input files with root directory given by input_base.
Returns:
PCollection of TF examples.
Raises:
RuntimeError: if split is empty or csv headers are not equal.
"""
input_base_uri = artifact_utils.get_single_uri(input_dict['input_base'])
csv_pattern = os.path.join(input_base_uri, split_pattern)
absl.logging.info(
'Processing input csv data {} to TFExample.'.format(csv_pattern))
csv_files = tf.io.gfile.glob(csv_pattern)
if not csv_files:
raise RuntimeError(
'Split pattern {} does not match any files.'.format(csv_pattern))
column_names = io_utils.load_csv_column_names(csv_files[0])
for csv_files in csv_files[1:]:
if io_utils.load_csv_column_names(csv_files) != column_names:
raise RuntimeError(
'Files in same split {} have different header.'.format(csv_pattern))
parsed_csv_lines = (
Returns:
None
Raises:
ValueError: if ai_platform_serving_args is not in
exec_properties.custom_config.
RuntimeError: if the Google Cloud AI Platform training job failed.
"""
self._log_startup(input_dict, output_dict, exec_properties)
if not self.CheckBlessing(input_dict, output_dict):
return
model_export = artifact_utils.get_single_instance(
input_dict['model_export'])
model_export_uri = model_export.uri
model_blessing_uri = artifact_utils.get_single_uri(
input_dict['model_blessing'])
model_push = artifact_utils.get_single_instance(output_dict['model_push'])
# TODO(jyzhao): should this be in driver or executor.
if not tf.gfile.Exists(os.path.join(model_blessing_uri, 'BLESSED')):
model_push.set_int_custom_property('pushed', 0)
tf.logging.info('Model on %s was not blessed',)
return
exec_properties_copy = exec_properties.copy()
custom_config = exec_properties_copy.pop('custom_config', {})
ai_platform_serving_args = custom_config['ai_platform_serving_args']
# Deploy the model.
model_path = path_utils.serving_model_path(model_export_uri)
# Note: we do not have a logical model version right now. This
# model_version is a timestamp mapped to trainer's exporter.
train_files = [
_all_files_pattern(
artifact_utils.get_split_uri(input_dict['examples'], 'train'))
]
transform_output = artifact_utils.get_single_uri(
input_dict['transform_output']) if input_dict.get(
'transform_output', None) else None
eval_files = [
_all_files_pattern(
artifact_utils.get_split_uri(input_dict['examples'], 'eval'))
]
schema_file = io_utils.get_only_uri_in_dir(
artifact_utils.get_single_uri(input_dict['schema']))
# TODO(ruoyu): Make this a dict of tag -> uri instead of list.
base_model = path_utils.serving_model_path(
artifact_utils.get_single_uri(
input_dict['base_model'])) if input_dict.get('base_model') else None
train_args = trainer_pb2.TrainArgs()
eval_args = trainer_pb2.EvalArgs()
json_format.Parse(exec_properties['train_args'], train_args)
json_format.Parse(exec_properties['eval_args'], eval_args)
# https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with
# num_steps=None. Conversion of the proto to python will set the default
# value of an int as 0 so modify the value here. Tensorflow will raise an
# error if num_steps <= 0.
train_steps = train_args.num_steps or None
eval_steps = eval_args.num_steps or None
output_path = artifact_utils.get_single_uri(output_dict['output'])
serving_model_dir = path_utils.serving_model_dir(output_path)
split 'train'. Stats on other splits are ignored.
- 'statistics': Synonym for 'stats'.
output_dict: Output dict from key to a list of artifacts, including:
- output: A list of 'Schema' artifact of size one.
exec_properties: A dict of execution properties, includes:
- infer_feature_shape: Whether or not to infer the shape of the feature.
Returns:
None
"""
# TODO(zhitaoli): Move constants between this file and component.py to a
# constants.py.
train_stats_uri = io_utils.get_only_uri_in_dir(
artifact_utils.get_split_uri(input_dict['stats'], 'train'))
output_uri = os.path.join(
artifact_utils.get_single_uri(output_dict['output']),
_DEFAULT_FILE_NAME)
infer_feature_shape = exec_properties['infer_feature_shape']
absl.logging.info('Infering schema from statistics.')
schema = tfdv.infer_schema(
tfdv.load_statistics(train_stats_uri), infer_feature_shape)
io_utils.write_pbtxt_file(output_uri, schema)
absl.logging.info('Schema written to %s.' % output_uri)