How to use the a2ml.api.utils.dataframe.DataFrame function in a2ml

To help you get started, we’ve selected a few a2ml examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github augerai / a2ml / tests / model_review / test_model_review.py View on Github external
]

  res = ModelReview({'model_path': model_path}).score_actuals(
    actual_records=actuals, prediction_group_id=prediction_group_id,
    primary_prediction_group_id=primary_prediction_group_id, primary_model_path=primary_model_path
  )

  assert type(res) == dict
  assert res['accuracy'] == 1.0

  actual_files = glob.glob(model_path + '/predictions/*_actuals.feather.zstd')
  assert len(actual_files) == 1
  actual_file = actual_files[0]
  assert str(datetime.date.today()) in actual_file

  stored_actuals = DataFrame({})
  stored_actuals.loadFromFeatherFile(actual_file)
  assert 'prediction_group_id' in stored_actuals.columns

  stored_actuals = json.loads(
    stored_actuals.df.sort_values(by=['prediction_id']).to_json(orient='records')
  )

  assert stored_actuals[0]['prediction_id'] == 'bef9be07-5534-434e-ab7c-c379d8fcfe77'
  assert stored_actuals[0]['prediction_group_id'] == prediction_group_id
  assert stored_actuals[0]['species'] == 'versicolor'

  assert stored_actuals[1]['prediction_id'] == 'f61b1bbc-6f7b-4e7e-9a3b-6acb6e1462cd'
  assert stored_actuals[1]['prediction_group_id'] == prediction_group_id
  assert stored_actuals[1]['species'] == 'virginica'
github augerai / a2ml / tests / model_review / test_model_helper.py View on Github external
self.assertTrue(fsclient.is_file_exists(results_file_path))

        ds = DataFrame.create_dataframe(os.path.join(model_path, "iris_test.csv"))
        fsclient.remove_file(results_file_path)
        self.assertFalse(fsclient.is_file_exists(results_file_path))
        fsclient.remove_file(predicted_file_path)
        self.assertFalse(fsclient.is_file_exists(predicted_file_path))

        res = ModelHelper.save_prediction(ds, prediction_id, 
            support_review_model=True, json_result=True, count_in_result=False, prediction_date=prediction_date, 
            model_path=model_path, model_id=options.get('uid'))
        res = json.loads(res)
        self.assertEqual( res['columns'], ds.columns)
        self.assertEqual( len(res['data']), 6)

        ds = DataFrame.create_dataframe(os.path.join(model_path, "iris_test.csv"))
        fsclient.remove_file(results_file_path)
        self.assertFalse(fsclient.is_file_exists(results_file_path))
        fsclient.remove_file(predicted_file_path)
        self.assertFalse(fsclient.is_file_exists(predicted_file_path))

        ds.options['data_path'] = None
        res = ModelHelper.save_prediction(ds, prediction_id, 
            support_review_model=False, json_result=False, count_in_result=False, prediction_date=prediction_date, 
            model_path=model_path, model_id=options.get('uid'))
        self.assertEqual( type(res[0]), dict)
        self.assertEqual( res[0][options['targetFeature']], 'setosa')

        ds = DataFrame.create_dataframe(os.path.join(model_path, "iris_test.csv"))
        fsclient.remove_file(results_file_path)
        self.assertFalse(fsclient.is_file_exists(results_file_path))
        fsclient.remove_file(predicted_file_path)
github augerai / a2ml / a2ml / api / azure / model.py View on Github external
def predict(self, filename, model_id, threshold=None, locally=False, data=None, columns=None, 
        predicted_at=None, output=None, json_result=False, count_in_result=False, prediction_id=None
        ):
        ds = DataFrame.create_dataframe(filename, data, columns)
        model_path = self.ctx.config.get_model_path(model_id)
        options = fsclient.read_json_file(os.path.join(model_path, "options.json"))

        results, results_proba, proba_classes, target_categories = \
            self._predict_locally(ds.df, model_id, threshold) if locally else self._predict_remotely(ds.df, model_id, threshold)

        if target_categories and len(target_categories) == 2:
            for idx, item in enumerate(target_categories):
                if item == "False":
                    target_categories[idx] = False
                if item == "True":
                    target_categories[idx] = True

        ModelHelper.process_prediction(ds,
            results, results_proba, proba_classes,
            threshold,
github augerai / a2ml / a2ml / api / model_review / model_helper.py View on Github external
def preprocess_target(model_path, data_path=None, records=None, features=None):
        ds = DataFrame.create_dataframe(data_path, records, features)

        return ModelHelper.preprocess_target_ds(model_path, ds)
github augerai / a2ml / a2ml / api / model_review / model_review.py View on Github external
def score_actuals(self, actuals_path = None, actual_records=None, actuals_ds=None,
            prediction_group_id=None, primary_prediction_group_id=None, primary_model_path=None,
            actual_date=None, actuals_id = None):

        ds_actuals = actuals_ds or DataFrame.create_dataframe(actuals_path, actual_records, 
            features=['prediction_id', 'a2ml_actual'])

        actuals_count = ds_actuals.count()

        primary_ds = None
        if primary_prediction_group_id:
            files = ModelReview._get_prediction_files(primary_model_path, primary_prediction_group_id)
            for (_, df) in DataFrame.load_from_files(files, features=['prediction_id']):
                primary_ds = df
                # should be only one file
                break

        origin_dtypes = []
        origin_columns = []
        prediction_files = ModelReview._get_prediction_files(self.model_path, prediction_group_id)
        actual_index = False
github augerai / a2ml / a2ml / api / auger / impl / mparts / predict.py View on Github external
def _predict_locally(self, filename_arg, model_id, threshold, data, columns, output):
        model_deploy = ModelDeploy(self.ctx, None)
        is_model_loaded, model_path, model_name = \
            model_deploy.verify_local_model(model_id)

        if not is_model_loaded:
            raise AugerException('Model isn\'t loaded locally. '
                'Please use a2ml deploy command to download model.')

        model_path, model_existed = self._extract_model(model_name)
        model_options = fsclient.read_json_file(os.path.join(model_path, "model", "options.json"))

        filename = filename_arg
        if not filename:
            ds = DataFrame.create_dataframe(filename, data, columns)            
            filename = os.path.join(self.ctx.config.get_path(), '.augerml', 'predict_data.csv')
            ds.saveToCsvFile(filename, compression=None)

        try:
            predicted = \
                self._docker_run_predict(filename, threshold, model_path)
        finally:
            # clean up unzipped model
            # if it wasn't unzipped before
            if not model_existed:
                shutil.rmtree(model_path, ignore_errors=True)

        if not filename_arg:
            ds_result = DataFrame.create_dataframe(predicted)

            ds_result.options['data_path'] = None
github augerai / a2ml / a2ml / api / utils / dataframe.py View on Github external
if filename:
            if filename.endswith('.json') or filename.endswith('.json.gz'):
                df = pandas.read_json(filename)
            elif filename.endswith('.xlsx') or filename.endswith('.xls'):
                df = pandas.read_excel(filename)
            elif filename.endswith('.feather') or filename.endswith('.feather.gz'):
                import feather
                with fsclient.open_file(filename, 'rb', encoding=None) as local_file:
                    df = feather.read_dataframe(local_file, columns=features, use_threads=bool(True))

            if df is None:        
                try:
                    df = DataFrame._read_csv(filename, ',', features, nrows)
                except Exception as e:
                    df = DataFrame._read_csv(filename, '|', features, nrows)

        else:
            df = DataFrame.load_data(data, features)

        features = df.columns.tolist()
        if target in features:
            df.drop(columns=[target], inplace=True)

        return df
github augerai / a2ml / a2ml / api / model_review / model_review.py View on Github external
ds_actuals.df.reset_index(inplace=True)
        ds_actuals.dropna(columns=[self.target_feature, 'a2ml_actual'])

        # combine_first changes orginal non float64 types to float64 when NaN values appear during merging tables
        # Good explanations https://stackoverflow.com/a/15353297/898680
        # Fix: store original datypes and force them after merging
        for col in origin_columns:
            if col != 'prediction_id':
                ds_actuals.df[col] = ds_actuals.df[col].astype(origin_dtypes[col], copy=False)

        ds_actuals.df['a2ml_actual'] = ds_actuals.df['a2ml_actual'].astype(
            origin_dtypes[self.target_feature], copy=False
        )

        ds_true = DataFrame({})
        ds_true.df = ds_actuals.df[['a2ml_actual']].rename(columns={'a2ml_actual':self.target_feature})

        y_pred, _ = ModelHelper.preprocess_target_ds(self.model_path, ds_actuals)
        y_true, _ = ModelHelper.preprocess_target_ds(self.model_path, ds_true)

        score = ModelHelper.calculate_scores(self.options, y_test=y_true, y_pred=y_pred)

        if not actuals_ds:
            ds_actuals.drop(self.target_feature)
            ds_actuals.df = ds_actuals.df.rename(columns={'a2ml_actual':self.target_feature})

            if not actuals_id:
                actuals_id = get_uid()

            file_name = str(actual_date or datetime.date.today()) + '_' + actuals_id + "_actuals.feather.zstd"
            ds_actuals.saveToFeatherFile(os.path.join(self.model_path, "predictions", file_name))
github augerai / a2ml / a2ml / api / model_review / model_review.py View on Github external
def score_model_performance_daily(self, date_from, date_to):
        features = ['prediction_id', self.target_feature]
        res = {}

        for (curr_date, files) in ModelReview._prediction_files_by_day(
                self.model_path, date_from, date_to, "_*_actuals.feather.zstd"):
            df_actuals = DataFrame({})
            for (file, df) in DataFrame.load_from_files(files, features):
                df_actuals.df = pd.concat([df_actuals.df, df.df])
                
            if df_actuals.count() > 0:
                df_actuals.df.rename(columns={self.target_feature: 'a2ml_actual'}, inplace=True)
                scores = self.score_actuals(actuals_ds=df_actuals)
                res[str(curr_date)] = scores[self.options.get('score_name')]

        return res