How to use the mindsdb.libs.helpers.general_helpers.get_value_bucket function in MindsDB

To help you get started, we’ve selected a few MindsDB examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github mindsdb / mindsdb / mindsdb / libs / phases / model_analyzer / helpers / column_evaluator.py View on Github external
columnless_prediction_distribution[output_column][input_column] = col_missing_output_histogram

        # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column

        for output_column in output_columns:
                buckets_stats[output_column] = {}

                bucket_indexes = {}
                for index,row in full_dataset.iterrows():
                    value = row[output_column]
                    if 'percentage_buckets' in stats[output_column]:
                        percentage_buckets = stats[output_column]['percentage_buckets']
                    else:
                        percentage_buckets = None

                    value_bucket = get_value_bucket(value, percentage_buckets, stats[output_column], self.transaction.hmd)
                    if value_bucket not in bucket_indexes:
                        bucket_indexes[value_bucket] = []
                    bucket_indexes[value_bucket].append(index)

                for bucket in bucket_indexes:
                    buckets_stats[output_column][bucket] = {}
                    input_data = TransactionData()
                    input_data.data_frame = full_dataset.loc[bucket_indexes[bucket]]
                    input_data.columns = input_data.columns

                    stats_generator = StatsGenerator(session=None, transaction=self.transaction)
                    try:
                        with disable_console_output():
                            col_buckets_stats = stats_generator.run(input_data=input_data, modify_light_metadata=False, print_logs=False)
                        buckets_stats[output_column][bucket].update(col_buckets_stats)
                    except Exception as e:
github mindsdb / mindsdb / mindsdb / libs / model_examination / column_evaluator.py View on Github external
if col_missing_output_stats is None:
                    pass
                elif 'histogram' in col_missing_output_stats[output_column]:
                    columnless_prediction_distribution[output_column][input_column] = col_missing_output_stats[output_column]['histogram']

            # If this coulmn is either very important or not important at all, compute stats for each of the buckets (in the validation data)
            if column_importance > 0.8 or column_importance < 0.2:
                split_data = {}
                for value in full_dataset[input_column]:

                    if 'percentage_buckets' in stats[input_column]:
                        bucket = stats[input_column]['percentage_buckets']
                    else:
                        bucket = None

                    vb = get_value_bucket(value, bucket, stats[input_column])
                    if f'{input_column}_bucket_{vb}' not in split_data:
                        split_data[f'{input_column}_bucket_{vb}'] = []

                    split_data[f'{input_column}_bucket_{vb}'].append(value)

                row_wise_data = []
                max_length = max(list(map(len, split_data.values())))

                columns = []
                for i in range(max_length):
                    row_wise_data.append([])
                    for k in split_data.keys():
                        # If the sub bucket has less than 6 values, it's no relevant
                        if len(split_data[k]) > 6:
                            columns.append(k)
                            if len(split_data[k]) > i:
github mindsdb / mindsdb / mindsdb / libs / helpers / probabilistic_validator.py View on Github external
:param real_value: The real value/label for this prediction
        :param predicted_value: The predicted value/label
        :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
        """
        try:
            predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(predicted_value)
        except:
            predicted_value = None

        try:
            real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(str(real_value).replace(',','.'))
        except:
            real_value = None

        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats, hmd)
            real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats, hmd)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = X + features_existence

            self._X_buff.append(X)
            self._Y_buff.append(real_value_b)
            self._real_buckets_buff = self._Y_buff
            self._predicted_buckets_buff.append(predicted_value_b)

            if is_original_data:
                self._original_real_buckets_buff.append(real_value_b)
                self._original_predicted_buckets_buff.append(predicted_value_b)


            # If no column is ignored, compute the accuracy for this bucket
github mindsdb / mindsdb / mindsdb / libs / model_examination / probabilistic_validator.py View on Github external
def evaluate_prediction_accuracy(self, features_existence, predicted_value):
        """
        # Fit the probabilistic validator on an observation    def evaluate_prediction_accuracy(self, features_existence, predicted_value):
        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param predicted_value: The predicted value/label
        :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value)
        """

        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = [X + features_existence]
        else:
            X = [features_existence]

        #X = [[predicted_value_b, *features_existence]]
        log_types = np.seterr()
        np.seterr(divide='ignore')
        distribution = self._probabilistic_model.predict_proba(np.array(X))
        np.seterr(divide=log_types['divide'])

        if self.buckets is not None:
            return ProbabilityEvaluation(self.buckets, distribution[0].tolist(), predicted_value).most_likely_probability
        else:
            return distribution[0][1]
github mindsdb / mindsdb / mindsdb / libs / model_examination / probabilistic_validator.py View on Github external
"""
        # Register an observation in the validator's internal buffers

        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param real_value: The real value/label for this prediction
        :param predicted_value: The predicted value/label
        :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
        """
        predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(predicted_value)
        try:
            real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(str(real_value).replace(',','.'))
        except:
            real_value = None

        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
            real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = X + features_existence
            self.X_buff.append(X)
            self.Y_buff.append(real_value_b)
        else:
            predicted_value_b = predicted_value
            real_value_b = real_value
            self.X_buff.append(features_existence)
            self.Y_buff.append(real_value_b == predicted_value_b)
github mindsdb / mindsdb / mindsdb / libs / model_examination / probabilistic_validator.py View on Github external
def evaluate_prediction_accuracy(self, features_existence, predicted_value):
        """
        # Fit the probabilistic validator on an observation    def evaluate_prediction_accuracy(self, features_existence, predicted_value):
        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param predicted_value: The predicted value/label
        :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value)
        """
        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = [X + features_existence]
        else:
            X = [features_existence]

        #X = [[predicted_value_b, *features_existence]]
        log_types = np.seterr()
        np.seterr(divide='ignore')
        distribution = self._probabilistic_model.predict_proba(np.array(X))
        np.seterr(divide=log_types['divide'])

        if self.buckets is not None:
            return ProbabilityEvaluation(self.buckets, distribution[0].tolist(), predicted_value).most_likely_probability
        else:
            return distribution[0][1]
github mindsdb / mindsdb / mindsdb / libs / model_examination / probabilistic_validator.py View on Github external
:param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param real_value: The real value/label for this prediction
        :param predicted_value: The predicted value/label
        :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
        """
        nr_missing_features = len([x for x in features_existence if x is False or x is 0])

        predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(predicted_value)
        try:
            real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(str(real_value).replace(',','.'))
        except:
            real_value = None

        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
            real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = X + features_existence
            self.X_buff.append(X)
            self.Y_buff.append(real_value_b)

            # If no column is ignored, compute the accuracy for this bucket
            if nr_missing_features == 0:
                if predicted_value_b not in self.bucket_accuracy:
                    self.bucket_accuracy[predicted_value_b] = []
                self.bucket_accuracy[predicted_value_b].append(int(real_value_b == predicted_value_b))
        else:
            predicted_value_b = predicted_value
            real_value_b = real_value
            self.X_buff.append(features_existence)
github mindsdb / mindsdb / mindsdb / libs / model_examination / probabilistic_validator.py View on Github external
:param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param real_value: The real value/label for this prediction
        :param predicted_value: The predicted value/label
        :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
        """
        nr_missing_features = len([x for x in features_existence if x is False or x is 0])

        predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(predicted_value)
        try:
            real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(str(real_value).replace(',','.'))
        except:
            real_value = None

        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
            real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = X + features_existence
            self.X_buff.append(X)
            self.Y_buff.append(real_value_b)

            # If no column is ignored, compute the accuracy for this bucket
            if nr_missing_features == 0:
                if predicted_value_b not in self.bucket_accuracy:
                    self.bucket_accuracy[predicted_value_b] = []
                self.bucket_accuracy[predicted_value_b].append(int(real_value_b == predicted_value_b))
        else:
            predicted_value_b = predicted_value
            real_value_b = real_value
            self.X_buff.append(features_existence)
            self.Y_buff.append(real_value_b == predicted_value_b)
github mindsdb / mindsdb / mindsdb / libs / model_examination / probabilistic_validator.py View on Github external
# Register an observation in the validator's internal buffers

        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param real_value: The real value/label for this prediction
        :param predicted_value: The predicted value/label
        :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
        """
        predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(predicted_value)
        try:
            real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(str(real_value).replace(',','.'))
        except:
            real_value = None

        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
            real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = X + features_existence
            self.X_buff.append(X)
            self.Y_buff.append(real_value_b)
        else:
            predicted_value_b = predicted_value
            real_value_b = real_value
            self.X_buff.append(features_existence)
            self.Y_buff.append(real_value_b == predicted_value_b)
github mindsdb / mindsdb / mindsdb / libs / helpers / probabilistic_validator.py View on Github external
:param predicted_value: The predicted value/label
        :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
        """
        try:
            predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(predicted_value)
        except:
            predicted_value = None

        try:
            real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(str(real_value).replace(',','.'))
        except:
            real_value = None

        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats, hmd)
            real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats, hmd)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = X + features_existence

            self._X_buff.append(X)
            self._Y_buff.append(real_value_b)
            self._real_buckets_buff = self._Y_buff
            self._predicted_buckets_buff.append(predicted_value_b)

            if is_original_data:
                self._original_real_buckets_buff.append(real_value_b)
                self._original_predicted_buckets_buff.append(predicted_value_b)


            # If no column is ignored, compute the accuracy for this bucket
            nr_missing_features = len([x for x in features_existence if x is False or x is 0])