How to use the lightwood.constants.lightwood.COLUMN_DATA_TYPES.CATEGORICAL function in lightwood

To help you get started, we’ve selected a few lightwood examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github mindsdb / lightwood / lightwood / mixers / boost / boost.py View on Github external
}
            if 'weights' in output_feature:
                self.targets[output_feature['name']]['weights'] = output_feature['weights']
            else:
                self.targets[output_feature['name']]['weights'] = None

        X = []

        for row in train_ds:
            X.append(np.array(row[0]))

        X = np.array(X)
        for target_col_name in self.targets:
            Y = train_ds.get_column_original_data(target_col_name)

            if self.targets[target_col_name]['type'] == COLUMN_DATA_TYPES.CATEGORICAL:
                weight_map = self.targets[target_col_name]['weights']
                if weight_map is None:
                    sample_weight = [1 for x in real]
                else:
                    sample_weight = []
                    for val in Y:
                        sample_weight.append(weight_map[val])

                self.targets[target_col_name]['model'] = GradientBoostingClassifier(n_estimators=600)
                self.targets[target_col_name]['model'].fit(X,Y,sample_weight=sample_weight)

            elif self.targets[target_col_name]['type'] == COLUMN_DATA_TYPES.NUMERIC:
                self.targets[target_col_name]['model'] = GradientBoostingRegressor(n_estimators=600)
                self.targets[target_col_name]['model'].fit(X,Y)
                if self.quantiles is not None:
                    self.targets[target_col_name]['quantile_models'] = {}
github mindsdb / lightwood / lightwood / mixers / nn / nn.py View on Github external
if self.batch_size < self.net.available_devices:
                self.batch_size = self.net.available_devices

            self.awareness_criterion = torch.nn.MSELoss()

            if self.criterion_arr is None:
                self.criterion_arr = []
                self.unreduced_criterion_arr = []
                if ds.output_weights is not None and ds.output_weights is not False:
                    output_weights = torch.Tensor(ds.output_weights).to(self.net.device)
                else:
                    output_weights = None

                for k, output_type in enumerate(self.out_types):
                    if output_type == COLUMN_DATA_TYPES.CATEGORICAL:
                        if output_weights is None:
                            weights_slice = None
                        else:
                            weights_slice = output_weights[ds.out_indexes[k][0]:ds.out_indexes[k][1]]

                        self.criterion_arr.append(TransformCrossEntropyLoss(weight=weights_slice))
                        self.unreduced_criterion_arr.append(TransformCrossEntropyLoss(weight=weights_slice,reduce=False))
                    elif output_type == COLUMN_DATA_TYPES.MULTIPLE_CATEGORICAL:
                        if output_weights is None:
                            weights_slice = None
                        else:
                            weights_slice = output_weights[ds.out_indexes[k][0]:ds.out_indexes[k][1]]

                        self.criterion_arr.append(torch.nn.BCEWithLogitsLoss(weight=weights_slice))
                        self.unreduced_criterion_arr.append(torch.nn.BCEWithLogitsLoss(weight=weights_slice, reduce=False))
                    elif output_type == COLUMN_DATA_TYPES.NUMERIC:
github mindsdb / lightwood / lightwood / api / predictor.py View on Github external
def type_map(col_name):
            col_pd_type = from_data[col_name].dtype
            col_pd_type = str(col_pd_type)

            if col_pd_type in ['int64', 'float64', 'timedelta']:
                return COLUMN_DATA_TYPES.NUMERIC
            elif col_pd_type in ['bool', 'category']:
                return COLUMN_DATA_TYPES.CATEGORICAL
            else:
                # if the number of uniques is elss than 100 or less,
                # than 10% of the total number of rows then keep it as categorical
                unique = from_data[col_name].nunique()
                if unique < 100 or unique < len(from_data[col_name]) / 10:
                    return COLUMN_DATA_TYPES.CATEGORICAL
                # else assume its text
                return COLUMN_DATA_TYPES.TEXT
github mindsdb / lightwood / lightwood / api / predictor.py View on Github external
def type_map(col_name):
            col_pd_type = from_data[col_name].dtype
            col_pd_type = str(col_pd_type)

            if col_pd_type in ['int64', 'float64', 'timedelta']:
                return COLUMN_DATA_TYPES.NUMERIC
            elif col_pd_type in ['bool', 'category']:
                return COLUMN_DATA_TYPES.CATEGORICAL
            else:
                # if the number of uniques is elss than 100 or less,
                # than 10% of the total number of rows then keep it as categorical
                unique = from_data[col_name].nunique()
                if unique < 100 or unique < len(from_data[col_name]) / 10:
                    return COLUMN_DATA_TYPES.CATEGORICAL
                # else assume its text
                return COLUMN_DATA_TYPES.TEXT
github mindsdb / lightwood / lightwood / encoders / text / distilbert.py View on Github external
def prepare_encoder(self, priming_data, training_data=None):
        if self._prepared:
            raise Exception('You can only call "prepare_encoder" once for a given encoder.')

        priming_data = [x if x is not None else '' for x in priming_data]

        self._max_len = min(max([len(x) for x in priming_data]), self._model_max_len)
        self._tokenizer = self._tokenizer_class.from_pretrained(self._pretrained_model_name)
        self._pad_id = self._tokenizer.convert_tokens_to_ids([self._tokenizer.pad_token])[0]
        # @TODO: Support multiple targets if they are all categorical
        # or train for the categorical target if it's a mix (maybe ?)

        # @TODO: Attach a language modeling head and/or use GPT2
        # and/or provide outputs better suited to a LM head (which will be the mixer) if the output if text
        if training_data is not None and 'targets' in training_data and len(training_data['targets']) == 1 and training_data['targets'][0]['output_type'] == COLUMN_DATA_TYPES.CATEGORICAL:

            self._model_type = 'classifier'
            self._model = self._classifier_model_class.from_pretrained(self._pretrained_model_name, num_labels=len(
                set(training_data['targets'][0]['unencoded_output'])) + 1).to(self.device)
            batch_size = 10

            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in self._model.named_parameters() if not any(
                    nd in n for nd in no_decay)], 'weight_decay': 0.000001},
                {'params': [p for n, p in self._model.named_parameters() if any(nd in n for nd in no_decay)],
                 'weight_decay': 0.0}
            ]

            optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8)
            scheduler = get_linear_schedule_with_warmup(