How to use the mlprimitives.custom.feature_extraction.FeatureExtractor function in mlprimitives

To help you get started, we’ve selected a few mlprimitives examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github HDI-Project / MLPrimitives / tests / custom / test_feature_extraction.py View on Github external
def test_fit_auto_pandas(self):
        class FE(FeatureExtractor):
            _detect_features = Mock(return_value=['a', 'b'])
            _fit = Mock()

        fe = FE(features='auto')
        X = pd.DataFrame({
            'a': ['a', 'b', 'c'],
            'b': ['d', 'e', 'f'],
            'c': [1, 2, 3]
        })

        fe.fit(X)

        assert fe._features == ['a', 'b']
        assert fe._detect_features.called_once_with(X)
        expected_calls = [
            ((pd.Series(['a', 'b', 'c']), ), {}),
github HDI-Project / MLPrimitives / mlprimitives / custom / feature_extraction.py View on Github external
self.vectorizers = dict()
        super(StringVectorizer, self).fit(X)

    def _fit(self, x):
        vectorizer = CountVectorizer(**self.kwargs)
        vectorizer.fit(x.fillna('').astype(str))
        self.vectorizers[x.name] = vectorizer

    def _transform(self, x):
        vectorizer = self.vectorizers[x.name]
        bow = vectorizer.transform(x.fillna('').astype(str))
        bow_columns = ['{}_{}'.format(x.name, f) for f in vectorizer.get_feature_names()]
        return pd.DataFrame(bow.toarray(), columns=bow_columns, index=x.index)


class DatetimeFeaturizer(FeatureExtractor):
    """Extract features from a datetime."""

    def _detect_features(self, X):
        return list(X.select_dtypes('datetime').columns)

    def _transform(self, x):
        prefix = x.name + '_'
        features = {
            prefix + 'year': x.dt.year,
            prefix + 'month': x.dt.month,
            prefix + 'day': x.dt.day,
            prefix + 'weekday': x.dt.day,
            prefix + 'hour': x.dt.hour,
        }
        return pd.DataFrame(features)
github HDI-Project / MLPrimitives / mlprimitives / custom / feature_extraction.py View on Github external
def fit(self, X, y=None):
        self.encoders = dict()
        super(CategoricalEncoder, self).fit(X)

    def _fit(self, x):
        encoder = OneHotLabelEncoder(x.name, self.max_labels, self.dropna)
        encoder.fit(x)
        self.encoders[x.name] = encoder

    def _transform(self, x):
        encoder = self.encoders[x.name]
        return encoder.transform(x)


class StringVectorizer(FeatureExtractor):
    """FeatureExtractor that encodes text features using a scikit-learn CountVectorizer.

    When autodetecting features, only features with dtype ``object`` features are considered.

    Optionally, a ``min_words`` can be passed, which allows ignoring features
    have less than the given value of words in all their occurrences.

    Args:
        copy (bool):
            Whether to make a copy of the input data or modify it in place.
            Defaults to ``True``.
        features (list or str):
            List of features to apply the feature extractor to. If ``'auto'`` is passed,
            try to detect the feature automatically. Defaults to an empty list.
        keep (bool):
            Whether to keep the original features instead of replaceing them.
github HDI-Project / MLPrimitives / mlprimitives / custom / feature_extraction.py View on Github external
if self.keep:
                x = X[feature]
            else:
                x = X.pop(feature)

            extracted = self._transform(x)
            X = pd.concat([X, extracted], axis=1)

        return X

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)


class CategoricalEncoder(FeatureExtractor):
    """FeatureExtractor that encodes categorical features using OneHotLabelEncoder.

    When autodetecting features, only features with dtype ``category`` or ``object``
    are considered.

    Optionally, a ``max_unique_ratio`` can be passed, which allows ignoring features
    that have a high number of unique values, such as primary keys.

    Args:
        max_labels (int or None):
            Maximum number of labels to use by feature. Defaults to ``None``.
        max_unique_ratio (int):
            Max proportion of unique values that a feature must have in order
            to be considered a categorical feature. If ``0`` is given, the ratio is ignored.
            Defaults to ``0``.
        dropna (bool):