How to use featuretools - 10 common examples

To help you get started, we’ve selected a few featuretools examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Featuretools / DL-DB / tests / test_atm.py View on Github external
def score_model_baseline(fm, labels, fl, hyperparams):
    baseline_fm = (fm.reset_index('customer_id', drop=False)
                     .drop_duplicates('customer_id', keep='last')
                     .set_index('customer_id'))
    baseline_fm, baseline_fl = ft.encode_features(baseline_fm, fl)
    baseline_fm, baseline_fl = remove_low_information_features(baseline_fm, baseline_fl)

    hyperparams = parse_hyperparams_baseline(hyperparams)
    print("HYPERPARAMS:", hyperparams)
    cv_score = []
    n_splits = 5
    splitter = StratifiedKFold(n_splits=n_splits, shuffle=True)
    for train_index, test_index in splitter.split(labels, labels):
        baseline_train_labels = labels.iloc[train_index]
        baseline_test_labels = labels.iloc[test_index]
        baseline_train_fm = baseline_fm.loc[baseline_train_labels.index, :]
        baseline_test_fm = baseline_fm.loc[baseline_test_labels.index, :]

        score = score_baseline_pipeline(baseline_train_fm, baseline_train_labels,
                                        baseline_test_fm, baseline_test_labels,
                                        **hyperparams)
github FeatureLabs / featuretools-tsfresh-primitives / featuretools_tsfresh_primitives / test_primitives.py View on Github external
def test_all_primitives(entityset, parameters):
    is_agg_primitive = lambda name: issubclass(primitives[name], ft.primitives.AggregationPrimitive)
    construct_primitive = lambda name: primitives[name](**parameters.get(name, {}))
    agg_primitives = [construct_primitive(name) for name in primitives if is_agg_primitive(name)]
    feature_matrix, features = ft.dfs(entityset=entityset, target_entity='sessions', agg_primitives=agg_primitives)
    assert not feature_matrix.empty
github FeatureLabs / featuretools-tsfresh-primitives / featuretools_tsfresh_primitives / test_primitives.py View on Github external
    is_agg_primitive = lambda name: issubclass(primitives[name], ft.primitives.AggregationPrimitive)
    construct_primitive = lambda name: primitives[name](**parameters.get(name, {}))
github HDI-Project / MLBlocks / mlblocks / primitives / custom / preprocessors / multitable / dfs.py View on Github external
def fit(self, X, **kwargs):
        self.features = ft.dfs(
            cutoff_time=X,
            features_only=True,
            max_depth=self.max_depth,
            **kwargs
        )
github FeatureLabs / nlp_primitives / nlp_primitives / lsa.py View on Github external
[[0.0, 0.0, 0.01], [0.0, 0.0, 0.0]]

        Now, if we change the values of the input corpus, to something that better resembles
        the given text, the same given input text will result in a different, more discerning,
        output. Also, NaN values are handled, as well as strings without words.

        >>> lsa = LSA()
        >>> x = ["the earth is round", "", np.NaN, ".,/"]
        >>> res = lsa(x).tolist()
        >>> for i in range(len(res)): res[i] = [abs(round(x, 2)) for x in res[i]]
        >>> res
        [[0.01, 0.0, nan, 0.0], [0.0, 0.0, nan, 0.0]]

    """
    name = "lsa"
    input_types = [Text]
    return_type = Numeric
    default_value = 0

    def __init__(self):
        # TODO: allow user to use own corpus
        self.number_output_features = 2
        self.n = 2

        try:
            brown = nltk.corpus.brown.sents()
        except LookupError:
            nltk.download('brown')
            brown = nltk.corpus.brown.sents()
        finally:
            self.trainer = make_pipeline(TfidfVectorizer(), TruncatedSVD())
            self.trainer.fit([" ".join(sent) for sent in brown])
github FeatureLabs / featuretools-tsfresh-primitives / featuretools_tsfresh_primitives / primitives / linear_trend.py View on Github external
class LinearTrend(AggregationPrimitive):
    """Calculate a linear least-squares regression for the values of the time
    series versus the sequence from 0 to length of the time series minus one.
    This feature assumes the signal to be uniformly sampled. It will not use
    the time stamps to fit the model.

    Args:
        attr (str) : Controls which of the characteristics are returned.
            Possible extracted attributes are:
                ['pvalue', 'rvalue', 'intercept', 'slope', 'stderr'].

    Docstring source:
    https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#tsfresh.feature_extraction.feature_calculators.linear_trend
    """
    name = "linear_trend"
    input_types = [Numeric]
    return_type = Numeric
    stack_on_self = False

    def __init__(self, attr):
        self.attr = attr

    def get_function(self):
        def function(x):
            param = [{'attr': self.attr}]
            return list(linear_trend(x, param))[0][1]

        return function
github FeatureLabs / featuretools-tsfresh-primitives / featuretools_tsfresh_primitives / primitives / maximum.py View on Github external
from featuretools.primitives import AggregationPrimitive
from featuretools.variable_types import Numeric
from tsfresh.feature_extraction.feature_calculators import maximum


class Maximum(AggregationPrimitive):
    """Calculates the highest value of the time series x.

    Docstring source:
    https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#tsfresh.feature_extraction.feature_calculators.maximum
    """
    name = "maximum"
    input_types = [Numeric]
    return_type = Numeric
    stack_on_self = False

    def get_function(self):
        return maximum
github FeatureLabs / featuretools-tsfresh-primitives / featuretools_tsfresh_primitives / primitives / quantile.py View on Github external
from featuretools.variable_types import Numeric
from tsfresh.feature_extraction.feature_calculators import quantile


class Quantile(AggregationPrimitive):
    """Calculates the q quantile of x. This is the value of x greater than q%
    of the ordered values from x.

    Args:
        q (float) : The quantile to calculate.

    Docstring source:
    https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#tsfresh.feature_extraction.feature_calculators.quantile
    """
    name = "quantile"
    input_types = [Numeric]
    return_type = Numeric
    stack_on_self = False

    def __init__(self, q):
        self.q = q

    def get_function(self):
        def function(x):
            return quantile(x, q=self.q)

        return function
github FeatureLabs / featuretools-tsfresh-primitives / featuretools_tsfresh_primitives / primitives / count_below_mean.py View on Github external
from featuretools.primitives import AggregationPrimitive
from featuretools.variable_types import Numeric
from tsfresh.feature_extraction.feature_calculators import count_below_mean


class CountBelowMean(AggregationPrimitive):
    """Returns the number of values in x that are lower than the mean of x

    Docstring source:
    https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#tsfresh.feature_extraction.feature_calculators.count_below_mean
    """
    name = "count_below_mean"
    input_types = [Numeric]
    return_type = Numeric
    stack_on_self = False

    def get_function(self):
        return count_below_mean
github FeatureLabs / featuretools / featuretools / utils / entity_utils.py View on Github external
# catch cases where object dtype cannot be interpreted as a string
                try:
                    avg_length = sample.str.len().mean()
                    if avg_length > 50:
                        inferred_type = vtypes.Text
                except AttributeError:
                    pass

        elif df[variable].dtype == "bool":
            inferred_type = vtypes.Boolean

        elif pdtypes.is_categorical_dtype(df[variable].dtype):
            inferred_type = vtypes.Categorical

        elif pdtypes.is_numeric_dtype(df[variable].dtype):
            inferred_type = vtypes.Numeric

        elif col_is_datetime(df[variable]):
            inferred_type = vtypes.Datetime

        elif len(df[variable]):
            sample = df[variable] \
                .sample(min(10000, df[variable].nunique(dropna=False)))

            unique = sample.unique()
            percent_unique = sample.size / len(unique)

            if percent_unique < .05:
                inferred_type = vtypes.Categorical
            else:
                inferred_type = vtypes.Numeric