How to use the featuretools.variable_types.Discrete function in featuretools

To help you get started, we’ve selected a few featuretools examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github FeatureLabs / featuretools / featuretools / entityset / base_entityset.py View on Github external
relationship (Relationship) : Instance of new
                relationship to be added.
        """
        if relationship in self.relationships:
            logger.warning(
                "Not adding duplicate relationship: %s", relationship)
            return self

        # _operations?

        # this is a new pair of entities
        child_e = relationship.child_entity
        child_v = relationship.child_variable.id
        parent_e = relationship.parent_entity
        parent_v = relationship.parent_variable.id
        if not isinstance(child_e[child_v], vtypes.Discrete):
            child_e.convert_variable_type(variable_id=child_v,
                                          new_type=vtypes.Id,
                                          convert_data=False)

        if not isinstance(parent_e[parent_v], vtypes.Discrete):
            parent_e.convert_variable_type(variable_id=parent_v,
                                           new_type=vtypes.Index,
                                           convert_data=False)

        parent_dtype = parent_e.df[parent_v].dtype
        child_dtype = child_e.df[child_v].dtype
        msg = "Unable to add relationship because {} in {} is Pandas dtype {}"\
            " and {} in {} is Pandas dtype {}."
        if not is_dtype_equal(parent_dtype, child_dtype):
            raise ValueError(msg.format(parent_v, parent_e.name, parent_dtype,
                                        child_v, child_e.name, child_dtype))
github FeatureLabs / featuretools / featuretools / utils / entity_utils.py View on Github external
nonnull = df[column_id].dropna().shape[0]
        if nonnull == 0 and orig_nonnull != 0:
            raise TypeError("Attempted to convert all string column {} to numeric".format(column_id))
    elif issubclass(new_type, vtypes.Datetime):
        format = kwargs.get("format", None)
        # TODO: if float convert to int?
        df[column_id] = pd.to_datetime(df[column_id], format=format,
                                       infer_datetime_format=True)
    elif new_type == vtypes.Boolean:
        map_dict = {kwargs.get("true_val", True): True,
                    kwargs.get("false_val", False): False,
                    True: True,
                    False: False}
        # TODO: what happens to nans?
        df[column_id] = df[column_id].map(map_dict).astype(np.bool)
    elif not issubclass(new_type, vtypes.Discrete):
        raise Exception("Cannot convert column %s to %s" %
                        (column_id, new_type))
    return df
github Featuretools / DL-DB / dldb / dldb.py View on Github external
categorical_feature_names (list[str], optional): List of feature names that are categorical

        Note: If neither categorical_feature_names nor fl provided, will assume all features of dtype object
        are categorical

        Assumes all provided features can be treated as either categorical or numeric (including Booleans).
        If a data type exists in fm other than a numeric or object type (such as a datetime), then make sure to
        include that feature in categorical_feature_names to treat it as a categorical.
        '''

        if categorical_feature_names is not None:
            self.categorical_feature_names = categorical_feature_names
        elif fl is not None:
            self.categorical_feature_names = [f.get_name() for f in fl
                                              if issubclass(f.variable_type,
                                                            Discrete)
                                              and not f.variable_type == Boolean]
        else:
            self.categorical_feature_names = [c for c in fm.columns
                                              if fm[c].dtype == object]

        self.categorical_vocab = self._gen_categorical_mapping(fm)
        fm = self._map_categorical_fm_to_int(fm)

        self.name_mapping = {c: feature_name_to_valid_keras_name(c)
                             for c in fm.columns}

        self.numeric_columns = [f for f in fm.columns
                                if f not in self.categorical_feature_names]

        self._fit_scaler_imputer(fm)
github FeatureLabs / featuretools / featuretools / feature_base / feature_base.py View on Github external
def __init__(self, base_features, primitive, groupby, name=None):
        if not isinstance(groupby, FeatureBase):
            groupby = IdentityFeature(groupby)
        assert issubclass(groupby.variable_type, Discrete)
        self.groupby = groupby

        if hasattr(base_features, '__iter__'):
            base_features.append(groupby)
        else:
            base_features = [base_features, groupby]

        super(GroupByTransformFeature, self).__init__(base_features=base_features,
                                                      primitive=primitive,
                                                      name=name)
github FeatureLabs / featuretools / featuretools / primitives / standard / cum_transform_feature.py View on Github external
class CumCount(TransformPrimitive):
    """Calculates the cumulative count.

    Description:
        Given a list of values, return the cumulative count
        (or running count). There is no set window, so the
        count at each point is calculated over all prior
        values. `NaN` values are counted.

    Examples:
        >>> cum_count = CumCount()
        >>> cum_count([1, 2, 3, 4, None, 5]).tolist()
        [1, 2, 3, 4, 5, 6]
    """
    name = "cum_count"
    input_types = [[Id], [Discrete]]
    return_type = Numeric
    uses_full_entity = True

    def get_function(self):
        def cum_count(values):
            return np.arange(1, len(values) + 1)

        return cum_count


class CumMean(TransformPrimitive):
    """Calculates the cumulative mean.

    Description:
        Given a list of values, return the cumulative mean
        (or running mean). There is no set window, so the
github FeatureLabs / featuretools / featuretools / primitives / options_utils.py View on Github external
def variable_filter(f, options, groupby=False):
    if groupby and not issubclass(f.variable_type, Discrete):
        return False
    include_vars = 'include_groupby_variables' if groupby else 'include_variables'
    ignore_vars = 'ignore_groupby_variables' if groupby else 'ignore_variables'
    include_entities = 'include_groupby_entities' if groupby else 'include_entities'
    ignore_entities = 'ignore_groupby_entities' if groupby else 'ignore_entities'

    dependencies = f.get_dependencies(deep=True) + [f]
    for base_f in dependencies:
        if isinstance(base_f, IdentityFeature):
            if include_vars in options and base_f.entity.id in options[include_vars]:
                if base_f.get_name() in options[include_vars][base_f.entity.id]:
                    continue  # this is a valid feature, go to next
                else:
                    return False  # this is not an included feature
            if ignore_vars in options and base_f.entity.id in options[ignore_vars]:
                if base_f.get_name() in options[ignore_vars][base_f.entity.id]:
github Featuretools / DL-DB / dldb / preprocessor.py View on Github external
def fit_transform(self, ftens, fl=None, categorical_feature_names=None, labels=None):
        if categorical_feature_names is not None:
            self.categorical_feature_names = categorical_feature_names
        elif fl is not None:
            self.categorical_feature_names = [f.get_name() for f in fl
                                              if issubclass(f.variable_type,
                                                            Discrete)
                                              and not
                                              f.variable_type == Boolean]
        else:
            self.categorical_feature_names = [c for c in ftens.columns
                                              if ftens[c].dtype == object]

        # Can't handle multiindex
        if len(ftens.index.names) > 1:
            index_name = ftens.index.names[0]
            ftens = ftens.reset_index(index_name, drop=False).set_index(index_name)
        self.categorical_vocab = self._gen_categorical_mapping(ftens)

        self.numeric_columns = [f for f in ftens.columns
                                if f not in self.categorical_feature_names]

        ftens = self.fit_transform_scaler_imputer(ftens)
github FeatureLabs / featuretools / featuretools / primitives / standard / aggregation_primitives.py View on Github external
class Mode(AggregationPrimitive):
    """Determines the most commonly repeated value.

    Description:
        Given a list of values, return the value with the
        highest number of occurences. If list is
        empty, return `NaN`.

    Examples:
        >>> mode = Mode()
        >>> mode(['red', 'blue', 'green', 'blue'])
        'blue'
    """
    name = "mode"
    input_types = [Discrete]
    return_type = None

    def get_function(self):
        def pd_mode(s):
            return s.mode().get(0, np.nan)
        return pd_mode


class Min(AggregationPrimitive):
    """Calculates the smallest value, ignoring `NaN` values.

    Examples:
        >>> min = Min()
        >>> min([1, 2, 3, 4, 5, None])
        1.0
    """
github FeatureLabs / featuretools / featuretools / primitives / standard / aggregation_primitives.py View on Github external
which appear the most frequently. If there are
        fewer than `n` unique values, the output will be
        filled with `NaN`.

    Args:
        n (int): defines "n" in "n most common." Defaults
            to 3.

    Examples:
        >>> n_most_common = NMostCommon(n=2)
        >>> x = ['orange', 'apple', 'orange', 'apple', 'orange', 'grapefruit']
        >>> n_most_common(x).tolist()
        ['orange', 'apple']
    """
    name = "n_most_common"
    input_types = [Discrete]
    return_type = Discrete

    def __init__(self, n=3):
        self.n = n
        self.number_output_features = n

    def get_function(self):
        def n_most_common(x):
            array = np.array(x.value_counts().index[:self.n])
            if len(array) < self.n:
                filler = np.full(self.n - len(array), np.nan)
                array = np.append(array, filler)
            return array
        return n_most_common

github FeatureLabs / featuretools / featuretools / selection / variance_selection.py View on Github external
means = feature_matrix[numeric_features].mean(axis=0, skipna=True)
    else:
        stds = feature_matrix.std(axis=0, skipna=True, numeric_only=True)
        means = feature_matrix.mean(axis=0, skipna=True, numeric_only=True)
    cvs = stds / means
    high_variances = cvs[cvs.abs() > cv_threshold]
    if features is None:
        high_variance_feature_names = [f for f in feature_matrix.columns if f in high_variances or f in keep]
    else:
        high_variance_features = [f for f in features if f.get_name() in high_variances.index or f.get_name() in keep]
        high_variance_feature_names = [f.get_name() for f in high_variance_features]

    high_variance_feature_matrix = feature_matrix[high_variance_feature_names]
    if categorical_nunique_ratio is not None:
        if features is not None:
            discrete_features = [f.get_name() for f in features if issubclass(f.variable_type, Discrete)]
            ratio = get_categorical_nunique_ratio(feature_matrix[discrete_features], drop_nonumeric=False)
        else:
            ratio = get_categorical_nunique_ratio(feature_matrix)

        high_ratio = ratio[ratio > categorical_nunique_ratio]
        if features is None:
            high_cat_feature_names = [f for f in feature_matrix if f in high_ratio.index]
        else:
            high_cat_features = [f for f in features if f.get_name() in high_ratio.index]
            high_cat_feature_names = [f.get_name() for f in high_cat_features]
            high_variance_features += high_cat_features
        high_cat_fm = feature_matrix[high_cat_feature_names]
        high_variance_feature_matrix = pd.concat([high_variance_feature_matrix, high_cat_fm], axis=1)
    if features is None:
        return high_variance_feature_matrix
    else: