How to use the featuretools.variable_types function in featuretools

To help you get started, we’ve selected a few featuretools examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github FeatureLabs / featuretools / featuretools / computational_backends / feature_set_calculator.py View on Github external
def _necessary_columns(self, entity, feature_names):
        # We have to keep all Id columns because we don't know what forward
        # relationships will come from this node.
        index_columns = {v.id for v in entity.variables
                         if isinstance(v, (variable_types.Index,
                                           variable_types.Id,
                                           variable_types.TimeIndex))}
        features = (self.feature_set.features_by_name[name]
                    for name in feature_names)
        feature_columns = {f.variable.id for f in features
                           if isinstance(f, IdentityFeature)}
        return list(index_columns | feature_columns)
github FeatureLabs / featuretools / featuretools / entityset / entity.py View on Github external
def set_secondary_time_index(self, secondary_time_index):
        for time_index, columns in secondary_time_index.items():
            if self.df.empty:
                time_to_check = vtypes.DEFAULT_DTYPE_VALUES[self[time_index]._default_pandas_dtype]
            else:
                time_to_check = self.df[time_index].iloc[0]
            time_type = _check_time_type(time_to_check)
            if time_type is None:
                raise TypeError("%s time index not recognized as numeric or"
                                " datetime" % (self.id))
            if self.entityset.time_type != time_type:
                raise TypeError("%s time index is %s type which differs from"
                                " other entityset time indexes" %
                                (self.id, time_type))
            if time_index not in columns:
                columns.append(time_index)

        self.secondary_time_index = secondary_time_index
github FeatureLabs / featuretools / featuretools / utils / entity_utils.py View on Github external
def convert_variable_data(df, column_id, new_type, **kwargs):
    """Convert dataframe's variable to different type.
    """
    if df[column_id].empty:
        return df
    if new_type == vtypes.Numeric:
        orig_nonnull = df[column_id].dropna().shape[0]
        df[column_id] = pd.to_numeric(df[column_id], errors='coerce')
        # This will convert strings to nans
        # If column contained all strings, then we should
        # just raise an error, because that shouldn't have
        # been converted to numeric
        nonnull = df[column_id].dropna().shape[0]
        if nonnull == 0 and orig_nonnull != 0:
            raise TypeError("Attempted to convert all string column {} to numeric".format(column_id))
    elif issubclass(new_type, vtypes.Datetime):
        format = kwargs.get("format", None)
        # TODO: if float convert to int?
        df[column_id] = pd.to_datetime(df[column_id], format=format,
                                       infer_datetime_format=True)
    elif new_type == vtypes.Boolean:
        map_dict = {kwargs.get("true_val", True): True,
github FeatureLabs / featuretools / featuretools / synthesis / deep_feature_synthesis.py View on Github external
def _features_by_type(self, all_features, entity, variable_type, max_depth):
        selected_features = []

        if max_depth is not None and max_depth < 0:
            return selected_features

        for feat in all_features[entity.id]:
            f = all_features[entity.id][feat]

            if (variable_type == variable_types.PandasTypes._all or
                    f.variable_type == variable_type or
                    any(issubclass(f.variable_type, vt) for vt in variable_type)):
                if ((max_depth is None or self._get_depth(f) <= max_depth) and
                        (self.max_hlevel is None or
                         self._max_hlevel(f) <= self.max_hlevel)):
                    selected_features.append(f)

        return selected_features
github FeatureLabs / featuretools / featuretools / utils / wrangle.py View on Github external
def _check_time_against_column(time, time_column):
    '''
    Check to make sure that time is compatible with time_column,
    where time could be a timestamp, or a Timedelta, number, or None,
    and time_column is a Variable. Compatibility means that
    arithmetic can be performed between time and elements of time_columnj

    If time is None, then we don't care if arithmetic can be performed
    (presumably it won't ever be performed)
    '''
    if time is None:
        return True
    elif isinstance(time, (int, float)):
        return isinstance(time_column,
                          variable_types.Numeric)
    elif isinstance(time, (pd.Timestamp, datetime, pd.DateOffset)):
        return isinstance(time_column,
                          variable_types.Datetime)
    elif isinstance(time, Timedelta):
        return (isinstance(time_column, (variable_types.Datetime, variable_types.DatetimeTimeIndex)) or
                (isinstance(time_column, (variable_types.Ordinal, variable_types.Numeric, variable_types.TimeIndex)) and
                 time.unit not in Timedelta._time_units))
    else:
        return False
github HDI-Project / MLBlocks / examples / pipelines / multitable / multitable.py View on Github external
def make_entity_set(orders_table, order_products_table):
    es = ft.EntitySet("instacart")

    es.entity_from_dataframe(
        entity_id="order_products",
        dataframe=order_products_table,
        index="order_product_id",
        variable_types={
            "aisle_id": ft.variable_types.Categorical,
            "reordered": ft.variable_types.Boolean
        },
        time_index="order_time")

    es.entity_from_dataframe(
        entity_id="orders",
        dataframe=orders_table,
        index="order_id",
        time_index="order_time")

    es.add_relationship(
        ft.Relationship(es["orders"]["order_id"],
                        es["order_products"]["order_id"]))

    es.normalize_entity(
        base_entity_id="orders", new_entity_id="users", index="user_id")
github FeatureLabs / featuretools / featuretools / entityset / entity.py View on Github external
from featuretools.utils.entity_utils import (
    col_is_datetime,
    convert_all_variable_data,
    convert_variable_data,
    get_linked_vars,
    infer_variable_types
)
from featuretools.utils.wrangle import (
    _check_time_type,
    _check_timedelta,
    _dataframes_equal
)

logger = logging.getLogger('featuretools.entityset')

_numeric_types = vtypes.PandasTypes._pandas_numerics
_categorical_types = [vtypes.PandasTypes._categorical]
_datetime_types = vtypes.PandasTypes._pandas_datetimes


class Entity(object):
    """Represents an entity in a Entityset, and stores relevant metadata and data

    An Entity is analogous to a table in a relational database

    See Also:
        :class:`.Relationship`, :class:`.Variable`, :class:`.EntitySet`

    """

    def __init__(self, id, df, entityset, variable_types=None,
                 index=None, time_index=None, secondary_time_index=None,
github FeatureLabs / featuretools / featuretools / entityset / entity.py View on Github external
def set_time_index(self, variable_id, already_sorted=False):
        # check time type
        if self.df.empty:
            time_to_check = vtypes.DEFAULT_DTYPE_VALUES[self[variable_id]._default_pandas_dtype]
        else:
            time_to_check = self.df[variable_id].iloc[0]

        time_type = _check_time_type(time_to_check)
        if time_type is None:
            raise TypeError("%s time index not recognized as numeric or"
                            " datetime" % (self.id))

        if self.entityset.time_type is None:
            self.entityset.time_type = time_type
        elif self.entityset.time_type != time_type:
            raise TypeError("%s time index is %s type which differs from"
                            " other entityset time indexes" %
                            (self.id, time_type))

        # use stable sort