Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _necessary_columns(self, entity, feature_names):
# We have to keep all Id columns because we don't know what forward
# relationships will come from this node.
index_columns = {v.id for v in entity.variables
if isinstance(v, (variable_types.Index,
variable_types.Id,
variable_types.TimeIndex))}
features = (self.feature_set.features_by_name[name]
for name in feature_names)
feature_columns = {f.variable.id for f in features
if isinstance(f, IdentityFeature)}
return list(index_columns | feature_columns)
def set_secondary_time_index(self, secondary_time_index):
for time_index, columns in secondary_time_index.items():
if self.df.empty:
time_to_check = vtypes.DEFAULT_DTYPE_VALUES[self[time_index]._default_pandas_dtype]
else:
time_to_check = self.df[time_index].iloc[0]
time_type = _check_time_type(time_to_check)
if time_type is None:
raise TypeError("%s time index not recognized as numeric or"
" datetime" % (self.id))
if self.entityset.time_type != time_type:
raise TypeError("%s time index is %s type which differs from"
" other entityset time indexes" %
(self.id, time_type))
if time_index not in columns:
columns.append(time_index)
self.secondary_time_index = secondary_time_index
def convert_variable_data(df, column_id, new_type, **kwargs):
"""Convert dataframe's variable to different type.
"""
if df[column_id].empty:
return df
if new_type == vtypes.Numeric:
orig_nonnull = df[column_id].dropna().shape[0]
df[column_id] = pd.to_numeric(df[column_id], errors='coerce')
# This will convert strings to nans
# If column contained all strings, then we should
# just raise an error, because that shouldn't have
# been converted to numeric
nonnull = df[column_id].dropna().shape[0]
if nonnull == 0 and orig_nonnull != 0:
raise TypeError("Attempted to convert all string column {} to numeric".format(column_id))
elif issubclass(new_type, vtypes.Datetime):
format = kwargs.get("format", None)
# TODO: if float convert to int?
df[column_id] = pd.to_datetime(df[column_id], format=format,
infer_datetime_format=True)
elif new_type == vtypes.Boolean:
map_dict = {kwargs.get("true_val", True): True,
def _features_by_type(self, all_features, entity, variable_type, max_depth):
selected_features = []
if max_depth is not None and max_depth < 0:
return selected_features
for feat in all_features[entity.id]:
f = all_features[entity.id][feat]
if (variable_type == variable_types.PandasTypes._all or
f.variable_type == variable_type or
any(issubclass(f.variable_type, vt) for vt in variable_type)):
if ((max_depth is None or self._get_depth(f) <= max_depth) and
(self.max_hlevel is None or
self._max_hlevel(f) <= self.max_hlevel)):
selected_features.append(f)
return selected_features
def _check_time_against_column(time, time_column):
'''
Check to make sure that time is compatible with time_column,
where time could be a timestamp, or a Timedelta, number, or None,
and time_column is a Variable. Compatibility means that
arithmetic can be performed between time and elements of time_columnj
If time is None, then we don't care if arithmetic can be performed
(presumably it won't ever be performed)
'''
if time is None:
return True
elif isinstance(time, (int, float)):
return isinstance(time_column,
variable_types.Numeric)
elif isinstance(time, (pd.Timestamp, datetime, pd.DateOffset)):
return isinstance(time_column,
variable_types.Datetime)
elif isinstance(time, Timedelta):
return (isinstance(time_column, (variable_types.Datetime, variable_types.DatetimeTimeIndex)) or
(isinstance(time_column, (variable_types.Ordinal, variable_types.Numeric, variable_types.TimeIndex)) and
time.unit not in Timedelta._time_units))
else:
return False
def make_entity_set(orders_table, order_products_table):
es = ft.EntitySet("instacart")
es.entity_from_dataframe(
entity_id="order_products",
dataframe=order_products_table,
index="order_product_id",
variable_types={
"aisle_id": ft.variable_types.Categorical,
"reordered": ft.variable_types.Boolean
},
time_index="order_time")
es.entity_from_dataframe(
entity_id="orders",
dataframe=orders_table,
index="order_id",
time_index="order_time")
es.add_relationship(
ft.Relationship(es["orders"]["order_id"],
es["order_products"]["order_id"]))
es.normalize_entity(
base_entity_id="orders", new_entity_id="users", index="user_id")
from featuretools.utils.entity_utils import (
col_is_datetime,
convert_all_variable_data,
convert_variable_data,
get_linked_vars,
infer_variable_types
)
from featuretools.utils.wrangle import (
_check_time_type,
_check_timedelta,
_dataframes_equal
)
logger = logging.getLogger('featuretools.entityset')
_numeric_types = vtypes.PandasTypes._pandas_numerics
_categorical_types = [vtypes.PandasTypes._categorical]
_datetime_types = vtypes.PandasTypes._pandas_datetimes
class Entity(object):
"""Represents an entity in a Entityset, and stores relevant metadata and data
An Entity is analogous to a table in a relational database
See Also:
:class:`.Relationship`, :class:`.Variable`, :class:`.EntitySet`
"""
def __init__(self, id, df, entityset, variable_types=None,
index=None, time_index=None, secondary_time_index=None,
def set_time_index(self, variable_id, already_sorted=False):
# check time type
if self.df.empty:
time_to_check = vtypes.DEFAULT_DTYPE_VALUES[self[variable_id]._default_pandas_dtype]
else:
time_to_check = self.df[variable_id].iloc[0]
time_type = _check_time_type(time_to_check)
if time_type is None:
raise TypeError("%s time index not recognized as numeric or"
" datetime" % (self.id))
if self.entityset.time_type is None:
self.entityset.time_type = time_type
elif self.entityset.time_type != time_type:
raise TypeError("%s time index is %s type which differs from"
" other entityset time indexes" %
(self.id, time_type))
# use stable sort