Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
relationship (Relationship) : Instance of new
relationship to be added.
"""
if relationship in self.relationships:
logger.warning(
"Not adding duplicate relationship: %s", relationship)
return self
# _operations?
# this is a new pair of entities
child_e = relationship.child_entity
child_v = relationship.child_variable.id
parent_e = relationship.parent_entity
parent_v = relationship.parent_variable.id
if not isinstance(child_e[child_v], vtypes.Discrete):
child_e.convert_variable_type(variable_id=child_v,
new_type=vtypes.Id,
convert_data=False)
if not isinstance(parent_e[parent_v], vtypes.Discrete):
parent_e.convert_variable_type(variable_id=parent_v,
new_type=vtypes.Index,
convert_data=False)
parent_dtype = parent_e.df[parent_v].dtype
child_dtype = child_e.df[child_v].dtype
msg = "Unable to add relationship because {} in {} is Pandas dtype {}"\
" and {} in {} is Pandas dtype {}."
if not is_dtype_equal(parent_dtype, child_dtype):
raise ValueError(msg.format(parent_v, parent_e.name, parent_dtype,
child_v, child_e.name, child_dtype))
nonnull = df[column_id].dropna().shape[0]
if nonnull == 0 and orig_nonnull != 0:
raise TypeError("Attempted to convert all string column {} to numeric".format(column_id))
elif issubclass(new_type, vtypes.Datetime):
format = kwargs.get("format", None)
# TODO: if float convert to int?
df[column_id] = pd.to_datetime(df[column_id], format=format,
infer_datetime_format=True)
elif new_type == vtypes.Boolean:
map_dict = {kwargs.get("true_val", True): True,
kwargs.get("false_val", False): False,
True: True,
False: False}
# TODO: what happens to nans?
df[column_id] = df[column_id].map(map_dict).astype(np.bool)
elif not issubclass(new_type, vtypes.Discrete):
raise Exception("Cannot convert column %s to %s" %
(column_id, new_type))
return df
categorical_feature_names (list[str], optional): List of feature names that are categorical
Note: If neither categorical_feature_names nor fl provided, will assume all features of dtype object
are categorical
Assumes all provided features can be treated as either categorical or numeric (including Booleans).
If a data type exists in fm other than a numeric or object type (such as a datetime), then make sure to
include that feature in categorical_feature_names to treat it as a categorical.
'''
if categorical_feature_names is not None:
self.categorical_feature_names = categorical_feature_names
elif fl is not None:
self.categorical_feature_names = [f.get_name() for f in fl
if issubclass(f.variable_type,
Discrete)
and not f.variable_type == Boolean]
else:
self.categorical_feature_names = [c for c in fm.columns
if fm[c].dtype == object]
self.categorical_vocab = self._gen_categorical_mapping(fm)
fm = self._map_categorical_fm_to_int(fm)
self.name_mapping = {c: feature_name_to_valid_keras_name(c)
for c in fm.columns}
self.numeric_columns = [f for f in fm.columns
if f not in self.categorical_feature_names]
self._fit_scaler_imputer(fm)
def __init__(self, base_features, primitive, groupby, name=None):
if not isinstance(groupby, FeatureBase):
groupby = IdentityFeature(groupby)
assert issubclass(groupby.variable_type, Discrete)
self.groupby = groupby
if hasattr(base_features, '__iter__'):
base_features.append(groupby)
else:
base_features = [base_features, groupby]
super(GroupByTransformFeature, self).__init__(base_features=base_features,
primitive=primitive,
name=name)
class CumCount(TransformPrimitive):
"""Calculates the cumulative count.
Description:
Given a list of values, return the cumulative count
(or running count). There is no set window, so the
count at each point is calculated over all prior
values. `NaN` values are counted.
Examples:
>>> cum_count = CumCount()
>>> cum_count([1, 2, 3, 4, None, 5]).tolist()
[1, 2, 3, 4, 5, 6]
"""
name = "cum_count"
input_types = [[Id], [Discrete]]
return_type = Numeric
uses_full_entity = True
def get_function(self):
def cum_count(values):
return np.arange(1, len(values) + 1)
return cum_count
class CumMean(TransformPrimitive):
"""Calculates the cumulative mean.
Description:
Given a list of values, return the cumulative mean
(or running mean). There is no set window, so the
def variable_filter(f, options, groupby=False):
if groupby and not issubclass(f.variable_type, Discrete):
return False
include_vars = 'include_groupby_variables' if groupby else 'include_variables'
ignore_vars = 'ignore_groupby_variables' if groupby else 'ignore_variables'
include_entities = 'include_groupby_entities' if groupby else 'include_entities'
ignore_entities = 'ignore_groupby_entities' if groupby else 'ignore_entities'
dependencies = f.get_dependencies(deep=True) + [f]
for base_f in dependencies:
if isinstance(base_f, IdentityFeature):
if include_vars in options and base_f.entity.id in options[include_vars]:
if base_f.get_name() in options[include_vars][base_f.entity.id]:
continue # this is a valid feature, go to next
else:
return False # this is not an included feature
if ignore_vars in options and base_f.entity.id in options[ignore_vars]:
if base_f.get_name() in options[ignore_vars][base_f.entity.id]:
def fit_transform(self, ftens, fl=None, categorical_feature_names=None, labels=None):
if categorical_feature_names is not None:
self.categorical_feature_names = categorical_feature_names
elif fl is not None:
self.categorical_feature_names = [f.get_name() for f in fl
if issubclass(f.variable_type,
Discrete)
and not
f.variable_type == Boolean]
else:
self.categorical_feature_names = [c for c in ftens.columns
if ftens[c].dtype == object]
# Can't handle multiindex
if len(ftens.index.names) > 1:
index_name = ftens.index.names[0]
ftens = ftens.reset_index(index_name, drop=False).set_index(index_name)
self.categorical_vocab = self._gen_categorical_mapping(ftens)
self.numeric_columns = [f for f in ftens.columns
if f not in self.categorical_feature_names]
ftens = self.fit_transform_scaler_imputer(ftens)
class Mode(AggregationPrimitive):
"""Determines the most commonly repeated value.
Description:
Given a list of values, return the value with the
highest number of occurences. If list is
empty, return `NaN`.
Examples:
>>> mode = Mode()
>>> mode(['red', 'blue', 'green', 'blue'])
'blue'
"""
name = "mode"
input_types = [Discrete]
return_type = None
def get_function(self):
def pd_mode(s):
return s.mode().get(0, np.nan)
return pd_mode
class Min(AggregationPrimitive):
"""Calculates the smallest value, ignoring `NaN` values.
Examples:
>>> min = Min()
>>> min([1, 2, 3, 4, 5, None])
1.0
"""
which appear the most frequently. If there are
fewer than `n` unique values, the output will be
filled with `NaN`.
Args:
n (int): defines "n" in "n most common." Defaults
to 3.
Examples:
>>> n_most_common = NMostCommon(n=2)
>>> x = ['orange', 'apple', 'orange', 'apple', 'orange', 'grapefruit']
>>> n_most_common(x).tolist()
['orange', 'apple']
"""
name = "n_most_common"
input_types = [Discrete]
return_type = Discrete
def __init__(self, n=3):
self.n = n
self.number_output_features = n
def get_function(self):
def n_most_common(x):
array = np.array(x.value_counts().index[:self.n])
if len(array) < self.n:
filler = np.full(self.n - len(array), np.nan)
array = np.append(array, filler)
return array
return n_most_common
means = feature_matrix[numeric_features].mean(axis=0, skipna=True)
else:
stds = feature_matrix.std(axis=0, skipna=True, numeric_only=True)
means = feature_matrix.mean(axis=0, skipna=True, numeric_only=True)
cvs = stds / means
high_variances = cvs[cvs.abs() > cv_threshold]
if features is None:
high_variance_feature_names = [f for f in feature_matrix.columns if f in high_variances or f in keep]
else:
high_variance_features = [f for f in features if f.get_name() in high_variances.index or f.get_name() in keep]
high_variance_feature_names = [f.get_name() for f in high_variance_features]
high_variance_feature_matrix = feature_matrix[high_variance_feature_names]
if categorical_nunique_ratio is not None:
if features is not None:
discrete_features = [f.get_name() for f in features if issubclass(f.variable_type, Discrete)]
ratio = get_categorical_nunique_ratio(feature_matrix[discrete_features], drop_nonumeric=False)
else:
ratio = get_categorical_nunique_ratio(feature_matrix)
high_ratio = ratio[ratio > categorical_nunique_ratio]
if features is None:
high_cat_feature_names = [f for f in feature_matrix if f in high_ratio.index]
else:
high_cat_features = [f for f in features if f.get_name() in high_ratio.index]
high_cat_feature_names = [f.get_name() for f in high_cat_features]
high_variance_features += high_cat_features
high_cat_fm = feature_matrix[high_cat_feature_names]
high_variance_feature_matrix = pd.concat([high_variance_feature_matrix, high_cat_fm], axis=1)
if features is None:
return high_variance_feature_matrix
else: