Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _prec(self, df):
return self._column in df.columns
def _transform(self, df, verbose):
inter_df = df
before_count = len(inter_df)
valcount = df[self._column].value_counts()
to_drop = valcount[valcount < self._threshold].index
inter_df = inter_df[~inter_df[self._column].isin(to_drop)]
if verbose:
print("{} rows dropped.".format(before_count - len(inter_df)))
return inter_df
class ColReorder(PdPipelineStage):
"""A pipeline stage that reorders columns.
Parameters
----------
positions : dict
A mapping of column names to their desired positions after reordering.
Columns not included in the mapping will maintain their relative
positions over the non-mapped colums.
Example
-------
>>> import pandas as pd; import pdpipe as pdp;
>>> df = pd.DataFrame([[8,4,3,7]], columns=['a', 'b', 'c', 'd'])
>>> pdp.ColReorder({'b': 0, 'c': 3}).apply(df)
b a d c
0 4 8 7 3
loc=loc,
column_name=colname,
)
loc += 1
return inter_df
assign_map = {
colname: new_cols[colname] for colname in new_cols.columns
}
return df.assign(**assign_map)
raise TypeError( # pragma: no cover
"Unexpected type generated by applying a function to a DataFrame."
" Only Series and DataFrame are allowed."
)
class ApplyByCols(PdPipelineStage):
"""A pipeline stage applying an element-wise function to columns.
Parameters
----------
columns : str or list-like
Names of columns on which to apply the given function.
func : function
The function to be applied to each element of the given columns.
result_columns : str or list-like, default None
The names of the new columns resulting from the mapping operation. Must
be of the same length as columns. If None, behavior depends on the
drop parameter: If drop is True, the name of the source column is used;
otherwise, the name of the source column is used with the suffix
'_app'.
drop : bool, default True
If set to True, source columns are dropped after being mapped.
prec = _always_true
self._adhoc_transform = transform
self._adhoc_prec = prec
super().__init__(**kwargs)
def _prec(self, df):
return self._adhoc_prec(df)
def _transform(self, df, verbose):
try:
return self._adhoc_transform(df, verbose=verbose)
except TypeError:
return self._adhoc_transform(df)
class PdPipeline(PdPipelineStage, collections.abc.Sequence):
"""A pipeline for processing pandas DataFrame objects.
transformer_getter is usefull to avoid applying pipeline stages that are
aimed to filter out items in a big dataset to create a training set for a
machine learning model, for example, but should not be applied on future
individual items to be transformed by the fitted pipeline.
Parameters
----------
stages : list
A list of PdPipelineStage objects making up this pipeline.
transform_getter : callable, optional
A callable that can be applied to the fitted pipeline to produce a
sub-pipeline of it which should be used to transform dataframes after
the pipeline has been fitted. If not given, the fitted pipeline is used
entirely.
"""Basic pdpipe PdPipelineStages."""
import numpy as np
import pandas as pd
import sortedcontainers as sc
import tqdm
from pdpipe.core import PdPipelineStage
from pdpipe.util import out_of_place_col_insert, get_numeric_column_names
from pdpipe.shared import _interpret_columns_param, _list_str
from .exceptions import PipelineApplicationError
class Bin(PdPipelineStage):
"""A pipeline stage that adds a binned version of a column or columns.
If drop is set to True the new columns retain the names of the source
columns; otherwise, the resulting column gain the suffix '_bin'
Parameters
----------
bin_map : dict
Maps column labels to bin arrays. The bin array is interpreted as
containing start points of consecutive bins, except for the final
point, assumed to be the end point of the last bin. Additionally, a
bin array implicitly projects a left-most bin containing all elements
smaller than the left-most end point and a right-most bin containing
all elements larger that the right-most end point. For example, the
list [0, 5, 8] is interpreted as the bins (-โ, 0),
[0-5), [5-8) and [8, โ).
inter_df = inter_df.drop(colname, axis=1)
loc -= 1
inter_df = out_of_place_col_insert(
df=inter_df,
series=source_col.map(self._value_map),
loc=loc,
column_name=new_name,
)
return inter_df
def _always_true(x):
return True
class ApplyToRows(PdPipelineStage):
"""A pipeline stage generating columns by applying a function to each row.
Parameters
----------
func : function
The function to be applied to each row of the processed DataFrame.
colname : single label, default None
The label of the new column resulting from the function application. If
None, 'new_col' is used. Ignored if a DataFrame is generated by the
function (i.e. each row generates a Series rather than a value), in
which case the laebl of each column in the resulting DataFrame is used.
follow_column : str, default None
Resulting columns will be inserted after this column. If None, new
columns are inserted at the end of the processed DataFrame.
func_desc : str, default None
A function description of the given function; e.g. 'normalizing revenue
'exmsg': ColRename._DEF_COLDRENAME_EXC_MSG.format(columns_str),
'appmsg': ColRename._DEF_COLDRENAME_APP_MSG.format(
suffix, columns_str),
'desc': "Rename column{} with {}".format(suffix, self._rename_map)
}
super_kwargs.update(**kwargs)
super().__init__(**super_kwargs)
def _prec(self, df):
return set(self._rename_map.keys()).issubset(df.columns)
def _transform(self, df, verbose):
return df.rename(columns=self._rename_map)
class DropNa(PdPipelineStage):
"""A pipeline stage that drops null values.
Supports all parameter supported by pandas.dropna function.
Example
-------
>>> import pandas as pd; import pdpipe as pdp;
>>> df = pd.DataFrame([[1,4],[4,None],[1,11]], [1,2,3], ['a','b'])
>>> pdp.DropNa().apply(df)
a b
1 1 4.0
3 1 11.0
"""
_DEF_DROPNA_EXC_MSG = "DropNa stage failed."
_DEF_DROPNA_APP_MSG = "Dropping null values..."
def _is_fittable(self):
if self.__class__._fit_transform == PdPipelineStage._fit_transform:
return False
return True
def __init__(self, exraise=True, exmsg=None, appmsg=None, desc=None):
if exmsg is None:
exmsg = PdPipelineStage._DEF_EXC_MSG
if appmsg is None:
appmsg = PdPipelineStage._DEF_APPLY_MSG
if desc is None:
desc = PdPipelineStage._DEF_DESCRIPTION
self._exraise = exraise
self._exmsg = exmsg
self._appmsg = appmsg
self._desc = desc
self.is_fitted = False
def _prec(self, df):
return True
def _transform(self, df, verbose):
before_count = len(df)
ncols_before = len(df.columns)
inter_df = df.dropna(**self.dropna_kwargs)
if verbose:
print("{} rows, {} columns dropeed".format(
before_count - len(inter_df),
ncols_before - len(inter_df.columns),
))
return inter_df
class FreqDrop(PdPipelineStage):
"""A pipeline stage that drops rows by value frequency.
Parameters
----------
threshold : int
The minimum frequency required for a value to be kept.
column : str
The name of the colums to check for the given value frequency.
Example
-------
>>> import pandas as pd; import pdpipe as pdp;
>>> df = pd.DataFrame([[1,4],[4,5],[1,11]], [1,2,3], ['a','b'])
>>> pdp.FreqDrop(2, 'a').apply(df)
a b
1 1 4
if self._drop:
inter_df = inter_df.drop(colname, axis=1)
new_name = colname
loc -= 1
inter_df = out_of_place_col_insert(
df=inter_df,
series=source_col.apply(
self._get_col_binner(self._bin_map[colname])
),
loc=loc,
column_name=new_name,
)
return inter_df
class OneHotEncode(PdPipelineStage):
"""A pipeline stage that one-hot-encodes categorical columns.
By default only k-1 dummies are created fo k categorical levels, as to
avoid perfect multicollinearity between the dummy features (also called
the dummy variabletrap). This is done since features are usually one-hot
encoded for use with linear models, which require this behaviour.
Parameters
----------
columns : single label or list-like, default None
Column labels in the DataFrame to be encoded. If columns is None then
all the columns with object or category dtype will be converted, except
those given in the exclude_columns parameter.
dummy_na : bool, default False
Add a column to indicate NaNs, if False NaNs are ignored.
exclude_columns : str or list-like, default None