How to use the pdpipe.core.PdPipelineStage function in pdpipe

To help you get started, weโ€™ve selected a few pdpipe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pdpipe / pdpipe / pdpipe / basic_stages.py View on Github external
def _prec(self, df):
        return self._column in df.columns

    def _transform(self, df, verbose):
        inter_df = df
        before_count = len(inter_df)
        valcount = df[self._column].value_counts()
        to_drop = valcount[valcount < self._threshold].index
        inter_df = inter_df[~inter_df[self._column].isin(to_drop)]
        if verbose:
            print("{} rows dropped.".format(before_count - len(inter_df)))
        return inter_df


class ColReorder(PdPipelineStage):
    """A pipeline stage that reorders columns.

    Parameters
    ----------
    positions : dict
        A mapping of column names to their desired positions after reordering.
        Columns not included in the mapping will maintain their relative
        positions over the non-mapped colums.

    Example
    -------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> df = pd.DataFrame([[8,4,3,7]], columns=['a', 'b', 'c', 'd'])
    >>> pdp.ColReorder({'b': 0, 'c': 3}).apply(df)
       b  a  d  c
    0  4  8  7  3
github pdpipe / pdpipe / pdpipe / col_generation.py View on Github external
loc=loc,
                        column_name=colname,
                    )
                    loc += 1
                return inter_df
            assign_map = {
                colname: new_cols[colname] for colname in new_cols.columns
            }
            return df.assign(**assign_map)
        raise TypeError(  # pragma: no cover
            "Unexpected type generated by applying a function to a DataFrame."
            " Only Series and DataFrame are allowed."
        )


class ApplyByCols(PdPipelineStage):
    """A pipeline stage applying an element-wise function to columns.

    Parameters
    ----------
    columns : str or list-like
        Names of columns on which to apply the given function.
    func : function
        The function to be applied to each element of the given columns.
    result_columns : str or list-like, default None
        The names of the new columns resulting from the mapping operation. Must
        be of the same length as columns. If None, behavior depends on the
        drop parameter: If drop is True, the name of the source column is used;
        otherwise, the name of the source column is used with the suffix
        '_app'.
    drop : bool, default True
        If set to True, source columns are dropped after being mapped.
github pdpipe / pdpipe / pdpipe / core.py View on Github external
prec = _always_true
        self._adhoc_transform = transform
        self._adhoc_prec = prec
        super().__init__(**kwargs)

    def _prec(self, df):
        return self._adhoc_prec(df)

    def _transform(self, df, verbose):
        try:
            return self._adhoc_transform(df, verbose=verbose)
        except TypeError:
            return self._adhoc_transform(df)


class PdPipeline(PdPipelineStage, collections.abc.Sequence):
    """A pipeline for processing pandas DataFrame objects.

    transformer_getter is usefull to avoid applying pipeline stages that are
    aimed to filter out items in a big dataset to create a training set for a
    machine learning model, for example, but should not be applied on future
    individual items to be transformed by the fitted pipeline.

    Parameters
    ----------
    stages : list
        A list of PdPipelineStage objects making up this pipeline.
    transform_getter : callable, optional
        A callable that can be applied to the fitted pipeline to produce a
        sub-pipeline of it which should be used to transform dataframes after
        the pipeline has been fitted. If not given, the fitted pipeline is used
        entirely.
github pdpipe / pdpipe / pdpipe / col_generation.py View on Github external
"""Basic pdpipe PdPipelineStages."""

import numpy as np
import pandas as pd
import sortedcontainers as sc
import tqdm

from pdpipe.core import PdPipelineStage
from pdpipe.util import out_of_place_col_insert, get_numeric_column_names

from pdpipe.shared import _interpret_columns_param, _list_str

from .exceptions import PipelineApplicationError


class Bin(PdPipelineStage):
    """A pipeline stage that adds a binned version of a column or columns.

    If drop is set to True the new columns retain the names of the source
    columns; otherwise, the resulting column gain the suffix '_bin'

    Parameters
    ----------
    bin_map : dict
        Maps column labels to bin arrays. The bin array is interpreted as
        containing start points of consecutive bins, except for the final
        point, assumed to be the end point of the last bin. Additionally, a
        bin array implicitly projects a left-most bin containing all elements
        smaller than the left-most end point and a right-most bin containing
        all elements larger that the right-most end point. For example, the
        list [0, 5, 8] is interpreted as the bins (-โˆž, 0),
        [0-5), [5-8) and [8, โˆž).
github pdpipe / pdpipe / pdpipe / col_generation.py View on Github external
inter_df = inter_df.drop(colname, axis=1)
                loc -= 1
            inter_df = out_of_place_col_insert(
                df=inter_df,
                series=source_col.map(self._value_map),
                loc=loc,
                column_name=new_name,
            )
        return inter_df


def _always_true(x):
    return True


class ApplyToRows(PdPipelineStage):
    """A pipeline stage generating columns by applying a function to each row.

    Parameters
    ----------
    func : function
        The function to be applied to each row of the processed DataFrame.
    colname : single label, default None
        The label of the new column resulting from the function application. If
        None, 'new_col' is used. Ignored if a DataFrame is generated by the
        function (i.e. each row generates a Series rather than a value), in
        which case the laebl of each column in the resulting DataFrame is used.
    follow_column : str, default None
        Resulting columns will be inserted after this column. If None, new
        columns are inserted at the end of the processed DataFrame.
    func_desc : str, default None
        A function description of the given function; e.g. 'normalizing revenue
github pdpipe / pdpipe / pdpipe / basic_stages.py View on Github external
'exmsg': ColRename._DEF_COLDRENAME_EXC_MSG.format(columns_str),
            'appmsg': ColRename._DEF_COLDRENAME_APP_MSG.format(
                suffix, columns_str),
            'desc': "Rename column{} with {}".format(suffix, self._rename_map)
        }
        super_kwargs.update(**kwargs)
        super().__init__(**super_kwargs)

    def _prec(self, df):
        return set(self._rename_map.keys()).issubset(df.columns)

    def _transform(self, df, verbose):
        return df.rename(columns=self._rename_map)


class DropNa(PdPipelineStage):
    """A pipeline stage that drops null values.

    Supports all parameter supported by pandas.dropna function.

    Example
    -------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> df = pd.DataFrame([[1,4],[4,None],[1,11]], [1,2,3], ['a','b'])
    >>> pdp.DropNa().apply(df)
       a     b
    1  1   4.0
    3  1  11.0
    """

    _DEF_DROPNA_EXC_MSG = "DropNa stage failed."
    _DEF_DROPNA_APP_MSG = "Dropping null values..."
github pdpipe / pdpipe / pdpipe / core.py View on Github external
def _is_fittable(self):
        if self.__class__._fit_transform == PdPipelineStage._fit_transform:
            return False
        return True
github pdpipe / pdpipe / pdpipe / core.py View on Github external
def __init__(self, exraise=True, exmsg=None, appmsg=None, desc=None):
        if exmsg is None:
            exmsg = PdPipelineStage._DEF_EXC_MSG
        if appmsg is None:
            appmsg = PdPipelineStage._DEF_APPLY_MSG
        if desc is None:
            desc = PdPipelineStage._DEF_DESCRIPTION
        self._exraise = exraise
        self._exmsg = exmsg
        self._appmsg = appmsg
        self._desc = desc
        self.is_fitted = False
github pdpipe / pdpipe / pdpipe / basic_stages.py View on Github external
def _prec(self, df):
        return True

    def _transform(self, df, verbose):
        before_count = len(df)
        ncols_before = len(df.columns)
        inter_df = df.dropna(**self.dropna_kwargs)
        if verbose:
            print("{} rows, {} columns dropeed".format(
                before_count - len(inter_df),
                ncols_before - len(inter_df.columns),
            ))
        return inter_df


class FreqDrop(PdPipelineStage):
    """A pipeline stage that drops rows by value frequency.

    Parameters
    ----------
    threshold : int
        The minimum frequency required for a value to be kept.
    column : str
        The name of the colums to check for the given value frequency.

    Example
    -------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> df = pd.DataFrame([[1,4],[4,5],[1,11]], [1,2,3], ['a','b'])
    >>> pdp.FreqDrop(2, 'a').apply(df)
       a   b
    1  1   4
github pdpipe / pdpipe / pdpipe / col_generation.py View on Github external
if self._drop:
                inter_df = inter_df.drop(colname, axis=1)
                new_name = colname
                loc -= 1
            inter_df = out_of_place_col_insert(
                df=inter_df,
                series=source_col.apply(
                    self._get_col_binner(self._bin_map[colname])
                ),
                loc=loc,
                column_name=new_name,
            )
        return inter_df


class OneHotEncode(PdPipelineStage):
    """A pipeline stage that one-hot-encodes categorical columns.

    By default only k-1 dummies are created fo k categorical levels, as to
    avoid perfect multicollinearity between the dummy features (also called
    the dummy variabletrap). This is done since features are usually one-hot
    encoded for use with linear models, which require this behaviour.

    Parameters
    ----------
    columns : single label or list-like, default None
        Column labels in the DataFrame to be encoded. If columns is None then
        all the columns with object or category dtype will be converted, except
        those given in the exclude_columns parameter.
    dummy_na : bool, default False
        Add a column to indicate NaNs, if False NaNs are ignored.
    exclude_columns : str or list-like, default None