How to use the pdpipe.col_generation.MapColVals function in pdpipe

To help you get started, weโ€™ve selected a few pdpipe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pdpipe / pdpipe / tests / col_generation / test_mapcolvals.py View on Github external
def test_mapcolvals():
    """Testing MapColVals pipeline stages."""
    df = _test_df()
    value_map = {1: 'Gold', 2: 'Silver', 3: 'Bronze'}
    res_df = MapColVals('Medal', value_map).apply(df)
    assert res_df['Medal']['UK'] == 'Gold'
    assert res_df['Medal']['USSR'] == 'Bronze'
    assert res_df['Medal']['US'] == 'Silver'
github pdpipe / pdpipe / tests / col_generation / test_mapcolvals.py View on Github external
def test_mapcolvals_with_res_name_no_drop():
    """Testing MapColVals pipeline stages."""
    df = _test_df()
    value_map = {1: 'Gold', 2: 'Silver', 3: 'Bronze'}
    map_stage = MapColVals(
        'Medal', value_map, result_columns='Metal', drop=False)
    res_df = map_stage(df)
    assert res_df['Medal']['UK'] == 1
    assert res_df['Medal']['USSR'] == 3
    assert res_df['Medal']['US'] == 2
    assert res_df['Metal']['UK'] == 'Gold'
    assert res_df['Metal']['USSR'] == 'Bronze'
    assert res_df['Metal']['US'] == 'Silver'
github pdpipe / pdpipe / tests / col_generation / test_mapcolvals.py View on Github external
def test_mapcolvals_bad_res_name_len():
    """Testing MapColVals pipeline stages."""
    value_map = {1: 'Gold', 2: 'Silver', 3: 'Bronze'}
    with pytest.raises(ValueError):
        map_stage = MapColVals('Medal', value_map, result_columns=['A', 'B'])
        assert isinstance(map_stage, MapColVals)
github pdpipe / pdpipe / pdpipe / nltk_stages.py View on Github external
'value_map': nltk.word_tokenize,
            'drop': drop,
            'suffix': '_tok',
            'exmsg': TokenizeWords._DEF_TOKENIZE_EXC_MSG.format(col_str),
            'appmsg': TokenizeWords._DEF_TOKENIZE_APP_MSG.format(col_str),
            'desc': "Tokenize {}".format(col_str),
        }
        super_kwargs.update(**kwargs)
        super().__init__(**super_kwargs)

    def _prec(self, df):
        return super()._prec(df) and all(
            col_type == object for col_type in df.dtypes[self._columns])


class UntokenizeWords(MapColVals):
    """A pipeline stage that joins token lists to whitespace-seperated strings.

    Parameters
    ----------
    columns : str or list-like
        Column names in the DataFrame to be untokenized.
    drop : bool, default True
        If set to True, the source columns are dropped after being untokenized,
        and the resulting columns retain the names of the source columns.
        Otherwise, untokenized columns gain the suffix '_untok'.

    Example
    -------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> data = [[3.2, ['Shake', 'and', 'bake!']]]
    >>> df = pd.DataFrame(data, [1], ['freq', 'content'])
github pdpipe / pdpipe / pdpipe / col_generation.py View on Github external
self._result_columns = [
                    col + self.suffix for col in self._columns
                ]
        else:
            self._result_columns = _interpret_columns_param(result_columns)
            if len(self._result_columns) != len(self._columns):
                raise ValueError(
                    "columns and result_columns parameters must"
                    " be string lists of the same length!"
                )
        col_str = _list_str(self._columns)
        sfx = "s" if len(self._columns) > 1 else ""
        self._drop = drop
        super_kwargs = {
            "exmsg": MapColVals._DEF_MAP_COLVAL_EXC_MSG.format(sfx, col_str),
            "appmsg": MapColVals._DEF_MAP_COLVAL_APP_MSG.format(
                sfx, col_str, self._value_map
            ),
            "desc": "Map values of column{} {} with {}.".format(
                sfx, col_str, self._value_map
            ),
        }
        super_kwargs.update(**kwargs)
        super().__init__(**super_kwargs)
github pdpipe / pdpipe / pdpipe / nltk_stages.py View on Github external
'value_map': self._stopwords_remover,
            'drop': drop,
            'suffix': '_nostop',
            'exmsg': RemoveStopwords._DEF_STOPWORDS_EXC_MSG.format(col_str),
            'appmsg': RemoveStopwords._DEF_STOPWORDS_APP_MSG.format(col_str),
            'desc': "Removing stopwords from {}".format(col_str),
        }
        super_kwargs.update(**kwargs)
        super().__init__(**super_kwargs)

    def _prec(self, df):
        return super()._prec(df) and all(
            col_type == object for col_type in df.dtypes[self._columns])


class SnowballStem(MapColVals):
    """A pipeline stage that stems words in a list using the Snowball stemmer.

    Parameters
    ----------
    stemmer_name : str
        The name of the Snowball stemmer to use. Should be one of the Snowball
        stemmers implemented by nltk. E.g. 'EnglishStemmer'.
    columns : str or list-like
        Column names in the DataFrame to stem tokens in.
    drop : bool, default True
        If set to True, the source columns are dropped after stemming, and the
        resulting columns retain the names of the source columns. Otherwise,
        resulting columns gain the suffix '_stem'.

    Example
    -------
github pdpipe / pdpipe / pdpipe / col_generation.py View on Github external
else:
                self._result_columns = [
                    col + self.suffix for col in self._columns
                ]
        else:
            self._result_columns = _interpret_columns_param(result_columns)
            if len(self._result_columns) != len(self._columns):
                raise ValueError(
                    "columns and result_columns parameters must"
                    " be string lists of the same length!"
                )
        col_str = _list_str(self._columns)
        sfx = "s" if len(self._columns) > 1 else ""
        self._drop = drop
        super_kwargs = {
            "exmsg": MapColVals._DEF_MAP_COLVAL_EXC_MSG.format(sfx, col_str),
            "appmsg": MapColVals._DEF_MAP_COLVAL_APP_MSG.format(
                sfx, col_str, self._value_map
            ),
            "desc": "Map values of column{} {} with {}.".format(
                sfx, col_str, self._value_map
            ),
        }
        super_kwargs.update(**kwargs)
        super().__init__(**super_kwargs)
github pdpipe / pdpipe / pdpipe / nltk_stages.py View on Github external
'value_map': UntokenizeWords._untokenize_list,
            'drop': drop,
            'suffix': '_untok',
            'exmsg': UntokenizeWords._DEF_UNTOKENIZE_EXC_MSG.format(col_str),
            'appmsg': "Untokenizing {}".format(col_str),
            'desc': "Untokenize {}".format(col_str),
        }
        super_kwargs.update(**kwargs)
        super().__init__(**super_kwargs)

    def _prec(self, df):
        return super()._prec(df) and all(
            col_type == object for col_type in df.dtypes[self._columns])


class RemoveStopwords(MapColVals):
    """A pipeline stage that removes stopwords from a tokenized list.

    Parameters
    ----------
    langugae : str or array-like
        If a string is given, interpreted as the language of the stopwords, and
        should then be one of the languages supported by the NLTK Stopwords
        Corpus. If a list is given, it is assumed to be the list of stopwords
        to remove.
    columns : str or list-like
        Column names in the DataFrame from which to remove stopwords.
    drop : bool, default True
        If set to True, the source columns are dropped after stopword removal,
        and the resulting columns retain the names of the source columns.
        Otherwise, resulting columns gain the suffix '_nostop'.
github pdpipe / pdpipe / pdpipe / nltk_stages.py View on Github external
import importlib
import collections

import nltk
import pandas as pd

from pdpipe.core import PdPipelineStage
from pdpipe.util import out_of_place_col_insert
from pdpipe.col_generation import MapColVals
from pdpipe.shared import (
    _interpret_columns_param,
    _list_str
)


class TokenizeWords(MapColVals):
    """A pipeline stage that tokenize a sentence into words by whitespaces.

    Parameters
    ----------
    columns : str or list-like
        Column names in the DataFrame to be tokenized.
    drop : bool, default True
        If set to True, the source columns are dropped after being tokenized,
        and the resulting tokenized columns retain the names of the source
        columns. Otherwise, tokenized columns gain the suffix '_tok'.

    Example
    -------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> df = pd.DataFrame([[3.2, "Kick the baby!"]], [1], ['freq', 'content'])
    >>> tokenize_stage = pdp.TokenizeWords('content')