How to use the pingouin._check_dataframe function in pingouin

To help you get started, we’ve selected a few pingouin examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github raphaelvallat / pingouin / pingouin / nonparametric.py View on Github external
NaN values are automatically removed.

    Examples
    --------
    Compute the Friedman test for repeated measurements.

    >>> from pingouin import friedman, read_dataset
    >>> df = read_dataset('rm_anova')
    >>> friedman(data=df, dv='DesireToKill', within='Disgustingness',
    ...          subject='Subject')
                      Source  ddof1      Q     p-unc
    Friedman  Disgustingness      1  9.228  0.002384
    """
    # Check data
    _check_dataframe(dv=dv, within=within, data=data, subject=subject,
                     effects='within')

    # Collapse to the mean
    data = data.groupby([subject, within]).mean().reset_index()

    # Remove NaN
    if data[dv].isnull().any():
        data = remove_rm_na(dv=dv, within=within, subject=subject,
                            data=data[[subject, within, dv]])

    # Extract number of groups and total sample size
    grp = data.groupby(within)[dv]
    rm = list(data[within].unique())
    k = len(rm)
    X = np.array([grp.get_group(r).values for r in rm]).T
    n = X.shape[0]
github raphaelvallat / pingouin / pingouin / parametric.py View on Github external
def anova2(data=None, dv=None, between=None, ss_type=2, export_filename=None):
    """Two-way balanced ANOVA in pure Python + Pandas.

    This is an internal function. The main call to this function should be done
    by the :py:func:`pingouin.anova` function.
    """
    # Validate the dataframe
    _check_dataframe(dv=dv, between=between, data=data, effects='between')

    assert len(between) == 2, 'Must have exactly two between-factors variables'
    fac1, fac2 = between

    # Drop missing values
    data = data[[dv, fac1, fac2]].dropna()
    assert data.shape[0] >= 5, 'Data must have at least 5 non-missing values.'

    # Reset index (avoid duplicate axis error)
    data = data.reset_index(drop=True)
    grp_both = data.groupby(between)[dv]

    if grp_both.count().nunique() == 1:
        # BALANCED DESIGN
        aov_fac1 = anova(data=data, dv=dv, between=fac1, detailed=True)
        aov_fac2 = anova(data=data, dv=dv, between=fac2, detailed=True)
github raphaelvallat / pingouin / pingouin / parametric.py View on Github external
Examples
    --------
    Compute a two-way mixed model ANOVA.

    >>> from pingouin import mixed_anova, read_dataset
    >>> df = read_dataset('mixed_anova')
    >>> aov = mixed_anova(dv='Scores', between='Group',
    ...                   within='Time', subject='Subject', data=df)
    >>> aov
            Source     SS  DF1  DF2     MS      F     p-unc    np2    eps
    0        Group  5.460    1   58  5.460  5.052  0.028420  0.080      -
    1         Time  7.628    2  116  3.814  4.027  0.020373  0.065  0.999
    2  Interaction  5.168    2  116  2.584  2.728  0.069530  0.045      -
    """
    # Check data
    _check_dataframe(dv=dv, within=within, between=between, data=data,
                     subject=subject, effects='interaction')

    # Collapse to the mean
    data = data.groupby([subject, within, between]).mean().reset_index()

    # Remove NaN
    if data[dv].isnull().any():
        data = remove_rm_na(dv=dv, within=within, subject=subject,
                            data=data[[subject, within, between, dv]])

    # SUMS OF SQUARES
    grandmean = data[dv].mean()
    # Extract main effects of time and between
    mtime = rm_anova(dv=dv, within=within, subject=subject, data=data,
                     correction=correction, detailed=True)
    mbetw = anova(dv=dv, between=between, data=data, detailed=True)
github raphaelvallat / pingouin / pingouin / nonparametric.py View on Github external
.. [1] Cochran, W.G., 1950. The comparison of percentages in matched
       samples. Biometrika 37, 256266.
       https://doi.org/10.1093/biomet/37.3-4.256

    Examples
    --------
    Compute the Cochran Q test for repeated measurements.

    >>> from pingouin import cochran, read_dataset
    >>> df = read_dataset('cochran')
    >>> cochran(data=df, dv='Energetic', within='Time', subject='Subject')
            Source  dof      Q     p-unc
    cochran   Time    2  6.706  0.034981
    """
    # Check data
    _check_dataframe(dv=dv, within=within, data=data, subject=subject,
                     effects='within')

    # Remove NaN
    if data[dv].isnull().any():
        data = remove_rm_na(dv=dv, within=within, subject=subject,
                            data=data[[subject, within, dv]])

    # Groupby and extract size
    grp = data.groupby(within)[dv]
    grp_s = data.groupby(subject)[dv]
    k = data[within].nunique()
    dof = k - 1
    # n = grp.count().unique()[0]

    # Q statistic and p-value
    q = (dof * (k * np.sum(grp.sum()**2) - grp.sum().sum()**2)) / \
github raphaelvallat / pingouin / pingouin / parametric.py View on Github external
an alternative approach." Biometrika 38.3/4 (1951): 330-336.

    Examples
    --------
    1. One-way Welch ANOVA on the pain threshold dataset.

    >>> from pingouin import welch_anova, read_dataset
    >>> df = read_dataset('anova')
    >>> aov = welch_anova(dv='Pain threshold', between='Hair color',
    ...                   data=df, export_filename='pain_anova.csv')
    >>> aov
           Source  ddof1  ddof2     F     p-unc
    0  Hair color      3   8.33  5.89  0.018813
    """
    # Check data
    _check_dataframe(dv=dv, between=between, data=data, effects='between')

    # Reset index (avoid duplicate axis error)
    data = data.reset_index(drop=True)

    # Number of groups
    r = data[between].nunique()
    ddof1 = r - 1

    # Compute weights and ajusted means
    grp = data.groupby(between)[dv]
    weights = grp.count() / grp.var()
    adj_grandmean = (weights * grp.mean()).sum() / weights.sum()

    # Treatment sum of squares
    ss_tr = np.sum(weights * np.square(grp.mean() - adj_grandmean))
    ms_tr = ss_tr / ddof1
github raphaelvallat / pingouin / pingouin / nonparametric.py View on Github external
sample must have at least 5 measurements.

    NaN values are automatically removed.

    Examples
    --------
    Compute the Kruskal-Wallis H-test for independent samples.

    >>> from pingouin import kruskal, read_dataset
    >>> df = read_dataset('anova')
    >>> kruskal(data=df, dv='Pain threshold', between='Hair color')
                 Source  ddof1       H     p-unc
    Kruskal  Hair color      3  10.589  0.014172
    """
    # Check data
    _check_dataframe(dv=dv, between=between, data=data,
                     effects='between')

    # Remove NaN values
    data = data[[dv, between]].dropna()

    # Reset index (avoid duplicate axis error)
    data = data.reset_index(drop=True)

    # Extract number of groups and total sample size
    groups = list(data[between].unique())
    n_groups = len(groups)
    n = data[dv].size

    # Rank data, dealing with ties appropriately
    data['rank'] = scipy.stats.rankdata(data[dv])
github raphaelvallat / pingouin / pingouin / parametric.py View on Github external
aov : DataFrame
        ANOVA summary ::

        'Source' : Name of the within-group factors
        'ddof1' : Degrees of freedom (numerator)
        'ddof2' : Degrees of freedom (denominator)
        'F' : F-value
        'p-unc' : Uncorrected p-value
        'np2' : Partial eta-square effect size
        'eps' : Greenhouse-Geisser epsilon factor (= index of sphericity)
        'p-GG-corr' : Greenhouse-Geisser corrected p-value
    """
    a, b = within

    # Validate the dataframe
    _check_dataframe(dv=dv, within=within, data=data, subject=subject,
                     effects='within')

    # Remove NaN
    if data[[subject, a, b, dv]].isnull().any().any():
        data = remove_rm_na(dv=dv, subject=subject, within=[a, b],
                            data=data[[subject, a, b, dv]])

    # Collapse to the mean (that this is also done in remove_rm_na)
    data = data.groupby([subject, a, b]).mean().reset_index()

    assert not data[a].isnull().any(), 'Cannot have NaN in %s' % a
    assert not data[b].isnull().any(), 'Cannot have NaN in %s' % b
    assert not data[subject].isnull().any(), 'Cannot have NaN in %s' % subject

    # Group sizes and grandmean
    n_a = data[a].nunique()
github raphaelvallat / pingouin / pingouin / parametric.py View on Github external
def anovan(data=None, dv=None, between=None, ss_type=2, export_filename=None):
    """N-way ANOVA using statsmodels.

    This is an internal function. The main call to this function should be done
    by the :py:func:`pingouin.anova` function.
    """
    # Check that stasmodels is installed
    from pingouin.utils import _is_statsmodels_installed
    _is_statsmodels_installed(raise_error=True)
    from statsmodels.api import stats
    from statsmodels.formula.api import ols

    # Validate the dataframe
    _check_dataframe(dv=dv, between=between, data=data, effects='between')
    all_cols = _flatten_list([dv, between])
    bad_chars = [',', '(', ')', ':']
    if not all([c not in v for c in bad_chars for v in all_cols]):
        err_msg = "comma, bracket, and colon are not allowed in column names."
        raise ValueError(err_msg)

    # Drop missing values
    data = data[all_cols].dropna()
    assert data.shape[0] >= 5, 'Data must have at least 5 non-missing values.'

    # Reset index (avoid duplicate axis error)
    data = data.reset_index(drop=True)

    # Create R-like formula
    formula = dv + ' ~ '
    for fac in between: