Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
NaN values are automatically removed.
Examples
--------
Compute the Friedman test for repeated measurements.
>>> from pingouin import friedman, read_dataset
>>> df = read_dataset('rm_anova')
>>> friedman(data=df, dv='DesireToKill', within='Disgustingness',
... subject='Subject')
Source ddof1 Q p-unc
Friedman Disgustingness 1 9.228 0.002384
"""
# Check data
_check_dataframe(dv=dv, within=within, data=data, subject=subject,
effects='within')
# Collapse to the mean
data = data.groupby([subject, within]).mean().reset_index()
# Remove NaN
if data[dv].isnull().any():
data = remove_rm_na(dv=dv, within=within, subject=subject,
data=data[[subject, within, dv]])
# Extract number of groups and total sample size
grp = data.groupby(within)[dv]
rm = list(data[within].unique())
k = len(rm)
X = np.array([grp.get_group(r).values for r in rm]).T
n = X.shape[0]
def anova2(data=None, dv=None, between=None, ss_type=2, export_filename=None):
"""Two-way balanced ANOVA in pure Python + Pandas.
This is an internal function. The main call to this function should be done
by the :py:func:`pingouin.anova` function.
"""
# Validate the dataframe
_check_dataframe(dv=dv, between=between, data=data, effects='between')
assert len(between) == 2, 'Must have exactly two between-factors variables'
fac1, fac2 = between
# Drop missing values
data = data[[dv, fac1, fac2]].dropna()
assert data.shape[0] >= 5, 'Data must have at least 5 non-missing values.'
# Reset index (avoid duplicate axis error)
data = data.reset_index(drop=True)
grp_both = data.groupby(between)[dv]
if grp_both.count().nunique() == 1:
# BALANCED DESIGN
aov_fac1 = anova(data=data, dv=dv, between=fac1, detailed=True)
aov_fac2 = anova(data=data, dv=dv, between=fac2, detailed=True)
Examples
--------
Compute a two-way mixed model ANOVA.
>>> from pingouin import mixed_anova, read_dataset
>>> df = read_dataset('mixed_anova')
>>> aov = mixed_anova(dv='Scores', between='Group',
... within='Time', subject='Subject', data=df)
>>> aov
Source SS DF1 DF2 MS F p-unc np2 eps
0 Group 5.460 1 58 5.460 5.052 0.028420 0.080 -
1 Time 7.628 2 116 3.814 4.027 0.020373 0.065 0.999
2 Interaction 5.168 2 116 2.584 2.728 0.069530 0.045 -
"""
# Check data
_check_dataframe(dv=dv, within=within, between=between, data=data,
subject=subject, effects='interaction')
# Collapse to the mean
data = data.groupby([subject, within, between]).mean().reset_index()
# Remove NaN
if data[dv].isnull().any():
data = remove_rm_na(dv=dv, within=within, subject=subject,
data=data[[subject, within, between, dv]])
# SUMS OF SQUARES
grandmean = data[dv].mean()
# Extract main effects of time and between
mtime = rm_anova(dv=dv, within=within, subject=subject, data=data,
correction=correction, detailed=True)
mbetw = anova(dv=dv, between=between, data=data, detailed=True)
.. [1] Cochran, W.G., 1950. The comparison of percentages in matched
samples. Biometrika 37, 256–266.
https://doi.org/10.1093/biomet/37.3-4.256
Examples
--------
Compute the Cochran Q test for repeated measurements.
>>> from pingouin import cochran, read_dataset
>>> df = read_dataset('cochran')
>>> cochran(data=df, dv='Energetic', within='Time', subject='Subject')
Source dof Q p-unc
cochran Time 2 6.706 0.034981
"""
# Check data
_check_dataframe(dv=dv, within=within, data=data, subject=subject,
effects='within')
# Remove NaN
if data[dv].isnull().any():
data = remove_rm_na(dv=dv, within=within, subject=subject,
data=data[[subject, within, dv]])
# Groupby and extract size
grp = data.groupby(within)[dv]
grp_s = data.groupby(subject)[dv]
k = data[within].nunique()
dof = k - 1
# n = grp.count().unique()[0]
# Q statistic and p-value
q = (dof * (k * np.sum(grp.sum()**2) - grp.sum().sum()**2)) / \
an alternative approach." Biometrika 38.3/4 (1951): 330-336.
Examples
--------
1. One-way Welch ANOVA on the pain threshold dataset.
>>> from pingouin import welch_anova, read_dataset
>>> df = read_dataset('anova')
>>> aov = welch_anova(dv='Pain threshold', between='Hair color',
... data=df, export_filename='pain_anova.csv')
>>> aov
Source ddof1 ddof2 F p-unc
0 Hair color 3 8.33 5.89 0.018813
"""
# Check data
_check_dataframe(dv=dv, between=between, data=data, effects='between')
# Reset index (avoid duplicate axis error)
data = data.reset_index(drop=True)
# Number of groups
r = data[between].nunique()
ddof1 = r - 1
# Compute weights and ajusted means
grp = data.groupby(between)[dv]
weights = grp.count() / grp.var()
adj_grandmean = (weights * grp.mean()).sum() / weights.sum()
# Treatment sum of squares
ss_tr = np.sum(weights * np.square(grp.mean() - adj_grandmean))
ms_tr = ss_tr / ddof1
sample must have at least 5 measurements.
NaN values are automatically removed.
Examples
--------
Compute the Kruskal-Wallis H-test for independent samples.
>>> from pingouin import kruskal, read_dataset
>>> df = read_dataset('anova')
>>> kruskal(data=df, dv='Pain threshold', between='Hair color')
Source ddof1 H p-unc
Kruskal Hair color 3 10.589 0.014172
"""
# Check data
_check_dataframe(dv=dv, between=between, data=data,
effects='between')
# Remove NaN values
data = data[[dv, between]].dropna()
# Reset index (avoid duplicate axis error)
data = data.reset_index(drop=True)
# Extract number of groups and total sample size
groups = list(data[between].unique())
n_groups = len(groups)
n = data[dv].size
# Rank data, dealing with ties appropriately
data['rank'] = scipy.stats.rankdata(data[dv])
aov : DataFrame
ANOVA summary ::
'Source' : Name of the within-group factors
'ddof1' : Degrees of freedom (numerator)
'ddof2' : Degrees of freedom (denominator)
'F' : F-value
'p-unc' : Uncorrected p-value
'np2' : Partial eta-square effect size
'eps' : Greenhouse-Geisser epsilon factor (= index of sphericity)
'p-GG-corr' : Greenhouse-Geisser corrected p-value
"""
a, b = within
# Validate the dataframe
_check_dataframe(dv=dv, within=within, data=data, subject=subject,
effects='within')
# Remove NaN
if data[[subject, a, b, dv]].isnull().any().any():
data = remove_rm_na(dv=dv, subject=subject, within=[a, b],
data=data[[subject, a, b, dv]])
# Collapse to the mean (that this is also done in remove_rm_na)
data = data.groupby([subject, a, b]).mean().reset_index()
assert not data[a].isnull().any(), 'Cannot have NaN in %s' % a
assert not data[b].isnull().any(), 'Cannot have NaN in %s' % b
assert not data[subject].isnull().any(), 'Cannot have NaN in %s' % subject
# Group sizes and grandmean
n_a = data[a].nunique()
def anovan(data=None, dv=None, between=None, ss_type=2, export_filename=None):
"""N-way ANOVA using statsmodels.
This is an internal function. The main call to this function should be done
by the :py:func:`pingouin.anova` function.
"""
# Check that stasmodels is installed
from pingouin.utils import _is_statsmodels_installed
_is_statsmodels_installed(raise_error=True)
from statsmodels.api import stats
from statsmodels.formula.api import ols
# Validate the dataframe
_check_dataframe(dv=dv, between=between, data=data, effects='between')
all_cols = _flatten_list([dv, between])
bad_chars = [',', '(', ')', ':']
if not all([c not in v for c in bad_chars for v in all_cols]):
err_msg = "comma, bracket, and colon are not allowed in column names."
raise ValueError(err_msg)
# Drop missing values
data = data[all_cols].dropna()
assert data.shape[0] >= 5, 'Data must have at least 5 non-missing values.'
# Reset index (avoid duplicate axis error)
data = data.reset_index(drop=True)
# Create R-like formula
formula = dv + ' ~ '
for fac in between: