Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
n r CI95% r2 adj_r2 p-val power
spearman 30 0.429 [0.08, 0.68] 0.184 0.123 0.018092 0.676
"""
from pingouin.utils import _flatten_list
# Check arguments
assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
assert data.shape[0] > 2, 'Data must have at least 3 samples.'
assert isinstance(x, (str, tuple)), 'x must be a string.'
assert isinstance(y, (str, tuple)), 'y must be a string.'
assert isinstance(covar, (str, list, type(None)))
assert isinstance(x_covar, (str, list, type(None)))
assert isinstance(y_covar, (str, list, type(None)))
if covar is not None and (x_covar is not None or y_covar is not None):
raise ValueError('Cannot specify both covar and {x,y}_covar.')
# Check that columns exist
col = _flatten_list([x, y, covar, x_covar, y_covar])
if isinstance(covar, str):
covar = [covar]
if isinstance(x_covar, str):
x_covar = [x_covar]
if isinstance(y_covar, str):
y_covar = [y_covar]
assert all([c in data for c in col]), 'columns are not in dataframe.'
# Check that columns are numeric
assert all([data[c].dtype.kind in 'bfi' for c in col])
# Drop rows with NaN
data = data[col].dropna()
assert data.shape[0] > 2, 'Data must have at least 3 non-NAN samples.'
# Standardize (= no need for an intercept in least-square regression)
C = (data[col] - data[col].mean(axis=0)) / data[col].std(axis=0)
assert isinstance(y, str), 'y must be a string.'
assert isinstance(m, (list, str)), 'Mediator(s) must be a list or string.'
assert isinstance(covar, (type(None), str, list))
if isinstance(m, str):
m = [m]
n_mediator = len(m)
assert isinstance(data, pd.DataFrame), 'Data must be a DataFrame.'
# Check for duplicates
assert n_mediator == len(set(m)), 'Cannot have duplicates mediators.'
if isinstance(covar, str):
covar = [covar]
if isinstance(covar, list):
assert len(covar) == len(set(covar)), 'Cannot have duplicates covar.'
assert set(m).isdisjoint(covar), 'Mediator cannot be in covar.'
# Check that columns are in dataframe
columns = _fl([x, m, y, covar])
keys = data.columns
assert all([c in keys for c in columns]), 'Column(s) are not in DataFrame.'
# Check that columns are numeric
err_msg = "Columns must be numeric or boolean."
assert all([data[c].dtype.kind in 'bfi' for c in columns]), err_msg
# Drop rows with NAN Values
data = data[columns].dropna()
n = data.shape[0]
assert n > 5, 'DataFrame must have at least 5 samples (rows).'
# Check if mediator is binary
mtype = 'logistic' if all(data[m].nunique() == 2) else 'linear'
# Name of CI
ll_name = 'CI[%.1f%%]' % (100 * alpha / 2)
grp_fac1 = data.groupby(factors[0], sort=False)[dv]
grp_fac2 = data.groupby(factors[1], sort=False)[dv]
grp_both = data.groupby(factors, sort=False)[dv]
labels_fac1 = grp_fac1.groups.keys()
labels_fac2 = grp_fac2.groups.keys()
# comb_fac1 = list(combinations(labels_fac1, 2))
comb_fac2 = list(combinations(labels_fac2, 2))
# Pairwise comparisons
combs_list = list(product(labels_fac1, comb_fac2))
ncombs = len(combs_list)
# np.array(combs_list) does not work because of tuples
# we therefore need to flatten the tupple
combs = np.zeros(shape=(ncombs, 3), dtype=object)
for i in range(ncombs):
combs[i] = _flatten_list(combs_list[i], include_tuple=True)
# Append empty rows
idxiter = np.arange(nrows, nrows + ncombs)
stats = stats.append(pd.DataFrame(columns=stats.columns,
index=idxiter), ignore_index=True)
# Update other columns
stats.loc[idxiter, 'Contrast'] = factors[0] + ' * ' + factors[1]
stats.loc[idxiter, 'Time'] = combs[:, 0]
stats.loc[idxiter, 'Paired'] = paired
stats.loc[idxiter, 'Tail'] = tail
stats.loc[idxiter, 'A'] = combs[:, 1]
stats.loc[idxiter, 'B'] = combs[:, 2]
for i, comb in enumerate(combs):
ic = nrows + i # Take into account previous rows
fac1, col1, col2 = comb
"""
# Check arguments
assert isinstance(dv, str), 'dv must be a string.'
assert isinstance(subject, str), 'subject must be a string.'
assert isinstance(within, (str, list)), 'within must be a string or list.'
# Check that all columns are present
assert dv in data.columns, '%s not in data' % dv
assert data[dv].dtype.kind in 'bfi', '%s must be numeric' % dv
assert subject in data.columns, '%s not in data' % subject
assert not data[subject].isnull().any(), 'Cannot have NaN in %s' % subject
if isinstance(within, str):
within = [within] # within = ['fac1'] or ['fac1', 'fac2']
for w in within:
assert w in data.columns, '%s not in data' % w
# Keep all relevant columns and reset index
data = data[_fl([subject, within, dv])]
# Convert to wide-format + collapse to the mean
data = pd.pivot_table(data, index=subject, values=dv, columns=within,
aggfunc='mean', dropna=True)
return data
n = data.shape[0]
assert n > 5, 'DataFrame must have at least 5 samples (rows).'
# Check if mediator is binary
mtype = 'logistic' if all(data[m].nunique() == 2) else 'linear'
# Name of CI
ll_name = 'CI[%.1f%%]' % (100 * alpha / 2)
ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2))
# Compute regressions
cols = ['names', 'coef', 'se', 'pval', ll_name, ul_name]
# For speed, we pass np.array instead of pandas DataFrame
X_val = data[_fl([x, covar])].values # X + covar as predictors
XM_val = data[_fl([x, m, covar])].values # X + M + covar as predictors
M_val = data[m].values # M as target (no covariates)
y_val = data[y].values # y as target (no covariates)
# M(j) ~ X + covar
sxm = {}
for idx, j in enumerate(m):
if mtype == 'linear':
sxm[j] = linear_regression(X_val, M_val[:, idx],
alpha=alpha).loc[[1], cols]
else:
sxm[j] = logistic_regression(X_val, M_val[:, idx],
alpha=alpha).loc[[1], cols]
sxm[j].at[1, 'names'] = '%s ~ X' % j
sxm = pd.concat(sxm, ignore_index=True)
# Y ~ M + covar