How to use statsmodels - 10 common examples

To help you get started, we’ve selected a few statsmodels examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github clicumu / doepipeline / doepipeline / model_utils.py View on Github external
def crossvalidate_formula(formula, data, response_column, k):
    PRESS = 0
    for i in range(k):
        start = i * (len(data) // k)
        end = (i + 1) * (len(data) // k) if i < k - 1 else len(data)

        to_drop = data.index[start: end]

        train = data.drop(to_drop)
        test = data.loc[to_drop]

        model = smf.ols(formula, train).fit()
        pred = model.predict(test)
        residuals = test[response_column] - pred
        PRESS += (residuals ** 2).sum()

    response = data[response_column]
    Q2 = 1 - PRESS / ((response - response.mean()) ** 2).sum()
    return Q2
github equinor / webviz-subsurface / webviz_subsurface / plugins / _test2_plug.py View on Github external
candidate_split = best_candidate.split(sep=":")
            if len(candidate_split) == 2:  
                if candidate_split[0] not in selected and candidate_split[0] in remaining: 
                    remaining.remove(candidate_split[0])
                    selected.append(candidate_split[0])
                    maxvars += 1
                if candidate_split[1] not in selected and candidate_split[1] in remaining:
                    remaining.remove(candidate_split[1])
                    selected.append(candidate_split[1])
                    maxvars += 1
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model
github sherpa-ai / sherpa / sherpa / algorithms / sequential_testing.py View on Github external
def _get_best_configs(self, parameters, results, configs, lower_is_better,
                          alpha=0.05):
        """
        Implements the testing procedure itself and returns the reduced set
        of parameter configurations.
        """
        df = self._prep_df_for_linreg(parameters, results,
                                      configs, lower_is_better)
        l = 1
        h = df.Rank.max()
        p = h
        while l != h:
            lm = ols('Objective ~ C(Rank)', data=df.loc[df.Rank <= p, :]).fit()

            p_value = sm.stats.anova_lm(lm, typ=2).loc[:, "PR(>F)"].ix["C(Rank)"]
            reject = p_value < alpha
            if reject:
                h = p - 1
            else:
                l = p
            p = math.ceil((l + h) / 2)

        return df.loc[df.Rank <= p, :].loc[:,
               [p.name for p in parameters]].drop_duplicates().to_dict(
            'records')
github dgwozdz / HN_SO_analysis / codes / grangercausalitytests_mod.py View on Github external
# create lagmat of both time series
        dta = lagmat2ds(x, mxlg, trim='both')
        dta = np.delete(dta, -1, axis = 1) # removal of the not lagged xs

        #add constant
        if addconst:
            dtaown = add_constant(dta[:, 1:(mxlg + 1)], prepend=False)
            dtajoint = add_constant(dta[:, 1:], prepend=False)
        else:
            raise NotImplementedError('Not Implemented')
            #dtaown = dta[:, 1:mxlg]
            #dtajoint = dta[:, 1:]

        # Run ols on both models without and with lags of second variable
        res2down = OLS(dta[:, 0], dtaown).fit()
        res2djoint = OLS(dta[:, 0], dtajoint).fit()

        #print results
        #for ssr based tests see:
        #http://support.sas.com/rnd/app/examples/ets/granger/index.htm
        #the other tests are made-up

        # Granger Causality test using ssr (F statistic)
        fgc1 = ((res2down.ssr - res2djoint.ssr) /
                res2djoint.ssr / mxlg * res2djoint.df_resid)
        if verbose:
            print('ssr based F test:         F=%-8.4f, p=%-8.4f, df_denom=%d,'
                   ' df_num=%d' % (fgc1,
                                    stats.f.sf(fgc1, mxlg,
                                               res2djoint.df_resid),
                                    res2djoint.df_resid, mxlg))
github ratschlab / RiboDiff / src / ribodiff / testcnt.py View on Github external
method = 'bonferroni'
    elif opts.multiTest == 'Holm':
        method = 'holm'
    elif opts.multiTest == 'Hochberg':
        method = 'simes-hochberg'
    elif opts.multiTest == 'Hommel':
        method = 'hommel'
    elif opts.multiTest == 'BY':
        method = 'fdr_by'
    elif opts.multiTest == 'TSBH':
        method = 'tsbh'
    else:
        sys.stderr.write('ERROR: The methods for multiple test correction can only accept \'Bonferroni\', \'Holm\', \'Hochberg\', \'Hommel\', \'BH\', \'BY\' or \'TSBH\' as its input.\n')
        sys.exit()

    mtc = sms.stats.multicomp.multipletests(pval[idx], alpha=0.1, method=method, returnsorted=False)

    padj = pval.copy()
    padj[idx] = mtc[1]
    data.padj = padj

    return data
github ratschlab / RiboDiff / src / ribodiff / testcnt.py View on Github external
print '\r%i genes finished.' % num

        if opts.dispDiff and np.isnan(data.dispAdjRibo[i]):
            continue
        if not opts.dispDiff and np.isnan(data.dispAdj[i]):
            continue

        response = np.hstack([data.countRibo[i, :], data.countRna[i, :]])

        if opts.dispDiff:
            disp = np.hstack([np.repeat(data.dispAdjRibo[i], lenSampleRibo), np.repeat(data.dispAdjRna[i], lenSampleRna)])
        else:
            disp = data.dispAdj[i]

        try:
            modNB0 = sm.GLM(response, explanatory0, family=sm.families.NegativeBinomial(alpha=disp), offset=np.log(librarySizes))
            modNB1 = sm.GLM(response, explanatory1, family=sm.families.NegativeBinomial(alpha=disp), offset=np.log(librarySizes))
            result0 = modNB0.fit()
            result1 = modNB1.fit()
        except sm.tools.sm_exceptions.PerfectSeparationError:
            errorCnt += 1
        else:
            if not opts.dispDiff:
                pval[i] = 1 - chi2.cdf(result0.deviance - result1.deviance, explanatory1.shape[1] - explanatory0.shape[1])
            elif opts.dispDiff:
                pval[i] = 1 - chi2.cdf(result0.deviance - result1.deviance, (explanatory1.shape[1] - explanatory0.shape[1]) / 2.5)
            else:
                pass

    data.pval = pval

    sys.stdout.write('Warning: Failed to do test: %i genes. P value set to \'nan\'.\n' % errorCnt)
github h2oai / h2o-2 / py / testdir_single_jvm / test_GLM2_basic_cmp2.py View on Github external
n_samples, n_features = train.shape
    print "n_samples:", n_samples, "n_features:",  n_features

    print "histogram of target"
    print sp.histogram(target,3)

    print "len(train):",  len(train)
    print "len(target):", len(target)
    print "dataset shape:", dataset.shape

    if family!='gaussian':
        raise Exception("Only have gaussian logistic for scipy")

    # train the classifier
    gauss_log = sm_api.GLM(target, train, family=sm_api.families.Gaussian(sm_api.families.links.log))
    start = time.time()
    gauss_log_results = gauss_log.fit()
    print "sm_api.GLM took", time.time() - start, "seconds"
    print gauss_log_results.summary()
github akelleh / causality / causality / inference / independence_tests / __init__.py View on Github external
def __init__(self, y, x, z, data, alpha):
        self.regression = sm.RLM(data[y], data[x+z])
        self.result = self.regression.fit()
        self.coefficient = self.result.params[x][0]
        confidence_interval = self.result.conf_int(alpha=alpha/2.)
        self.upper = confidence_interval[1][x][0]
        self.lower = confidence_interval[0][x][0]
github selective-inference / Python-software / examples / bootstrap / test_boot.py View on Github external
try:
                df = pd.concat([df, pd.read_csv(csvfile)])
            except FileNotFoundError:
                pass

            if len(df['pivot']) > 0:

                print("selective:", np.mean(df['pivot']), np.std(df['pivot']), np.mean(df['length']), np.std(df['length']), np.mean(df['coverage']))
                print("naive:", np.mean(df['naive_pivot']), np.std(df['naive_pivot']), np.mean(df['naive_length']), np.std(df['naive_length']), np.mean(df['naive_coverage']))

                print("len ratio selective divided by naive:", np.mean(np.array(df['length']) / np.array(df['naive_length'])))

                plt.clf()
                U = np.linspace(0, 1, 101)
                plt.plot(U, sm.distributions.ECDF(df['pivot'])(U), 'r', label='Selective', linewidth=3)
                plt.plot(U, sm.distributions.ECDF(df['naive_pivot'])(U), 'b', label='Naive', linewidth=3)
                plt.legend()
                plt.plot([0,1], [0,1], 'k--', linewidth=2)
                plt.savefig(csvfile[:-4] + '.pdf')

                plt.clf()
                plt.scatter(df['naive_length'], df['length'])
                plt.savefig(csvfile[:-4] + '_lengths.pdf')

            df.to_csv(csvfile, index=False)