How to use the statsmodels.formula.api.ols function in statsmodels

To help you get started, we’ve selected a few statsmodels examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github clicumu / doepipeline / doepipeline / model_utils.py View on Github external
def crossvalidate_formula(formula, data, response_column, k):
    PRESS = 0
    for i in range(k):
        start = i * (len(data) // k)
        end = (i + 1) * (len(data) // k) if i < k - 1 else len(data)

        to_drop = data.index[start: end]

        train = data.drop(to_drop)
        test = data.loc[to_drop]

        model = smf.ols(formula, train).fit()
        pred = model.predict(test)
        residuals = test[response_column] - pred
        PRESS += (residuals ** 2).sum()

    response = data[response_column]
    Q2 = 1 - PRESS / ((response - response.mean()) ** 2).sum()
    return Q2
github equinor / webviz-subsurface / webviz_subsurface / plugins / _test2_plug.py View on Github external
candidate_split = best_candidate.split(sep=":")
            if len(candidate_split) == 2:  
                if candidate_split[0] not in selected and candidate_split[0] in remaining: 
                    remaining.remove(candidate_split[0])
                    selected.append(candidate_split[0])
                    maxvars += 1
                if candidate_split[1] not in selected and candidate_split[1] in remaining:
                    remaining.remove(candidate_split[1])
                    selected.append(candidate_split[1])
                    maxvars += 1
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model
github sherpa-ai / sherpa / sherpa / algorithms / sequential_testing.py View on Github external
def _get_best_configs(self, parameters, results, configs, lower_is_better,
                          alpha=0.05):
        """
        Implements the testing procedure itself and returns the reduced set
        of parameter configurations.
        """
        df = self._prep_df_for_linreg(parameters, results,
                                      configs, lower_is_better)
        l = 1
        h = df.Rank.max()
        p = h
        while l != h:
            lm = ols('Objective ~ C(Rank)', data=df.loc[df.Rank <= p, :]).fit()

            p_value = sm.stats.anova_lm(lm, typ=2).loc[:, "PR(>F)"].ix["C(Rank)"]
            reject = p_value < alpha
            if reject:
                h = p - 1
            else:
                l = p
            p = math.ceil((l + h) / 2)

        return df.loc[df.Rank <= p, :].loc[:,
               [p.name for p in parameters]].drop_duplicates().to_dict(
            'records')
github microsoft / LongitudinalDifferenceInDifferencesPy / LongitudinalDifferenceInDifferencesPy / didDesigner.py View on Github external
def calculateDifferenceInDifferences(self, df):
        #dropna is necessary to remove non-pre and post values.
        reg = smf.ols('kpi ~ post + treat + treat*post', data = df.dropna()).fit()
        differenceInDifferencesEstimateIndex = 3
        return {
            "estimate": reg.params[differenceInDifferencesEstimateIndex],
            "confidence interval": list((reg._results.conf_int(alpha=0.05, cols=[differenceInDifferencesEstimateIndex]))[0])
        }
github openeemeter / eemeter / eemeter / modeling / models / hourly_model.py View on Github external
weekday_df = self.add_cdd(weekday_df)
        weekday_df = self.add_hdd(weekday_df)

        weekend_df = train_df.loc[train_df['day_of_week'].isin(self.weekends)]
        weekend_df = self.add_hdd(weekend_df)
        weekend_df = self.add_cdd(weekend_df)

        try:
            self.model_weekday = smf.ols(formula=self.formula, data=weekday_df)
            self.model_res_weekday = self.model_weekday.fit()
        except ValueError:
            self.model_weekday = None
            self.model_res_weekday = None

        try:
            self.model_weekend = smf.ols(formula=self.formula, data=weekend_df)
            self.model_res_weekend = self.model_weekend.fit()
        except ValueError:
            self.model_weekend = None
            self.model_res_weekend = None

        params = {
            "coefficients": self.model_res_weekday.params.to_dict(),
            "coefficients_weekend": self.model_res_weekend.params.to_dict(),
            "formula": self.formula,
            "cdd_bp": self.cdd_base_temp,
            "hdd_bp": self.hdd_base_temp,
            "X_design_info": ''
        }

        weekday_model_stats = self.get_model_stats(self.model_res_weekday, weekday_df)
        weekend_model_stats = self.get_model_stats(self.model_res_weekend, weekend_df)
github TimDettmers / sparse_learning / mnist_cifar / plot_feature_histograms.py View on Github external
fig.subplots_adjust(wspace=0.0)
        title = '{0} Conv2D Layer {1}'.format(name, layer_id+1)

        plt.suptitle(title, x=0.55, y=1.0)
        if not os.path.exists('./feat_plots'):
            os.mkdir('feat_plots')
        plt.savefig('./feat_plots/{0}.png'.format(title))
        #fig.savefig("foo.pdf", bbox_inches='tight')
        plt.clf()

    anova_all = np.array(anova_all)
    df = pd.DataFrame(data=anova_all[1:,1:],index=anova_all[1:,0].tolist(),columns=anova_all[0,1:].tolist())
    df.colums = ['id', 'y', 'layer_id', 'is_sparse']
    df = df.astype({'y' : 'float32', 'layer_id' : 'int32', 'is_sparse' : 'int32'})
    formula = 'y ~ C(layer_id) + C(is_sparse) + C(layer_id)*C(is_sparse)'
    model = ols(formula, df).fit()
    aov_table = anova_lm(model, typ=1)

    eta_squared(aov_table)
    omega_squared(aov_table)
    print(aov_table)
github ekolik / -Python-Analysis_of_wine_quality / machine_learning.py View on Github external
clustergrp = merged_train.groupby('cluster').mean()
    print('\nClustering variable means by cluster:')
    print(clustergrp)

    # _________validate clusters in training data by examining cluster differences
    #               in wine quality (validation variable) using ANOVA_____________
    # merge wine quality with clustering variables and cluster assignment data
    qual = wine_set['quality']
    # split quality data into train and test sets
    qual_train, qual_test = train_test_split(qual, test_size=.3, random_state=123)
    qual_train1 = pd.DataFrame(qual_train)
    qual_train1.reset_index(level=0, inplace=True)
    merged_train_all = pd.merge(qual_train1, merged_train, on='index')
    sub1 = merged_train_all[['quality', 'cluster']]

    mod = smf.ols(formula='quality ~ C(cluster)', data=sub1).fit()
    print(mod.summary())

    print('\nMeans for wine quality by cluster:')
    print(sub1.groupby('cluster').mean())
    print('\nStandard deviations for wine quality by cluster:')
    print(sub1.groupby('cluster').std())

    # perform Post hoc test (using Tukey's Honestly Significant Difference Test)
    mc1 = multi.MultiComparison(sub1['quality'], sub1['cluster'])
    res1 = mc1.tukeyhsd()
    print(res1.summary())
github hall-lab / svtools / svtools / sv_classifier.py View on Github external
df=pd.DataFrame(tSet, columns=CN_rec._fields)
    #exclude from training data, DELs and DUPs with CN in the tails of the distribution
    df.loc[:,'q_low']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(lowQuantile)
    df.loc[:,'q_high']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(highQuantile)
    df=df[(df.log2r>=df.q_low) & (df.log2r<=df.q_high)]
    #df.to_csv('./train.csv')
    #adjust copy number for small deletions (<1kb), no strong relationship b/w cn and size for dups evident so far
    small_het_dels = df[(df.svtype=="DEL") & (df.GT=="0/1") & (df.svlen<1000) & (df.svlen>=50)].copy()
    small_hom_dels = df[(df.svtype=="DEL") & (df.GT=="1/1") & (df.svlen<1000) & (df.svlen>=50)].copy()
    het_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="0/1") & (df.svtype=="DEL")]['log2r'])
    hom_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="1/1") & (df.svtype=="DEL")]['log2r'])
    small_het_dels.loc[:,'offset']=small_het_dels.loc[:,'log2r']-het_del_mean
    small_hom_dels.loc[:,'offset']=small_hom_dels.loc[:,'log2r']-hom_del_mean
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        hom_del_fit=smf.ols('offset~log_len',small_hom_dels).fit()
        het_del_fit=smf.ols('offset~log_len',small_het_dels).fit()
        #print hom_del_fit.summary()
        #print het_del_fit.summary()
        small_hom_dels.loc[:,'log2r_adj'] = small_hom_dels.loc[:,'log2r'] - hom_del_fit.predict(small_hom_dels)
        small_het_dels.loc[:,'log2r_adj'] = small_het_dels.loc[:,'log2r'] - het_del_fit.predict(small_het_dels)
    small_dels=small_hom_dels.append(small_het_dels)
    small_dels=small_dels[['var_id', 'sample', 'svtype', 'svlen', 'AF', 'GT', 'CN', 'log_len', 'log2r', 'q_low', 'q_high', 'log2r_adj']]
    # dels of length<100 bp are excluded here
    df1=df.loc[(df.svtype!="DEL") | (df.GT=="0/0") | (df.svlen>=1000), :].copy()
    df1.loc[:,'log2r_adj']=df1.loc[:,'log2r']
    df1=df1.append(small_dels)
    params=df1.groupby(['sample', 'svtype', 'GT'])['log2r_adj'].aggregate([np.mean,np.var, len]).reset_index()
    params=pd.pivot_table(params, index=['sample', 'svtype'], columns='GT', values=['mean', 'var', 'len']).reset_index()
    params.columns=['sample', 'svtype', 'mean0', 'mean1', 'mean2', 'var0', 'var1', 'var2', 'len0', 'len1', 'len2']
    params['std_pooled'] = np.sqrt((params['var0']*params['len0']+params['var1']*params['len1']+params['var2']*params['len2'])/(params['len0']+params['len1']+params['len2']))
    #params.to_csv('./params.csv')
github jseabold / statsmodels-tutorial / linear_models.py View on Github external
min_lm = ols('JPERF ~ TEST', data=minority_table).fit()
print min_lm.summary()

# 

fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF')
for factor, group in factor_group:
    ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
                marker=markers[factor], s=12**2)
ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1, loc='upper left')
fig = abline_plot(model_results = min_lm, ax=ax)

# 

min_lm2 = ols('JPERF ~ TEST + TEST:ETHN', data=minority_table).fit()
print min_lm2.summary()

# 

fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF')
for factor, group in factor_group:
    ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
                marker=markers[factor], s=12**2)

fig = abline_plot(intercept = min_lm2.params['Intercept'],
                 slope = min_lm2.params['TEST'], ax=ax, color='purple')
ax = fig.axes[0]
fig = abline_plot(intercept = min_lm2.params['Intercept'],
        slope = min_lm2.params['TEST'] + min_lm2.params['TEST:ETHN'],
        ax=ax, color='green')