How to use the statsmodels.formula.api function in statsmodels

To help you get started, we’ve selected a few statsmodels examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github equinor / webviz-subsurface / webviz_subsurface / plugins / _multiple_regression_sara.py View on Github external
while remaining and current_score == best_new_score and len(selected) < maxvars:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                        ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model
github openeemeter / eemeter / eemeter / features.py View on Github external
def _estimate_hour_of_week_occupancy(model_data, threshold):
    index = pd.CategoricalIndex(range(168))
    if model_data.dropna().empty:
        return pd.Series(np.nan, index=index, name="occupancy")

    usage_model = smf.wls(
        formula="meter_value ~ cdd_65 + hdd_50",
        data=model_data,
        weights=model_data.weight,
    )

    model_data_with_residuals = model_data.merge(
        pd.DataFrame({"residuals": usage_model.fit().resid}),
        left_index=True,
        right_index=True,
    )

    def _is_high_usage(df):
        if df.empty:
            return np.nan
        n_positive_residuals = sum(df.residuals > 0)
        n_residuals = float(len(df.residuals))
github justmarkham / DAT3 / code / 10_logistic_regression_class.py View on Github external
plt.scatter(train_d.balance, train_d.income, marker='o', 
            edgecolors = 'r', facecolors = 'none')
plt.ylim([0,80000]); plt.xlim([0, 2800])
plt.legend( ('default', 'no default'), loc='upper right')

# 6 - What can you infer from this plot?
# It appears that the balance is more correlated with default than income

'''
PART II - LOGISTIC REGRESSION
'''

# 1 - Run a logistic regression on the balance variable
# 2 - Is the beta  value associated with balance significant?
balance = smf.logit('default ~ balance', data = train).fit()
balance.summary()
np.exp(balance.params.balance)

# Beta is significant!
# 2 - Predict the probability of default for someone with a balance of $1.2k and $2k
prob = balance.predict({'balance': [1200, 2000]})

# What does beta mean? Let's create some plots to find out!
x = np.linspace(test.balance.min(), test.balance.max(),500)
beta = [balance.params.Intercept, balance.params.balance]

y = np.exp(beta[0] + beta[1]*x) / (1 + np.exp(beta[0] + beta[1]*x))
odds = np.exp(beta[0] + beta[1]*x)
log_odds = beta[0] + beta[1]*x

plt.figure(figsize=(7, 8))
github optilude / jira-cycle-extract / jira_cycle_extract / charting.py View on Github external
if ax is None:
        fig, ax = plt.subplots()
    else:
        fig = ax.get_figure()
        
    if title is not None:
        ax.set_title(title)

    fig.autofmt_xdate()

    # Calculate zero-indexed days to allow linear regression calculation
    day_zero = throughput_data.index[0]
    throughput_data['day'] = (throughput_data.index - day_zero).days

    # Fit a linear regression (http://stackoverflow.com/questions/29960917/timeseries-fitted-values-from-trend-python)
    fit = sm.ols(formula="count ~ day", data=throughput_data).fit()
    throughput_data['fitted'] = fit.predict(throughput_data)

    # Plot

    ax.set_xlabel("Completed date")
    ax.set_ylabel("Number of items")

    ax.bar(throughput_data.index, throughput_data['count'])

    bottom, top = ax.get_ylim()
    ax.set_ylim(0, top + 1)

    for x, y in zip(throughput_data.index, throughput_data['count']):
        if y == 0:
            continue
        ax.annotate(
github pzivich / zEpid / zepid / causal / generalize / estimators.py View on Github external
"""
        if self.outcome_type == 'binary':
            linkdist = sm.families.family.Binomial()
        elif self.outcome_type == 'normal':
            linkdist = sm.families.family.Gaussian()
        elif self.outcome_type == 'poisson':
            linkdist = sm.families.family.Poisson()
        else:
            raise ValueError("Only 'binary', 'normal', and 'poisson' distributed outcomes are available")

        # Modeling the outcome
        if self.weight is None:
            m = smf.glm(self.outcome+' ~ '+model, self.sample, family=linkdist)
            self._outcome_model = m.fit()
        else:
            m = smf.glm(self.outcome+' ~ '+model, self.sample, family=linkdist,
                        freq_weights=self.sample[self.weight])
            self._outcome_model = m.fit()

        # Printing results of the model and if any observations were dropped
        if print_results:
            print(self._outcome_model.summary())
github thomas-haslwanter / dobson / dobson.py View on Github external
print(model_add.fittedvalues[0])

    # Saturated model
    # model_sat = smf.glm('frequency~site*type', family = sm_families.Poisson(), data=df).fit()
    #
    # The saturated model gives a perfect fit, and the fitted data are equal to
    # the original data. Statsmodels indicates a "PerfectSeparationError"

    # Ulcer and aspirin, p. 182 ------------------------------------- 
    inFile = r'GLM_data/Table 9.7 Ulcer and aspirin use.xls'
    df = get_data(inFile)
    df.columns = ['GD', 'CC', 'AP', 'freq']

    model1 = smf.glm('freq~GD+CC+GD*CC', family = sm_families.Poisson(), data=df).fit()
    model2 = smf.glm('freq~GD+CC+GD*CC + AP', family = sm_families.Poisson(), data=df).fit()
    model3 = smf.glm('freq~GD+CC+GD*CC + AP + AP*CC', family = sm_families.Poisson(), data=df).fit()
    model4 = smf.glm('freq~GD+CC+GD*CC + AP + AP*CC + AP*GD', family = sm_families.Poisson(), data=df).fit()
    
    print('Ulcer and aspirin')
    print(model4.fittedvalues)
github pzivich / zEpid / zepid / causal / ipw / IPTW.py View on Github external
self.average_treatment_effect = pd.DataFrame()
            self.average_treatment_effect['labels'] = np.asarray(fm.params.index)
            self.average_treatment_effect.set_index(keys=['labels'], inplace=True)
            self.average_treatment_effect['ATE'] = np.asarray(fm.params)
            self.average_treatment_effect['SE(ATE)'] = np.asarray(fm.bse)
            self.average_treatment_effect['95%LCL'] = np.asarray(fm.conf_int()[0])
            self.average_treatment_effect['95%UCL'] = np.asarray(fm.conf_int()[1])

        else:
            # Ignoring DomainWarnings from statsmodels
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', DomainWarning)

                # Estimating Risk Difference
                f = sm.families.family.Binomial(sm.families.links.identity())
                fm = smf.gee(full_msm, df.index, df,
                             cov_struct=ind, family=f, weights=df['_ipfw_']).fit()
                self.risk_difference = pd.DataFrame()
                self.risk_difference['labels'] = np.asarray(fm.params.index)
                self.risk_difference.set_index(keys=['labels'], inplace=True)
                self.risk_difference['RD'] = np.asarray(fm.params)
                self.risk_difference['SE(RD)'] = np.asarray(fm.bse)
                self.risk_difference['95%LCL'] = np.asarray(fm.conf_int()[0])
                self.risk_difference['95%UCL'] = np.asarray(fm.conf_int()[1])

                # Estimating Risk Ratio
                f = sm.families.family.Binomial(sm.families.links.log())
                fm = smf.gee(full_msm, df.index, df,
                             cov_struct=ind, family=f, weights=df['_ipfw_']).fit()
                self.risk_ratio = pd.DataFrame()
                self.risk_ratio['labels'] = np.asarray(fm.params.index)
                self.risk_ratio.set_index(keys=['labels'], inplace=True)
github equinor / webviz-subsurface / webviz_subsurface / plugins / _multiple_regession_vegard.py View on Github external
--------
    model: an "optimal" fitted statsmodels linear model
        with an intercept
        selected by forward selection
        evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score and len(selected) < maxvars:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                        ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            candidate_split = best_candidate.split(sep=":")
            if len(candidate_split) == 2:  
                if candidate_split[0] not in selected and candidate_split[0] in remaining: 
                    remaining.remove(candidate_split[0])
                    selected.append(candidate_split[0])
                    maxvars += 1
                if candidate_split[1] not in selected and candidate_split[1] in remaining:
                    remaining.remove(candidate_split[1])
                    selected.append(candidate_split[1])
                    maxvars += 1
            remaining.remove(best_candidate)
            selected.append(best_candidate)
github statsmodels / statsmodels / examples / python / glm_formula.py View on Github external
# Then, we fit the GLM model:

mod1 = smf.glm(formula=formula, data=dta, family=sm.families.Binomial()).fit()
mod1.summary()


# Finally, we define a function to operate customized data transformation using the formula framework:

def double_it(x):
    return 2 * x


formula = 'SUCCESS ~ double_it(LOWINC) + PERASIAN + PERBLACK + PERHISP + PCTCHRT +            PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF'
mod2 = smf.glm(formula=formula, data=dta, family=sm.families.Binomial()).fit()
mod2.summary()


# As expected, the coefficient for ``double_it(LOWINC)`` in the second model is half the size of the ``LOWINC`` coefficient from the first model:

print(mod1.params[1])
print(mod2.params[1] * 2)