How to use the statsmodels.formula.api.glm function in statsmodels

To help you get started, we’ve selected a few statsmodels examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github thomas-haslwanter / dobson / dobson.py View on Github external
def general_logistic_regression():
    '''Example General Logistic Recression,
    Example 7.4.1, p. 135'''
    
    # Get the data
    inFile = r'GLM_data/Table 7.5 Embryogenic anthers.xls'
    df = get_data(inFile)
    
    # Define the variables so that they match Dobson
    df['n_y'] = df['n'] - df['y']
    df['newstor'] = df['storage']-1
    df['x'] = np.log(df['centrifuge'])
    
    # Model 1
    model1 = smf.glm('n_y + y ~ newstor*x', data=df, family=sm_families.Binomial()).fit()
    print(model1.summary())
    
    # Model 2
    model2 = smf.glm('n_y + y ~ newstor+x', data=df, family=sm_families.Binomial()).fit()
    print(model2.summary())
    
    # Model 3
    model3 = smf.glm('n_y + y ~ x', data=df, family=sm_families.Binomial()).fit()
    print(model3 .summary())
github thomas-haslwanter / dobson / dobson.py View on Github external
inFile = r"GLM_data/Table 9.1 British doctors' smoking and coronary death.xls"
    df = get_data(inFile)
    print(df)

    # Generate the required variables
    df['smoke'] = np.zeros(len(df))
    df['smoke'][df['smoking']=='smoker']=1

    df['agecat'] = np.array([1,2,3,4,5,1,2,3,4,5])
    df['agesq'] = df['agecat']**2

    df['smkage'] = df['agecat']
    df['smkage'][df['smoking']=='non-smoker']=0

    model = smf.glm('deaths~agecat+agesq+smoke+smkage',
            family=sm_families.Poisson(), data=df,
            exposure=df["person-years"]).fit()
    print(model.summary())
github pzivich / zEpid / zepid / zepid / base.py View on Github external
vb01 = cov_matrix.loc['E0M1']['E0M1']
        vb11 = cov_matrix.loc['E1M1']['E1M1']
        cvb10_01 = cov_matrix.loc['E1M0']['E0M1']
        cvb10_11 = cov_matrix.loc['E1M0']['E1M1']
        cvb01_11 = cov_matrix.loc['E0M1']['E1M1']
        varICR = (((em10**2)*vb10)+((em01**2)*vb01)+((em11**2)*vb11)+((em10*em01*2*cvb10_01))+(-1*em10*em11*2*cvb10_11)+(-1*em01*em11*2*cvb01_11))
        icr_lcl = icr - zalpha*math.sqrt(varICR)
        icr_ucl = icr + zalpha*math.sqrt(varICR)
    elif ci == 'bootstrap':
        bse_icr = []
        ul = 1 - alpha/2
        ll = 0 + alpha/2
        for i in range(b_sample):
            dfs = df.sample(n=len(df),replace=True)
            try:
                bmodel = smf.glm(eq,dfs,family=f).fit()
                em_bexpect = math.exp(bmodel.params['E1M0']) + math.exp(bmodel.params['E0M1']) - 1
                bicr = math.exp(bmodel.params['E1M1']) - em_bexpect
                sigma = bicr - icr
                bse_icr.append(sigma)
            except:
                bse_icr.append(np.nan)
        bsdf = pd.DataFrame()
        bsdf['sigma'] = bse_icr
        lsig,usig = bsdf['sigma'].dropna().quantile(q=[ll,ul])
        icr_lcl = lsig + icr
        icr_ucl = usig + icr
    else:
        raise ValueError('Please specify a supported confidence interval type')
    print('\n----------------------------------------------------------------------')
    if regression == 'logit':
        print('ICR based on Odds Ratio\t\tAlpha = '+str(alpha))
github pzivich / zEpid / zepid / causal / generalize / estimators.py View on Github external
pandas dataframe when initialized. Model form should contain the exposure, i.e. 'art + age + male'
        print_results : bool, optional
            Whether to print the logistic regression results to the terminal. Default is True
        """
        if self.outcome_type == 'binary':
            linkdist = sm.families.family.Binomial()
        elif self.outcome_type == 'normal':
            linkdist = sm.families.family.Gaussian()
        elif self.outcome_type == 'poisson':
            linkdist = sm.families.family.Poisson()
        else:
            raise ValueError("Only 'binary', 'normal', and 'poisson' distributed outcomes are available")

        # Modeling the outcome
        if self.weight is None:
            m = smf.glm(self.outcome+' ~ '+model, self.sample, family=linkdist)
            self._outcome_model = m.fit()
        else:
            m = smf.glm(self.outcome+' ~ '+model, self.sample, family=linkdist,
                        freq_weights=self.sample[self.weight])
            self._outcome_model = m.fit()

        # Printing results of the model and if any observations were dropped
        if print_results:
            print(self._outcome_model.summary())
github statsmodels / statsmodels / examples / python / glm_weights.py View on Github external
glm = smf.glm(
    'affairs ~ rate_marriage + yrs_married',
    data=dc,
    family=sm.families.Poisson(),
    freq_weights=np.asarray(dc['freq']))
res_f2 = glm.fit()
#print(res_f2.summary())
res_f2.pearson_chi2 - res_f.pearson_chi2, res_f2.deviance - res_f.deviance, res_f2.llf - res_f.llf

# #### aggregated data: ``exposure`` and ``var_weights``
#
# Note: LR test agrees with original observations, ``pearson_chi2``
# differs and has the wrong sign.

glm = smf.glm(
    'affairs_sum ~ rate_marriage + yrs_married',
    data=df_a,
    family=sm.families.Poisson(),
    exposure=np.asarray(df_a['affairs_count']))
res_e2 = glm.fit()
res_e2.pearson_chi2 - res_e.pearson_chi2, res_e2.deviance - res_e.deviance, res_e2.llf - res_e.llf

glm = smf.glm(
    'affairs_mean ~ rate_marriage + yrs_married',
    data=df_a,
    family=sm.families.Poisson(),
    var_weights=np.asarray(df_a['affairs_count']))
res_a2 = glm.fit()
res_a2.pearson_chi2 - res_a.pearson_chi2, res_a2.deviance - res_a.deviance, res_a2.llf - res_a.llf

# ### Investigating Pearson chi-square statistic
github statsmodels / statsmodels / examples / python / glm_weights.py View on Github external
'affairs_sum ~ rate_marriage + age + yrs_married',
    data=df_a,
    family=sm.families.Poisson(),
    exposure=np.asarray(df_a['affairs_count']))
res_e = glm.fit()
print(res_e.summary())

res_e.pearson_chi2 / res_e.df_resid

# #### using var_weights
#
# We can also use the mean of all combined values of the dependent
# variable. In this case the variance will be related to the inverse of the
# total exposure reflected by one combined observation.

glm = smf.glm(
    'affairs_mean ~ rate_marriage + age + yrs_married',
    data=df_a,
    family=sm.families.Poisson(),
    var_weights=np.asarray(df_a['affairs_count']))
res_a = glm.fit()
print(res_a.summary())

# ### Comparison
#
# We saw in the summary prints above that ``params`` and ``cov_params``
# with associated Wald inference agree across versions. We summarize this in
# the following comparing individual results attributes across versions.
#
# Parameter estimates `params`, standard errors of the parameters `bse`
# and `pvalues` of the parameters for the tests that the parameters are
# zeros all agree. However, the likelihood and goodness-of-fit statistics,
github pzivich / zEpid / zepid / causal / doublyrobust / AIPW.py View on Github external
self._out_model = self.outcome + ' ~ ' + model

        if self._continuous_outcome:
            self._continuous_type = continuous_distribution
            if (continuous_distribution == 'gaussian') or (continuous_distribution == 'normal'):
                f = sm.families.family.Gaussian()
            elif continuous_distribution == 'poisson':
                f = sm.families.family.Poisson()
            else:
                raise ValueError("Only 'gaussian' and 'poisson' distributions are supported")
        else:
            f = sm.families.family.Binomial()

        if self._weight_ is None:
            log = smf.glm(self._out_model, self.df, family=f).fit()
        else:
            log = smf.glm(self._out_model, self.df, freq_weights=self.df[self._weight_], family=f).fit()

        if print_results:
            print('\n----------------------------------------------------------------')
            print('MODEL: ' + self._out_model)
            print('-----------------------------------------------------------------')
            print(log.summary())

        # Generating predictions for observed variables
        self._predicted_y_ = log.predict(self.df)

        # Predicting under treatment strategies
        dfx = self.df.copy()
        dfx[self.exposure] = 1
        self.df['_pY1_'] = log.predict(dfx)
github pzivich / zEpid / zepid / causal / utils.py View on Github external
Whether to print the logistic regression results. Default is True

    Returns
    -------------
    Fitted statsmodels GLM object

    Example
    ------------
    >>>from zepid import load_sample_data
    >>>from zepid.causal.utils import propensity_score
    >>>df = load_sample_data(timevary=False)
    >>>propensity_score(df=df,model='dead ~ art0 + male + dvl0')
    """
    f = sm.families.family.Binomial()
    if weights is None:
        log = smf.glm(model, df, family=f).fit()
    else:
        log = smf.glm(model, df, freq_weights=df[weights], family=f).fit()

    if print_results:
        print('\n----------------------------------------------------------------')
        print('MODEL: ' + model)
        print('-----------------------------------------------------------------')
        print(log.summary())
    return log
github pzivich / zEpid / zepid / zepid / graphics / graphics.py View on Github external
'''
    rf = df.copy()
    rf = rf.dropna().sort_values(by=[var,outcome]).reset_index()
    print('Warning: missing observations of model variables are dropped')
    print(int(len(df)-len(rf)),' observations were dropped from the functional form assessment')
    if f_form == None:
        f_form = var 
    else:
        pass 
    if link_dist == None:
        link_dist = sm.families.family.Binomial(sm.families.links.logit)
    else:
        pass 
    if (loess == True) | (points == True):
        if outcome_type=='binary':
            djm = smf.glm(outcome+'~ C('+var+')',rf,family=link_dist).fit()
            djf = djm.get_prediction(rf).summary_frame()        
            dj = pd.concat([rf,djf],axis=1)
            dj.sort_values(var,inplace=True)
            if points == True:
                pf = dj.groupby(by=[var,'mean']).count().reset_index()
            if loess == True:
                yl = lowess(list(dj['mean']),list(dj[var]),frac=loess_value)
                lowess_x = list(zip(*yl))[0]
                lowess_y = list(zip(*yl))[1]
        elif outcome_type=='continuous':
            if points == True:
                pf = rf.groupby(by=[var,outcome]).count().reset_index()
            if loess == True:
                yl = lowess(list(rf[outcome]),list(rf[var]),frac=loess_value)
                lowess_x = list(zip(*yl))[0]
                lowess_y = list(zip(*yl))[1]
github jseabold / statsmodels-tutorial / discrete_choice.py View on Github external
# Toss a six-sided die 5 times, what's the probability of exactly 2 fours?

# 

stats.binom(5, 1./6).pmf(2)

# 

from scipy.misc import comb
comb(5,2) * (1/6.)**2 * (5/6.)**3

# 

from statsmodels.formula.api import glm
glm_mod = glm(formula, dta, family=sm.families.Binomial()).fit()

# 

print glm_mod.summary()

# 

# The number of trials 

# 

glm_mod.model.data.orig_endog.sum(1)

# 

glm_mod.fittedvalues * glm_mod.model.data.orig_endog.sum(1)