Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
while remaining and current_score == best_new_score and len(selected) < maxvars:
scores_with_candidates = []
for candidate in remaining:
formula = "{} ~ {} + 1".format(response,
' + '.join(selected + [candidate]))
score = smf.ols(formula, data).fit().rsquared_adj
scores_with_candidates.append((score, candidate))
scores_with_candidates.sort()
best_new_score, best_candidate = scores_with_candidates.pop()
if current_score < best_new_score:
remaining.remove(best_candidate)
selected.append(best_candidate)
current_score = best_new_score
formula = "{} ~ {} + 1".format(response,
' + '.join(selected))
model = smf.ols(formula, data).fit()
return model
def _estimate_hour_of_week_occupancy(model_data, threshold):
index = pd.CategoricalIndex(range(168))
if model_data.dropna().empty:
return pd.Series(np.nan, index=index, name="occupancy")
usage_model = smf.wls(
formula="meter_value ~ cdd_65 + hdd_50",
data=model_data,
weights=model_data.weight,
)
model_data_with_residuals = model_data.merge(
pd.DataFrame({"residuals": usage_model.fit().resid}),
left_index=True,
right_index=True,
)
def _is_high_usage(df):
if df.empty:
return np.nan
n_positive_residuals = sum(df.residuals > 0)
n_residuals = float(len(df.residuals))
plt.scatter(train_d.balance, train_d.income, marker='o',
edgecolors = 'r', facecolors = 'none')
plt.ylim([0,80000]); plt.xlim([0, 2800])
plt.legend( ('default', 'no default'), loc='upper right')
# 6 - What can you infer from this plot?
# It appears that the balance is more correlated with default than income
'''
PART II - LOGISTIC REGRESSION
'''
# 1 - Run a logistic regression on the balance variable
# 2 - Is the beta value associated with balance significant?
balance = smf.logit('default ~ balance', data = train).fit()
balance.summary()
np.exp(balance.params.balance)
# Beta is significant!
# 2 - Predict the probability of default for someone with a balance of $1.2k and $2k
prob = balance.predict({'balance': [1200, 2000]})
# What does beta mean? Let's create some plots to find out!
x = np.linspace(test.balance.min(), test.balance.max(),500)
beta = [balance.params.Intercept, balance.params.balance]
y = np.exp(beta[0] + beta[1]*x) / (1 + np.exp(beta[0] + beta[1]*x))
odds = np.exp(beta[0] + beta[1]*x)
log_odds = beta[0] + beta[1]*x
plt.figure(figsize=(7, 8))
if ax is None:
fig, ax = plt.subplots()
else:
fig = ax.get_figure()
if title is not None:
ax.set_title(title)
fig.autofmt_xdate()
# Calculate zero-indexed days to allow linear regression calculation
day_zero = throughput_data.index[0]
throughput_data['day'] = (throughput_data.index - day_zero).days
# Fit a linear regression (http://stackoverflow.com/questions/29960917/timeseries-fitted-values-from-trend-python)
fit = sm.ols(formula="count ~ day", data=throughput_data).fit()
throughput_data['fitted'] = fit.predict(throughput_data)
# Plot
ax.set_xlabel("Completed date")
ax.set_ylabel("Number of items")
ax.bar(throughput_data.index, throughput_data['count'])
bottom, top = ax.get_ylim()
ax.set_ylim(0, top + 1)
for x, y in zip(throughput_data.index, throughput_data['count']):
if y == 0:
continue
ax.annotate(
"""
if self.outcome_type == 'binary':
linkdist = sm.families.family.Binomial()
elif self.outcome_type == 'normal':
linkdist = sm.families.family.Gaussian()
elif self.outcome_type == 'poisson':
linkdist = sm.families.family.Poisson()
else:
raise ValueError("Only 'binary', 'normal', and 'poisson' distributed outcomes are available")
# Modeling the outcome
if self.weight is None:
m = smf.glm(self.outcome+' ~ '+model, self.sample, family=linkdist)
self._outcome_model = m.fit()
else:
m = smf.glm(self.outcome+' ~ '+model, self.sample, family=linkdist,
freq_weights=self.sample[self.weight])
self._outcome_model = m.fit()
# Printing results of the model and if any observations were dropped
if print_results:
print(self._outcome_model.summary())
print(model_add.fittedvalues[0])
# Saturated model
# model_sat = smf.glm('frequency~site*type', family = sm_families.Poisson(), data=df).fit()
#
# The saturated model gives a perfect fit, and the fitted data are equal to
# the original data. Statsmodels indicates a "PerfectSeparationError"
# Ulcer and aspirin, p. 182 -------------------------------------
inFile = r'GLM_data/Table 9.7 Ulcer and aspirin use.xls'
df = get_data(inFile)
df.columns = ['GD', 'CC', 'AP', 'freq']
model1 = smf.glm('freq~GD+CC+GD*CC', family = sm_families.Poisson(), data=df).fit()
model2 = smf.glm('freq~GD+CC+GD*CC + AP', family = sm_families.Poisson(), data=df).fit()
model3 = smf.glm('freq~GD+CC+GD*CC + AP + AP*CC', family = sm_families.Poisson(), data=df).fit()
model4 = smf.glm('freq~GD+CC+GD*CC + AP + AP*CC + AP*GD', family = sm_families.Poisson(), data=df).fit()
print('Ulcer and aspirin')
print(model4.fittedvalues)
self.average_treatment_effect = pd.DataFrame()
self.average_treatment_effect['labels'] = np.asarray(fm.params.index)
self.average_treatment_effect.set_index(keys=['labels'], inplace=True)
self.average_treatment_effect['ATE'] = np.asarray(fm.params)
self.average_treatment_effect['SE(ATE)'] = np.asarray(fm.bse)
self.average_treatment_effect['95%LCL'] = np.asarray(fm.conf_int()[0])
self.average_treatment_effect['95%UCL'] = np.asarray(fm.conf_int()[1])
else:
# Ignoring DomainWarnings from statsmodels
with warnings.catch_warnings():
warnings.simplefilter('ignore', DomainWarning)
# Estimating Risk Difference
f = sm.families.family.Binomial(sm.families.links.identity())
fm = smf.gee(full_msm, df.index, df,
cov_struct=ind, family=f, weights=df['_ipfw_']).fit()
self.risk_difference = pd.DataFrame()
self.risk_difference['labels'] = np.asarray(fm.params.index)
self.risk_difference.set_index(keys=['labels'], inplace=True)
self.risk_difference['RD'] = np.asarray(fm.params)
self.risk_difference['SE(RD)'] = np.asarray(fm.bse)
self.risk_difference['95%LCL'] = np.asarray(fm.conf_int()[0])
self.risk_difference['95%UCL'] = np.asarray(fm.conf_int()[1])
# Estimating Risk Ratio
f = sm.families.family.Binomial(sm.families.links.log())
fm = smf.gee(full_msm, df.index, df,
cov_struct=ind, family=f, weights=df['_ipfw_']).fit()
self.risk_ratio = pd.DataFrame()
self.risk_ratio['labels'] = np.asarray(fm.params.index)
self.risk_ratio.set_index(keys=['labels'], inplace=True)
--------
model: an "optimal" fitted statsmodels linear model
with an intercept
selected by forward selection
evaluated by adjusted R-squared
"""
remaining = set(data.columns)
remaining.remove(response)
selected = []
current_score, best_new_score = 0.0, 0.0
while remaining and current_score == best_new_score and len(selected) < maxvars:
scores_with_candidates = []
for candidate in remaining:
formula = "{} ~ {} + 1".format(response,
' + '.join(selected + [candidate]))
score = smf.ols(formula, data).fit().rsquared_adj
scores_with_candidates.append((score, candidate))
scores_with_candidates.sort()
best_new_score, best_candidate = scores_with_candidates.pop()
if current_score < best_new_score:
candidate_split = best_candidate.split(sep=":")
if len(candidate_split) == 2:
if candidate_split[0] not in selected and candidate_split[0] in remaining:
remaining.remove(candidate_split[0])
selected.append(candidate_split[0])
maxvars += 1
if candidate_split[1] not in selected and candidate_split[1] in remaining:
remaining.remove(candidate_split[1])
selected.append(candidate_split[1])
maxvars += 1
remaining.remove(best_candidate)
selected.append(best_candidate)
# Then, we fit the GLM model:
mod1 = smf.glm(formula=formula, data=dta, family=sm.families.Binomial()).fit()
mod1.summary()
# Finally, we define a function to operate customized data transformation using the formula framework:
def double_it(x):
return 2 * x
formula = 'SUCCESS ~ double_it(LOWINC) + PERASIAN + PERBLACK + PERHISP + PCTCHRT + PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF'
mod2 = smf.glm(formula=formula, data=dta, family=sm.families.Binomial()).fit()
mod2.summary()
# As expected, the coefficient for ``double_it(LOWINC)`` in the second model is half the size of the ``LOWINC`` coefficient from the first model:
print(mod1.params[1])
print(mod2.params[1] * 2)