Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self, y, x, z, data, alpha):
self.regression = sm.RLM(data[y], data[x+z])
self.result = self.regression.fit()
self.coefficient = self.result.params[x][0]
confidence_interval = self.result.conf_int(alpha=alpha/2.)
self.upper = confidence_interval[1][x][0]
self.lower = confidence_interval[0][x][0]
return valid_data
# If valid data hasn't yet been stored in dictionary, determine the valid data
df = self._monthly_daily.df
# First set of filters checking combined losses and if the Nan data flag was on
df_sub = df.loc[
((df['availability_pct'] + df['curtailment_pct']) < comb_loss_thresh) & (df['nan_flag'] == False)]
#print df_sub
# Now perform robust linear regression using Huber algorithm to flag outliers
X = sm.add_constant(df_sub[reanal]) # Reanalysis data with constant column
y = df_sub['gross_energy_gwh'] # Energy data
# Perform robust linear regression
rlm = sm.RLM(y, X, M=sm.robust.norms.HuberT(outlier_thresh))
rlm_results = rlm.fit()
# Define valid data as points in which the Huber algorithm returned a value of 1
if self.time_resolution == 'M':
valid_data = df_sub.loc[rlm_results.weights == 1, [reanal, reanal + '_wd', reanal + '_temp',
reanal + '_u', reanal + '_v',
'energy_gwh', 'availability_gwh',
'curtailment_gwh', 'num_days_expected']]
elif self.time_resolution == 'D':
valid_data = df_sub.loc[rlm_results.weights == 1, [reanal, reanal + '_wd', reanal + '_temp',
reanal + '_u', reanal + '_v',
'energy_gwh', 'availability_gwh',
'curtailment_gwh']]
# Update the dictionary
self.outlier_filtering[(reanal, outlier_thresh, comb_loss_thresh)] = valid_data
if len(zz) ==0:
continue
indSel = where(zs == 1)[0]
zzs = ndimage.gaussian_filter(zz[indSel],sigma=1)
rrs = ndimage.gaussian_filter(rr[indSel],sigma=1)
divs = diff(zzs/rrs)/diff(bind[indSel])
indSel = indSel[where(abs(divs) < 0.5)[0]+1]
if len(indSel) < minLen:
indSel = array(range(len(zz)))
yy = zz[indSel]
xx = bind[indSel]
X = sm.add_constant(column_stack((xx**0,xx,xx**2,xx**3)))
res1 = sm.RLM(yy,X).fit()
yy = rr[indSel]
#res2 = sm.RLM(yy,X).fit()
zsmooth = res1.fittedvalues
#rsmooth = res2.fittedvalues
fz = interp1d(bind[indSel], zsmooth)
znew = zeros(len(bind))
ii1 = indSel[0]
ii2 = indSel[-1]
znew[ii1:ii2+1] = fz(bind[ii1:ii2+1])
for ii in range(0,ii1):
znew[ii] = znew[ii1]
for ii in range(ii2+1,len(znew)):
znew[ii] = znew[ii2]
#fz2 = extrap1d(fz)
#znew = fz2(bind)
prediction = results.fittedvalues
difference = delta - prediction
# TODO swap and clean the following lines.
# z_score = (difference - difference.mean()) / difference.std()
difference_median = np.median(difference)
difference_mad = 1.4826 * np.median(np.absolute(difference - difference_median))
z_score = (difference - difference_median) / difference_mad
sub_indices = np.where(z_score >= z_score_threshold)[0]
# Plot rho and delta values (if necessary).
if self.debug_plots is not None:
labels = z_score >= z_score_threshold
self.plot_rho_delta(rho, delta - prediction, labels=labels, filename_suffix="modified")
elif smart_select_mode == 'ransac_bis':
x = sm.add_constant(rho)
model = sm.RLM(delta, x)
results = model.fit()
delta_mean = results.fittedvalues
difference = delta - delta_mean
# 1st solution.
# sigma = difference.std() # TODO this estimation is sensible to outliers
# 2nd solution.
# difference_median = np.median(difference)
# difference_mad = 1.4826 * np.median(np.absolute(difference - difference_median))
# sigma = difference_mad
# upper = + 3.0 * sigma # + sigma * prediction # TODO check/optimize this last term?
# 3rd solution (fit variance also).
sigma = 1.4826 * np.median(np.absolute(difference - np.median(difference)))
variance_model = sm.RLM(np.square(difference), x)
variance_result = variance_model.fit()
variance_delta = variance_result.fittedvalues
variance_delta = np.maximum(sigma ** 2.0, variance_delta) # i.e. rectify negative values
ax.set_xlabel('Dim 0')
ax.set_ylabel('Dim 1')
ax.set_title(r'$\delta$')
ax = fig.add_subplot(245)
ax.set_xlabel(r'$\rho$')
ax.set_ylabel(r'$\delta$')
ax.set_title('Putative Centroids')
ax.scatter(rho, delta, c='k', s=def_size, linewidth=0)
idx = numpy.argsort(rho)
import statsmodels.api as sm
x = sm.add_constant(rho)
model = sm.RLM(delta, x)
results = model.fit()
difference = delta - results.fittedvalues
factor = numpy.median(numpy.abs(difference - numpy.median(difference)))
upper = results.fittedvalues + alpha*factor*(1 + results.fittedvalues)
z_score = difference - alpha*factor*(1 + results.fittedvalues)
mcenters = numpy.where(z_score >= 0)[0]
ax.fill_between(rho[idx], results.fittedvalues[idx], upper[idx], alpha=0.5, color='r')
ax.plot(rho[mcenters], delta[mcenters], 'r.')
ax2 = fig.add_subplot(246)
ax2.set_xlabel(r'$\rho$')
ax2.set_ylabel(r'$\epsilon$')
ax2.scatter(rho, z_score, c='k', s=def_size, linewidth=0)
ax2.scatter(rho[centers], z_score[centers], c='r', s=def_size, linewidth=0)
ax2.set_xlim(0.98*rmin, 1.02*rmax)
def bump_cluster(model_str, methylations, covs, coef, nsims=1000,
value_fn=coef_sum, robust=False):
orig = _combine_cluster(model_str, methylations, covs, coef, robust=robust)
obs_coef = value_fn(orig)
reduced_residuals, reduced_fitted = [], []
# get the reduced residuals and models so we can shuffle
for i, methylation in enumerate(methylations):
y, X = patsy.dmatrices(model_str, covs, return_type='dataframe')
idxs = [par for par in X.columns if par.startswith(coef)]
assert len(idxs) == 1, ('too many coefficents like', coef)
X.pop(idxs[0])
fitr = (RLM if robust else GLS)(y, X).fit()
reduced_residuals.append(np.array(fitr.resid))
reduced_fitted.append(np.array(fitr.fittedvalues))
ngt, idxs = 0, np.arange(len(methylations[0]))
for isim in range(nsims):
np.random.shuffle(idxs)
fakem = np.array([rf + rr[idxs] for rf, rr in izip(reduced_fitted,
reduced_residuals)])
assert fakem.shape == methylations.shape
sim = _combine_cluster(model_str, fakem, covs, coef, robust=robust)
ccut = value_fn(sim)
ngt += abs(ccut) > abs(obs_coef)
return valid_data
# If valid data hasn't yet been stored in dictionary, determine the valid data
df = self._monthly.df
# First set of filters checking combined losses and if the Nan data flag was on
df_sub = df.loc[
((df['availability_pct'] + df['curtailment_pct']) < comb_loss_thresh) & (df['nan_flag'] == False)]
#print df_sub
# Now perform robust linear regression using Huber algorithm to flag outliers
X = sm.add_constant(df_sub[reanal]) # Reanalysis data with constant column
y = df_sub['gross_energy_gwh'] # Energy data
# Perform robust linear regression
rlm = sm.RLM(y, X, M=sm.robust.norms.HuberT(outlier_thresh))
rlm_results = rlm.fit()
# Define valid data as points in which the Huber algorithm returned a value of 1
valid_data = df_sub.loc[rlm_results.weights == 1, [reanal, 'energy_gwh', 'availability_gwh',
'curtailment_gwh', 'num_days_expected']]
# Update the dictionary
self.outlier_filtering[(reanal, outlier_thresh, comb_loss_thresh)] = valid_data
# Return result
return valid_data
res2 = sm.OLS(y2, X).fit()
print("OLS: parameter estimates: slope, constant")
print(res2.params)
print("standard deviation of parameter estimates")
print(res2.bse)
prstd, iv_l, iv_u = wls_prediction_std(res2)
plt.plot(x1, res2.fittedvalues, 'r-')
plt.plot(x1, iv_u, 'r--')
plt.plot(x1, iv_l, 'r--')
#compare with robust estimator
resrlm2 = sm.RLM(y2, X).fit()
print("\nRLM: parameter estimates: slope, constant")
print(resrlm2.params)
print("standard deviation of parameter estimates")
print(resrlm2.bse)
plt.plot(x1, resrlm2.fittedvalues, 'g.-')
plt.title('Data with Outliers; blue: true, red: OLS, green: RLM')
# see also help(sm.RLM.fit) for more options and
# module sm.robust.scale for scale options
plt.show()
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
# ## Estimation
#
# Load data:
data = sm.datasets.stackloss.load()
data.exog = sm.add_constant(data.exog)
# Huber's T norm with the (default) median absolute deviation scaling
huber_t = sm.RLM(data.endog, data.exog, M=sm.robust.norms.HuberT())
hub_results = huber_t.fit()
print(hub_results.params)
print(hub_results.bse)
print(hub_results.summary(yname='y',
xname=['var_%d' % i for i in range(len(hub_results.params))]))
# Huber's T norm with 'H2' covariance matrix
hub_results2 = huber_t.fit(cov="H2")
print(hub_results2.params)
print(hub_results2.bse)
# Andrew's Wave norm with Huber's Proposal 2 scaling and 'H3' covariance matrix