How to use the statsmodels.api.RLM function in statsmodels

To help you get started, we’ve selected a few statsmodels examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github akelleh / causality / causality / inference / independence_tests / __init__.py View on Github external
def __init__(self, y, x, z, data, alpha):
        self.regression = sm.RLM(data[y], data[x+z])
        self.result = self.regression.fit()
        self.coefficient = self.result.params[x][0]
        confidence_interval = self.result.conf_int(alpha=alpha/2.)
        self.upper = confidence_interval[1][x][0]
        self.lower = confidence_interval[0][x][0]
github NREL / OpenOA / operational_analysis / methods / plant_analysis_github.py View on Github external
return valid_data

        # If valid data hasn't yet been stored in dictionary, determine the valid data
        df = self._monthly_daily.df
        
        # First set of filters checking combined losses and if the Nan data flag was on
        df_sub = df.loc[
            ((df['availability_pct'] + df['curtailment_pct']) < comb_loss_thresh) & (df['nan_flag'] == False)]

        #print df_sub
        # Now perform robust linear regression using Huber algorithm to flag outliers
        X = sm.add_constant(df_sub[reanal])  # Reanalysis data with constant column
        y = df_sub['gross_energy_gwh']  # Energy data

        # Perform robust linear regression
        rlm = sm.RLM(y, X, M=sm.robust.norms.HuberT(outlier_thresh))
        rlm_results = rlm.fit()

        # Define valid data as points in which the Huber algorithm returned a value of 1
        if self.time_resolution == 'M':
            valid_data = df_sub.loc[rlm_results.weights == 1, [reanal, reanal + '_wd', reanal + '_temp',
                                                               reanal + '_u', reanal + '_v',
                                                               'energy_gwh', 'availability_gwh',
                                                               'curtailment_gwh', 'num_days_expected']]
        elif self.time_resolution == 'D': 
            valid_data = df_sub.loc[rlm_results.weights == 1, [reanal, reanal + '_wd', reanal + '_temp',
                                                               reanal + '_u', reanal + '_v',
                                                               'energy_gwh', 'availability_gwh',
                                                               'curtailment_gwh']]
              
        # Update the dictionary
        self.outlier_filtering[(reanal, outlier_thresh, comb_loss_thresh)] = valid_data
github janelia-flyem / NeuTu / neurolabi / python / stackImageProcess.py View on Github external
if len(zz) ==0:
			continue

		indSel = where(zs == 1)[0]
		zzs = ndimage.gaussian_filter(zz[indSel],sigma=1)
		rrs = ndimage.gaussian_filter(rr[indSel],sigma=1)
		divs = diff(zzs/rrs)/diff(bind[indSel])
		indSel = indSel[where(abs(divs) < 0.5)[0]+1]
				
		if len(indSel) < minLen:
			indSel = array(range(len(zz)))

		yy = zz[indSel]
		xx = bind[indSel]
		X = sm.add_constant(column_stack((xx**0,xx,xx**2,xx**3)))
		res1 = sm.RLM(yy,X).fit()
		yy = rr[indSel]
		#res2 = sm.RLM(yy,X).fit()
		zsmooth = res1.fittedvalues
		#rsmooth = res2.fittedvalues

		fz = interp1d(bind[indSel], zsmooth)
		znew = zeros(len(bind))
		ii1 = indSel[0]
		ii2 = indSel[-1]
		znew[ii1:ii2+1] = fz(bind[ii1:ii2+1])
		for ii in range(0,ii1):
			znew[ii] = znew[ii1]
		for ii in range(ii2+1,len(znew)):
			znew[ii] = znew[ii2]
		#fz2 = extrap1d(fz)
		#znew = fz2(bind)			
github spyking-circus / spyking-circus-ort / circusort / utils / clustering.py View on Github external
prediction = results.fittedvalues
            difference = delta - prediction
            # TODO swap and clean the following lines.
            # z_score = (difference - difference.mean()) / difference.std()
            difference_median = np.median(difference)
            difference_mad = 1.4826 * np.median(np.absolute(difference - difference_median))
            z_score = (difference - difference_median) / difference_mad
            sub_indices = np.where(z_score >= z_score_threshold)[0]
            # Plot rho and delta values (if necessary).
            if self.debug_plots is not None:
                labels = z_score >= z_score_threshold
                self.plot_rho_delta(rho, delta - prediction, labels=labels, filename_suffix="modified")

        elif smart_select_mode == 'ransac_bis':
            x = sm.add_constant(rho)
            model = sm.RLM(delta, x)
            results = model.fit()
            delta_mean = results.fittedvalues
            difference = delta - delta_mean
            # 1st solution.
            # sigma = difference.std()  # TODO this estimation is sensible to outliers
            # 2nd solution.
            # difference_median = np.median(difference)
            # difference_mad = 1.4826 * np.median(np.absolute(difference - difference_median))
            # sigma = difference_mad
            # upper = + 3.0 * sigma  # + sigma * prediction  # TODO check/optimize this last term?
            # 3rd solution (fit variance also).
            sigma = 1.4826 * np.median(np.absolute(difference - np.median(difference)))
            variance_model = sm.RLM(np.square(difference), x)
            variance_result = variance_model.fit()
            variance_delta = variance_result.fittedvalues
            variance_delta = np.maximum(sigma ** 2.0, variance_delta)  # i.e. rectify negative values
github spyking-circus / spyking-circus / circus / shared / plot.py View on Github external
ax.set_xlabel('Dim 0')
    ax.set_ylabel('Dim 1')
    ax.set_title(r'$\delta$')

    ax = fig.add_subplot(245)
    ax.set_xlabel(r'$\rho$')
    ax.set_ylabel(r'$\delta$')
    ax.set_title('Putative Centroids')

    ax.scatter(rho, delta, c='k', s=def_size, linewidth=0)

    idx = numpy.argsort(rho)
            
    import statsmodels.api as sm
    x = sm.add_constant(rho)
    model = sm.RLM(delta, x)
    results = model.fit()
    difference = delta - results.fittedvalues
    factor = numpy.median(numpy.abs(difference - numpy.median(difference)))
    upper = results.fittedvalues + alpha*factor*(1 + results.fittedvalues)
    z_score = difference - alpha*factor*(1 + results.fittedvalues)
    mcenters = numpy.where(z_score >= 0)[0]
    ax.fill_between(rho[idx], results.fittedvalues[idx], upper[idx], alpha=0.5, color='r')
    ax.plot(rho[mcenters], delta[mcenters], 'r.')
    
    ax2 = fig.add_subplot(246)
    ax2.set_xlabel(r'$\rho$')
    ax2.set_ylabel(r'$\epsilon$')
    ax2.scatter(rho, z_score, c='k', s=def_size, linewidth=0)
    ax2.scatter(rho[centers], z_score[centers], c='r', s=def_size, linewidth=0)
    ax2.set_xlim(0.98*rmin, 1.02*rmax)
github brentp / aclust / examples / crystal.py View on Github external
def bump_cluster(model_str, methylations, covs, coef, nsims=1000,
        value_fn=coef_sum, robust=False):
    orig = _combine_cluster(model_str, methylations, covs, coef, robust=robust)
    obs_coef = value_fn(orig)

    reduced_residuals, reduced_fitted = [], []

    # get the reduced residuals and models so we can shuffle
    for i, methylation in enumerate(methylations):
        y, X = patsy.dmatrices(model_str, covs, return_type='dataframe')
        idxs = [par for par in X.columns if par.startswith(coef)]
        assert len(idxs) == 1, ('too many coefficents like', coef)
        X.pop(idxs[0])
        fitr = (RLM if robust else GLS)(y, X).fit()

        reduced_residuals.append(np.array(fitr.resid))
        reduced_fitted.append(np.array(fitr.fittedvalues))

    ngt, idxs = 0, np.arange(len(methylations[0]))

    for isim in range(nsims):
        np.random.shuffle(idxs)

        fakem = np.array([rf + rr[idxs] for rf, rr in izip(reduced_fitted,
            reduced_residuals)])
        assert fakem.shape == methylations.shape

        sim = _combine_cluster(model_str, fakem, covs, coef, robust=robust)
        ccut = value_fn(sim)
        ngt += abs(ccut) > abs(obs_coef)
github NREL / OpenOA / operational_analysis / methods / plant_analysis.py View on Github external
return valid_data

        # If valid data hasn't yet been stored in dictionary, determine the valid data
        df = self._monthly.df
        
        # First set of filters checking combined losses and if the Nan data flag was on
        df_sub = df.loc[
            ((df['availability_pct'] + df['curtailment_pct']) < comb_loss_thresh) & (df['nan_flag'] == False)]

        #print df_sub
        # Now perform robust linear regression using Huber algorithm to flag outliers
        X = sm.add_constant(df_sub[reanal])  # Reanalysis data with constant column
        y = df_sub['gross_energy_gwh']  # Energy data

        # Perform robust linear regression
        rlm = sm.RLM(y, X, M=sm.robust.norms.HuberT(outlier_thresh))
        rlm_results = rlm.fit()

        # Define valid data as points in which the Huber algorithm returned a value of 1
        valid_data = df_sub.loc[rlm_results.weights == 1, [reanal, 'energy_gwh', 'availability_gwh',
                                                           'curtailment_gwh', 'num_days_expected']]

        # Update the dictionary
        self.outlier_filtering[(reanal, outlier_thresh, comb_loss_thresh)] = valid_data

        # Return result
        return valid_data
github statsmodels / statsmodels / statsmodels / examples / tut_ols_rlm_short.py View on Github external
res2 = sm.OLS(y2, X).fit()
print("OLS: parameter estimates: slope, constant")
print(res2.params)
print("standard deviation of parameter estimates")
print(res2.bse)
prstd, iv_l, iv_u = wls_prediction_std(res2)
plt.plot(x1, res2.fittedvalues, 'r-')
plt.plot(x1, iv_u, 'r--')
plt.plot(x1, iv_l, 'r--')


#compare with robust estimator

resrlm2 = sm.RLM(y2, X).fit()
print("\nRLM: parameter estimates: slope, constant")
print(resrlm2.params)
print("standard deviation of parameter estimates")
print(resrlm2.bse)
plt.plot(x1, resrlm2.fittedvalues, 'g.-')
plt.title('Data with Outliers; blue: true, red: OLS, green: RLM')


# see also help(sm.RLM.fit) for more options and
# module sm.robust.scale for scale options

plt.show()
github statsmodels / statsmodels / examples / python / robust_models_0.py View on Github external
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std


# ## Estimation
# 
# Load data:

data = sm.datasets.stackloss.load()
data.exog = sm.add_constant(data.exog)


# Huber's T norm with the (default) median absolute deviation scaling

huber_t = sm.RLM(data.endog, data.exog, M=sm.robust.norms.HuberT())
hub_results = huber_t.fit()
print(hub_results.params)
print(hub_results.bse)
print(hub_results.summary(yname='y',
            xname=['var_%d' % i for i in range(len(hub_results.params))]))


# Huber's T norm with 'H2' covariance matrix

hub_results2 = huber_t.fit(cov="H2")
print(hub_results2.params)
print(hub_results2.bse)


# Andrew's Wave norm with Huber's Proposal 2 scaling and 'H3' covariance matrix