How to use the patsy.dmatrices function in patsy

To help you get started, we’ve selected a few patsy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hakyimlab / MetaXcan / software / metax / predixcan / Utilities.py View on Github external
def _get_residual(pheno, covariates):
    e = pandas.DataFrame(covariates)
    e["order"] = range(0, e.shape[0])
    e["pheno"] = pheno

    e_ = e.dropna()
    e_ = e_[e_.columns[~(e_ == 0).all()]]

    keys = list(e_.columns.values)
    keys.remove("pheno")
    keys.remove("order")

    y, X = dmatrices("pheno ~ {}".format(" + ".join(keys)), data=e_, return_type="dataframe")
    model = sm.OLS(y,X)
    result = model.fit()
    e_["residual"] = result.resid
    e = e.merge(e_[["order", "residual"]], on="order", how="left")
    return e["residual"]
github gtri / scrimmage / scripts / analysis.py View on Github external
def main():

    df = pd.DataFrame(data=np.hstack((np.load('xx.npy'), np.load('yy.npy'))),
                      columns=['greedy_b', 'greedy_r', 'score_b', 'score_r'])

    df = df[df['score_b'] >= 0]
    df['score_diff'] = -df['score_b'] + df['score_r']

    y, X = patsy.dmatrices('score_diff ~ '
                           'greedy_b + '
                           'greedy_r + '
                           'square(greedy_b) + '
                           'square(greedy_r) + '
                           'greedy_b:greedy_r',
                           data=df, return_type='dataframe')
    result = sm.OLS(y, X).fit()
    print result.summary()

    x = df['greedy_b']
    y = df['greedy_r']
    z = df['score_diff']

    '''
    xi = np.linspace(0, 1, 1000)
    yi = np.linspace(0, 1, 1000)
github dssg / bikeshare / web / poisson_web.py View on Github external
delta_weekday_dummy = delta_dayofweek.copy()
    delta_weekday_dummy[delta_dayofweek < 5] = 1
    delta_weekday_dummy[delta_dayofweek >= 5] = 0

    arrivals_departures['weekday_dummy'] = delta_weekday_dummy

    # print arrivals_departures
    # print arrivals_departures.head(20)
    
    # Create design matrix for months, hours, and weekday vs. weekend.
    # We can't just create a "month" column to toss into our model, because it doesnt
    # understand what "June" is. Instead, we need to create a column for each month
    # and code each row according to what month it's in. Ditto for hours and weekday (=1).
    
    y_arr, X_arr = patsy.dmatrices("arrivals ~ C(months, Treatment) + C(hours, Treatment) + C(weekday_dummy, Treatment)", arrivals_departures, return_type='dataframe')
    y_dep, X_dep = patsy.dmatrices("departures ~ C(months, Treatment) + C(hours, Treatment) + C(weekday_dummy, Treatment)", arrivals_departures, return_type='dataframe')

    y_dep[pd.isnull(y_dep)] = 0
    
    # Fit poisson distributions for arrivals and departures, print results
    arr_poisson_model = sm.Poisson(y_arr, X_arr)
    arr_poisson_results = arr_poisson_model.fit()
    
    dep_poisson_model = sm.Poisson(y_dep, X_dep)
    dep_poisson_results = dep_poisson_model.fit()
    
    # print arr_poisson_results.summary(), dep_poisson_results.summary()
    
    poisson_results = [arr_poisson_results, dep_poisson_results]
    
    return poisson_results
github statsmodels / statsmodels / statsmodels / sandbox / examples / example_nbin.py View on Github external
def test_nb2():
    y, X = patsy.dmatrices('los ~ C(type) + hmo + white', medpar)
    y = np.array(y)[:,0]
    nb2 = NBin(y,X,'nb2').fit(maxiter=10000, maxfun=5000)
    assert_almost_equal(nb2.params,
                        [2.31027893349935, 0.221248978197356, 0.706158824346228,
                         -0.067955221930748, -0.129065442248951, 0.4457567],
                        decimal=2)
github b45ch1 / algopy / documentation / sphinx / examples / neg_binom_regression.py View on Github external
def main():

    # read the data from the internet into numpy arrays
    medpar = pandas.read_csv(g_url)
    y_patsy, X_patsy = patsy.dmatrices('los~type2+type3+hmo+white', medpar)
    y = numpy.array(y_patsy).flatten()
    X = numpy.array(X_patsy)

    # define the objective function and the autodiff gradient and hessian
    f = functools.partial(get_neg_ll, y, X)
    g = functools.partial(eval_grad, f)
    h = functools.partial(eval_hess, f)

    # init the search for max likelihood parameters
    theta0 = numpy.array([
        numpy.log(numpy.mean(y)),
        0, 0, 0, 0,
        0.5,
        ], dtype=float)

    # do the max likelihood search
github statsmodels / statsmodels / statsmodels / imputation / mice.py View on Github external
exog_obs : DataFrame
            Current values of the predictors where the variable to be
            imputed is observed.
        exog_miss : DataFrame
            Current values of the predictors where the variable to be
            Imputed is missing.
        init_kwds : dict-like
            The init keyword arguments for `vname`, processed through Patsy
            as required.
        fit_kwds : dict-like
            The fit keyword arguments for `vname`, processed through Patsy
            as required.
        """

        formula = self.conditional_formula[vname]
        endog, exog = patsy.dmatrices(formula, self.data,
                                      return_type="dataframe")

        # Rows with observed endog
        ixo = self.ix_obs[vname]
        endog_obs = np.asarray(endog.iloc[ixo])
        exog_obs = np.asarray(exog.iloc[ixo, :])

        # Rows with missing endog
        ixm = self.ix_miss[vname]
        exog_miss = np.asarray(exog.iloc[ixm, :])

        predict_obs_kwds = {}
        if vname in self.predict_kwds:
            kwds = self.predict_kwds[vname]
            predict_obs_kwds = self._process_kwds(kwds, ixo)
github RJT1990 / pyflux / pyflux / garch / egarchmreg.py View on Github external
How many steps ahead would you like to forecast?

        oos_data : pd.DataFrame
            Data to use for the predictors in the forecast

        intervals : boolean (default: False)
            Whether to return prediction intervals

        Returns
        ----------
        - pd.DataFrame with predicted values
        """     
        if self.latent_variables.estimated is False:
            raise Exception("No latent variables estimated!")
        else:
            _, X_oos = dmatrices(self.formula, oos_data)
            X_oos = np.array([X_oos])[0]
            X_pred = X_oos[:h]
            lmda, Y, scores = self._model(self.latent_variables.get_z_values())         
            date_index = self.shift_dates(h)

            if self.latent_variables.estimation_method in ['M-H']:
                sim_vector = self._sim_prediction_bayes(h, 15000, X_pred)
                error_bars = []

                for pre in range(5,100,5):
                    error_bars.append(np.insert([np.percentile(i,pre) for i in sim_vector], 0, lmda[-1]))

                forecasted_values = np.array([np.mean(i) for i in sim_vector])
                prediction_01 = np.array([np.percentile(i, 1) for i in sim_vector])
                prediction_05 = np.array([np.percentile(i, 5) for i in sim_vector])
                prediction_95 = np.array([np.percentile(i, 95) for i in sim_vector])
github openeemeter / eemeter / eemeter / modeling / models / caltrack.py View on Github external
estimated = hdd_res.fittedvalues
            r2, rmse = hdd_rsquared, np.sqrt(hdd_res.ssr/hdd_res.nobs)
            model_obj, model_res, formula = hdd_mod, hdd_res, hdd_formula
            fit_bp_hdd = hdd_bp

        elif use_cdd_only:
            y, X = patsy.dmatrices(
                cdd_formula, df, return_type='dataframe')
            estimated = cdd_res.fittedvalues
            r2, rmse = cdd_rsquared, np.sqrt(cdd_res.ssr/cdd_res.nobs)
            model_obj, model_res, formula = cdd_mod, cdd_res, cdd_formula
            fit_bp_cdd = cdd_bp

        else:
            # Use Intercept-only
            y, X = patsy.dmatrices(
                int_formula, df, return_type='dataframe')
            estimated = int_res.fittedvalues
            r2, rmse = int_rsquared, np.sqrt(int_res.ssr/int_res.nobs)
            model_obj, model_res, formula = int_mod, int_res, int_formula

        if y.mean != 0:
            cvrmse = rmse / float(y.values.ravel().mean())
        else:
            cvrmse = np.nan

        n = estimated.shape[0]

        self.df = df
        self.y, self.X = y, X
        self.estimated = estimated
        self.r2, self.rmse = r2, rmse
github pymc-devs / pymc3 / pymc3 / glm / linear.py View on Github external
name='', model=None, offset=0., eval_env=0):
        """Creates linear component from `patsy` formula.

        Parameters
        ----------
        formula : str - a patsy formula
        data : a dict-like object that can be used to look up variables referenced
            in `formula`
        eval_env : either a `patsy.EvalEnvironment` or else a depth represented as
            an integer which will be passed to `patsy.EvalEnvironment.capture()`.
            See `patsy.dmatrix` and `patsy.EvalEnvironment` for details.
        Other arguments are documented in the constructor.
        """
        import patsy
        eval_env = patsy.EvalEnvironment.capture(eval_env, reference=1)
        y, x = patsy.dmatrices(formula, data, eval_env=eval_env)
        labels = x.design_info.column_names
        return cls(np.asarray(x), np.asarray(y)[:, -1], intercept=False,
                   labels=labels, priors=priors, vars=vars, name=name,
                   model=model, offset=offset)