Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
print '\r%i genes finished.' % num
if opts.dispDiff and np.isnan(data.dispAdjRibo[i]):
continue
if not opts.dispDiff and np.isnan(data.dispAdj[i]):
continue
response = np.hstack([data.countRibo[i, :], data.countRna[i, :]])
if opts.dispDiff:
disp = np.hstack([np.repeat(data.dispAdjRibo[i], lenSampleRibo), np.repeat(data.dispAdjRna[i], lenSampleRna)])
else:
disp = data.dispAdj[i]
try:
modNB0 = sm.GLM(response, explanatory0, family=sm.families.NegativeBinomial(alpha=disp), offset=np.log(librarySizes))
modNB1 = sm.GLM(response, explanatory1, family=sm.families.NegativeBinomial(alpha=disp), offset=np.log(librarySizes))
result0 = modNB0.fit()
result1 = modNB1.fit()
except sm.tools.sm_exceptions.PerfectSeparationError:
errorCnt += 1
else:
if not opts.dispDiff:
pval[i] = 1 - chi2.cdf(result0.deviance - result1.deviance, explanatory1.shape[1] - explanatory0.shape[1])
elif opts.dispDiff:
pval[i] = 1 - chi2.cdf(result0.deviance - result1.deviance, (explanatory1.shape[1] - explanatory0.shape[1]) / 2.5)
else:
pass
data.pval = pval
sys.stdout.write('Warning: Failed to do test: %i genes. P value set to \'nan\'.\n' % errorCnt)
n_samples, n_features = train.shape
print "n_samples:", n_samples, "n_features:", n_features
print "histogram of target"
print sp.histogram(target,3)
print "len(train):", len(train)
print "len(target):", len(target)
print "dataset shape:", dataset.shape
if family!='gaussian':
raise Exception("Only have gaussian logistic for scipy")
# train the classifier
gauss_log = sm_api.GLM(target, train, family=sm_api.families.Gaussian(sm_api.families.links.log))
start = time.time()
gauss_log_results = gauss_log.fit()
print "sm_api.GLM took", time.time() - start, "seconds"
print gauss_log_results.summary()
"""Build the g-transport model for the outcome. This is also referred to at the Q-model.
Parameters
----------
model : str
Variables to include in the model for predicting the outcome. Must be contained within the input
pandas dataframe when initialized. Model form should contain the exposure, i.e. 'art + age + male'
outcome_type : str, optional
Variable type for the outcome. Default is binary. Also supports 'normal' and 'poisson' distributed outcomes
print_results : bool, optional
Whether to print the logistic regression results to the terminal. Default is True
"""
if outcome_type == 'binary':
linkdist = sm.families.family.Binomial()
elif outcome_type == 'normal':
linkdist = sm.families.family.Gaussian()
elif outcome_type == 'poisson':
linkdist = sm.families.family.Poisson()
else:
raise ValueError("Only 'binary', 'normal', and 'poisson' distributed outcomes are available")
# Modeling the outcome
df = self.df[self.sample].copy()
m = smf.glm(self.outcome+' ~ '+model, df, family=linkdist)
self._outcome_model = m.fit()
# Printing results of the model and if any observations were dropped
if print_results:
print(self._outcome_model.summary())
dfa = self.df.copy()
dfn = self.df.copy()
-str object of the exposure variable name from pandas df. Ex) 'Exposure'
modifier:
-str object of the modifier variable name from pandas df. Ex) 'Modifier'
adjust:
-str object of other variables to adjust for in correct statsmodels format.
Note: variables can NOT be named {E1M0,E0M1,E1M1} since this function creates
variables with those names. Answers will be incorrect
Ex) '+ C1 + C2 + C3 + Z'
decimal:
-Number of decimals to display in result. Default is 3
'''
df.loc[((df[exposure]==1)&(df[modifier]==1)),'E1M1'] = 1
df.loc[((df[exposure]!=1)|(df[modifier]!=1)),'E1M1'] = 0
df.loc[((df[exposure].isnull())|(df[modifier].isnull())),'E1M1'] = np.nan
eq = outcome + '~'+exposure+'+'+modifier+'+E1M1' + adjust
f = sm.families.family.Binomial(sm.families.links.identity)
model = smf.glm(eq,df,family=f).fit()
print(model.summary())
ic = model.params['E1M1']
print(ic)
lcl = model.conf_int().loc['E1M1'][0]
ucl = model.conf_int().loc['E1M1'][1]
print('\n----------------------------------------------------------------------')
print('Interaction Contrast')
print('----------------------------------------------------------------------')
print('\nIC:\t\t'+str(round(ic,decimal)))
print('95% CI:\t\t('+str(round(lcl,decimal))+', '+str(round(ucl,decimal))+')')
print('----------------------------------------------------------------------')
subData['events'] = subData['events'].cat.add_categories(['Out'])
for out in outs:
subData.loc[subData['events'] == out, 'events'] = 'Out'
subData['events'] = subData['events'].cat.remove_unused_categories()
X1 = subData.loc[:, xLabels[:-1]]
X2 = subData.loc[:, xLabels]
y1 = subData[yLabel] == 'Home Run'
y2 = subData[yLabel]
skf = StratifiedKFold(n_splits=10, shuffle=True)
glm = BetterGLM(
SMParams={'family': sm.families.Binomial(sm.families.links.probit)})
mnLogit = BetterMNLogit()
y11p = cross_val_predict(glm, X1, y1, cv=skf, n_jobs=-1,
method='predict_proba')
y21p = cross_val_predict(glm, X2, y1, cv=skf, n_jobs=-1,
method='predict_proba')
y12p = cross_val_predict(mnLogit, X1, y2, cv=skf, n_jobs=-1,
method='predict_proba')
y22p = cross_val_predict(mnLogit, X2, y2, cv=skf, n_jobs=-1,
method='predict_proba')
# %% Log-loss
L11 = np.exp(-log_loss(y1, y11p))
L21 = np.exp(-log_loss(y1, y21p))
L12 = np.exp(-log_loss(y2, y12p))
sys.stderr.write("[status]\tstatsmodels version:\t" + str(statsmodels.__version__) + "\n")
# define model formula
self.df = self.df[['ribosome_count', 'gene', 'codon', 'avgProb_scaled', 'logTPM_scaled']]
formula = 'ribosome_count ~ C(gene) + C(codon) + avgProb_scaled'
sys.stderr.write("[status]\tFormula: " + str(formula) + "\n")
# define model fitting options
solver = "IRLS" # "lbfgs"
tolerance = 1e-4
numIter = 100
sys.stderr.write("[status]\tSolver: " + solver + "\n")
sys.stderr.write("[status]\tConvergence tolerance: " + str(tolerance) + "\n")
sys.stderr.write("[status]\tMaxiter: " + str(numIter) + "\n")
# model fitting NegativeBinomial GLM
sys.stderr.write("[status]\tModel: smf.glm(formula, self.df, family=sm.families.NegativeBinomial(), "
"offset=self.df['logTPM_scaled']" + "\n")
model = smf.glm(formula, self.df, family=sm.families.NegativeBinomial(), offset=self.df['logTPM_scaled'])
res = model.fit(method=solver, tol=tolerance, maxiter=numIter)
# print model output
print(res.summary(), flush=True)
g.fit(treatment='none')
r_none = g.marginal_outcome
rd_results.append(r_all - r_none)
rr_results.append(r_all / r_none)
print('RD 95% CI:',np.percentile(rd_results,q=[2.5,97.5]))
print('RR 95% CI:',np.percentile(rr_results,q=[2.5,97.5]))
#IPTW
model = 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0'
df['iptw'] = ze.ipw.iptw(df,treatment='art',model_denominator=model,stabilized=True)
ind = sm.cov_struct.Independence()
f = sm.families.family.Binomial(sm.families.links.identity)
linrisk = smf.gee('dead ~ art',df['id'],df,cov_struct=ind,family=f,weights=df['iptw']).fit()
linrisk.summary()
f = sm.families.family.Binomial(sm.families.links.log)
log = smf.gee('dead ~ art',df['id'],df,cov_struct=ind,family=f,weights=df['iptw']).fit()
log.summary()
#Double-Robust
sdr = SimpleDoubleRobust(df,exposure='art',outcome='dead')
sdr.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
sdr.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
sdr.fit()
sdr.summary()
rd = []
rr = []
for i in range(500):
dfs = df.sample(n=df.shape[0],replace=True)
s = SimpleDoubleRobust(dfs,exposure='art',outcome='dead')
s.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False)
s.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False)
s.fit()
vec_pos = {leaf.name: i for i, leaf in enumerate(tree)}
count_fn = '.'.join(asg_fn.split('.')[:-1]+['npy'])
if os.path.isfile(count_fn):
print("Loading counts from existing npy file", file=sys.stderr)
map_counts = np.load(count_fn)
else:
map_counts = analyse_assignments(asg_fn, nodes2leaves, vec_pos)
np.save(count_fn, map_counts)
assert len(leaves) == len(map_counts), "Length of mappings different from #leaves...try to remove and analyse assignments again using the right tree!"
sim_mat = np.load(sim_mat_fn)
assert len(leaves) == len(sim_mat), "Size of similarity matrix different from #leaves...have you used the right index/tree?"
glm = sm.GLM(map_counts, sim_mat, family=sm.families.Poisson(link=sm.families.links.identity))
glm_results = glm.fit_regularized(alpha=alpha, L1_wt=l1_ratio, refit=True)
print(glm_results.summary())
with open(out_fn, 'w') as out_f:
for leaf, ab in zip(leaves, glm_results.params):
print(leaf, ab, sep='\t', file=out_f)
to calculate
Returns a pandas Series of calculated probabilities for all observations
within the dataframe
df:
-Dataframe for the model
model:
-Model to fit the logistic regression to. Example) 'y ~ var1 + var2'
mresult:
-Whether to print the logistic regression results. Default is True
Example)
>>>zepid.ipw.propensity_score(df=data,model='X ~ Z + var1 + var2')
'''
f = sm.families.family.Binomial(sm.families.links.logit)
log = smf.glm(model,df,family=f).fit()
if mresult == True:
print('\n----------------------------------------------------------------')
print('MODEL: '+model)
print('-----------------------------------------------------------------')
print(log.summary())
p = log.predict(df)
return p
self = Dummy()
# generate artificial data
np.random.seed(98765678)
nobs = 200
rvs = np.random.randn(nobs,6)
data_exog = rvs
data_exog = sm.add_constant(data_exog)
xbeta = 1 + 0.1*rvs.sum(1)
data_endog = np.random.poisson(np.exp(xbeta))
#estimate discretemod.Poisson as benchmark
from statsmodels.discrete.discrete_model import Poisson
res_discrete = Poisson(data_endog, data_exog).fit()
mod_glm = sm.GLM(data_endog, data_exog, family=sm.families.Poisson())
res_glm = mod_glm.fit()
#estimate generic MLE
self.mod = PoissonGMLE(data_endog, data_exog)
res = self.mod.fit()
offset = res.params[0] * data_exog[:,0] #1d ???
mod1 = PoissonOffsetGMLE(data_endog, data_exog[:,1:], offset=offset)
start_params = np.ones(6)/2.
start_params = res.params[1:]
res1 = mod1.fit(start_params=start_params, method='nm', maxiter=1000, maxfun=1000)
print 'mod2'
mod2 = PoissonZiGMLE(data_endog, data_exog[:,1:], offset=offset)
start_params = np.r_[np.ones(6)/2.,10]
start_params = np.r_[res.params[1:], 20.] #-100]