Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def plot_bcr_phylo_target_attraction(plotdir, event): # plots of which sequences are going toward which targets
fig, ax = plotting.mpl_init()
# affinity vs stuff:
# xvals = [1. / af for line in mutated_events for af in line['affinities']]
# yvals = [nm for line in mutated_events for nm in line['n_mutations']]
# # min distance to target:
# yvals = [hd for line in mutated_events for hd in get_min_target_hdists(line['input_seqs'], line['target_seqs'])]
# ax.scatter(xvals, yvals, alpha=0.65)
hist = Hist(len(event['target_seqs']), -0.5, len(event['target_seqs']) - 0.5, value_list=event['nearest_target_indices'])
hist.mpl_plot(ax, alpha=0.7, ignore_overflows=True)
plotname = 'nearest-target-identities'
plotting.mpl_finish(ax, plotdir, plotname, xlabel='index (identity) of nearest target sequence', ylabel='counts') #, xbounds=(minfrac*xmin, maxfrac*xmax), ybounds=(-0.05, 1.05), log='x', xticks=xticks, xticklabels=[('%d' % x) for x in xticks], leg_loc=(0.8, 0.55 + 0.05*(4 - len(plotvals))), leg_title=leg_title, title=title)
elif jk2 in cachevals:
return jk2
else:
return None
hstyles = ['plain'] #, 'zoom'] # 'zoom-logy',
if metric == 'logprob':
nbins, xmin, xmax = 40, -55, 70
elif metric == 'naive_hfrac':
nbins, xmin, xmax = 70, 0., 0.65
else:
assert False
hists = OrderedDict()
htypes = ['nearest-clones', 'farthest-clones', 'all-clones', 'not']
for ht in htypes:
hists[ht] = Hist(nbins, xmin, xmax)
bigvals, smallvals = {}, {}
for key_a, key_b in combinations: # is colon-separated string (not a list of keys)
a_ids, b_ids = key_a.split(':'), key_b.split(':')
# if not utils.from_same_event(reco_info, a_ids) or not utils.from_same_event(reco_info, b_ids): # skip clusters that were erroneously merged -- i.e., in effect, assume the previous step didn't screw up at all
# raise Exception('woop')
jk = get_joint_key(key_a, key_b)
if jk is None: # if we don't have the joint logprob cached
continue
if metric == 'logprob':
# jk = get_joint_key(key_a, key_b)
# if jk is None: # if we don't have the joint logprob cached
# continue
lratio = cachevals[jk] - cachevals[key_a] - cachevals[key_b]
# print '%f - %f - %f = %f' % (cachevals[jk], cachevals[key_a], cachevals[key_b], lratio),
mval = lratio
def make_hist(plotvals, n_total, n_skipped, iclust=None, affinities=None):
if len(plotvals) == 0:
return
hist = Hist(30, min(plotvals), max(plotvals), value_list=plotvals)
fig, ax = plotting.mpl_init()
hist.mpl_plot(ax) #, square_bins=True, errors=False)
# fig.text(0.7, 0.8, 'mean %.3f' % numpy.mean(plotvals), fontsize=15)
# fig.text(0.7, 0.75, 'max %.3f' % max(plotvals), fontsize=15)
# if affinities is not None:
# fig.text(0.38, 0.88, 'mean/max affinity: %.4f/%.4f' % (numpy.mean(affinities), max(affinities)), fontsize=15)
plotname = '%s-%s' % (lb_metric, str(iclust) if iclust is not None else 'all-clusters')
leafskipstr = ', skipped %d leaves' % n_skipped if n_skipped > 0 else '' # ok they're not necessarily leaves, but almost all of them are leaves (and not really sure how a non-leaf could get zero, but some of them seem to)
title = '%s %s: %d entries%s (%s)' % ('true' if is_true_line else 'inferred', mtitlestr('per-seq', lb_metric, short=True), n_total, leafskipstr, 'all clusters' if iclust is None else 'iclust %d'%iclust)
fn = plotting.mpl_finish(ax, plotdir, plotname, xlabel=mtitlestr('per-seq', lb_metric), log='y', ylabel='counts', title=title)
if iclust is None or (iclust_fnames is not None and iclust < iclust_fnames):
add_fn(fnames, fn=fn)
def get_hists_from_dir(dirname, histname, string_to_ignore=None):
hists = {}
for fname in glob.glob(dirname + '/*.csv'):
varname = os.path.basename(fname).replace('.csv', '')
if string_to_ignore is not None:
varname = varname.replace(string_to_ignore, '')
hists[varname] = Hist(fname=fname, title=histname)
if len(hists) == 0:
print ' no csvs found in %s' % dirname
return hists
xmin, xmax = None, None
hists, labels = [], []
for ihist in range(len(numpyhists)):
nphist = numpyhists[ihist] # numpy.hist is two arrays: [0] is bin counts, [1] is bin x values (not sure if low, high, or centers)
obs_time = ihist # + 1 # I *think* it's right without the 1 (although I guess it's really a little arbitrary)
if not plot_this_time(obs_time, numpyhists):
continue
if nphist is None: # time points at which we didn't sample
hists.append(None)
labels.append(None)
continue
bin_contents, bin_edges = nphist
assert len(bin_contents) == len(bin_edges) - 1
# print ' '.join('%5.1f' % c for c in bin_contents)
# print ' '.join('%5.1f' % c for c in bin_edges)
hist = Hist(len(bin_edges) - 1, bin_edges[0], bin_edges[-1])
for ibin in range(len(bin_edges) - 1): # nphist indexing, not Hist indexing
lo_edge = bin_edges[ibin]
hi_edge = bin_edges[ibin + 1]
bin_center = (hi_edge + lo_edge) / 2.
for _ in range(bin_contents[ibin]):
hist.fill(bin_center)
xmin = lo_edge if xmin is None else min(xmin, lo_edge)
xmax = hi_edge if xmax is None else max(xmax, hi_edge)
hists.append(hist)
labels.append('%d (%.1f)' % (obs_time, hist.get_mean()))
# hists = [Hist(1, xmin, xmax) if h is None else h for h in hists] # replace the None values with empty hists
hists, labels = zip(*[(h, l) for h, l in zip(hists, labels) if h is not None])
return hists, labels, xmin, xmax
self.min_min_candidate_ratio_to_plot = 1.5 # don't plot positions that're below this (for all )
self.n_warn_alleles_per_gene = 2
self.default_slope_bounds = (-0.1, 1.) # fitting function needs some reasonable bounds from which to start (I could at some point make slope part of the criteria for candidacy, but it wouldn't add much sensitivity)
self.unbounded_y_icpt_bounds = (-1., 1.5)
self.seq_info = {}
self.counts, self.fitfos = {}, {}
self.inferred_allele_info = [] # new alleles with respect to the template genes from which we actually inferred them, for usage internal to allelefinder
self.new_allele_info = [] # new alleles with respect to original template genes, for external use (distinction is important if we infer a new allele from another previously-inferred new allele)
self.positions_to_plot = {}
self.n_seqs_too_highly_mutated = {} # sequences (per-gene) that had more than mutations
self.gene_obs_counts = {} # NOTE same as n_clonal_representatives
self.overall_mute_counts = Hist(self.args.n_max_mutations_per_segment - 1, 0.5, self.args.n_max_mutations_per_segment - 0.5) # i.e. 0th (underflow) bin corresponds to zero mutations
self.per_gene_mute_counts = {} # crappy name -- this is the denominator for each position in . Which is usually the same for most positions, but it's cleaner to keep it separate than choose one of 'em.
self.n_big_del_skipped = {s : {} for s in self.n_bases_to_exclude}
self.n_fits = 0
self.default_initial_glfo = None
if self.args.default_initial_germline_dir is not None: # if this is set, we want to take any new allele names from this directory's glfo if they're in there
self.default_initial_glfo = glutils.read_glfo(self.args.default_initial_germline_dir, glfo['locus'])
self.n_excluded_clonal_queries = {}
self.n_clones = {}
self.n_clonal_representatives = {} # NOTE same as gene_obs_counts
self.finalized = False
self.reflengths = {}
def init_gene(self, gene):
self.counts[gene] = {}
for igl in range(len(self.glfo['seqs'][utils.get_region(gene)][gene])):
self.counts[gene][igl] = {}
for istart in range(self.args.n_max_mutations_per_segment + 1): # istart and n_mutes are equivalent
self.counts[gene][igl][istart] = {n : 0 for n in ['muted', 'total'] + utils.nukes}
self.gene_obs_counts[gene] = 0
self.per_gene_mute_counts[gene] = Hist(self.args.n_max_mutations_per_segment - 1, 0.5, self.args.n_max_mutations_per_segment - 0.5) # i.e. 0th (underflow) bin corresponds to zero mutations
for side in self.n_big_del_skipped:
self.n_big_del_skipped[side][gene] = 0
self.n_seqs_too_highly_mutated[gene] = 0
self.n_excluded_clonal_queries[gene] = 0
self.n_clones[gene] = 0
self.n_clonal_representatives[gene] = 0
def get_mute_hist(self, mtype):
if self.args.mutate_from_scratch:
mean_mute_val = self.args.scratch_mute_freq
if self.args.same_mute_freq_for_all_seqs:
hist = Hist(1, mean_mute_val - utils.eps, mean_mute_val + utils.eps)
hist.fill(mean_mute_val)
else:
n_entries = 500
length_vals = [v for v in numpy.random.exponential(mean_mute_val, n_entries)] # count doesn't work on numpy.ndarray objects
max_val = 0.8 # this is arbitrary, but you shouldn't be calling this with anything that gets a significant number anywhere near there, anyway
if length_vals.count(max_val):
print '%s lots of really high mutation rates treegenerator::get_mute_hist()' % utils.color('yellow', 'warning')
length_vals = [min(v, max_val) for v in length_vals]
hist = Hist(30, 0., max_val)
for val in length_vals:
hist.fill(val)
hist.normalize()
else:
hist = Hist(fname=self.parameter_dir + '/' + mtype + '-mean-mute-freqs.csv')
return hist
def compare_directories(args, plotdirlist, outdir):
utils.prep_dir(outdir, wildlings=['*.png', '*.svg', '*.csv'])
# read hists from
allhists = OrderedDict()
allvars = set() # all variables that appeared in any dir
for idir in range(len(plotdirlist)):
dirhists = get_hists_from_dir(plotdirlist[idir], args.names[idir])
allvars |= set(dirhists.keys())
allhists[args.names[idir]] = dirhists
# then loop over all the s we found
for varname in allvars:
hlist = [allhists[dname].get(varname, Hist(1, 0, 1, title='null')) for dname in allhists]
plot_single_variable(args, varname, hlist, outdir, pathnameclues=plotdirlist[0])
plotting.make_html(outdir, n_columns=4)