Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
%prog blastfile fastafiles
Calculate the vector clear range file based BLAST to the vectors.
"""
p = OptionParser(clr.__doc__)
opts, args = p.parse_args(args)
if len(args) < 2:
sys.exit(not p.print_help())
blastfile = args[0]
fastafiles = args[1:]
sizes = {}
for fa in fastafiles:
f = Fasta(fa)
sizes.update(f.itersizes())
b = Blast(blastfile)
for query, hits in b.iter_hits():
qsize = sizes[query]
vectors = list((x.qstart, x.qstop) for x in hits)
vmin, vmax = range_minmax(vectors)
left_size = vmin - 1
right_size = qsize - vmax
if left_size > right_size:
clr_start, clr_end = 0, vmin
else:
clr_start, clr_end = vmax, qsize
Take number of records randomly from fasta
"""
from random import sample
p = OptionParser(random.__doc__)
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
fastafile, N = args
N = int(N)
assert N > 0
f = Fasta(fastafile)
fw = must_open("stdout", "w")
for key in sample(f.keys(), N):
rec = f[key]
SeqIO.write([rec], fw, "fasta")
fw.close()
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
mcscanfile, cdsfile = args
if opts.addtandem:
tandemfile = opts.addtandem
mcscanfile_with_tandems = add_tandems(mcscanfile, tandemfile)
mcscanfile = mcscanfile_with_tandems
seqdir = opts.outdir
mkdir(seqdir)
f = Fasta(cdsfile)
fp = must_open(mcscanfile)
if opts.writecolors:
fc = must_open("leafcolors.txt", "w")
n = 0
for i, row in enumerate(fp):
row = row.strip().split("\t")
if i == 0:
l = len(row)
if l <= 20:
colors = discrete_rainbow(l, shuffle=False)[1]
else:
colors = discrete_rainbow(l, usepreset=False, shuffle=False)[1]
warnings.warn("*** WARNING ***\n" \
"Too many columns. Colors may not be all distinctive.")
from jcvi.formats.bed import Bed
from jcvi.utils.cbook import fill
p = OptionParser(frombed.__doc__)
opts, args = p.parse_args(args)
if len(args) != 3:
sys.exit(not p.print_help())
bedfile, contigfasta, readfasta = args
prefix = bedfile.rsplit(".", 1)[0]
contigfile = prefix + ".contig"
idsfile = prefix + ".ids"
contigfasta = Fasta(contigfasta)
readfasta = Fasta(readfasta)
bed = Bed(bedfile)
checksum = "00000000 checksum."
fw_ids = open(idsfile, "w")
fw = open(contigfile, "w")
for ctg, reads in bed.sub_beds():
ctgseq = contigfasta[ctg]
ctgline = "##{0} {1} {2} bases, {3}".format(\
ctg, len(reads), len(ctgseq), checksum)
print(ctg, file=fw_ids)
print(ctgline, file=fw)
print(fill(ctgseq.seq), file=fw)
for b in reads:
Scan the headers for the consensus clusters and count the number of reads.
"""
from jcvi.graphics.histogram import stem_leaf_plot
from jcvi.utils.cbook import SummaryStats
p = OptionParser(count.__doc__)
p.add_option("--csv", help="Write depth per contig to file")
opts, args = p.parse_args(args)
if len(args) != 1:
sys.exit(not p.print_help())
fastafile, = args
csv = open(opts.csv, "w") if opts.csv else None
f = Fasta(fastafile, lazy=True)
sizes = []
for desc, rec in f.iterdescriptions_ordered():
if desc.startswith("singleton"):
sizes.append(1)
continue
# consensus_for_cluster_0 with 63 sequences
if "with" in desc:
name, w, size, seqs = desc.split()
if csv:
print("\t".join(str(x)
for x in (name, size, len(rec))), file=csv)
assert w == "with"
sizes.append(int(size))
# MRD85:00603:02472;size=167;
else:
def make_qual(fastafile, score=OKQUAL):
logging.warning("assume qual ({0})".format(score))
qualfile = fastafile.rsplit(".", 1)[0] + ".qual"
fw = open(qualfile, "w")
fasta = Fasta(fastafile, lazy=True)
score = str(score) + " "
for entry, size in fasta.itersizes_ordered():
print(">" + entry, file=fw)
print(score * size, file=fw)
fw.close()
return qualfile
if len(args) != 1:
sys.exit(not p.print_help())
fastafile, = args
libID = fastafile.split(".")[0]
depth = opts.depth
readlen = opts.readlen
shift = opts.shift
outfile = libID + ".depth{0}".format(depth)
if opts.fasta:
outfile += ".fasta"
else:
outfile += ".frg"
f = Fasta(fastafile, lazy=True)
fw = must_open(outfile, "w", checkexists=True)
if not opts.fasta:
print(headerTemplate.format(libID=libID), file=fw)
"""
Taken from runCA:
|*********|
|###################|
|--------------------------------------------------|
---------------1---------------
---------------2---------------
---------------3---------------
*** - center_increments
### - center_range_width
p = OptionParser(filter.__doc__)
p.add_option("--minsize", default=2, type="int",
help="Minimum cluster size")
p.set_outfile()
opts, args = p.parse_args(args)
if len(args) < 1:
sys.exit(not p.print_help())
fastafiles = args
minsize = opts.minsize
totalreads = totalassembled = 0
fw = must_open(opts.outfile, "w")
for i, fastafile in enumerate(fastafiles):
f = Fasta(fastafile, lazy=True)
pf = "s{0:03d}".format(i)
nreads = nsingletons = nclusters = 0
for desc, rec in f.iterdescriptions_ordered():
nclusters += 1
if desc.startswith("singleton"):
nsingletons += 1
nreads += 1
continue
# consensus_for_cluster_0 with 63 sequences
name, w, size, seqs = desc.split()
assert w == "with"
size = int(size)
nreads += size
if size < minsize:
continue
rec.description = rec.description.split(None, 1)[-1]
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \
evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False):
if genefam:
N = 1e5
# get the sizes for the CDS first
f = Fasta(cds_file)
sizes = dict(f.itersizes())
# retrieve the locations
bed = Bed(bed_file)
order = bed.order
if is_self:
# filter the blast file
g = Grouper()
fp = open(blast_file)
for row in fp:
b = BlastLine(row)
query_len = sizes[b.query]
subject_len = sizes[b.subject]
if b.hitlen < min(query_len, subject_len)*P/100.:
continue
fw = open(finalfasta, "w")
minimusfasta = op.join(closuredir, prefix + ".fasta")
f = Fasta(minimusfasta)
for id, rec in f.iteritems_ordered():
if id in excludecontigs:
continue
SeqIO.write([rec], fw, "fasta")
singletonfile = op.join(closuredir, prefix + ".singletons")
singletons = set(x.strip() for x in open(singletonfile))
leftovers = singletons & originalIDs
logging.debug("Pull leftover singletons: {0}".\
format(", ".join(sorted(leftovers))))
f = Fasta(ctgfasta)
for id, rec in f.iteritems_ordered():
if id not in leftovers:
continue
SeqIO.write([rec], fw, "fasta")
fw.close()
fastafile = finalfasta
finalfasta = fastafile.rstrip("_")
format([fastafile, finalfasta, "--sequential", "--pad0=3",
"--prefix={0}_".format(rid)])
logging.debug("Improved FASTA written to `{0}`.".format(finalfasta))
n50([ctgfasta])
n50([finalfasta])