Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
types = {"read": ["padded_start", "padded_end", "orient"],
"consensus": ["padded_consensus_start", "padded_consensus_end"],
"quality" : ["qual_clipping_start", "qual_clipping_end", "align_clipping_start", "align_clipping_end"]
}
valid_types = tuple(types.keys())
p.add_option("--type", default="read", choices=valid_types,
help="choose report type [default: %default]")
opts, args = p.parse_args(args)
if len(args) != 1:
sys.exit(not p.print_help())
acefile, = args
ace = Ace.read(must_open(acefile))
logging.debug('Loaded ace file {0}'.format(acefile))
for c in ace.contigs:
print(c.name)
table = dict()
if opts.type == "read":
ps, pe = [], []
ps = [read.padded_start for read in c.af]
for i in xrange(1, len(ps)):
pe.append(ps[i] - ps[i-1])
pe.append(c.nbases)
map = dict(zip(ps, pe))
for i, read in enumerate(c.af):
values = [str(x) for x in (read.padded_start, map[read.padded_start], read.coru)]
for i, label in enumerate(types[opts.type]):
table[(str(read.name), label)] = values[i]
p = OptionParser(bed.__doc__)
p.add_option("--swap", default=False, action="store_true",
help="Write query positions [default: %default]")
p.add_option("--both", default=False, action="store_true",
help="Generate one line for each of query and subject")
opts, args = p.parse_args(args)
if len(args) != 1:
sys.exit(p.print_help())
blastfile, = args
positive = (not opts.swap) or opts.both
negative = opts.swap or opts.both
fp = must_open(blastfile)
bedfile = "{0}.bed".format(blastfile.rsplit(".", 1)[0]) \
if blastfile.endswith(".blast") \
else "{0}.bed".format(blastfile)
fw = open(bedfile, "w")
for row in fp:
b = BlastLine(row)
if positive:
print(b.bedline, file=fw)
if negative:
print(b.swapped.bedline, file=fw)
logging.debug("File written to `{0}`.".format(bedfile))
fw.close()
bed_sort([bedfile, "-i"])
return bedfile
files = [""]
qrys = []
qry = " ".join(args)
if ";" in qry:
for q in qry.split(";"):
if len(q.strip()) > 0:
qrys.append(q)
else:
qrys.append(qry)
queries = set()
if files:
for datafile in files:
datafile = datafile.strip()
fp = must_open(datafile)
for row in fp:
for qry in qrys:
qry = qry.strip()
m = re.findall(r"\{\d+\}", qry)
if m:
mi = [int(x.strip("{}")) for x in m]
atoms = row.strip().split("\t")
assert max(mi) <= len(atoms), \
"Number of columns in `datafile`({0})".format(len(atoms)) + \
" != number of `placeholders`({0})".format(len(m))
natoms = [atoms[x] for x in mi]
for idx, (match, atom) in enumerate(zip(m, natoms)):
qry = qry.replace(match, atom)
queries.add(qry)
else:
for qry in qrys:
help="Minimum feature length")
p.add_option("--maxsize", default=1000000000, type="int",
help="Minimum feature length")
p.add_option("--minaccn", type="int",
help="Minimum value of accn, useful to filter based on coverage")
p.add_option("--minscore", type="int", help="Minimum score")
p.set_outfile()
opts, args = p.parse_args(args)
if len(args) != 1:
sys.exit(not p.print_help())
bedfile, = args
fp = must_open(bedfile)
fw = must_open(opts.outfile, "w")
minsize, maxsize = opts.minsize, opts.maxsize
minaccn = opts.minaccn
minscore = opts.minscore
total = []
keep = []
for row in fp:
try:
b = BedLine(row)
except IndexError:
print(row.strip(), file=fw)
continue
span = b.span
total.append(span)
if not minsize <= span <= maxsize:
continue
if minaccn and int(b.accn) < minaccn:
def __init__(self, filename=None, key=None, sorted=True, juncs=False,
include=None):
super(Bed, self).__init__(filename)
# the sorting key provides some flexibility in ordering the features
# for example, user might not like the lexico-order of seqid
self.nullkey = lambda x: (natsort_key(x.seqid), x.start, x.accn)
self.key = key or self.nullkey
if not filename:
return
for line in must_open(filename):
if line[0] == "#" or (juncs and line.startswith('track name')):
continue
b = BedLine(line)
if include and b.accn not in include:
continue
self.append(b)
if sorted:
self.sort(key=self.key)
def print_to_anchors(self, outfile):
fw = must_open(outfile, "w")
for row in self:
print(row.anchorline, file=fw)
fw.close()
def write_unplaced_agp(agpfile, scaffolds, unplaced_agp):
agp = AGP(agpfile)
scaffolds_seen = set(x.component_id for x in agp)
sizes = Sizes(scaffolds).mapping
fwagp = must_open(unplaced_agp, "w")
for s in natsorted(sizes.keys()):
if s in scaffolds_seen:
continue
order_to_agp(s, [(s, "?")], sizes, fwagp)
logging.debug("Write unplaced AGP to `{0}`.".format(unplaced_agp))
def parse_fasta(infile, upper=False):
'''
parse a fasta-formatted file and returns header
can be a fasta file that contains multiple records.
'''
try:
fp = must_open(infile)
except:
fp = infile
# keep header
fa_iter = (x[1] for x in groupby(fp, lambda row: row[0] == '>'))
for header in fa_iter:
header = next(header)
if header[0] != '>':
continue
# drop '>'
header = header.strip()[1:]
# stitch the sequence lines together and make into upper case
seq = "".join(s.strip() for s in next(fa_iter))
if upper:
seq = seq.upper()
yield header, seq
def iteritems_ordered(self):
for rec in SeqIO.parse(must_open(self.filename), "fasta"):
yield rec.name, rec
if b.hitlen < min(query_len, subject_len)*P/100.:
continue
query = gene_name(b.query, strip_name)
subject = gene_name(b.subject, strip_name)
homologs.join(query, subject)
g = Grouper()
for i, atom in enumerate(bed):
for x in range(1, N+1):
if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \
homologs.joined(bed[i-x].accn, atom.accn)]):
g.join(bed[i-x].accn, atom.accn)
# dump the grouper
fw = must_open(ofile, "w")
ngenes, nfamilies = 0, 0
families = []
for group in sorted(g):
if len(group) >= 2:
print >>fw, ",".join(sorted(group))
ngenes += len(group)
nfamilies += 1
families.append(sorted(group))
longest_family = max(families, key=lambda x: len(x))
# generate reports
print >>sys.stderr, "Proximal paralogues (dist=%d):" % N
print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family),
",".join(longest_family))