How to use the jcvi.formats.base.must_open function in jcvi

To help you get started, we’ve selected a few jcvi examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github tanghaibao / jcvi / jcvi / formats / ace.py View on Github external
types = {"read":      ["padded_start", "padded_end", "orient"],
             "consensus": ["padded_consensus_start", "padded_consensus_end"],
             "quality"  : ["qual_clipping_start", "qual_clipping_end", "align_clipping_start", "align_clipping_end"]
            }
    valid_types = tuple(types.keys())
    p.add_option("--type", default="read", choices=valid_types,
            help="choose report type [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    acefile, = args
    ace = Ace.read(must_open(acefile))
    logging.debug('Loaded ace file {0}'.format(acefile))

    for c in ace.contigs:
        print(c.name)
        table = dict()
        if opts.type == "read":
            ps, pe = [], []
            ps = [read.padded_start for read in c.af]
            for i in xrange(1, len(ps)):
                pe.append(ps[i] - ps[i-1])
            pe.append(c.nbases)
            map = dict(zip(ps, pe))
            for i, read in enumerate(c.af):
                values = [str(x) for x in (read.padded_start, map[read.padded_start], read.coru)]
                for i, label in enumerate(types[opts.type]):
                    table[(str(read.name), label)] = values[i]
github tanghaibao / jcvi / jcvi / formats / blast.py View on Github external
p = OptionParser(bed.__doc__)
    p.add_option("--swap", default=False, action="store_true",
                 help="Write query positions [default: %default]")
    p.add_option("--both", default=False, action="store_true",
                 help="Generate one line for each of query and subject")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    blastfile, = args
    positive = (not opts.swap) or opts.both
    negative = opts.swap or opts.both

    fp = must_open(blastfile)
    bedfile =  "{0}.bed".format(blastfile.rsplit(".", 1)[0]) \
            if blastfile.endswith(".blast") \
            else "{0}.bed".format(blastfile)
    fw = open(bedfile, "w")
    for row in fp:
        b = BlastLine(row)
        if positive:
            print(b.bedline, file=fw)
        if negative:
            print(b.swapped.bedline, file=fw)

    logging.debug("File written to `{0}`.".format(bedfile))
    fw.close()
    bed_sort([bedfile, "-i"])

    return bedfile
github tanghaibao / jcvi / jcvi / utils / db.py View on Github external
files = [""]

    qrys = []
    qry = " ".join(args)
    if ";" in qry:
        for q in qry.split(";"):
            if len(q.strip()) > 0:
                qrys.append(q)
    else:
        qrys.append(qry)

    queries = set()
    if files:
        for datafile in files:
            datafile = datafile.strip()
            fp = must_open(datafile)
            for row in fp:
                for qry in qrys:
                    qry = qry.strip()
                    m = re.findall(r"\{\d+\}", qry)
                    if m:
                        mi = [int(x.strip("{}")) for x in m]
                        atoms = row.strip().split("\t")
                        assert max(mi) <= len(atoms), \
                                "Number of columns in `datafile`({0})".format(len(atoms)) + \
                                " != number of `placeholders`({0})".format(len(m))
                        natoms = [atoms[x] for x in mi]
                        for idx, (match, atom) in enumerate(zip(m, natoms)):
                            qry = qry.replace(match, atom)
                    queries.add(qry)
    else:
        for qry in qrys:
github tanghaibao / jcvi / jcvi / formats / bed.py View on Github external
help="Minimum feature length")
    p.add_option("--maxsize", default=1000000000, type="int",
                 help="Minimum feature length")
    p.add_option("--minaccn", type="int",
                 help="Minimum value of accn, useful to filter based on coverage")
    p.add_option("--minscore", type="int", help="Minimum score")
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    fp = must_open(bedfile)
    fw = must_open(opts.outfile, "w")
    minsize, maxsize = opts.minsize, opts.maxsize
    minaccn = opts.minaccn
    minscore = opts.minscore
    total = []
    keep = []
    for row in fp:
        try:
            b = BedLine(row)
        except IndexError:
            print(row.strip(), file=fw)
            continue
        span = b.span
        total.append(span)
        if not minsize <= span <= maxsize:
            continue
        if minaccn and int(b.accn) < minaccn:
github tanghaibao / jcvi / jcvi / formats / bed.py View on Github external
def __init__(self, filename=None, key=None, sorted=True, juncs=False,
                       include=None):
        super(Bed, self).__init__(filename)

        # the sorting key provides some flexibility in ordering the features
        # for example, user might not like the lexico-order of seqid
        self.nullkey = lambda x: (natsort_key(x.seqid), x.start, x.accn)
        self.key = key or self.nullkey

        if not filename:
            return

        for line in must_open(filename):
            if line[0] == "#" or (juncs and line.startswith('track name')):
                continue
            b = BedLine(line)
            if include and b.accn not in include:
                continue
            self.append(b)

        if sorted:
            self.sort(key=self.key)
github tanghaibao / jcvi / jcvi / apps / ks.py View on Github external
def print_to_anchors(self, outfile):
        fw = must_open(outfile, "w")
        for row in self:
            print(row.anchorline, file=fw)
        fw.close()
github tanghaibao / jcvi / jcvi / assembly / allmaps.py View on Github external
def write_unplaced_agp(agpfile, scaffolds, unplaced_agp):
    agp = AGP(agpfile)
    scaffolds_seen = set(x.component_id for x in agp)
    sizes = Sizes(scaffolds).mapping
    fwagp = must_open(unplaced_agp, "w")
    for s in natsorted(sizes.keys()):
        if s in scaffolds_seen:
            continue
        order_to_agp(s, [(s, "?")], sizes, fwagp)
    logging.debug("Write unplaced AGP to `{0}`.".format(unplaced_agp))
github tanghaibao / jcvi / jcvi / formats / fasta.py View on Github external
def parse_fasta(infile, upper=False):
    '''
    parse a fasta-formatted file and returns header
    can be a fasta file that contains multiple records.
    '''
    try:
        fp = must_open(infile)
    except:
        fp = infile
    # keep header
    fa_iter = (x[1] for x in groupby(fp, lambda row: row[0] == '>'))
    for header in fa_iter:
        header = next(header)
        if header[0] != '>':
            continue
        # drop '>'
        header = header.strip()[1:]
        # stitch the sequence lines together and make into upper case
        seq = "".join(s.strip() for s in next(fa_iter))
        if upper:
            seq = seq.upper()
        yield header, seq
github tanghaibao / jcvi / jcvi / formats / fasta.py View on Github external
def iteritems_ordered(self):
        for rec in SeqIO.parse(must_open(self.filename), "fasta"):
            yield rec.name, rec
github tanghaibao / jcvi / algorithms / tandem.py View on Github external
if b.hitlen < min(query_len, subject_len)*P/100.:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            homologs.join(query, subject)

        g = Grouper()
        for i, atom in enumerate(bed):
            for x in range(1, N+1):
                if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \
                    homologs.joined(bed[i-x].accn, atom.accn)]):
                    g.join(bed[i-x].accn, atom.accn)

    # dump the grouper
    fw = must_open(ofile, "w")
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print >>fw, ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >>sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family),
        ",".join(longest_family))