How to use the jcvi.formats.base.DictFile function in jcvi

To help you get started, we’ve selected a few jcvi examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github tanghaibao / jcvi / jcvi / compara / catalog.py View on Github external
def make_ortholog(blocksfile, rbhfile, orthofile):
    from jcvi.formats.base import DictFile

    # Generate mapping both ways
    adict = DictFile(rbhfile)
    bdict = DictFile(rbhfile, keypos=1, valuepos=0)
    adict.update(bdict)

    fp = open(blocksfile)
    fw = open(orthofile, "w")
    nrecruited = 0
    for row in fp:
        a, b = row.split()
        if b == '.':
            if a in adict:
                b = adict[a]
                nrecruited += 1
                b += "'"
        print("\t".join((a, b)), file=fw)

    logging.debug("Recruited {0} pairs from RBH.".format(nrecruited))
github tanghaibao / jcvi / jcvi / formats / fasta.py View on Github external
noversion = opts.noversion
    sequential = opts.sequential
    sequentialoffset = opts.sequentialoffset
    sep = opts.sep
    idx = opts.index
    mapfile = opts.switch
    annotfile = opts.annotation
    desc = not opts.nodesc
    idsfile = opts.ids
    idsfile = open(idsfile, "w") if idsfile else None
    upper = opts.upper

    if mapfile:
        mapping = DictFile(mapfile, delimiter="\t")
    if annotfile:
        annotation = DictFile(annotfile, delimiter="\t")

    fp = SeqIO.parse(must_open(infasta), "fasta")
    fw = must_open(outfasta, "w")
    for i, rec in enumerate(fp):
        origid = rec.id
        description = rec.description.replace(origid, "").strip()
        if sep:
            rec.id = rec.description.split(sep)[idx].strip()
        if gb:
            # gi|262233616|gb|GU123895.1| Coffea arabica clone BAC
            atoms = rec.id.split("|")
            if len(atoms) >= 3:
                rec.id = atoms[3]
            elif len(atoms) == 2:
                rec.id = atoms[1]
        if pairs:
github tanghaibao / jcvi / jcvi / formats / blast.py View on Github external
"""
    from jcvi.formats.base import DictFile

    p = OptionParser(annotation.__doc__)
    p.add_option("--queryids", help="Query IDS file to switch [default: %default]")
    p.add_option("--subjectids", help="Subject IDS file to switch [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args

    d = "\t"
    qids = DictFile(opts.queryids, delimiter=d) if opts.queryids else None
    sids = DictFile(opts.subjectids, delimiter=d) if opts.subjectids else None
    blast = Blast(blastfile)
    for b in blast:
        query, subject = b.query, b.subject
        if qids:
            query = qids[query]
        if sids:
            subject = sids[subject]
        print("\t".join((query, subject)))
github tanghaibao / jcvi / jcvi / formats / gff.py View on Github external
def rename(args):
    """
    %prog rename in.gff3 switch.ids > reindexed.gff3

    Change the IDs within the gff3.
    """
    p = OptionParser(rename.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ingff3, switch = args
    switch = DictFile(switch)

    gff = Gff(ingff3)
    for g in gff:
        id, = g.attributes["ID"]
        newname = switch.get(id, id)
        g.attributes["ID"] = [newname]

        if "Parent" in g.attributes:
            parents = g.attributes["Parent"]
            g.attributes["Parent"] = [switch.get(x, x) for x in parents]

        g.update_attributes()
        print(g)
github tanghaibao / jcvi / jcvi / formats / gff.py View on Github external
invent_name_attr = opts.invent_name_attr
    invent_protein_feat = opts.invent_protein_feat
    compute_signature = False

    outfile = opts.outfile

    mapping = None
    mod_attrs = set()
    if mapfile and op.isfile(mapfile):
        mapping = DictFile(mapfile, delimiter="\t", strict=strict)
        mod_attrs.add("ID")
    if note:
        note = DictFile(note, delimiter="\t", strict=strict)
        mod_attrs.add("Note")
    if source and op.isfile(source):
        source = DictFile(source, delimiter="\t", strict=strict)
    if ftype and op.isfile(ftype):
        ftype = DictFile(ftype, delimiter="\t", strict=strict)
    if names:
        names = DictFile(names, delimiter="\t", strict=strict)
        mod_attrs.add("Name")

    if attrib_files:
        attr_values = {}
        for fn in attrib_files:
            attr_name = op.basename(fn).rsplit(".", 1)[0]
            if attr_name not in reserved_gff_attributes:
                attr_name = attr_name.lower()
            attr_values[attr_name] = DictFile(fn, delimiter="\t", strict=strict)
            mod_attrs.add(attr_name)
    if dbxref_files:
        dbxref_values = {}
github tanghaibao / jcvi / jcvi / assembly / goldenpath.py View on Github external
certificates based on agpfile. At first, it seems counter-productive to
    convert first agp to certificates then certificates back to agp.

    The certificates provide a way to edit the overlap information, so that the
    agpfile can be corrected (without changing agpfile directly).
    """
    from jcvi.formats.base import DictFile

    p = OptionParser(agp.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    tpffile, certificatefile, agpfile = args
    orientationguide = DictFile(tpffile, valuepos=2)
    cert = Certificate(certificatefile)
    cert.write_AGP(agpfile, orientationguide=orientationguide)
github tanghaibao / jcvi / jcvi / formats / gff.py View on Github external
JCVI.Medtr.v4.fasta -o features.fasta
    """
    from jcvi.formats.fasta import longestorf

    p = OptionParser(orient.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ingff3, fastafile = args
    idsfile = fastafile.rsplit(".", 1)[0] + ".orf.ids"
    if need_update(fastafile, idsfile):
        longestorf([fastafile, "--ids"])

    orientations = DictFile(idsfile)
    gff = Gff(ingff3)
    flipped = 0
    for g in gff:
        id = None
        for tag in ("ID", "Parent"):
            if tag in g.attributes:
                id, = g.attributes[tag]
                break
        assert id

        orientation = orientations.get(id, "+")
        if orientation == '-':
            g.strand = {"+": "-", "-": "+"}[g.strand]
            flipped += 1

        print(g)
github tanghaibao / jcvi / jcvi / graphics / tree.py View on Github external
num_leaves = len(t.get_leaf_names())
    yinterval = canvas / (num_leaves + 1)

    # get exons structures, if any
    structures = {}
    if gffdir:
        gffiles = glob("{0}/*.gff*".format(gffdir))
        setups, ratio = get_setups(gffiles, canvas=rmargin / 2, noUTR=True)
        structures = dict((a, (b, c)) for a, b, c in setups)

    if sizes:
        sizes = Sizes(sizes).mapping

    if barcodefile:
        barcodemap = DictFile(barcodefile, delimiter="\t")

    if leafcolorfile:
        leafcolors = DictFile(leafcolorfile, delimiter="\t")

    coords = {}
    i = 0
    for n in t.traverse("postorder"):
        dist = n.get_distance(t)
        xx = xstart + scale * dist

        if n.is_leaf():
            yy = ystart - i * yinterval
            i += 1

            if trunc_name:
                name = truncate_name(n.name, rule=trunc_name)
github tanghaibao / jcvi / jcvi / annotation / stats.py View on Github external
gb = opts.groupby
    g = make_index(gff_file)

    tf = "transcript.sizes"
    if need_update(gff_file, tf):
        fw = open(tf, "w")
        for feat in g.features_of_type("mRNA"):
            fid = feat.id
            conf_class = feat.attributes.get(gb, "all")
            tsize = sum((c.stop - c.start + 1) for c in g.children(fid, 1) \
                             if c.featuretype == "exon")
            print("\t".join((fid, str(tsize), conf_class)), file=fw)
        fw.close()

    tsizes = DictFile(tf, cast=int)
    conf_classes = DictFile(tf, valuepos=2)
    logging.debug("A total of {0} transcripts populated.".format(len(tsizes)))

    genes = []
    for feat in g.features_of_type("gene"):
        fid = feat.id
        transcripts = [c.id for c in g.children(fid, 1) \
                         if c.featuretype == "mRNA"]
        transcript_sizes = [tsizes[x] for x in transcripts]
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                         if c.featuretype == "exon")
        conf_class = conf_classes[transcripts[0]]
        gs = GeneStats(feat, conf_class, transcript_sizes, exons)
        genes.append(gs)

    r = {}  # Report
    distinct_groups = set(conf_classes.values())