How to use the jcvi.formats.fasta.Fasta function in jcvi

To help you get started, we’ve selected a few jcvi examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github tanghaibao / jcvi / jcvi / assembly / ca.py View on Github external
%prog blastfile fastafiles

    Calculate the vector clear range file based BLAST to the vectors.
    """
    p = OptionParser(clr.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    blastfile = args[0]
    fastafiles = args[1:]

    sizes = {}
    for fa in fastafiles:
        f = Fasta(fa)
        sizes.update(f.itersizes())

    b = Blast(blastfile)
    for query, hits in b.iter_hits():

        qsize = sizes[query]
        vectors = list((x.qstart, x.qstop) for x in hits)
        vmin, vmax = range_minmax(vectors)

        left_size = vmin - 1
        right_size = qsize - vmax

        if left_size > right_size:
            clr_start, clr_end = 0, vmin
        else:
            clr_start, clr_end = vmax, qsize
github tanghaibao / jcvi / jcvi / formats / fasta.py View on Github external
Take number of records randomly from fasta
    """
    from random import sample

    p = OptionParser(random.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, N = args
    N = int(N)
    assert N > 0

    f = Fasta(fastafile)
    fw = must_open("stdout", "w")

    for key in sample(f.keys(), N):
        rec = f[key]
        SeqIO.write([rec], fw, "fasta")

    fw.close()
github tanghaibao / jcvi / jcvi / apps / phylo.py View on Github external
opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    mcscanfile, cdsfile = args

    if opts.addtandem:
        tandemfile = opts.addtandem
        mcscanfile_with_tandems = add_tandems(mcscanfile, tandemfile)
        mcscanfile = mcscanfile_with_tandems

    seqdir = opts.outdir
    mkdir(seqdir)
    f = Fasta(cdsfile)
    fp = must_open(mcscanfile)
    if opts.writecolors:
        fc = must_open("leafcolors.txt", "w")

    n = 0
    for i, row in enumerate(fp):
        row = row.strip().split("\t")
        if i == 0:
            l = len(row)
            if l <= 20:
                colors = discrete_rainbow(l, shuffle=False)[1]
            else:
                colors = discrete_rainbow(l, usepreset=False, shuffle=False)[1]
                warnings.warn("*** WARNING ***\n" \
                    "Too many columns. Colors may not be all distinctive.")
github tanghaibao / jcvi / jcvi / formats / contig.py View on Github external
from jcvi.formats.bed import Bed
    from jcvi.utils.cbook import fill

    p = OptionParser(frombed.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, contigfasta, readfasta = args
    prefix = bedfile.rsplit(".", 1)[0]
    contigfile = prefix + ".contig"
    idsfile = prefix + ".ids"

    contigfasta = Fasta(contigfasta)
    readfasta = Fasta(readfasta)

    bed = Bed(bedfile)
    checksum = "00000000 checksum."
    fw_ids = open(idsfile, "w")
    fw = open(contigfile, "w")

    for ctg, reads in bed.sub_beds():
        ctgseq = contigfasta[ctg]
        ctgline = "##{0} {1} {2} bases, {3}".format(\
                ctg, len(reads), len(ctgseq), checksum)

        print(ctg, file=fw_ids)
        print(ctgline, file=fw)
        print(fill(ctgseq.seq), file=fw)

        for b in reads:
github tanghaibao / jcvi / jcvi / projects / tgbs.py View on Github external
Scan the headers for the consensus clusters and count the number of reads.
    """
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(count.__doc__)
    p.add_option("--csv", help="Write depth per contig to file")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    csv = open(opts.csv, "w") if opts.csv else None

    f = Fasta(fastafile, lazy=True)
    sizes = []
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            sizes.append(1)
            continue

        # consensus_for_cluster_0 with 63 sequences
        if "with" in desc:
            name, w, size, seqs = desc.split()
            if csv:
                print("\t".join(str(x)
                                for x in (name, size, len(rec))), file=csv)
            assert w == "with"
            sizes.append(int(size))
        # MRD85:00603:02472;size=167;
        else:
github tanghaibao / jcvi / jcvi / formats / fasta.py View on Github external
def make_qual(fastafile, score=OKQUAL):
    logging.warning("assume qual ({0})".format(score))
    qualfile = fastafile.rsplit(".", 1)[0] + ".qual"
    fw = open(qualfile, "w")
    fasta = Fasta(fastafile, lazy=True)
    score = str(score) + " "
    for entry, size in fasta.itersizes_ordered():
        print(">" + entry, file=fw)
        print(score * size, file=fw)
    fw.close()
    return qualfile
github tanghaibao / jcvi / jcvi / assembly / ca.py View on Github external
if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    libID = fastafile.split(".")[0]
    depth = opts.depth
    readlen = opts.readlen
    shift = opts.shift

    outfile = libID + ".depth{0}".format(depth)
    if opts.fasta:
        outfile += ".fasta"
    else:
        outfile += ".frg"
    f = Fasta(fastafile, lazy=True)

    fw = must_open(outfile, "w", checkexists=True)
    if not opts.fasta:
       print(headerTemplate.format(libID=libID), file=fw)

    """
    Taken from runCA:

                    |*********|
                    |###################|
    |--------------------------------------------------|
     ---------------1---------------
               ---------------2---------------
                         ---------------3---------------
    *** - center_increments
    ### - center_range_width
github tanghaibao / jcvi / jcvi / apps / cdhit.py View on Github external
p = OptionParser(filter.__doc__)
    p.add_option("--minsize", default=2, type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastafiles = args
    minsize = opts.minsize
    totalreads = totalassembled = 0
    fw = must_open(opts.outfile, "w")
    for i, fastafile in enumerate(fastafiles):
        f = Fasta(fastafile, lazy=True)
        pf = "s{0:03d}".format(i)
        nreads = nsingletons = nclusters = 0
        for desc, rec in f.iterdescriptions_ordered():
            nclusters += 1
            if desc.startswith("singleton"):
                nsingletons += 1
                nreads += 1
                continue
            # consensus_for_cluster_0 with 63 sequences
            name, w, size, seqs = desc.split()
            assert w == "with"
            size = int(size)
            nreads += size
            if size < minsize:
                continue
            rec.description = rec.description.split(None, 1)[-1]
github tanghaibao / jcvi / jcvi / compara / catalog.py View on Github external
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \
    evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False):

    if genefam:
        N = 1e5

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file)
    order = bed.order

    if is_self:
        # filter the blast file
        g = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len)*P/100.:
                continue
github tanghaibao / jcvi / jcvi / assembly / postprocess.py View on Github external
fw = open(finalfasta, "w")
    minimusfasta = op.join(closuredir, prefix + ".fasta")
    f = Fasta(minimusfasta)
    for id, rec in f.iteritems_ordered():
        if id in excludecontigs:
            continue
        SeqIO.write([rec], fw, "fasta")

    singletonfile = op.join(closuredir, prefix + ".singletons")
    singletons = set(x.strip() for x in open(singletonfile))
    leftovers = singletons & originalIDs

    logging.debug("Pull leftover singletons: {0}".\
            format(", ".join(sorted(leftovers))))

    f = Fasta(ctgfasta)
    for id, rec in f.iteritems_ordered():
        if id not in leftovers:
            continue
        SeqIO.write([rec], fw, "fasta")

    fw.close()

    fastafile = finalfasta
    finalfasta = fastafile.rstrip("_")
    format([fastafile, finalfasta, "--sequential", "--pad0=3",
        "--prefix={0}_".format(rid)])

    logging.debug("Improved FASTA written to `{0}`.".format(finalfasta))

    n50([ctgfasta])
    n50([finalfasta])