How to use the pysam.Tabixfile function in pysam

To help you get started, we’ve selected a few pysam examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github svviz / svviz / src / svviz / tabix.py View on Github external
if not bedPath.endswith(".gz"):
        if not os.path.exists(bedPath+".gz"):
            logging.info("bgzf compressing {}".format(bedPath))
            pysam.tabix_compress(bedPath, bedPath+".gz")
            if not os.path.exists(bedPath+".gz"):
                raise Exception("Failed to create compress {preset} file for {file}; make sure the {preset} file is "
                    "sorted and the directory is writeable".format(preset=preset, file=bedPath))
        bedPath += ".gz"
    if not os.path.exists(bedPath+".tbi"):
        logging.info("creating tabix index for {}".format(bedPath))
        pysam.tabix_index(bedPath, preset=preset)
        if not os.path.exists(bedPath+".tbi"):
            raise Exception("Failed to create tabix index file for {file}; make sure the {preset} file is "
                "sorted and the directory is writeable".format(preset=preset, file=bedPath))

    line = next(pysam.Tabixfile(bedPath).fetch())
    if len(line.strip().split("\t")) < 6 and preset == "bed":
        raise AnnotationError("BED files need to have at least 6 (tab-delimited) fields (including "
            "chrom, start, end, name, score, strand; score is unused)")
    if len(line.strip().split("\t")) < 9 and preset == "gff":
        raise AnnotationError("GFF/GTF files need to have at least 9 tab-delimited fields")

    return bedPath
github arq5x / gemini / gemini / gemini_annotate.py View on Github external
def _annotate_variants(args, conn, metadata, get_val_fn, col_names=None, col_types=None, col_ops=None):
    """Generalized annotation of variants with a new column.

    get_val_fn takes a list of annotations in a region and returns
    the value for that region to update the database with.

    Separates selection and identification of values from update,
    to avoid concurrent database access errors from sqlite, especially on
    NFS systems. The retained to_update list is small, but batching
    could help if memory issues emerge.
    """
    # For each, use Tabix to detect overlaps with the user-defined
    # annotation file.  Update the variant row with T/F if overlaps found.
    anno = pysam.Tabixfile(args.anno_file)
    naming = guess_contig_naming(anno)
    cursor = conn.bind.connect()
    add_requested_columns(args, cursor, col_names, col_types)
    conn.commit()
    cursor.close()

    conn, metadata = database.get_session_metadata(str(conn.bind.url))
    cursor = conn.bind.connect()

    total = 0
    update_size = 5000
    to_update = []

    select_res = cursor.execution_options(stream_results=True).execute('''SELECT chrom, start, end, ref, alt, variant_id FROM variants''')
    for row in select_res:
github secastel / allelecounter / allelecounter.py View on Github external
print("Generating pileup...");
	
	# instantiate temp file to dump to
	pileup_out = tempfile.NamedTemporaryFile(delete=False);
	
	#A unzip the VCF and convert to BED use samtools to produce a read pileup
	if args.vcf.endswith(".gz"):
		subprocess.check_output("gunzip -c "+args.vcf+" | awk 'BEGIN { OFS=\"\t\"; FS=\"\t\"; } { if (index($0, \"#\") == 0) { print($1,$2-1,$2,$3,$6,$4,$5,$7,$8,$9); } }' | samtools mpileup -d "+str(args.max_depth)+" -I -B -q 0 -Q 0 -s -l - -f "+args.ref+" "+args.bam+" > "+pileup_out.name, shell=True);
	else:
		print("FATAL ERROR: input VCF must be gzipped and indexed.")
		quit();
	
	# 2 process the mpileup result
	# A load the VCF
	#vcf_reader = vcf.Reader(filename=args.vcf);
	vcf_reader = pysam.Tabixfile(args.vcf,"r");
	vcf_map = sample_column_map(args.vcf);
	
	# B go through the pileup result line by line
	out_stream = open(args.o, "w");
	
	print("Processing pileup...");
	out_stream.write("contig	position	variantID	refAllele	altAllele	refCount	altCount	totalCount	lowMAPQDepth	lowBaseQDepth	rawDepth	otherBases\n");
	
	stream_in = open(pileup_out.name, "r");
	
	for line in stream_in:
		cols = line.replace("\n","").split("\t");
		#chr	pos	REF	count	reads
		chr = cols[0];
		pos = int(cols[1]);
		ref = "";
github biothings / myvariant.info / src / hub / dataload / sources / cadd / cadd_parser.py View on Github external
def load_contig(contig):
    """contig is #chrm, from 1 to 23, X and Y"""
    tabix = pysam.Tabixfile(cadd_file_path)
    data = fetch_generator(tabix, contig)
    for doc in data:
        yield doc
github phenopolis / phenopolis / views / __init__.py View on Github external
def parse_tabix_file_subset(tabix_filenames, subset_i, subset_n, record_parser):
    """
    Returns a generator of parsed record objects (as returned by record_parser) for the i'th out n subset of records
    across all the given tabix_file(s). The records are split by files and contigs within files, with 1/n of all contigs
    from all files being assigned to this the i'th subset.
    Args:
        tabix_filenames: a list of one or more tabix-indexed files. These will be opened using pysam.Tabixfile
        subset_i: zero-based number
        subset_n: total number of subsets
        record_parser: a function that takes a file-like object and returns a generator of parsed records
    """
    start_time = time.time()
    print(tabix_filenames)
    open_tabix_files = [pysam.Tabixfile(tabix_filename) for tabix_filename in tabix_filenames]
    tabix_file_contig_pairs = [(tabix_file, contig) for tabix_file in open_tabix_files for contig in tabix_file.contigs]
    # get every n'th tabix_file/contig pair
    tabix_file_contig_subset = tabix_file_contig_pairs[subset_i : : subset_n]
    short_filenames = ", ".join(map(os.path.basename, tabix_filenames))
    print(short_filenames)
    num_file_contig_pairs = len(tabix_file_contig_subset)
    print(("Loading subset %(subset_i)s of %(subset_n)s total: %(num_file_contig_pairs)s contigs from %(short_filenames)s") % locals())
    counter = 0
    for tabix_file, contig in tabix_file_contig_subset:
        header_iterator = tabix_file.header
        records_iterator = tabix_file.fetch(contig, 0, 10**9, multiple_iterators=True)
        for parsed_record in record_parser(itertools.chain(header_iterator, records_iterator)):
            counter += 1
            yield parsed_record
            if counter % 100000 == 0:
                seconds_elapsed = int(time.time()-start_time)
github RahmanTeam / CAVA / cava_ / data.py View on Github external
def __init__(self, options):
        # Openning tabix file representing the dbSNP database
        self.tabixfile = pysam.Tabixfile(options.args['dbsnp'])
github arq5x / gemini / gemini / annotations.py View on Github external
access a given handle and fetch data from it
    as follows:

    dbsnp_handle = annotations.annos['dbsnp']
    hits = dbsnp_handle.fetch(chrom, start, end)
    """
    anno_files = get_anno_files(args)
    for anno in anno_files:
        try:
            # .gz denotes Tabix files.
            if anno_files[anno].endswith(".gz"):
                if anno == "clinvar":
                    annos[anno] = pysam.Tabixfile(anno_files[anno],
                                                  encoding='utf8')
                else:
                    annos[anno] = pysam.Tabixfile(anno_files[anno])

            elif anno_files[anno].endswith(".bcf"):
                annos[anno] = cyvcf2.VCF(anno_files[anno])
            # .bw denotes BigWig files.
            elif anno_files[anno].endswith(".bw"):
                from bx.bbi.bigwig_file import BigWigFile
                annos[anno] = BigWigFile(open(anno_files[anno]))

        except IOError:
            raise IOError("Gemini cannot open this annotation file: %s. \n"
                          "Have you installed the annotation files?  If so, "
                          "have they been moved or deleted? Exiting...\n\n"
                          "For more details:\n\t"
                          "http://gemini.readthedocs.org/en/latest/content/"
                          "#installation.html\#installing-annotation-files\n"
                          % anno_files[anno])
github adamewing / bamsurgeon / bin / addsnv.py View on Github external
return None

        try:
            mutbase = mut(refbase, altbase)
            mutbase_list.append(mutbase)

        except ValueError as e:
            logger.warning(mutid_list[n] + " " + ' '.join(("skipped site:",chrom,str(hc[n]['start']),str(hc[n]['end']),"due to N base:",str(e),"\n")))
            return None

        mutstr_list.append(refbase + "-->" + str(mutbase))

    # optional CNV file
    cnv = None
    if (args.cnvfile):
        cnv = pysam.Tabixfile(args.cnvfile, 'r')

    hapstr = "_".join(('haplo',chrom,str(min(mutpos_list)),str(max(mutpos_list))))
    log = open('addsnv_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + "." + hapstr + ".log",'w')

    tmpoutbamname = args.tmpdir + "/" + hapstr + ".tmpbam." + str(uuid4()) + ".bam"
    logger.info("%s creating tmp bam: %s" % (hapstr, tmpoutbamname))
    outbam_muts = pysam.Samfile(tmpoutbamname, 'wb', template=bamfile)

    mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate(args, log, bamfile, bammate, chrom, min(mutpos_list), max(mutpos_list)+1, mutpos_list, avoid=avoid, mutid_list=mutid_list, is_snv=True, mutbase_list=mutbase_list, reffile=reffile)

    if mutfail:
        outbam_muts.close()
        os.remove(tmpoutbamname)
        return None

    # pick reads to change
github 10XGenomics / cellranger / tenkit / lib / python / tenkit / tabix.py View on Github external
def create_tabix_infile(file_name):
    return pysam.Tabixfile(file_name)