How to use the pysam.idxstats function in pysam

To help you get started, we’ve selected a few pysam examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pysam-developers / pysam / save / pysam_test2.6.py View on Github external
("pysam_ex1.depth", (pysam.depth, "ex1.bam" ) ),
                ),
          "faidx" : 
              ( 
                ("ex1.fa.fai", "faidx ex1.fa"), 
                ("pysam_ex1.fa.fai", (pysam.faidx, "ex1.fa") ),
                ),
          "index":
              (
                ("ex1.bam.bai", "index ex1.bam" ),
                ("pysam_ex1.bam.bai", (pysam.index, "pysam_ex1.bam" ) ),
                ),
          "idxstats" :
              ( 
                ("ex1.idxstats", "idxstats ex1.bam > ex1.idxstats" ),
                ("pysam_ex1.idxstats", (pysam.idxstats, "pysam_ex1.bam" ) ),
                ),
          "fixmate" :
              (
                ("ex1.fixmate", "fixmate ex1.bam ex1.fixmate" ),
                ("pysam_ex1.fixmate", (pysam.fixmate, "pysam_ex1.bam pysam_ex1.fixmate") ),
                ),
          "flagstat" :
              (
                ("ex1.flagstat", "flagstat ex1.bam > ex1.flagstat" ),
                ("pysam_ex1.flagstat", (pysam.flagstat, "pysam_ex1.bam") ),
                ),
          "calmd" :
              (
                ("ex1.calmd", "calmd ex1.bam ex1.fa > ex1.calmd" ),
                ("pysam_ex1.calmd", (pysam.calmd, "pysam_ex1.bam ex1.fa") ),
                ),
github CostaLab / reg-gen / rgt / CoverageSet.py View on Github external
def _init_read_number(self, bamFile):
        """Compute number of reads and number of mapped reads for CoverageSet"""
        # XXX ToDo add number of mapped reads in all cases
        # try:
        from distutils.version import LooseVersion
        if LooseVersion("0.9.0") <= LooseVersion(pysam.__version__):
            a = pysam.idxstats(bamFile)
            mapped_reads = sum([int(el.split('\t')[2]) for el in a.split('\n')[:len(a.split('\n')) - 1]])
            unmapped_read = sum([int(el.split('\t')[3]) for el in a.split('\n')[:len(a.split('\n')) - 1]])
            self.reads = mapped_reads + unmapped_read
            self.mapped_reads = mapped_reads
        else:
            self.reads = reduce(lambda x, y: x + y,
                                [eval('+'.join(l.rstrip('\n').split('\t')[2:])) for l in pysam.idxstats(bamFile)])
            self.mapped_reads = None
        # except:
github CostaLab / reg-gen / rgt / THOR / get_statistics.py View on Github external
def get_read_statistics(fname, chroms):
    """ return how many chromosomes are in each files and how many reads are correspondes to each file
    it should return a dictionary, mapped information
    use pysam 0.12. there is function,
    get_index_statistics(self): self, means the BAM file,
    return statistics about mapped/unmapped reads per chromosome as they are stored in the index.
    Returns: list  mapped, unmapped and total.
    Return type: a list of records for each chromosome. Each record has the attributes contig,
    """
    # in pysam 0.9, it uses idxstats...
    # f = pysam.Samfile(fname, "rb")
    stats_string = pysam.idxstats(fname)  # return in a string form and each info (chrom_name, total_length, mapped, unmapped) separated by '\n'
    stats_tmp = stats_string.split('\n')  # separate each info
    # using stats, we initialize the distribution of sample data, all we use is read_mapped data
    # change stats into dictionary list with only chromosome in chroms
    stats_total = [] # whole information in file including 0 reads

    isspatial = False
    isspatial_max = 0
    isspatial_len = 0

    for chrom in chroms:
        for each_stats in stats_tmp:
            tmp = each_stats.split()  # transform string into separate list
            # tmp in format [[chrom],[gene_len],[mapped_read_num],[unmapped_read_nums]]
            tmp[1:] = map(int, tmp[1:]) # change num from string to int
            if chrom in tmp:
                if tmp[2] > 0: # read_mapped data >0 will be recorded in stats_data
github CostaLab / reg-gen / rgt / ODIN / dpc_help.py View on Github external
# load the reference chromosomes
    genome_chromosomes = set()
    for s in FastaReader(genome_file):
        if s.name in genome_chromosomes:
            raise ValueError("{} is duplicate in {}".format(s.name, genome_file))

        genome_chromosomes.add(s.name)

    # check that the reference is compliant with the chromosome size file
    if genome_chromosomes != set(chrom_sizes):
        raise ValueError("reference and chromosome size file do not agree")

    # load bam files chromosome information
    if pysam.__version__.split(".")[1] > 8:
        idxstats_1 = filter(None, pysam.idxstats(bamfile1).split("\n"))
        idxstats_2 = filter(None, pysam.idxstats(bamfile2).split("\n"))
    else:
        idxstats_1 = pysam.idxstats(bamfile1)
        idxstats_2 = pysam.idxstats(bamfile2)

    chrom_bams_1 = dict(map(lambda x: x.split('\t')[0:2], idxstats_1))
    chrom_bams_2 = dict(map(lambda x: x.split('\t')[0:2], idxstats_2))

    # now check the chromosomes

    for chrom, end in chrom_bams_1.items():
        if chrom == "*":
            continue
        if chrom not in chrom_sizes:
            raise ValueError("{} has chromosome {} but it's not in reference".format(bamfile1, chrom))
        if chrom_sizes[chrom] != end:
github CostaLab / reg-gen / rgt / HINT / Footprinting.py View on Github external
tag_count = reads_file.get_tag_count(ref=f.chrom, start=p1, end=p2,
                                                 downstream_ext=downstream_ext, upstream_ext=upstream_ext,
                                                 forward_shift=forward_shift, reverse_shift=reverse_shift,
                                                 initial_clip=initial_clip)
        except Exception:
            tag_count = 0
        f.data = str(int(tag_count))

    ###################################################################################################
    # Writing output
    ###################################################################################################
    output_file_name = os.path.join(output_location, "{}.bed".format(output_prefix))
    footprints_overlap.write(output_file_name)

    # the number of reads
    lines = pysam.idxstats(reads_file.file_name).splitlines()
    num_reads = sum(map(int, [x.split("\t")[2] for x in lines if not x.startswith("#")]))

    # the number of peaks and tag count within peaks
    num_peaks = 0
    num_tc = 0
    for r in original_regions:
        num_peaks += 1
        try:
            tag_count = reads_file.get_tag_count(r.chrom, r.initial, r.final, downstream_ext, upstream_ext,
                                                 forward_shift, reverse_shift, initial_clip)
        except Exception:
            tag_count = 0
        num_tc += tag_count

    # the number of footprints
    num_fp = len(footprints_overlap)
github BuysDB / SingleCellMultiOmics / singlecellmultiomics / bamProcessing / bamFunctions.py View on Github external
def get_contigs_with_reads(bam_path: str, with_length: bool = False)  -> Generator :
    """
    Get all contigs with reads mapped to them

    Args:
        bam_path(str): path to bam file

        with_length(bool): also yield the length of the contig

    Yields:
        contig(str)

    """
    for line in pysam.idxstats(bam_path).split('\n'):
        try:
            contig, contig_len, mapped_reads, unmapped_reads = line.strip().split()
            mapped_reads, unmapped_reads = int(mapped_reads), int(unmapped_reads)
            if mapped_reads>0 or unmapped_reads>0:
                if with_length:
                    yield contig, int(contig_len)
                else:
                    yield contig
        except ValueError:
            pass
github CostaLab / reg-gen / rgt / ODIN / dpc_help.py View on Github external
for s in FastaReader(genome_file):
        if s.name in genome_chromosomes:
            raise ValueError("{} is duplicate in {}".format(s.name, genome_file))

        genome_chromosomes.add(s.name)

    # check that the reference is compliant with the chromosome size file
    if genome_chromosomes != set(chrom_sizes):
        raise ValueError("reference and chromosome size file do not agree")

    # load bam files chromosome information
    if pysam.__version__.split(".")[1] > 8:
        idxstats_1 = filter(None, pysam.idxstats(bamfile1).split("\n"))
        idxstats_2 = filter(None, pysam.idxstats(bamfile2).split("\n"))
    else:
        idxstats_1 = pysam.idxstats(bamfile1)
        idxstats_2 = pysam.idxstats(bamfile2)

    chrom_bams_1 = dict(map(lambda x: x.split('\t')[0:2], idxstats_1))
    chrom_bams_2 = dict(map(lambda x: x.split('\t')[0:2], idxstats_2))

    # now check the chromosomes

    for chrom, end in chrom_bams_1.items():
        if chrom == "*":
            continue
        if chrom not in chrom_sizes:
            raise ValueError("{} has chromosome {} but it's not in reference".format(bamfile1, chrom))
        if chrom_sizes[chrom] != end:
            raise ValueError("{} has chromosome {} but size is wrong".format(bamfile1, chrom))

    for chrom, end in chrom_bams_2.items():
github etal / cnvkit / cnvlib / samutil.py View on Github external
def idxstats(bam_fname, drop_unmapped=False):
    """Get chromosome names, lengths, and number of mapped/unmapped reads.

    Use the BAM index (.bai) to get the number of reads and size of each
    chromosome. Contigs with no mapped reads are skipped.
    """
    handle = StringIO(pysam.idxstats(bam_fname, split_lines=False))
    table = pd.read_csv(handle, sep='\t', header=None,
                        names=['chromosome', 'length', 'mapped', 'unmapped'])
    if drop_unmapped:
        table = table[table.mapped != 0].drop('unmapped', axis=1)
    return table