Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
("pysam_ex1.depth", (pysam.depth, "ex1.bam" ) ),
),
"faidx" :
(
("ex1.fa.fai", "faidx ex1.fa"),
("pysam_ex1.fa.fai", (pysam.faidx, "ex1.fa") ),
),
"index":
(
("ex1.bam.bai", "index ex1.bam" ),
("pysam_ex1.bam.bai", (pysam.index, "pysam_ex1.bam" ) ),
),
"idxstats" :
(
("ex1.idxstats", "idxstats ex1.bam > ex1.idxstats" ),
("pysam_ex1.idxstats", (pysam.idxstats, "pysam_ex1.bam" ) ),
),
"fixmate" :
(
("ex1.fixmate", "fixmate ex1.bam ex1.fixmate" ),
("pysam_ex1.fixmate", (pysam.fixmate, "pysam_ex1.bam pysam_ex1.fixmate") ),
),
"flagstat" :
(
("ex1.flagstat", "flagstat ex1.bam > ex1.flagstat" ),
("pysam_ex1.flagstat", (pysam.flagstat, "pysam_ex1.bam") ),
),
"calmd" :
(
("ex1.calmd", "calmd ex1.bam ex1.fa > ex1.calmd" ),
("pysam_ex1.calmd", (pysam.calmd, "pysam_ex1.bam ex1.fa") ),
),
def _init_read_number(self, bamFile):
"""Compute number of reads and number of mapped reads for CoverageSet"""
# XXX ToDo add number of mapped reads in all cases
# try:
from distutils.version import LooseVersion
if LooseVersion("0.9.0") <= LooseVersion(pysam.__version__):
a = pysam.idxstats(bamFile)
mapped_reads = sum([int(el.split('\t')[2]) for el in a.split('\n')[:len(a.split('\n')) - 1]])
unmapped_read = sum([int(el.split('\t')[3]) for el in a.split('\n')[:len(a.split('\n')) - 1]])
self.reads = mapped_reads + unmapped_read
self.mapped_reads = mapped_reads
else:
self.reads = reduce(lambda x, y: x + y,
[eval('+'.join(l.rstrip('\n').split('\t')[2:])) for l in pysam.idxstats(bamFile)])
self.mapped_reads = None
# except:
def get_read_statistics(fname, chroms):
""" return how many chromosomes are in each files and how many reads are correspondes to each file
it should return a dictionary, mapped information
use pysam 0.12. there is function,
get_index_statistics(self): self, means the BAM file,
return statistics about mapped/unmapped reads per chromosome as they are stored in the index.
Returns: list mapped, unmapped and total.
Return type: a list of records for each chromosome. Each record has the attributes contig,
"""
# in pysam 0.9, it uses idxstats...
# f = pysam.Samfile(fname, "rb")
stats_string = pysam.idxstats(fname) # return in a string form and each info (chrom_name, total_length, mapped, unmapped) separated by '\n'
stats_tmp = stats_string.split('\n') # separate each info
# using stats, we initialize the distribution of sample data, all we use is read_mapped data
# change stats into dictionary list with only chromosome in chroms
stats_total = [] # whole information in file including 0 reads
isspatial = False
isspatial_max = 0
isspatial_len = 0
for chrom in chroms:
for each_stats in stats_tmp:
tmp = each_stats.split() # transform string into separate list
# tmp in format [[chrom],[gene_len],[mapped_read_num],[unmapped_read_nums]]
tmp[1:] = map(int, tmp[1:]) # change num from string to int
if chrom in tmp:
if tmp[2] > 0: # read_mapped data >0 will be recorded in stats_data
# load the reference chromosomes
genome_chromosomes = set()
for s in FastaReader(genome_file):
if s.name in genome_chromosomes:
raise ValueError("{} is duplicate in {}".format(s.name, genome_file))
genome_chromosomes.add(s.name)
# check that the reference is compliant with the chromosome size file
if genome_chromosomes != set(chrom_sizes):
raise ValueError("reference and chromosome size file do not agree")
# load bam files chromosome information
if pysam.__version__.split(".")[1] > 8:
idxstats_1 = filter(None, pysam.idxstats(bamfile1).split("\n"))
idxstats_2 = filter(None, pysam.idxstats(bamfile2).split("\n"))
else:
idxstats_1 = pysam.idxstats(bamfile1)
idxstats_2 = pysam.idxstats(bamfile2)
chrom_bams_1 = dict(map(lambda x: x.split('\t')[0:2], idxstats_1))
chrom_bams_2 = dict(map(lambda x: x.split('\t')[0:2], idxstats_2))
# now check the chromosomes
for chrom, end in chrom_bams_1.items():
if chrom == "*":
continue
if chrom not in chrom_sizes:
raise ValueError("{} has chromosome {} but it's not in reference".format(bamfile1, chrom))
if chrom_sizes[chrom] != end:
tag_count = reads_file.get_tag_count(ref=f.chrom, start=p1, end=p2,
downstream_ext=downstream_ext, upstream_ext=upstream_ext,
forward_shift=forward_shift, reverse_shift=reverse_shift,
initial_clip=initial_clip)
except Exception:
tag_count = 0
f.data = str(int(tag_count))
###################################################################################################
# Writing output
###################################################################################################
output_file_name = os.path.join(output_location, "{}.bed".format(output_prefix))
footprints_overlap.write(output_file_name)
# the number of reads
lines = pysam.idxstats(reads_file.file_name).splitlines()
num_reads = sum(map(int, [x.split("\t")[2] for x in lines if not x.startswith("#")]))
# the number of peaks and tag count within peaks
num_peaks = 0
num_tc = 0
for r in original_regions:
num_peaks += 1
try:
tag_count = reads_file.get_tag_count(r.chrom, r.initial, r.final, downstream_ext, upstream_ext,
forward_shift, reverse_shift, initial_clip)
except Exception:
tag_count = 0
num_tc += tag_count
# the number of footprints
num_fp = len(footprints_overlap)
def get_contigs_with_reads(bam_path: str, with_length: bool = False) -> Generator :
"""
Get all contigs with reads mapped to them
Args:
bam_path(str): path to bam file
with_length(bool): also yield the length of the contig
Yields:
contig(str)
"""
for line in pysam.idxstats(bam_path).split('\n'):
try:
contig, contig_len, mapped_reads, unmapped_reads = line.strip().split()
mapped_reads, unmapped_reads = int(mapped_reads), int(unmapped_reads)
if mapped_reads>0 or unmapped_reads>0:
if with_length:
yield contig, int(contig_len)
else:
yield contig
except ValueError:
pass
for s in FastaReader(genome_file):
if s.name in genome_chromosomes:
raise ValueError("{} is duplicate in {}".format(s.name, genome_file))
genome_chromosomes.add(s.name)
# check that the reference is compliant with the chromosome size file
if genome_chromosomes != set(chrom_sizes):
raise ValueError("reference and chromosome size file do not agree")
# load bam files chromosome information
if pysam.__version__.split(".")[1] > 8:
idxstats_1 = filter(None, pysam.idxstats(bamfile1).split("\n"))
idxstats_2 = filter(None, pysam.idxstats(bamfile2).split("\n"))
else:
idxstats_1 = pysam.idxstats(bamfile1)
idxstats_2 = pysam.idxstats(bamfile2)
chrom_bams_1 = dict(map(lambda x: x.split('\t')[0:2], idxstats_1))
chrom_bams_2 = dict(map(lambda x: x.split('\t')[0:2], idxstats_2))
# now check the chromosomes
for chrom, end in chrom_bams_1.items():
if chrom == "*":
continue
if chrom not in chrom_sizes:
raise ValueError("{} has chromosome {} but it's not in reference".format(bamfile1, chrom))
if chrom_sizes[chrom] != end:
raise ValueError("{} has chromosome {} but size is wrong".format(bamfile1, chrom))
for chrom, end in chrom_bams_2.items():
def idxstats(bam_fname, drop_unmapped=False):
"""Get chromosome names, lengths, and number of mapped/unmapped reads.
Use the BAM index (.bai) to get the number of reads and size of each
chromosome. Contigs with no mapped reads are skipped.
"""
handle = StringIO(pysam.idxstats(bam_fname, split_lines=False))
table = pd.read_csv(handle, sep='\t', header=None,
names=['chromosome', 'length', 'mapped', 'unmapped'])
if drop_unmapped:
table = table[table.mapped != 0].drop('unmapped', axis=1)
return table