Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_all_records(self):
for datum in test_data:
vcf_reader = vcf.Reader(filename=datum.vcf_file)
bcf_file = pysam.VariantFile(datum.vcf_file)
pyvcf_records = list(vcf_reader)
pysam_records = list(bcf_file)
self.verify_records(datum, pyvcf_records, pysam_records)
bcf_file.close()
def test_vcf_gz_set_meta():
vcf_gz = VcfGz()
with get_input_files('1.vcf_bgzip') as input_files, get_dataset(input_files[0], index_attr='tabix_index') as dataset:
vcf_gz.set_meta(dataset)
f = pysam.VariantFile(dataset.file_name, index_filename=dataset.metadata.tabix_index.file_name)
assert isinstance(f.index, pysam.libcbcf.TabixIndex) is True
else:
raise ValueError("Unexpected variant caller for PhyloWGS prep: %s" % vcaller)
out_file = os.path.join(work_dir, "%s-%s-prep.vcf" % (utils.splitext_plus(os.path.basename(in_file))[0],
vcaller))
if not utils.file_uptodate(out_file, in_file):
check_fn = _min_sample_pass(ignore_file)
with file_transaction(somatic_info.tumor_data, out_file) as tx_out_file:
tx_out_file_raw = "%s-raw%s" % utils.splitext_plus(tx_out_file)
# Filter inputs
with VariantFile(in_file) as bcf_in:
depths = [_sample_depth(rec, somatic_info.tumor_name) for rec in
filter(check_fn, bcf_in)]
depths.sort(reverse=True)
depth_thresh = depths[:config["sample_size"]][-1] if depths else 0
with VariantFile(in_file) as bcf_in:
with VariantFile(tx_out_file_raw, "w", header=bcf_in.header) as bcf_out:
for rec in bcf_in:
if (check_fn(rec) and
(depth_thresh < 5 or _sample_depth(rec, somatic_info.tumor_name) >= depth_thresh)):
bcf_out.write(rec)
# Fix potential chromosome issues
with open(tx_out_file_raw) as in_handle:
with open(tx_out_file, "w") as out_handle:
for line in in_handle:
if not line.startswith("#"):
parts = line.split("\t")
parts[0] = _phylowgs_compatible_chroms(parts[0])
line = "\t".join(parts)
out_handle.write(line)
return variant_type, out_file
if x.split()[0] == chromosome:
lines |= {y for y in range(int(x.split()[1]), int(x.split()[2]))}
# loop through allsites for chromosome
counter = 0
fasta_string = '>{}:{}-{}'.format(chromosome, start, stop)
if pol != 'None':
wga_bed = pysam.TabixFile(pol)
else:
wga_bed = None
# move through vcf
print(fasta_string)
fasta_string = ''
prev_position = start
for line in VariantFile(all_sites).fetch(chromosome, start, stop):
# catch indels that overlap start and trim
position = int(line.pos)
if position < start + 1:
continue
# catch missing sites in allsites (new gatk3.7 feature)
diff = position - prev_position
if diff != 1:
missed_bases = ''.join(['1' for i in range(0, diff-1)])
fasta_string += missed_bases
prev_position = position
# add line break every 60 bases
if len(fasta_string) >= 60:
if len(fasta_string) == 60:
:param vcf_in_fname:
:param vcf_out_fname:
:param ref_fname:
:param bg_fname:
:param d:
:return:
"""
measures = [ShannonEntropy(k=k) for k in [1, 2, 4]] + [LinguisticComplexity(), Mappability(bed_graph=bg_fname)]
info_tags = [m.header_info()[0] for m in measures]
logger.debug('Starting annotation ...')
t0 = time.time()
mode = 'rb' if vcf_in_fname.endswith('bcf') else 'r'
vcf_in = pysam.VariantFile(vcf_in_fname, mode)
for m in measures:
vcf_in.header.info.add(*m.header_info())
vcf_out = pysam.VariantFile(vcf_out_fname, mode='w', header=vcf_in.header)
ref_fp = pysam.FastaFile(ref_fname)
d_2 = int(window_size / 2)
v_cnt = 0
for v_cnt, v in enumerate(vcf_in):
for tag, m in zip(info_tags, measures):
v.info[tag] = m.complexity(
s=ref_fp.fetch(reference=v.contig, start=v.start - d_2, end=v.stop + d_2).upper(),
chrom=v.contig, pos=v.pos)
vcf_out.write(v)
if v_cnt % 10000 == 9999:
t1 = time.time()
def filter_somatic(in_vcf_path, out_vcf_path):
in_vcf = VariantFile(in_vcf_path)
out_vcf = VariantFile(out_vcf_path, 'w', header=in_vcf.header)
num_skipped_records = 0
for rec in in_vcf:
if is_somatic(rec, in_vcf.header):
try:
out_vcf.write(rec)
except OSError:
num_skipped_records += 1
print("Skipped " + str(num_skipped_records) + " bad records")
in_vcf.close()
out_vcf.close()
def extract_fp_fn(fname_in, prefix_out):
"""
:param fname_in:
:param prefix_out:
:return:
"""
logger.debug('Starting filtering ...')
t0 = time.time()
mode = 'rb' if fname_in.endswith('bcf') else 'r'
vcf_in = pysam.VariantFile(fname_in, mode)
fp_vcf_out = pysam.VariantFile(prefix_out + '-fp.vcf', mode='w', header=vcf_in.header)
fp_roi_bed = open(prefix_out + '-fp-roi.bed', 'w')
fn_vcf_out = pysam.VariantFile(prefix_out + '-fn.vcf', mode='w', header=vcf_in.header)
fn_roi_bed = open(prefix_out + '-fn-roi.bed', 'w')
bam_in = pysam.AlignmentFile(fname_in[:-3] + 'bam', "rb")#caner
fp_bam_out = pysam.AlignmentFile(prefix_out+'-fp.bam', "wb", template=bam_in) #caner
fn_bam_out = pysam.AlignmentFile(prefix_out+'-fn.bam', "wb", template=bam_in) #caner
n, fp_cnt, fn_cnt = -1, 0, 0
#CANER - distribution of variant sizes
fp_ins_size_in_curr_bin=0
fp_del_size_in_curr_bin=0
fn_ins_size_in_curr_bin=0
fn_del_size_in_curr_bin=0
fp_snp_size_in_curr_bin=0
fn_snp_size_in_curr_bin=0
fp_subst_size_in_curr_bin=0
fn_subst_size_in_curr_bin=0
e1 = (1.0 / a1) * (((n + 1.0) / (3.0 * (n - 1.0))) - (1.0 / a1))
e2 = (1.0 / (a1**2 + a2)) * \
(((2.0 * (n**2 + n + 3.0)) / ((9.0 * n) * (n - 1.0))) -
((n + 2.0) / (n * a1)) +
(a2 / a1**2))
# print(e2)
vd = (e1 * segsites) + ((e2 * segsites) * (segsites - 1.0))
big_d = little_d / math.sqrt(vd)
return big_d
# get variant site info
allele_freqs = {}
for line in pysam.VariantFile(vcf_file).fetch():
chromo = line.contig
# overcomes lack of z chromosome in zebrafinch data
if skipZ is True and chromo == 'chrZ':
continue
else:
alt_allele_freq = round(line.info['AF'][0], 2)
ref_seq = line.ref
alt_seq = line.alts[0]
# if INDEL mode works out if polarised and if so if del or ins
if mode == 'INDEL':
variant_type = 'Unpolarised'
try:
ancestral_sequence = line.info['AA']
except KeyError:
def loadSNPSFromVcf(self, vcfFilePath, locations=None):
for rec in pysam.VariantFile(vcfFilePath):
if locations is None or (rec.chrom, rec.pos) in locations:
for sample in rec.samples:
# get genotypes:
for allele in rec.samples[sample].alleles:
try:
self.addVariant(
chromosome=rec.chrom,
start=rec.pos,
value=allele,
name='SNP',
variantType='SNP',
end=None)
except BaseException:
pass
def max_normal_germline_depth(in_file, params, somatic_info):
"""Calculate threshold for excluding potential heterozygotes based on normal depth.
"""
bcf_in = pysam.VariantFile(in_file)
depths = []
for rec in bcf_in:
stats = _is_possible_loh(rec, bcf_in, params, somatic_info)
if tz.get_in(["normal", "depth"], stats):
depths.append(tz.get_in(["normal", "depth"], stats))
if depths:
return np.median(depths) * NORMAL_FILTER_PARAMS["max_depth_percent"]