Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def process_vcf(self, inputfile):
"""
Main function for parsing VCF
"""
# initialize reference genome
fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000)
# initialize vcf reader
if self.args.samplefile:
keep_samples = parseSampleFile(self.args.samplefile)
vcf_reader = VCF(
inputfile,
mode='rb',
gts012=True,
lazy=True,
samples=keep_samples)
else:
vcf_reader = VCF(inputfile, mode='rb', gts012=True, lazy=True)
nbp = (self.args.length - 1) // 2
# index samples
if (self.args.samplefile and self.args.groupvar):
all_samples = vcf_reader.samples
sg_dict = indexGroups(self.args.samplefile, self.args.groupvar)
samples = sorted(list(set(sg_dict.values())))
def variants():
vcf = VCF(str(HERE.parent.joinpath('examples', 'sample.vcf')), gts012 = True)
return list(vcf)
This implies that some compounds will have the status 'not_loaded'=True.
When loading all variants for a region then all variants should
have status 'not_loaded'=False.
"""
adapter = real_populated_database
variant_type = "clinical"
category = "snv"
## GIVEN a database without any variants
assert adapter.variant_collection.find_one() is None
institute_obj = adapter.institute_collection.find_one()
institute_id = institute_obj["_id"]
## WHEN loading variants into the database without updating compound information
vcf_obj = VCF(variant_clinical_file)
rank_results_header = parse_rank_results_header(vcf_obj)
vep_header = parse_vep_header(vcf_obj)
individual_positions = {}
for i, ind in enumerate(vcf_obj.samples):
individual_positions[ind] = i
variants = []
for i, variant in enumerate(vcf_obj):
parsed_variant = parse_variant(
variant=variant,
case=case_obj,
variant_type="clinical",
rank_results_header=rank_results_header,
vep_header=vep_header,
individual_positions=individual_positions,
v.alt = 'asd'
# non-fixed arguments
v.id = 'asd'
v.qual = 10
v.filter = 'asd'
v.source = 2
assert isinstance(Variant("chr1", '10', 'C', 'T').pos, int)
# from cyvcf2
vcf = cyvcf2.VCF('tests/data/test.vcf.gz')
cv = list(vcf)[0]
v2 = Variant.from_cyvcf(cv)
assert isinstance(v2.source, cyvcf2.Variant)
def __init__(self, filename, reference=None):
if not os.path.isfile(filename) and filename != "-":
exit(message("Error: " + filename + " does not exist"))
self.filename = filename
if reference:
self.reference = reference
self.reference_file = resolve_reference_genome(reference)
cyvcf2.__init__(self, self.filename)
# Check if file exists
self.n = len(self.samples) # Number of Samples
# Meta Data
comp = re.compile(r'''^##(?P[^<#]+?)=(?P[^<#]+$)''', re.M)
self.metadata = OrderedDict(comp.findall(self.raw_header))
# Contigs
self.contigs = OrderedDict(zip(
re.compile("##contig=]*?)>").findall(self.raw_header))
))
self.info_set = [x for x in self.header_iter() if x.type == "INFO"]
self.filter_set = [x for x in self.header_iter() if x.type == "FILTER"]
self.format_set = [x for x in self.header_iter() if x.type == "FORMAT"]
def filter_to_pass_and_reject(in_file, paired, out_dir=None):
"""Filter VCF to only those with a strict PASS/REJECT: somatic + germline.
Removes low quality calls filtered but also labeled with REJECT.
"""
from bcbio.heterogeneity import bubbletree
out_file = "%s-prfilter.vcf.gz" % utils.splitext_plus(in_file)[0]
if out_dir:
out_file = os.path.join(out_dir, os.path.basename(out_file))
if not utils.file_uptodate(out_file, in_file):
with file_transaction(paired.tumor_data, out_file) as tx_out_file:
max_depth = bubbletree.max_normal_germline_depth(in_file, bubbletree.PARAMS, paired)
tx_out_plain = tx_out_file.replace(".vcf.gz", ".vcf")
with contextlib.closing(cyvcf2.VCF(in_file)) as reader:
reader = _add_db_to_header(reader)
with contextlib.closing(cyvcf2.Writer(tx_out_plain, reader)) as writer:
for rec in reader:
filters = rec.FILTER.split(";") if rec.FILTER else []
other_filters = [x for x in filters if x not in ["PASS", ".", "REJECT"]]
if len(other_filters) == 0 or bubbletree.is_info_germline(rec):
# Germline, check if we should include based on frequencies
if "REJECT" in filters or bubbletree.is_info_germline(rec):
stats = bubbletree._is_possible_loh(rec, reader, bubbletree.PARAMS, paired,
use_status=True, max_normal_depth=max_depth)
if stats:
rec.FILTER = "PASS"
rec.INFO["DB"] = True
writer.write_record(rec)
# Somatic, always include
else:
writer.write_record(rec)
vcfutils.bgzip_and_index(tx_out_plain, paired.tumor_data["config"])
import sys
import re
from cyvcf2 import VCF, Writer
date = sys.argv[2]
vcf = VCF(sys.argv[1])
fhs = {'benign': Writer('clinvar-benign.%s.vcf' % date, vcf),
'benign-likely_benign': Writer('clinvar-benign-likely_benign.%s.vcf' % date, vcf),
'pathogenic': Writer('clinvar-pathogenic.%s.vcf' % date, vcf),
'expert': Writer('clinvar-pathogenic-expert.%s.vcf' % date, vcf),
'likely_pathogenic-pathogenic': Writer('clinvar-pathogenic-likely_pathogenic.%s.vcf' % date, vcf),
}
wtr = fhs['pathogenic']
fhs['likely_benign'] = fhs['benign-likely_benign']
fhs['likely_pathogenic'] = fhs['likely_pathogenic-pathogenic']
# benigns also get written just to the joint file
lookup = {}
lookup['benign'] = 'benign-likely_benign'
lookup['pathogenic'] = 'likely_pathogenic-pathogenic'
exclude = "criteria_provided,_conflicting_interpretations no_assertion_criteria_provided no_assertion_provided no_interpretation_for_the_single_variant".split()
for v in vcf:
return smax
if __name__ == "__main__":
import doctest
doctest.testmod()
vcf = VCF(sys.argv[1])
vcf.add_info_to_header({'ID': 'grantham', 'Description': 'grantham score',
'Type':'Integer', 'Number': '1'})
vcf.add_info_to_header({'ID': 'mis_badness', 'Description': 'missense badness score',
'Type':'Float', 'Number': '1'})
vcf.add_info_to_header({'ID': 'blosum', 'Description': 'blosum score',
'Type':'Integer', 'Number': '1'})
wtr = Writer("-", vcf)
for v in vcf:
csqs = v.INFO.get("BCSQ", "").split(",")
smax = None
for csq in csqs:
smax = score(csq, smax)
if not smax:
continue
for k in ('Grantham', 'mis_badness', 'BLOSUM'):
v.INFO[k.lower()] = smax[k]
wtr.write_record(v)
wtr.close()
def __init__(self, filename, reference=None, use_template="ALT", polymorphic=True):
if not os.path.isfile(filename) and filename != "-":
exit(message("Error: " + filename + " does not exist"))
self.filename = filename
self.use_template = use_template
self.use_polymorphic = polymorphic
if reference:
self.reference = reference
self.reference_file = resolve_reference_genome(reference)
cyvcf2.__init__(self, self.filename)
# Check if file exists
self.n = len(self.samples) # Number of Samples
# Meta Data
comp = re.compile(r'''^##(?P[^<#]+?)=(?P[^<#]+$)''', re.M)
self.metadata = OrderedDict(comp.findall(self.raw_header))
# Contigs
self.contigs = OrderedDict(zip(
re.compile("##contig=").findall(self.raw_header))
))
self.info_set = [x for x in self.header_iter() if x.type == "INFO"]
self.filter_set = [x for x in self.header_iter() if x.type == "FILTER"]
self.format_set = [x for x in self.header_iter() if x.type == "FORMAT"]
def __init__(self, filename, reference=None):
if not os.path.isfile(filename) and filename != "-":
with indent(4):
exit(puts(colored.red("\nError: " + filename + " does not exist\n")))
self.filename = filename
if reference:
self.reference = reference
cyvcf2.__init__(self, self.filename)
# Check if file exists
self.n = len(self.samples) # Number of Samples
# Meta Data
comp = re.compile(r'''^##(?P[^<#]+?)=(?P[^<#]+$)''', re.M)
self.metadata = OrderedDict(comp.findall(self.raw_header))
# Contigs
self.contigs = OrderedDict(zip(
re.compile("##contig=").findall(self.raw_header))
))
# Info
r = re.compile(r'''\#\#INFO=<
ID=(?P[^,]+),
Number=(?P-?\d+|\.|[AG]),