How to use cyvcf2 - 10 common examples

To help you get started, we’ve selected a few cyvcf2 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github carjed / helmsman / util.py View on Github external
def process_vcf(self, inputfile):
        """
        Main function for parsing VCF

        """
        # initialize reference genome
        fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000)

        # initialize vcf reader
        if self.args.samplefile:
            keep_samples = parseSampleFile(self.args.samplefile)

            vcf_reader = VCF(
                inputfile,
                mode='rb',
                gts012=True,
                lazy=True,
                samples=keep_samples)
        else:
            vcf_reader = VCF(inputfile, mode='rb', gts012=True, lazy=True)

        nbp = (self.args.length - 1) // 2

        # index samples
        if (self.args.samplefile and self.args.groupvar):
            all_samples = vcf_reader.samples

            sg_dict = indexGroups(self.args.samplefile, self.args.groupvar)
            samples = sorted(list(set(sg_dict.values())))
github pwwang / vcfstats / tests / test_formula.py View on Github external
def variants():
	vcf = VCF(str(HERE.parent.joinpath('examples', 'sample.vcf')), gts012 = True)
	return list(vcf)
github Clinical-Genomics / scout / tests / update / test_update_compounds.py View on Github external
This implies that some compounds will have the status 'not_loaded'=True.
       When loading all variants for a region then all variants should 
       have status 'not_loaded'=False.
    """
    adapter = real_populated_database
    variant_type = "clinical"
    category = "snv"
    ## GIVEN a database without any variants
    assert adapter.variant_collection.find_one() is None

    institute_obj = adapter.institute_collection.find_one()
    institute_id = institute_obj["_id"]

    ## WHEN loading variants into the database without updating compound information

    vcf_obj = VCF(variant_clinical_file)
    rank_results_header = parse_rank_results_header(vcf_obj)
    vep_header = parse_vep_header(vcf_obj)

    individual_positions = {}
    for i, ind in enumerate(vcf_obj.samples):
        individual_positions[ind] = i

    variants = []
    for i, variant in enumerate(vcf_obj):
        parsed_variant = parse_variant(
            variant=variant,
            case=case_obj,
            variant_type="clinical",
            rank_results_header=rank_results_header,
            vep_header=vep_header,
            individual_positions=individual_positions,
github kipoi / kipoiseq / tests / test_dataclasses.py View on Github external
v.alt = 'asd'

    # non-fixed arguments
    v.id = 'asd'
    v.qual = 10
    v.filter = 'asd'
    v.source = 2

    assert isinstance(Variant("chr1", '10', 'C', 'T').pos, int)

    # from cyvcf2
    vcf = cyvcf2.VCF('tests/data/test.vcf.gz')
    cv = list(vcf)[0]

    v2 = Variant.from_cyvcf(cv)
    assert isinstance(v2.source, cyvcf2.Variant)
github AndersenLab / VCF-kit / vcfkit / utils / vcf.py View on Github external
def __init__(self, filename, reference=None):
        if not os.path.isfile(filename) and filename != "-":
            exit(message("Error: " + filename + " does not exist"))
        self.filename = filename
        if reference:
            self.reference = reference
            self.reference_file = resolve_reference_genome(reference)

        cyvcf2.__init__(self, self.filename)
        # Check if file exists
        self.n = len(self.samples)  # Number of Samples

        # Meta Data
        comp = re.compile(r'''^##(?P[^<#]+?)=(?P[^<#]+$)''', re.M)
        self.metadata = OrderedDict(comp.findall(self.raw_header))

        # Contigs
        self.contigs = OrderedDict(zip(
            re.compile("##contig=]*?)>").findall(self.raw_header))
        ))

        self.info_set = [x for x in self.header_iter() if x.type == "INFO"]
        self.filter_set = [x for x in self.header_iter() if x.type == "FILTER"]
        self.format_set = [x for x in self.header_iter() if x.type == "FORMAT"]
github bcbio / bcbio-nextgen / bcbio / variation / germline.py View on Github external
def filter_to_pass_and_reject(in_file, paired, out_dir=None):
    """Filter VCF to only those with a strict PASS/REJECT: somatic + germline.

    Removes low quality calls filtered but also labeled with REJECT.
    """
    from bcbio.heterogeneity import bubbletree
    out_file = "%s-prfilter.vcf.gz" % utils.splitext_plus(in_file)[0]
    if out_dir:
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            max_depth = bubbletree.max_normal_germline_depth(in_file, bubbletree.PARAMS, paired)
            tx_out_plain = tx_out_file.replace(".vcf.gz", ".vcf")
            with contextlib.closing(cyvcf2.VCF(in_file)) as reader:
                reader = _add_db_to_header(reader)
                with contextlib.closing(cyvcf2.Writer(tx_out_plain, reader)) as writer:
                    for rec in reader:
                        filters = rec.FILTER.split(";") if rec.FILTER else []
                        other_filters = [x for x in filters if x not in ["PASS", ".", "REJECT"]]
                        if len(other_filters) == 0 or bubbletree.is_info_germline(rec):
                            # Germline, check if we should include based on frequencies
                            if "REJECT" in filters or bubbletree.is_info_germline(rec):
                                stats = bubbletree._is_possible_loh(rec, reader, bubbletree.PARAMS, paired,
                                                                    use_status=True, max_normal_depth=max_depth)
                                if stats:
                                    rec.FILTER = "PASS"
                                    rec.INFO["DB"] = True
                                    writer.write_record(rec)
                            # Somatic, always include
                            else:
                                writer.write_record(rec)
            vcfutils.bgzip_and_index(tx_out_plain, paired.tumor_data["config"])
github quinlan-lab / pathoscore / truth-sets / GRCh37 / clinvar / make.py View on Github external
import sys
import re
from cyvcf2 import VCF, Writer

date = sys.argv[2]

vcf = VCF(sys.argv[1])

fhs = {'benign': Writer('clinvar-benign.%s.vcf' % date, vcf),
       'benign-likely_benign': Writer('clinvar-benign-likely_benign.%s.vcf' % date, vcf),
       'pathogenic': Writer('clinvar-pathogenic.%s.vcf' % date, vcf),
       'expert': Writer('clinvar-pathogenic-expert.%s.vcf' % date, vcf),
       'likely_pathogenic-pathogenic': Writer('clinvar-pathogenic-likely_pathogenic.%s.vcf' % date, vcf),
       }
wtr = fhs['pathogenic']

fhs['likely_benign'] = fhs['benign-likely_benign']
fhs['likely_pathogenic'] = fhs['likely_pathogenic-pathogenic']

# benigns also get written just to the joint file
lookup = {}
lookup['benign'] = 'benign-likely_benign'
lookup['pathogenic'] = 'likely_pathogenic-pathogenic'

exclude = "criteria_provided,_conflicting_interpretations no_assertion_criteria_provided no_assertion_provided no_interpretation_for_the_single_variant".split()

for v in vcf:
github quinlan-lab / pathoscore / truth-sets / score.py View on Github external
return smax

if __name__ == "__main__":
    import doctest
    doctest.testmod()

    vcf = VCF(sys.argv[1])
    vcf.add_info_to_header({'ID': 'grantham', 'Description': 'grantham score',
                            'Type':'Integer', 'Number': '1'})
    vcf.add_info_to_header({'ID': 'mis_badness', 'Description': 'missense badness score',
                            'Type':'Float', 'Number': '1'})
    vcf.add_info_to_header({'ID': 'blosum', 'Description': 'blosum score',
                            'Type':'Integer', 'Number': '1'})


    wtr = Writer("-", vcf)

    for v in vcf:
        csqs = v.INFO.get("BCSQ", "").split(",")
        smax = None
        for csq in csqs:
            smax = score(csq, smax)
        if not smax:
            continue

        for k in ('Grantham', 'mis_badness', 'BLOSUM'):
            v.INFO[k.lower()] = smax[k]

        wtr.write_record(v)
    wtr.close()
github AndersenLab / VCF-kit / vcfkit / utils / primer_vcf.py View on Github external
def __init__(self, filename, reference=None, use_template="ALT", polymorphic=True):
        if not os.path.isfile(filename) and filename != "-":
            exit(message("Error: " + filename + " does not exist"))
        self.filename = filename
        self.use_template = use_template
        self.use_polymorphic = polymorphic
        if reference:
            self.reference = reference
            self.reference_file = resolve_reference_genome(reference)

        cyvcf2.__init__(self, self.filename)
        # Check if file exists
        self.n = len(self.samples)  # Number of Samples

        # Meta Data
        comp = re.compile(r'''^##(?P[^<#]+?)=(?P[^<#]+$)''', re.M)
        self.metadata = OrderedDict(comp.findall(self.raw_header))

        # Contigs
        self.contigs = OrderedDict(zip(
            re.compile("##contig=").findall(self.raw_header))
        ))

        self.info_set = [x for x in self.header_iter() if x.type == "INFO"]
        self.filter_set = [x for x in self.header_iter() if x.type == "FILTER"]
        self.format_set = [x for x in self.header_iter() if x.type == "FORMAT"]
github AndersenLab / VCF-kit / tb / utils / vcf.py View on Github external
def __init__(self, filename, reference=None):
        if not os.path.isfile(filename) and filename != "-":
            with indent(4):
                exit(puts(colored.red("\nError: " + filename + " does not exist\n")))
        self.filename = filename
        if reference:
            self.reference = reference

        cyvcf2.__init__(self, self.filename)
        # Check if file exists
        self.n = len(self.samples)  # Number of Samples

        # Meta Data
        comp = re.compile(r'''^##(?P[^<#]+?)=(?P[^<#]+$)''', re.M)
        self.metadata = OrderedDict(comp.findall(self.raw_header))

        # Contigs
        self.contigs = OrderedDict(zip(
            re.compile("##contig=").findall(self.raw_header))
        ))
        # Info
        r = re.compile(r'''\#\#INFO=<
            ID=(?P[^,]+),
            Number=(?P-?\d+|\.|[AG]),

cyvcf2

fast vcf parsing with cython + htslib

MIT
Latest version published 6 months ago

Package Health Score

63 / 100
Full package analysis

Similar packages