How to use the cyvcf2.VCF function in cyvcf2

To help you get started, we’ve selected a few cyvcf2 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github carjed / helmsman / util.py View on Github external
def process_vcf(self, inputfile):
        """
        Main function for parsing VCF

        """
        # initialize reference genome
        fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000)

        # initialize vcf reader
        if self.args.samplefile:
            keep_samples = parseSampleFile(self.args.samplefile)

            vcf_reader = VCF(
                inputfile,
                mode='rb',
                gts012=True,
                lazy=True,
                samples=keep_samples)
        else:
            vcf_reader = VCF(inputfile, mode='rb', gts012=True, lazy=True)

        nbp = (self.args.length - 1) // 2

        # index samples
        if (self.args.samplefile and self.args.groupvar):
            all_samples = vcf_reader.samples

            sg_dict = indexGroups(self.args.samplefile, self.args.groupvar)
            samples = sorted(list(set(sg_dict.values())))
github pwwang / vcfstats / tests / test_formula.py View on Github external
def variants():
	vcf = VCF(str(HERE.parent.joinpath('examples', 'sample.vcf')), gts012 = True)
	return list(vcf)
github Clinical-Genomics / scout / tests / update / test_update_compounds.py View on Github external
This implies that some compounds will have the status 'not_loaded'=True.
       When loading all variants for a region then all variants should 
       have status 'not_loaded'=False.
    """
    adapter = real_populated_database
    variant_type = "clinical"
    category = "snv"
    ## GIVEN a database without any variants
    assert adapter.variant_collection.find_one() is None

    institute_obj = adapter.institute_collection.find_one()
    institute_id = institute_obj["_id"]

    ## WHEN loading variants into the database without updating compound information

    vcf_obj = VCF(variant_clinical_file)
    rank_results_header = parse_rank_results_header(vcf_obj)
    vep_header = parse_vep_header(vcf_obj)

    individual_positions = {}
    for i, ind in enumerate(vcf_obj.samples):
        individual_positions[ind] = i

    variants = []
    for i, variant in enumerate(vcf_obj):
        parsed_variant = parse_variant(
            variant=variant,
            case=case_obj,
            variant_type="clinical",
            rank_results_header=rank_results_header,
            vep_header=vep_header,
            individual_positions=individual_positions,
github mcveanlab / treeseq-inference / human-data / tsutil.py View on Github external
def run_benchmark_vcf(args):

    before = time.perf_counter()
    records = cyvcf2.VCF(args.input)
    duration = time.perf_counter() - before
    print("Read BCF header in {:.2f} seconds".format(duration))
    before = time.perf_counter()
    count = 0
    for record in records:
        count += 1
    duration = time.perf_counter() - before
    print("Read {} VCF records in {:.2f} seconds".format(count, duration))
github mcveanlab / treeseq-inference / human-data / convert_1kg.py View on Github external
def convert(
        vcf_file, ancestral_states_file, pedigree_file, output_file, max_variants=None,
        show_progress=False):

    git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"])
    git_provenance = {
        "repo": "git@github.com:mcveanlab/treeseq-inference.git",
        "hash": git_hash.decode().strip(),
        "dir": "human-data",
        "notes:": (
            "Use the Makefile to download and process the upstream data files")}

    with tsinfer.SampleData(path=output_file, num_flush_threads=2) as sample_data:
        pop_id_map = add_populations(sample_data)

        vcf = cyvcf2.VCF(vcf_file)
        individual_names = list(vcf.samples)
        vcf.close()

        with open(pedigree_file, "r") as ped_file:
            add_samples(ped_file, pop_id_map, individual_names, sample_data)

        ancestral_states = read_ancestral_states(ancestral_states_file, show_progress)

        iterator = variants(vcf_file, ancestral_states, show_progress, max_variants)
        for site in iterator:
            sample_data.add_site(
                position=site.position, genotypes=site.genotypes,
                alleles=site.alleles, metadata=site.metadata)
        sample_data.record_provenance(
            command=sys.argv[0], args=sys.argv[1:], git=git_provenance)
github mcveanlab / treeseq-inference / human-data / convert_1kg.py View on Github external
def variants(vcf_path, ancestral_states, show_progress=False, max_sites=None):
    """
    Yield a tuple of position, alleles, genotypes, metadata. Ancestral_states is a
    dictionary mapping RSID to the ancestral allele.
    """
    tot_sites = vcf_num_rows(vcf_path)
    progress = tqdm.tqdm(
        total=tot_sites, desc="Read genotypes", disable=not show_progress)

    vcf = cyvcf2.VCF(vcf_path)

    sites_used = 0
    non_biallelic = 0
    no_ancestral_state = 0
    missing_data = 0
    indels = 0
    unphased = 0
    invariant = 0

    num_diploids = len(vcf.samples)
    num_samples = 2 * num_diploids
    j = 0
    for row in filter_duplicates(vcf):
        progress.update()
        # only use biallelic sites with data for all samples, where ancestral state is known
        # and we have an RSID.
github hakyimlab / MetaXcan / software / metax / genotype / CYVCF2Genotype.py View on Github external
def vcf_file_geno_lines(path, mode="genotyped", variant_mapping=None, whitelist=None, skip_palindromic=False, liftover_conversion=None):
    logging.log(9, "Processing vcf %s", path)
    vcf_reader = VCF(path)

    is_dict_mapping = variant_mapping is not None and type(variant_mapping) == dict

    for variant in vcf_reader:
        chr = variant.CHROM
        pos = variant.POS
        variant_id = variant.ID
        ref = variant.REF
        alts = variant.ALT

        if liftover_conversion:
            chr_, pos_ = chr, pos
            chr, pos = liftover_conversion(chr, pos)
            if chr == "NA" or pos == "NA":
                continue
github arq5x / gemini / gemini / annotations.py View on Github external
def _get_hits(coords, annotation, parser_type, _parsers=PARSERS):
    """Retrieve BED information, recovering if BED annotation file does have a chromosome.
    """
    try:
        parser = _parsers[parser_type]
    except KeyError:
        raise ValueError("Unexpected parser type: %s" % parser)
    chrom, start, end = coords
    if isinstance(annotation, pysam.VariantFile):
        return annotation.fetch(chrom, start, end)
    elif isinstance(annotation, cyvcf2.VCF):
        return annotation("%s:%d-%d" % (chrom, start - 1, end))
    try:
        hit_iter = annotation.fetch(str(chrom), start, end, parser=parser)
    # catch invalid region errors raised by ctabix
    except ValueError:
        hit_iter = []
    # recent versions of pysam return KeyError
    except KeyError:
        hit_iter = []
    except:
        print(annotation.__class__, file=sys.stderr)
        raise
    return hit_iter
github robinandeer / puzzle / puzzle / plugins / vcf / mixins / variant_mixin.py View on Github external
if filters.get('sv_types'):
            sv_types = set(filters['sv_types'])

        logger.info("Get variants from {0}".format(vcf_file_path))

        if filters.get('range'):
            range_str = "{0}:{1}-{2}".format(
                filters['range']['chromosome'],
                filters['range']['start'],
                filters['range']['end'])

            vcf = VCF(vcf_file_path)
            handle = vcf(range_str)
        else:
            handle = VCF(vcf_file_path)

        for variant in handle:
            variant_line = str(variant)
            keep_variant = True

            if genes and keep_variant:
                keep_variant = False
                for gene in genes:
                    if "{0}".format(gene) in variant_line:
                        keep_variant = True
                        break

            if consequences and keep_variant:
                keep_variant = False
                for consequence in consequences:
                    if consequence in variant_line:

cyvcf2

fast vcf parsing with cython + htslib

MIT
Latest version published 4 months ago

Package Health Score

68 / 100
Full package analysis

Similar packages