How to use the pysam.VariantFile function in pysam

To help you get started, we’ve selected a few pysam examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github tskit-dev / msprime / tskit_tests / test_vcf.py View on Github external
def test_all_records(self):
        for datum in test_data:
            vcf_reader = vcf.Reader(filename=datum.vcf_file)
            bcf_file = pysam.VariantFile(datum.vcf_file)
            pyvcf_records = list(vcf_reader)
            pysam_records = list(bcf_file)
            self.verify_records(datum, pyvcf_records, pysam_records)
            bcf_file.close()
github galaxyproject / galaxy / test / unit / datatypes / test_vcf.py View on Github external
def test_vcf_gz_set_meta():
    vcf_gz = VcfGz()
    with get_input_files('1.vcf_bgzip') as input_files, get_dataset(input_files[0], index_attr='tabix_index') as dataset:
        vcf_gz.set_meta(dataset)
        f = pysam.VariantFile(dataset.file_name, index_filename=dataset.metadata.tabix_index.file_name)
        assert isinstance(f.index, pysam.libcbcf.TabixIndex) is True
github bcbio / bcbio-nextgen / bcbio / heterogeneity / phylowgs.py View on Github external
else:
        raise ValueError("Unexpected variant caller for PhyloWGS prep: %s" % vcaller)
    out_file = os.path.join(work_dir, "%s-%s-prep.vcf" % (utils.splitext_plus(os.path.basename(in_file))[0],
                                                          vcaller))
    if not utils.file_uptodate(out_file, in_file):
        check_fn = _min_sample_pass(ignore_file)
        with file_transaction(somatic_info.tumor_data, out_file) as tx_out_file:
            tx_out_file_raw = "%s-raw%s" % utils.splitext_plus(tx_out_file)
            # Filter inputs
            with VariantFile(in_file) as bcf_in:
                depths = [_sample_depth(rec, somatic_info.tumor_name) for rec in
                          filter(check_fn, bcf_in)]
                depths.sort(reverse=True)
                depth_thresh = depths[:config["sample_size"]][-1] if depths else 0
            with VariantFile(in_file) as bcf_in:
                with VariantFile(tx_out_file_raw, "w", header=bcf_in.header) as bcf_out:
                    for rec in bcf_in:
                        if (check_fn(rec) and
                              (depth_thresh < 5 or _sample_depth(rec, somatic_info.tumor_name) >= depth_thresh)):
                            bcf_out.write(rec)
            # Fix potential chromosome issues
            with open(tx_out_file_raw) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    for line in in_handle:
                        if not line.startswith("#"):
                            parts = line.split("\t")
                            parts[0] = _phylowgs_compatible_chroms(parts[0])
                            line = "\t".join(parts)
                        out_handle.write(line)
    return variant_type, out_file
github henryjuho / parus_indel / summary_analyses / callable_sites_from_vcf_regional.py View on Github external
if x.split()[0] == chromosome:
                lines |= {y for y in range(int(x.split()[1]), int(x.split()[2]))}

    # loop through allsites for chromosome
    counter = 0
    fasta_string = '>{}:{}-{}'.format(chromosome, start, stop)
    if pol != 'None':
        wga_bed = pysam.TabixFile(pol)
    else:
        wga_bed = None

    # move through vcf
    print(fasta_string)
    fasta_string = ''
    prev_position = start
    for line in VariantFile(all_sites).fetch(chromosome, start, stop):

        # catch indels that overlap start and trim
        position = int(line.pos)
        if position < start + 1:
            continue

        # catch missing sites in allsites (new gatk3.7 feature)
        diff = position - prev_position
        if diff != 1:
            missed_bases = ''.join(['1' for i in range(0, diff-1)])
            fasta_string += missed_bases
        prev_position = position

        # add line break every 60 bases
        if len(fasta_string) >= 60:
            if len(fasta_string) == 60:
github sbg / Mitty / mitty / benchmarking / complexity.py View on Github external
:param vcf_in_fname:
  :param vcf_out_fname:
  :param ref_fname:
  :param bg_fname:
  :param d:
  :return:
  """
  measures = [ShannonEntropy(k=k) for k in [1, 2, 4]] + [LinguisticComplexity(), Mappability(bed_graph=bg_fname)]
  info_tags = [m.header_info()[0] for m in measures]

  logger.debug('Starting annotation ...')
  t0 = time.time()

  mode = 'rb' if vcf_in_fname.endswith('bcf') else 'r'
  vcf_in = pysam.VariantFile(vcf_in_fname, mode)
  for m in measures:
    vcf_in.header.info.add(*m.header_info())
  vcf_out = pysam.VariantFile(vcf_out_fname, mode='w', header=vcf_in.header)

  ref_fp = pysam.FastaFile(ref_fname)

  d_2 = int(window_size / 2)
  v_cnt = 0
  for v_cnt, v in enumerate(vcf_in):
    for tag, m in zip(info_tags, measures):
      v.info[tag] = m.complexity(
        s=ref_fp.fetch(reference=v.contig, start=v.start - d_2, end=v.stop + d_2).upper(),
        chrom=v.contig, pos=v.pos)
    vcf_out.write(v)
    if v_cnt % 10000 == 9999:
      t1 = time.time()
github luntergroup / octopus / scripts / train_somatic_random_forest.py View on Github external
def filter_somatic(in_vcf_path, out_vcf_path):
    in_vcf = VariantFile(in_vcf_path)
    out_vcf = VariantFile(out_vcf_path, 'w', header=in_vcf.header)
    num_skipped_records = 0
    for rec in in_vcf:
        if is_somatic(rec, in_vcf.header):
            try:
                out_vcf.write(rec)
            except OSError:
                num_skipped_records += 1
    print("Skipped " + str(num_skipped_records) + " bad records")
    in_vcf.close()
    out_vcf.close()
github sbg / Mitty / mitty / benchmarking / filtereval_caner.py View on Github external
def extract_fp_fn(fname_in, prefix_out):
  """

  :param fname_in:
  :param prefix_out:
  :return:
  """
  logger.debug('Starting filtering ...')
  t0 = time.time()

  mode = 'rb' if fname_in.endswith('bcf') else 'r'
  vcf_in = pysam.VariantFile(fname_in, mode)

  fp_vcf_out = pysam.VariantFile(prefix_out + '-fp.vcf', mode='w', header=vcf_in.header)
  fp_roi_bed = open(prefix_out + '-fp-roi.bed', 'w')
  fn_vcf_out = pysam.VariantFile(prefix_out + '-fn.vcf', mode='w', header=vcf_in.header)
  fn_roi_bed = open(prefix_out + '-fn-roi.bed', 'w')
  bam_in = pysam.AlignmentFile(fname_in[:-3] + 'bam', "rb")#caner
  fp_bam_out = pysam.AlignmentFile(prefix_out+'-fp.bam', "wb", template=bam_in) #caner
  fn_bam_out = pysam.AlignmentFile(prefix_out+'-fn.bam', "wb", template=bam_in) #caner

  n, fp_cnt, fn_cnt = -1, 0, 0
  #CANER - distribution of variant sizes
  fp_ins_size_in_curr_bin=0
  fp_del_size_in_curr_bin=0
  fn_ins_size_in_curr_bin=0
  fn_del_size_in_curr_bin=0
  fp_snp_size_in_curr_bin=0
  fn_snp_size_in_curr_bin=0
  fp_subst_size_in_curr_bin=0
  fn_subst_size_in_curr_bin=0
github henryjuho / parus_indel / summarise_vcf.py View on Github external
e1 = (1.0 / a1) * (((n + 1.0) / (3.0 * (n - 1.0))) - (1.0 / a1))
    e2 = (1.0 / (a1**2 + a2)) * \
         (((2.0 * (n**2 + n + 3.0)) / ((9.0 * n) * (n - 1.0))) -
          ((n + 2.0) / (n * a1)) +
          (a2 / a1**2))
    # print(e2)
    vd = (e1 * segsites) + ((e2 * segsites) * (segsites - 1.0))
    big_d = little_d / math.sqrt(vd)

    return big_d

# get variant site info
allele_freqs = {}

for line in pysam.VariantFile(vcf_file).fetch():
    chromo = line.contig

    # overcomes lack of z chromosome in zebrafinch data
    if skipZ is True and chromo == 'chrZ':
        continue
    else:
        alt_allele_freq = round(line.info['AF'][0], 2)
        ref_seq = line.ref
        alt_seq = line.alts[0]

        # if INDEL mode works out if polarised and if so if del or ins
        if mode == 'INDEL':
            variant_type = 'Unpolarised'
            try:
                ancestral_sequence = line.info['AA']
            except KeyError:
github BuysDB / SingleCellMultiOmics / singlecellmultiomics / features / features.py View on Github external
def loadSNPSFromVcf(self, vcfFilePath, locations=None):
        for rec in pysam.VariantFile(vcfFilePath):
            if locations is None or (rec.chrom, rec.pos) in locations:
                for sample in rec.samples:
                    # get genotypes:
                    for allele in rec.samples[sample].alleles:
                        try:
                            self.addVariant(
                                chromosome=rec.chrom,
                                start=rec.pos,
                                value=allele,
                                name='SNP',
                                variantType='SNP',
                                end=None)
                        except BaseException:
                            pass
github bcbio / bcbio-nextgen / bcbio / heterogeneity / bubbletree.py View on Github external
def max_normal_germline_depth(in_file, params, somatic_info):
    """Calculate threshold for excluding potential heterozygotes based on normal depth.
    """
    bcf_in = pysam.VariantFile(in_file)
    depths = []
    for rec in bcf_in:
        stats = _is_possible_loh(rec, bcf_in, params, somatic_info)
        if tz.get_in(["normal", "depth"], stats):
            depths.append(tz.get_in(["normal", "depth"], stats))
    if depths:
        return np.median(depths) * NORMAL_FILTER_PARAMS["max_depth_percent"]