How to use the pysam.faidx function in pysam

To help you get started, we’ve selected a few pysam examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github ngsutils / ngsutils / ngsutils / bed / tofasta.py View on Github external
def bed_tofasta(bed, ref_fasta, min_size=50, stranded=True, include_name=False, out=sys.stdout):
    if not os.path.exists('%s.fai' % ref_fasta):
        pysam.faidx(ref_fasta)

    fasta = pysam.Fastafile(ref_fasta)

    refs = set()
    with open('%s.fai' % ref_fasta) as f:
        for line in f:
            refs.add(line.split('\t')[0].strip())

    name = ''
    for region in bed:
        if include_name:
            name = '%s|' % (region.name.strip())

        if region.end - region.start >= min_size and region.chrom in refs:
            seq = fasta.fetch(region.chrom, region.start, region.end)
            if stranded and region.strand:
github YangLab / CIRCexplorer / circ / CIRCexplorer.py View on Github external
if not options[arg]:
            miss_parameters.append(arg)
    if miss_parameters:
        sys.exit('Miss required option: ' + ' '.join(miss_parameters))
    if options['--fusion'] and not options['--junc']:
        try:
            fusion_bam = pysam.AlignmentFile(options['--fusion'], 'rb')
        except:
            sys.exit('Please make sure %s is BAM file!' % options['--fusion'])
    elif not options['--junc'] and not options['--fusion']:
        sys.exit('--fusion or --junc should be used!')
    elif options['--junc'] and options['--fusion']:
        sys.exit('Could not use --fusion and --junc simultaneously!')

    if not os.path.isfile(options['--genome'] + '.fai'):
        pysam.faidx(options['--genome'])
    genome_fa = pysam.FastaFile(options['--genome'])
    ref_f = options['--ref']
    output_prefix = options['--output']
    output = output_prefix + '_circ.txt'
    print('Start CIRCexplorer %s' % __version__)
    if options['--junc']:
        temp1 = options['--junc']
        temp_dir, temp2 = create_temp(options['--tmp'], output_prefix, 0)
    else:
        temp_dir, temp1, temp2 = create_temp(options['--tmp'], output_prefix)
        convert_fusion(fusion_bam, temp1)
    annotate_fusion(ref_f, temp1, temp2)
    fix_fusion(ref_f, genome_fa, temp2, output, options['--no-fix'])
    if not options['--tmp']:
        if options['--junc']:
            delete_temp(temp_dir, temp1, temp2, 0)
github Genomon-Project / GenomonSV / genomon_sv / utils.py View on Github external
def get_seq(reference, chr, start, end):

    seq = ""
    for item in pysam.faidx(reference, chr + ":" + str(start) + "-" + str(end)):
        # if item[0] == ">": continue
        seq = seq + item.rstrip('\n')
    seq = seq.replace('>', '')
    seq = seq.replace(chr + ":" + str(start) + "-" + str(end), '')

    if re.search(r'[^ACGTUWSMKRYBDHVNacgtuwsmkrybdhvn]', seq) is not None:
        print("The return value in get_seq function includes non-nucleotide characters:", file = sys.stderr)
        print(seq, file = sys.stderr)
        sys.exit(1)

    return seq
github YangLab / CIRCexplorer / circ / fetch_ucsc.py View on Github external
for line in ens_id_f:
                iso, gene = line.split()
                ens_iso[iso] = gene
        with gzip.open('ensGene.txt.gz', 'rb') as ens_f:
            with open(sys.argv[3], 'w') as outf:
                for line in ens_f:
                    entry = line.split()
                    iso = entry[1]
                    outf.write('\t'.join([ens_iso[iso]] + entry[1:11]) + '\n')
    elif sys.argv[2] == 'fa':  # Genome sequences
        urllib.urlretrieve(path + 'bigZips/chromFa.tar.gz', 'chromFa.tar.gz')
        with tarfile.open('chromFa.tar.gz', 'r:gz') as seq:
            with open(sys.argv[3], 'w') as outf:
                for f in seq:
                    outf.write(seq.extractfile(f).read())
        pysam.faidx(sys.argv[3])
    else:
        sys.exit('Only support ref/kg/ens/fa!')
github JoseBlanca / seq_crumbs / crumbs / bam / bam_tools.py View on Github external
def _create_sam_reference_index(fpath):
    'It creates a sam index for a reference sequence file'
    index_fpath = fpath + '.fai'
    if os.path.exists(index_fpath):
        return
    pysam.faidx(fpath)
github hyeshik / tailseeker / tailseeker / sequtils.py View on Github external
def __init__(self, filename):
        if not os.path.exists(filename + '.fai'):
            import pysam
            pysam.faidx(filename)

        self.fasta = open(filename)
        self.index = self.load_index(filename + '.fai')
github CGATOxford / cgat / pipeline_snps.py View on Github external
@files( "%s.fasta" % PARAMS["genome"], "%s.fa" % PARAMS["genome"])
def indexGenome( infile, outfile ):
    '''index the genome for samtools.

    Samtools does not like long lines, so create a new file
    with split lines (what a waste).
    '''

    # statement = '''fold %(infile)s | perl -p -e "s/chr//" > %(outfile)s'''
    statement = '''fold %(infile)s > %(outfile)s'''
    P.run()
    
    pysam.faidx( outfile )
github churchill-lab / g2gtools / g2gtools / g2g_utils.py View on Github external
def index_file(original_file, file_format="vcf", overwrite=False):
    """

    :param original_file:
    :param new_file:
    :param file_format:
    :return:
    """
    if overwrite or not has_index_file(original_file, file_format=file_format):
        if file_format.lower() == 'fa':
            pysam.faidx(original_file)
        elif file_format.lower() == 'vcf':
            pysam.tabix_index(original_file, preset="vcf", force=True)
        elif file_format.lower() == 'vci':
            pysam.tabix_index(original_file, seq_col=0, start_col=1, end_col=1, force=True)
        else:
            raise G2GValueError("Unknown file format: {0}".format(file_format))
github mdshw5 / pyfaidx / scripts / benchmark.py View on Github external
def pysam_faidx(n):
        print('timings for pysam.faidx')
        ti = []
        tf = []
        for _ in range(n):
            t = time.time()
            pysam.faidx(fa_file.name)
            ti.append(time.time() - t)

            t = time.time()
            read_pysam(fa_file.name, headers)
            tf.append(time.time() - t)
            os.remove(index)
        # profile memory usage and report timings
        tracemalloc.start()
        pysam.faidx(fa_file.name)
        read_pysam(fa_file.name, headers)
        os.remove(index)
        print(tracemalloc.get_traced_memory())
        print(mean(ti))
        print(mean(tf)/nreads/10*1000*1000)
        tracemalloc.stop()
github CGATOxford / cgat / CGATPipelines / pipeline_variants.py View on Github external
@files("%s.fasta" % PARAMS["genome"], "%s.fa" % PARAMS["genome"])
def indexGenome(infile, outfile):
    '''index the genome for samtools.

    Samtools does not like long lines, so create a new file
    with split lines (what a waste).
    '''

    # statement = '''fold %(infile)s | perl -p -e "s/chr//" > %(outfile)s'''
    statement = '''fold %(infile)s > %(outfile)s'''
    P.run()

    pysam.faidx(outfile)