How to use the pysam.sort function in pysam

To help you get started, we’ve selected a few pysam examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github roryk / spp-idr / spp_idr / idr.py View on Github external
def bam_split(in_file, nfiles=2):
    out_files = _get_split_files(in_file, nfiles)
    out_bases = [os.path.splitext(x)[0] for x in out_files]
    tmp_files = [x + "tmp" for x in out_files]
    samfile = pysam.Samfile(in_file, "rb")
    out_handles = [pysam.Samfile(x, "wb", template=samfile) for x in tmp_files]
    for read in samfile:
        out_handles[random.randint(0, nfiles-1)].write(read)
    samfile.close()
    [x.close() for x in out_handles]
    [pysam.sort(tmp_files[x], out_bases[x]) for x in range(nfiles)]
    [pysam.index(x) for x in out_files]
    return out_files
github rhshah / iCallSV / iCallSV / sortbamByCoordinate.py View on Github external
def sortBam(inputBam, outputBamName, outputDir):
    logger.info("sortbamByCoordinate: Trying to sort BAM file by Coordinate")
    if(os.path.isdir(outputDir)):
        logger.info("sortbamByCoordinate: The output directory %s exists", outputDir)
        outputFile = outputDir + "/" + outputBamName
    else:
        logger.info("sortbamByCoordinate:The output directory %s does not exists !!", outputDir)
        sys.exit()

    if(os.path.isfile(inputBam)):
        try:
            pysam.sort(inputBam, outputFile)
        except IndexError as err:
            exception = "Index error({0}): {1}".format(err.errno, err.strerror)
            logger.info("%s", exception)
        except IOError as err:
            exception = "I/O error({0}): {1}".format(err.errno, err.strerror)
            logger.info("%s", exception)
    else:
        logger.info("sortbamByCoordinate: bam File %s does not exists !!", inputBam)
        sys.exit()
    logger.info("sortbamByCoordinate: Finished sorting BAM file by Coordinate.")
    return(outputFile)
github mitenjain / nanopore / nanopore / metaAnalyses / customTrackAssemblyHub.py View on Github external
outFolderReferenceFiles = parentFolder + hubFastaDir + "/"
			outFolderBamFiles = outFolderReferenceFiles + "bamFiles/"

			# Create hierarchical reference and bamfile folders
			if not os.path.exists(outFolderReferenceFiles):
				os.mkdir(outFolderReferenceFiles)
			if not os.path.exists(outFolderBamFiles):
				os.mkdir(outFolderBamFiles)
			
			# Check and create bam, sorted bam, and indexed bam files
			bamFile = os.path.join(resultsDir, "mapping.bam")
			sortedbamFile = os.path.join(resultsDir, "mapping.sorted")
			if not os.path.isfile(os.path.join(resultsDir, "mapping.bam")):
				try:
					samToBamFile(os.path.join(resultsDir, "mapping.sam"), bamFile)
					pysam.sort(bamFile, sortedbamFile)
					pysam.index(sortedbamFile + ".bam")
				except:
					continue

			# Copy files
			try:
				system("cp %s %s" % (os.path.join(resultsDir, "mapping.sorted.bam"), outFolderBamFiles + experiment + ".sorted.bam"))
				system("cp %s %s" % (os.path.join(resultsDir, "mapping.sorted.bam.bai"), outFolderBamFiles + experiment + ".sorted.bam.bai"))
			except:
				continue

		genomes = open(parentFolder + "genomes.txt", "a")
		for referenceFastaFile in self.referenceFastaFiles:
			if referenceFastaFile.endswith(".fa") or referenceFastaFile.endswith(".fasta"):
				header = referenceFastaFile.split("/")[-1].split(".fa")[0]
				system("cp %s %s" % (referenceFastaFile, parentFolder + header + "/"))
github CGATOxford / UMI-tools / umi_tools / dedup.py View on Github external
stats_post_df_dict['UMI'].extend(umis)
                stats_post_df_dict['counts'].extend(umi_counts)

                average_distance = umi_methods.get_average_umi_distance(post_cluster_umis)
                post_cluster_stats.append(average_distance)

                cluster_size = len(post_cluster_umis)
                random_umis = read_gn.getUmis(cluster_size)
                average_distance_null = umi_methods.get_average_umi_distance(random_umis)
                post_cluster_stats_null.append(average_distance_null)

    outfile.close()

    if not options.no_sort_output:
        # sort the output
        pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name)
        os.unlink(out_name)  # delete the tempfile

    if options.stats:

        # generate the stats dataframe
        stats_pre_df = pd.DataFrame(stats_pre_df_dict)
        stats_post_df = pd.DataFrame(stats_post_df_dict)

        # tally the counts per umi per position
        pre_counts = collections.Counter(stats_pre_df["counts"])
        post_counts = collections.Counter(stats_post_df["counts"])
        counts_index = list(set(pre_counts.keys()).union(set(post_counts.keys())))
        counts_index.sort()
        with U.openFile(options.stats + "_per_umi_per_position.tsv", "w") as outf:
            outf.write("counts\tinstances_pre\tinstances_post\n")
            for count in counts_index:
github CGATOxford / cgat / CGATPipelines / PipelinePeakcalling.py View on Github external
nwithout_dups = ninput - nduplicates
    logs.write("duplicates\t%i\t%5.2f%%\n" %
               (nduplicates, 100.0 * nduplicates / ninput))
    logs.write("without duplicates\t%i\t%5.2f%%\n" %
               (nwithout_dups, 100.0 * nwithout_dups / ninput))
    logs.write("target\t%i\t%5.2f%%\n" %
               (min_reads, 100.0 * min_reads / nwithout_dups))
    logs.write("noutput\t%i\t%5.2f%%\n" %
               (noutput, 100.0 * noutput / nwithout_dups))

    logs.close()

    # if more than one samfile: sort
    if len(samfiles) > 1:
        tmpfilename = P.getTempFilename()
        pysam.sort(outfile, tmpfilename)
        shutil.move(tmpfilename + ".bam", outfile)
        os.unlink(tmpfilename)

    pysam.index(outfile)

    E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" %
           (ninput, noutput, 100.0 * noutput / ninput, min_reads))
github pughlab / bamgineer / src / qsub / ruffus / implementGain.py View on Github external
gain_HET_ALT_RE_PAIR_SAMPLED =  "/".join([splittmpbams_hap, chr + event +'_HET_ALT_RE_PAIR_SAMPLED.bam'])
gain_HET_ALT_RE_PAIR_SAMPLED_RENAMED =  "/".join([splittmpbams_hap ,  chr +'_GAIN_HET_ALT_RE_PAIR_SAMPLED_RENAMED.bam'])

gain_NONHET_RE_PAIR = "/".join([splittmpbams_hap, chr + event +'_NONHET_REPAIR.bam'])
gain_NONHET_RE_PAIR_SAMPLED = "/".join([splittmpbams_hap, chr + event +'_NONHET_SAMPLED.bam'])
gain_NONHET_RE_PAIR_SAMPLED_RENAMED = "/".join([splittmpbams_hap, chr + '_GAIN_NONHET_RE_PAIR_SAMPLED_RENAMED.bam'])

gain_NONHET_FINAL = "/".join([finalbams_hap,  chr.upper() +'_GAIN_NH'])
gain_HET_FINAL = "/".join([finalbams_hap,  chr.upper() +'_GAIN_H'])

if(os.path.isfile(HET_ALT)):
   
    samapleratehet = splitAndRePairHET(HET_ALT,  gain_HET_ALT_RE_PAIR, chr )
    subsample(gain_HET_ALT_RE_PAIR,gain_HET_ALT_RE_PAIR_SAMPLED, str(samapleratehet)) # we need to keep a bit more (by 15-20%)
    renamereads(gain_HET_ALT_RE_PAIR_SAMPLED, gain_HET_ALT_RE_PAIR_SAMPLED_RENAMED )
    pysam.sort(gain_HET_ALT_RE_PAIR_SAMPLED_RENAMED,  gain_HET_FINAL)
    #os.remove(gain_HET_ALT_RE_PAIR_SAMPLED_RENAMED)
    #logger.debug('sampling rate for HET'+chr +' = '+ str(samapleratehet))
else:
    print('HET_ALT FILE NOT EXISTING')
       
if(os.path.isfile(NONHET)):
    samapleratenonhet = splitAndRePairNONHET(NONHET, gain_NONHET_RE_PAIR, chr)
    subsample(gain_NONHET_RE_PAIR, gain_NONHET_RE_PAIR_SAMPLED,  str(samapleratenonhet))
    renamereads(gain_NONHET_RE_PAIR_SAMPLED, gain_NONHET_RE_PAIR_SAMPLED_RENAMED)
    pysam.sort(gain_NONHET_RE_PAIR_SAMPLED_RENAMED, gain_NONHET_FINAL)
    #os.remove(gain_NONHET_RE_PAIR_SAMPLED_RENAMED)
else:
    print('NON_HET FILE NOT EXISTING')
github nspies / svviz2 / src / svviz2 / utility / bam.py View on Github external
def bam_sort_index(unsorted_bam_path):
    sorted_bam_path = unsorted_bam_path.replace(".bam", "") + ".sorted.bam"
    pysam.sort("-o", sorted_bam_path, unsorted_bam_path)
    pysam.index(sorted_bam_path)

    os.remove(unsorted_bam_path)

    return sorted_bam_path
github luntergroup / octopus / scripts / split_realigned_bam.py View on Github external
def index_bam(bam_fname, sort_if_needed=False):
    ps.index(str(bam_fname))
    if not bam_index_exists(bam_fname):
        sorted_bam_fname = str(bam_fname).replace('.bam', '.sorted.tmp.bam')
        ps.sort('-o', sorted_bam_fname, str(bam_fname))
        rename(sorted_bam_fname, bam_fname)
        ps.index(str(bam_fname))
github CGATOxford / cgat / pipeline_vitaminD_intervals.py View on Github external
logs.write("# min_reads=%i, threshold= %5.2f\n" % \
                   (min_reads, threshold))
    logs.write("set\tcounts\tpercent\n")
    logs.write("ninput\t%i\t%5.2f%%\n" % (ninput, 100.0) )
    nwithout_dups = ninput - nduplicates
    logs.write("duplicates\t%i\t%5.2f%%\n" % (nduplicates,100.0*nduplicates/ninput))
    logs.write("without duplicates\t%i\t%5.2f%%\n" % (nwithout_dups,100.0*nwithout_dups/ninput))
    logs.write("target\t%i\t%5.2f%%\n" %   (min_reads,100.0*min_reads/nwithout_dups))
    logs.write("noutput\t%i\t%5.2f%%\n" % (noutput,100.0*noutput/nwithout_dups))
    
    logs.close()
    
    # if more than one samfile: sort
    if len(samfiles) > 1:
        tmpfilename = P.getTempFilename()
        pysam.sort( outfile, tmpfilename )
        shutil.move( tmpfilename + ".bam", outfile )
        os.unlink( tmpfilename )

    pysam.index( outfile )

    P.info( "buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" % (ninput, noutput, 100.0*noutput/ninput, min_reads ))
github sbg / Mitty / mitty / benchmarking / god_aligner.py View on Github external
:return:
  """
  logger.debug('Writing to {} ...'.format(bam_fname))
  t0 = time.time()
  fp = pysam.AlignmentFile(bam_fname, 'wb', header=bam_hdr)
  ref_dict = {k['SN']: n for n, k in enumerate(bam_hdr['SQ'])}
  cnt = 0
  for cnt, (qname, read_data) in enumerate(iter(in_queue.get, __process_stop_code__)):
    write_perfect_reads(qname, rg_id, long_qname_table, ref_dict, read_data, cigar_v2, fp)
  fp.close()
  t1 = time.time()
  logger.debug('... {}: {} reads in {:0.2f}s ({:0.2f} t/s)'.format(bam_fname, cnt, t1 - t0, cnt/(t1 - t0)))

  logger.debug('Sorting {} -> {}'.format(bam_fname, bam_fname + '.sorted'))
  t0 = time.time()
  pysam.sort('-m', '1G', '-o', bam_fname + '.sorted', bam_fname)
  os.remove(bam_fname)
  t1 = time.time()
  logger.debug('... {:0.2f}s'.format(t1 - t0))

  logger.debug('Shutting down thread for {}'.format(bam_fname))