How to use the biolib.seq_io function in biolib

To help you get started, we’ve selected a few biolib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dparks1134 / RefineM / refinem / deprecated / taxonomic_profile.py View on Github external
diamond_output_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(diamond_output_dir)

        fragment_file = os.path.join(diamond_output_dir, 'fragments.fna')
        fragment_out = open(fragment_file, 'w')
        contig_id_to_genome_id = {}
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            self.profiles[genome_id] = Profile(genome_id, taxonomy)
            self._fragment_genomes(genome_file,
                                  window_size,
                                  step_size,
                                  self.profiles[genome_id],
                                  fragment_out)

            for seq_id, _seq in seq_io.read_seq(genome_file):
                contig_id_to_genome_id[seq_id] = genome_id

        # run diamond
        self.logger.info('')
        self.logger.info('  Running diamond blastx with %d processes (be patient!)' % self.cpus)

        diamond = Diamond(self.cpus)
        diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits')
        diamond.blastx(fragment_file, db_file, evalue, per_identity, 1, diamond_daa_out)

        diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv')
        diamond.view(diamond_daa_out + '.daa', diamond_table_out)

        self.logger.info('')
        self.logger.info('  Creating taxonomic profile for each genome.')
        self._taxonomic_profiles(diamond_table_out, taxonomy, contig_id_to_genome_id)
github dparks1134 / RefineM / refinem / unbinned.py View on Github external
Returns
        -------
        dict : d[seq_id] -> seq
            Dictionary of unbinned sequences.
        """

        check_file_exists(scaffold_file)

        # get list of sequences in bins
        self.logger.info('Reading binned scaffolds.')

        binned_seq_ids = set()
        total_binned_bases = 0
        for genome_file in genome_files:
            for seq_id, seq in seq_io.read_seq(genome_file):
                binned_seq_ids.add(seq_id)
                total_binned_bases += len(seq)

        self.logger.info('Read %d (%.2f Mbp) binned scaffolds.' % (len(binned_seq_ids), float(total_binned_bases) / 1e6))

        # write all unbinned sequences
        self.logger.info('Identifying unbinned scaffolds >= %d bp.' % min_seq_len)

        unbinned_bases = 0
        unbinned_seqs = {}
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id not in binned_seq_ids and len(seq) >= min_seq_len:
                unbinned_seqs[seq_id] = seq
                unbinned_bases += len(seq)

        self.logger.info('Identified %d (%.2f Mbp) unbinned scaffolds.' % (len(unbinned_seqs), float(unbinned_bases) / 1e6))
github dparks1134 / RefineM / refinem / bin_comparer.py View on Github external
Second set of genome files in fasta format.
        seq_file : str
            Scaffolds/contigs binned to create genomes.
        output_file : str
            Desire file to write results.
        """

        # determine total number of sequences
        self.logger.info('Reading sequences.')

        seq_lens = {}
        total_bases = 0
        num_seqs_over_length = defaultdict(int)
        total_bases_over_length = defaultdict(int)
        lengths_to_check = [1000, 5000, 10000, 20000, 50000]
        for seq_id, seq in seq_io.read_seq(seq_file):
            seq_len = len(seq)
            seq_lens[seq_id] = seq_len
            total_bases += seq_len

            for length in lengths_to_check:
                if seq_len >= length:
                    num_seqs_over_length[length] += 1
                    total_bases_over_length[length] += seq_len

        # determine sequences in each bin
        genome_seqs1 = self._genome_seqs(genome_files1)
        genome_seqs2 = self._genome_seqs(genome_files2)

        # determine bin stats
        genome_stats1, total_uniq_binned_seqs1, total_uniq_binned_bases1, num_repeats1 = self._genome_stats(genome_seqs1, seq_lens)
        genome_stats2, total_uniq_binned_seqs2, total_uniq_binned_bases2, num_repeats2 = self._genome_stats(genome_seqs2, seq_lens)