How to use the biolib.taxonomy.Taxonomy function in biolib

To help you get started, we’ve selected a few biolib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dparks1134 / RefineM / refinem / ssu.py View on Github external
Taxonomic classifications of SSU sequences for each genome.
        """
        
        blast = Blast(self.cpus)
        
        self.logger.info('Classifying SSU rRNA genes.')
        classifications = defaultdict(dict)
        for genome_id, seq_file in seq_files.items():
            genome_dir = os.path.join(output_dir, genome_id)

            # blast sequences against 16S database
            blast_file = os.path.join(genome_dir, 'ssu.blastn.tsv')
            blast.blastn(seq_file, ssu_db, blast_file, evalue=evalue_threshold, max_matches=1, output_fmt='custom')

            # read taxonomy file
            taxonomy = Taxonomy().read(ssu_taxonomy_file)

            # write out classification file
            classification_file = os.path.join(genome_dir, 'ssu.taxonomy.tsv')
            fout = open(classification_file, 'w')
            fout.write('query_id\tssu_taxonomy\tssu_length\tssu_blast_subject_id\tssu_blast_evalue\tssu_blast_bitscore\tssu_blast_align_len\tssu_blast_perc_identity\n')

            processed_query_ids = set()
            for line in open(blast_file):
                line_split = [x.strip() for x in line.split('\t')]
                query_id = line_split[0]

                if query_id in processed_query_ids:
                    # A query may have multiple hits to different sections
                    # of a gene. Blast results are organized by e-value so
                    # only the first hit is considered. The subject gene
                    # is the same in all cases so the taxonomy string will
github dparks1134 / RefineM / refinem / taxon_profile.py View on Github external
fout.write(rank + ': taxon')
            fout.write('\t' + rank + ': % scaffolds')
            fout.write('\t' + rank + ': % genes')
            fout.write('\t' + rank + ': % coding bps')
            fout.write('\t' + rank + ': avg. e-value')
            fout.write('\t' + rank + ': avg. % identity')
            fout.write('\t' + rank + ': avg. align. length (aa)')
        fout.write('\n')

        # sort profiles in descending order of abundance
        profile, stats = self.profile()

        sorted_profiles = {}
        max_taxa = 0
        for r in range(0, len(Taxonomy.rank_labels)):
            sorted_profile = sorted(profile[r].items(), key=operator.itemgetter(1))
            sorted_profile.reverse()

            sorted_profiles[r] = sorted_profile

            if len(sorted_profiles) > max_taxa:
                max_taxa = len(sorted_profiles)

        # write out table
        total_seqs = len(self.genes_in_scaffold)
        total_genes = sum(self.genes_in_scaffold.values())
        total_coding_bases = sum(self.coding_bases.values())

        for i in range(0, max_taxa):
            for r in range(0, len(Taxonomy.rank_labels)):
                if r != 0:
github dparks1134 / RefineM / refinem / taxon_profile.py View on Github external
Scaffold are classified using a majority vote
        over all genes with a valid hit. If less than 20% of
        genes have a valid hit, the scaffold is considered unclassified.
        Classification is performed from the highest (domain)
        to lowest (species) rank. If a rank is taxonomically
        inconsistent with a higher ranks classification, this
        rank and all lower ranks are set to unclassified.

        Returns
        -------
        dict : d[scaffold_id][rank] -> [taxon, HitInfo]
            Classification of each scaffold along with summary statistics
            of hits to the specified taxon.
        """

        expected_parent = Taxonomy().taxonomic_consistency(self.taxonomy)

        # classify each scaffold using a majority vote
        seq_assignments = defaultdict(lambda: defaultdict(list))
        for seq_id, rank_hits in self.hits.items():
            parent_taxa = None
            for rank in range(0, len(Taxonomy.rank_prefixes)):
                taxa = max(rank_hits[rank], key=lambda x: len(rank_hits[rank][x]))
                count = len(rank_hits[rank][taxa])
                
                if (taxa != Taxonomy.rank_prefixes[rank]
                    and (count >= self.percent_to_classify * self.genes_in_scaffold[seq_id])
                    and (rank == 0 or expected_parent[taxa] == parent_taxa)):
                        seq_assignments[seq_id][rank] = [taxa, rank_hits[rank][taxa]]
                        parent_taxa = taxa
                else:
                    # set to unclassified at all lower ranks
github dparks1134 / RefineM / refinem / deprecated / taxonomic_profile.py View on Github external
over all fragments originating from the sequence
        with a valid hit. If less than 20% of fragments have
        a valid hit, the sequence is considered unclassified.
        Classification is performed from the highest (domain)
        to lowest (species) rank. If a rank is taxonomically
        inconsistent with a higher ranks classification, this
        rank and all lower ranks are set to unclassified.

        Returns
        -------
        dict : d[contig_id][rank] -> [taxa, HitInfo]
            Classification of each sequence along with summary statistics
            of hits to the specified taxa.
        """

        expected_parent = Taxonomy().taxonomic_consistency(self.taxonomy)

        # classify each sequence using a majority vote
        seq_assignments = defaultdict(lambda: defaultdict(list))
        for seq_id, rank_hits in self.hits.iteritems():
            parent_taxa = None
            for rank in xrange(0, len(self.rank_prefixes)):
                taxa = max(rank_hits[rank], key=lambda x: len(rank_hits[rank][x]))
                count = len(rank_hits[rank][taxa])

                if count >= self.percent_to_classify * self.fragments_from_seq[seq_id]:
                    if rank == 0 or expected_parent[taxa] == parent_taxa:
                        seq_assignments[seq_id][rank] = [taxa, rank_hits[rank][taxa]]
                        parent_taxa = taxa
                else:
                    # set to  unclassified at all lower ranks
                    for r in xrange(rank, len(self.rank_prefixes)):
github Ecogenomics / GTDBTk / src / gtdbtk / gtdbtk.py View on Github external
def root(self, options):
        """Root tree using outgroup."""
        
        check_file_exists(options.input_tree)
        
        gtdb_taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)
        
        self.logger.info('Identifying genomes from the specified outgroup.')
        outgroup = set()
        for genome_id, taxa in gtdb_taxonomy.iteritems():
            if options.outgroup_taxon in taxa:
                outgroup.add(genome_id)

        reroot = RerootTree()
        reroot.root_with_outgroup(options.input_tree, 
                                    options.output_tree, 
                                    outgroup)
        
        self.logger.info('Done.')
github dparks1134 / RefineM / refinem / taxon_profile.py View on Github external
def write_genome_summary(self, output_file):
        """Summarize classification of each genome.

        Parameters
        ----------
        output_file : str
            Output file.
        """

        fout = open(output_file, 'w')
        fout.write('Genome id\t# scaffolds\t# genes\tCoding bases')
        for rank in Taxonomy.rank_labels:
            fout.write('\t' + rank + ': taxon')
            fout.write('\t' + rank + ': % of scaffolds')
            fout.write('\t' + rank + ': % of genes')
            fout.write('\t' + rank + ': % of coding bases')
            fout.write('\t' + rank + ': avg. e-value')
            fout.write('\t' + rank + ': avg. % identity')
            fout.write('\t' + rank + ': avg. align. length (aa)')
        fout.write('\n')

        sorted_genome_ids = alphanumeric_sort(self.profiles.keys())
        for genome_id in sorted_genome_ids:
            self.profiles[genome_id].write_genome_summary(fout)

        fout.close()
github dparks1134 / RefineM / refinem / taxon_profile.py View on Github external
def __init__(self, genome_id, percent_to_classify, taxonomy):
        """Initialization.

        Parameters
        ----------
        genome_id : str
            Unique identify of genome.
        percent_to_classify : float
            Minimum percentage of genes to assign scaffold to a taxon [0, 100].
        taxonomy : d[ref_genome_id] -> [domain, phylum, ..., species]
            Taxonomic assignment of each reference genome.
        """

        self.percent_to_classify = percent_to_classify / 100.0

        self.unclassified = Taxonomy.unclassified_rank

        self.genome_id = genome_id
        self.taxonomy = taxonomy

        self.TaxaInfo = namedtuple('TaxaInfo', """evalue
                                                perc_identity
                                                aln_length
                                                num_seqs
                                                num_genes
                                                num_basepairs""")

        # track hits at each rank: dict[scaffold_id][rank][taxa] -> [HitInfo, ...]
        self.HitInfo = namedtuple('HitInfo', """subject_genome_id
                                                subject_gene_id
                                                evalue
                                                perc_identity
github dparks1134 / RefineM / scripts / make_database.py View on Github external
def __init__(self):
        """Initialize."""

        check_dependencies(['comparem', 'diamond', 'makeblastdb'])

        self.underclassified = 'underclassified'

        self.rank_prefixes = Taxonomy.rank_prefixes
        self.rank_index = Taxonomy.rank_index
        self.rank_labels = Taxonomy.rank_labels

        self.time_keeper = TimeKeeper()