How to use the gtdbtk.biolib_lite.taxonomy.Taxonomy.rank_prefixes function in gtdbtk

To help you get started, we’ve selected a few gtdbtk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Ecogenomics / GTDBTk / gtdbtk / infer_ranks.py View on Github external
def _get_median_reds(self, ingroup_domain: str):
        """Get median RED values for domain of ingroup taxon."""

        # get median RED values for domain
        if ingroup_domain == 'd__Bacteria':
            median_reds = RED_DIST_BAC_DICT
        elif ingroup_domain == 'd__Archaea':
            median_reds = RED_DIST_ARC_DICT
        else:
            raise GTDBTkExit(f'Unrecognized GTDB domain: {ingroup_domain}.')

        # report median values
        domain = ingroup_domain.replace('d__', '')
        self.logger.info('Median RED values for {}:'.format(domain))
        for idx, rank_prefix in enumerate(Taxonomy.rank_prefixes):
            if idx != Taxonomy.DOMAIN_IDX and idx != Taxonomy.SPECIES_IDX:
                self.logger.info('  {}\t{:.3f}'.format(
                    Taxonomy.rank_labels[idx].capitalize(),
                    median_reds[rank_prefix]))

        return median_reds
github Ecogenomics / GTDBTk / gtdbtk / biolib_lite / taxonomy.py View on Github external
dict : d[taxon_id] -> [taxon, taxonomy]
            Taxa with invalid rank prefixes.
        dict: d[taxon_id] -> [species name, error message]
            Taxa with invalid species names.
        dict: d[child_taxon_id] -> two or more parent taxon ids
            Taxa with invalid hierarchies.
        """

        # check for incomplete taxonomy strings or unexpected rank prefixes
        invalid_ranks = {}
        invalid_prefixes = {}
        invalid_species_name = {}
        invalid_group_name = {}
        for taxon_id, taxa in taxonomy.items():
            if check_ranks:
                if len(taxa) != len(Taxonomy.rank_prefixes):
                    invalid_ranks[taxon_id] = ';'.join(taxa)
                    continue

            if check_prefixes:
                for r, taxon in enumerate(taxa):
                    if taxon[0:3] != Taxonomy.rank_prefixes[r]:
                        invalid_prefixes[taxon_id] = [taxon, ';'.join(taxa)]
                        break

            if check_group_names:
                for taxon in taxa:
                    canonical_taxon = ' '.join(
                        [t.strip() for t in re.split('_[A-Z]+(?= |$)', taxon[3:])]).strip()
                    if canonical_taxon and re.match('^[a-zA-Z0-9- ]+$', canonical_taxon) is None:
                        invalid_group_name[taxon_id] = [
                            taxon, 'Taxon contains invalid characters']
github Ecogenomics / GTDBTk / gtdbtk / decorate.py View on Github external
parent = leaf
        while parent:
            _support, taxon, _aux_info = parse_label(parent.label)

            if taxon:
                for t in taxon.split(';')[::-1]:
                    leaf_taxa.append(t.strip())

            parent = parent.parent_node

        ordered_taxa = leaf_taxa[::-1]

        # fill in missing ranks
        last_rank = ordered_taxa[-1][0:3]
        for i in range(Taxonomy.rank_prefixes.index(last_rank) + 1, len(Taxonomy.rank_prefixes)):
            ordered_taxa.append(Taxonomy.rank_prefixes[i])

        return ordered_taxa
github Ecogenomics / GTDBTk / gtdbtk / decorate.py View on Github external
extent_taxa_with_label = {}
        for i, rank in enumerate(Taxonomy.rank_labels):
            extent_taxa_with_label[i] = Taxonomy().extant_taxa_for_rank(rank, taxonomy)

        # get parent taxon for each taxon:
        taxon_parents = Taxonomy().parents(taxonomy)

        # get number of leaves and taxon in each lineage
        self.logger.info('Calculating taxa within each lineage.')
        for node in tree.preorder_node_iter():
            num_leaves = 0
            taxa_count = defaultdict(lambda: defaultdict(int))
            for leaf in node.leaf_iter():
                num_leaves += 1
                for rank_index, taxon in enumerate(taxonomy[leaf.taxon.label]):
                    if taxon != Taxonomy.rank_prefixes[rank_index]:
                        taxa_count[rank_index][taxon] += 1

            node.num_leaves = num_leaves
            node.taxa_count = taxa_count

        taxa_in_tree = defaultdict(int)
        for leaf in tree.leaf_node_iter():
            for taxon in taxonomy[leaf.taxon.label]:
                taxa_in_tree[taxon] += 1

        # find node with best F-measure for each taxon
        fmeasure_for_taxa = {}
        for rank_index in range(0, len(Taxonomy.rank_labels)):
            # if rank_index == 6: #*** skip species
            #    continue 
            self.logger.info('Processing {:,} taxa at {} rank.'.format(
github Ecogenomics / GTDBTk / gtdbtk / biolib_lite / taxonomy.py View on Github external
Parameters
        ----------
        taxa : [d__, ..., s__]
            List of taxa.

        Returns
        -------
        list
            List of taxa with filled trailing ranks.
        """

        if not taxa:
            return ';'.join(Taxonomy.rank_prefixes)

        last_rank = Taxonomy.rank_prefixes.index(taxa[-1][0:3])

        for i in range(last_rank + 1, len(Taxonomy.rank_prefixes)):
            taxa.append(Taxonomy.rank_prefixes[i])

        return taxa
github Ecogenomics / GTDBTk / gtdbtk / biolib_lite / taxonomy.py View on Github external
def sort_taxa(self, taxa, reverse=False):
        """Sort taxa by rank and then alphabetically.

        Parameters
        ----------
        taxa : iterable
            Taxa with rank prefixes.

        Returns
        -------
        list
            Taxa sorted by rank and alphabetically within each rank.
        """

        ordered_taxa = []
        for rank_prefix in Taxonomy.rank_prefixes:
            rank_taxa = []
            for taxon in taxa:
                if taxon.startswith(rank_prefix):
                    rank_taxa.append(taxon)

            ordered_taxa.extend(sorted(rank_taxa))

        if reverse:
            ordered_taxa = ordered_taxa[::-1]

        return ordered_taxa
github Ecogenomics / GTDBTk / gtdbtk / biolib_lite / taxonomy.py View on Github external
def fill_trailing_ranks(self, taxa):
        """Fill in missing trailing ranks in a taxonomy string.

        Parameters
        ----------
        taxa : [d__, ..., s__]
            List of taxa.

        Returns
        -------
        list
            List of taxa with filled trailing ranks.
        """

        if not taxa:
            return ';'.join(Taxonomy.rank_prefixes)

        last_rank = Taxonomy.rank_prefixes.index(taxa[-1][0:3])

        for i in range(last_rank + 1, len(Taxonomy.rank_prefixes)):
            taxa.append(Taxonomy.rank_prefixes[i])

        return taxa
github Ecogenomics / GTDBTk / gtdbtk / biolib_lite / taxonomy.py View on Github external
Parameters
        ----------
        taxonomy : d[unique_id] -> [d__; ...; s__]
            Taxonomy strings indexed by unique ids.

        Returns
        -------
        dict : d[taxon] -> lineages
            List of lineages for duplicate taxa.
        """

        # get lineages for each taxon name
        taxon_lineages = defaultdict(set)
        for taxa in taxonomy.values():
            for i, taxon in enumerate(taxa):
                if taxon != Taxonomy.rank_prefixes[i]:
                    taxon_lineages[taxon].add(';'.join(taxa[0:i + 1]))

        # identify taxon belonging to multiple lineages
        duplicates = {}
        for taxon, lineages in taxon_lineages.items():
            if len(lineages) >= 2:
                duplicates[taxon] = lineages

        return duplicates
github Ecogenomics / GTDBTk / gtdbtk / decorate.py View on Github external
for node in taxon_parent_node.preorder_iter():
                    taxa_in_lineage = node.taxa_count[rank_index][taxon]
                    num_leaves_with_taxa = sum(node.taxa_count[rank_index].values())

                    if taxa_in_lineage != 0 and num_leaves_with_taxa != 0:
                        precision = float(taxa_in_lineage) / num_leaves_with_taxa
                        recall = float(taxa_in_lineage) / total_taxa
                        fmeasure = (2 * precision * recall) / (precision + recall)

                        if fmeasure >= cur_taxon_fmeasure:
                            node_taxa = set([l.taxon.label for l in node.leaf_iter()])
                            rogue_out = cur_taxa - node_taxa
                            rogue_in = []
                            for gid in node_taxa - cur_taxa:
                                if taxonomy[gid][rank_index] != Taxonomy.rank_prefixes[rank_index]:
                                    rogue_in.append(gid)

                            stat_table = self.StatsTable(node=node,
                                                         fmeasure=fmeasure,
                                                         precision=precision,
                                                         recall=recall,
                                                         taxa_in_lineage=taxa_in_lineage,
                                                         total_taxa=total_taxa,
                                                         num_leaves_with_taxa=num_leaves_with_taxa,
                                                         rogue_out=rogue_out,
                                                         rogue_in=rogue_in)

                            if fmeasure > cur_taxon_fmeasure:
                                cur_taxon_fmeasure = fmeasure
                                fmeasure_for_taxa[taxon] = [stat_table]
                            elif fmeasure == cur_taxon_fmeasure: