Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _get_median_reds(self, ingroup_domain: str):
"""Get median RED values for domain of ingroup taxon."""
# get median RED values for domain
if ingroup_domain == 'd__Bacteria':
median_reds = RED_DIST_BAC_DICT
elif ingroup_domain == 'd__Archaea':
median_reds = RED_DIST_ARC_DICT
else:
raise GTDBTkExit(f'Unrecognized GTDB domain: {ingroup_domain}.')
# report median values
domain = ingroup_domain.replace('d__', '')
self.logger.info('Median RED values for {}:'.format(domain))
for idx, rank_prefix in enumerate(Taxonomy.rank_prefixes):
if idx != Taxonomy.DOMAIN_IDX and idx != Taxonomy.SPECIES_IDX:
self.logger.info(' {}\t{:.3f}'.format(
Taxonomy.rank_labels[idx].capitalize(),
median_reds[rank_prefix]))
return median_reds
dict : d[taxon_id] -> [taxon, taxonomy]
Taxa with invalid rank prefixes.
dict: d[taxon_id] -> [species name, error message]
Taxa with invalid species names.
dict: d[child_taxon_id] -> two or more parent taxon ids
Taxa with invalid hierarchies.
"""
# check for incomplete taxonomy strings or unexpected rank prefixes
invalid_ranks = {}
invalid_prefixes = {}
invalid_species_name = {}
invalid_group_name = {}
for taxon_id, taxa in taxonomy.items():
if check_ranks:
if len(taxa) != len(Taxonomy.rank_prefixes):
invalid_ranks[taxon_id] = ';'.join(taxa)
continue
if check_prefixes:
for r, taxon in enumerate(taxa):
if taxon[0:3] != Taxonomy.rank_prefixes[r]:
invalid_prefixes[taxon_id] = [taxon, ';'.join(taxa)]
break
if check_group_names:
for taxon in taxa:
canonical_taxon = ' '.join(
[t.strip() for t in re.split('_[A-Z]+(?= |$)', taxon[3:])]).strip()
if canonical_taxon and re.match('^[a-zA-Z0-9- ]+$', canonical_taxon) is None:
invalid_group_name[taxon_id] = [
taxon, 'Taxon contains invalid characters']
parent = leaf
while parent:
_support, taxon, _aux_info = parse_label(parent.label)
if taxon:
for t in taxon.split(';')[::-1]:
leaf_taxa.append(t.strip())
parent = parent.parent_node
ordered_taxa = leaf_taxa[::-1]
# fill in missing ranks
last_rank = ordered_taxa[-1][0:3]
for i in range(Taxonomy.rank_prefixes.index(last_rank) + 1, len(Taxonomy.rank_prefixes)):
ordered_taxa.append(Taxonomy.rank_prefixes[i])
return ordered_taxa
extent_taxa_with_label = {}
for i, rank in enumerate(Taxonomy.rank_labels):
extent_taxa_with_label[i] = Taxonomy().extant_taxa_for_rank(rank, taxonomy)
# get parent taxon for each taxon:
taxon_parents = Taxonomy().parents(taxonomy)
# get number of leaves and taxon in each lineage
self.logger.info('Calculating taxa within each lineage.')
for node in tree.preorder_node_iter():
num_leaves = 0
taxa_count = defaultdict(lambda: defaultdict(int))
for leaf in node.leaf_iter():
num_leaves += 1
for rank_index, taxon in enumerate(taxonomy[leaf.taxon.label]):
if taxon != Taxonomy.rank_prefixes[rank_index]:
taxa_count[rank_index][taxon] += 1
node.num_leaves = num_leaves
node.taxa_count = taxa_count
taxa_in_tree = defaultdict(int)
for leaf in tree.leaf_node_iter():
for taxon in taxonomy[leaf.taxon.label]:
taxa_in_tree[taxon] += 1
# find node with best F-measure for each taxon
fmeasure_for_taxa = {}
for rank_index in range(0, len(Taxonomy.rank_labels)):
# if rank_index == 6: #*** skip species
# continue
self.logger.info('Processing {:,} taxa at {} rank.'.format(
Parameters
----------
taxa : [d__, ..., s__]
List of taxa.
Returns
-------
list
List of taxa with filled trailing ranks.
"""
if not taxa:
return ';'.join(Taxonomy.rank_prefixes)
last_rank = Taxonomy.rank_prefixes.index(taxa[-1][0:3])
for i in range(last_rank + 1, len(Taxonomy.rank_prefixes)):
taxa.append(Taxonomy.rank_prefixes[i])
return taxa
def sort_taxa(self, taxa, reverse=False):
"""Sort taxa by rank and then alphabetically.
Parameters
----------
taxa : iterable
Taxa with rank prefixes.
Returns
-------
list
Taxa sorted by rank and alphabetically within each rank.
"""
ordered_taxa = []
for rank_prefix in Taxonomy.rank_prefixes:
rank_taxa = []
for taxon in taxa:
if taxon.startswith(rank_prefix):
rank_taxa.append(taxon)
ordered_taxa.extend(sorted(rank_taxa))
if reverse:
ordered_taxa = ordered_taxa[::-1]
return ordered_taxa
def fill_trailing_ranks(self, taxa):
"""Fill in missing trailing ranks in a taxonomy string.
Parameters
----------
taxa : [d__, ..., s__]
List of taxa.
Returns
-------
list
List of taxa with filled trailing ranks.
"""
if not taxa:
return ';'.join(Taxonomy.rank_prefixes)
last_rank = Taxonomy.rank_prefixes.index(taxa[-1][0:3])
for i in range(last_rank + 1, len(Taxonomy.rank_prefixes)):
taxa.append(Taxonomy.rank_prefixes[i])
return taxa
Parameters
----------
taxonomy : d[unique_id] -> [d__; ...; s__]
Taxonomy strings indexed by unique ids.
Returns
-------
dict : d[taxon] -> lineages
List of lineages for duplicate taxa.
"""
# get lineages for each taxon name
taxon_lineages = defaultdict(set)
for taxa in taxonomy.values():
for i, taxon in enumerate(taxa):
if taxon != Taxonomy.rank_prefixes[i]:
taxon_lineages[taxon].add(';'.join(taxa[0:i + 1]))
# identify taxon belonging to multiple lineages
duplicates = {}
for taxon, lineages in taxon_lineages.items():
if len(lineages) >= 2:
duplicates[taxon] = lineages
return duplicates
for node in taxon_parent_node.preorder_iter():
taxa_in_lineage = node.taxa_count[rank_index][taxon]
num_leaves_with_taxa = sum(node.taxa_count[rank_index].values())
if taxa_in_lineage != 0 and num_leaves_with_taxa != 0:
precision = float(taxa_in_lineage) / num_leaves_with_taxa
recall = float(taxa_in_lineage) / total_taxa
fmeasure = (2 * precision * recall) / (precision + recall)
if fmeasure >= cur_taxon_fmeasure:
node_taxa = set([l.taxon.label for l in node.leaf_iter()])
rogue_out = cur_taxa - node_taxa
rogue_in = []
for gid in node_taxa - cur_taxa:
if taxonomy[gid][rank_index] != Taxonomy.rank_prefixes[rank_index]:
rogue_in.append(gid)
stat_table = self.StatsTable(node=node,
fmeasure=fmeasure,
precision=precision,
recall=recall,
taxa_in_lineage=taxa_in_lineage,
total_taxa=total_taxa,
num_leaves_with_taxa=num_leaves_with_taxa,
rogue_out=rogue_out,
rogue_in=rogue_in)
if fmeasure > cur_taxon_fmeasure:
cur_taxon_fmeasure = fmeasure
fmeasure_for_taxa[taxon] = [stat_table]
elif fmeasure == cur_taxon_fmeasure: