Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _get_fastani_genome_path(self, fastani_verification, genomes):
"""Generates a queue of comparisons to be made and the paths to
the corresponding genome id."""
dict_compare, dict_paths = dict(), dict()
for qry_node, qry_dict in fastani_verification.items():
user_label = qry_node.taxon.label
dict_paths[user_label] = genomes[user_label]
dict_compare[user_label] = set()
for node in qry_dict.get('potential_g'):
leafnode = node[0]
shortleaf = leafnode.taxon.label
if leafnode.taxon.label.startswith('GB_') or leafnode.taxon.label.startswith('RS_'):
shortleaf = leafnode.taxon.label[3:]
ref_path = os.path.join(
Config.FASTANI_GENOMES, shortleaf + Config.FASTANI_GENOMES_EXT)
if not os.path.isfile(ref_path):
raise GTDBTkExit(f'Reference genome missing from FastANI database: {ref_path}')
dict_compare[user_label].add(shortleaf)
dict_paths[shortleaf] = ref_path
return dict_compare, dict_paths
def parser_marker_summary_file(self, marker_summary_file, marker_set_id):
results = {}
with open(marker_summary_file, 'r') as msf:
msf.readline()
for line in msf:
infos = line.strip().split('\t')
if marker_set_id == "bac120":
multi_hits_percent = (100 * float(infos[2])) / \
Config.BAC_MARKER_COUNT
elif marker_set_id == "ar122":
multi_hits_percent = (100 * float(infos[2])) / \
Config.AR_MARKER_COUNT
# print (marker_set_id, float(infos[3]), multi_hits_percent)
if multi_hits_percent >= Config.DEFAULT_MULTIHIT_THRESHOLD:
results[infos[0]] = round(multi_hits_percent, 1)
return results
self.logger.info('Reading tree.')
tree = dendropy.Tree.get_from_path(input_tree,
schema='newick',
rooting='force-rooted',
preserve_underscores=True)
self.logger.info('Reading taxonomy from file.')
taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)
# determine taxa to be used for inferring distribution
trusted_taxa = None
taxa_for_dist_inference = self._filter_taxa_for_dist_inference(tree,
taxonomy,
trusted_taxa,
Config.RED_MIN_CHILDREN,
Config.RED_MIN_SUPPORT)
phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree,
taxa_for_dist_inference,
taxonomy)
# set edge lengths to median value over all rootings
tree.seed_node.rel_dist = 0.0
for n in tree.preorder_node_iter(lambda x: x != tree.seed_node):
n.rel_dist = np_median(rel_node_dists[n.id])
rd_to_parent = n.rel_dist - n.parent_node.rel_dist
if rd_to_parent < 0:
# This can occur since we are setting all nodes
# to their median RED value.
# self.logger.warning('Not all branches are positive after scaling.')
pass
n.edge_length = rd_to_parent
# read tree
self.logger.info('Reading tree.')
tree = dendropy.Tree.get_from_path(input_tree,
schema='newick',
rooting='force-rooted',
preserve_underscores=True)
self.logger.info('Reading taxonomy from file.')
taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)
# determine taxa to be used for inferring distribution
trusted_taxa = None
taxa_for_dist_inference = self._filter_taxa_for_dist_inference(tree,
taxonomy,
trusted_taxa,
Config.RED_MIN_CHILDREN,
Config.RED_MIN_SUPPORT)
phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree,
taxa_for_dist_inference,
taxonomy)
# set edge lengths to median value over all rootings
tree.seed_node.rel_dist = 0.0
for n in tree.preorder_node_iter(lambda x: x != tree.seed_node):
n.rel_dist = np_median(rel_node_dists[n.id])
rd_to_parent = n.rel_dist - n.parent_node.rel_dist
if rd_to_parent < 0:
# This can occur since we are setting all nodes
# to their median RED value.
# self.logger.warning('Not all branches are positive after scaling.')
pass
Parameters
----------
out_dir : output directory
prefix : desired prefix for output files
marker_set_id : bacterial or archeal id (bac120 or ar122)
Returns
-------
dictionary
dictionary[rank_prefix] = red_value
"""
if marker_set_id == 'bac120':
marker_dict = Config.RED_DIST_BAC_DICT
out_path = os.path.join(
out_dir, PATH_BAC120_RED_DICT.format(prefix=prefix))
elif marker_set_id == 'ar122':
marker_dict = Config.RED_DIST_ARC_DICT
out_path = os.path.join(
out_dir, PATH_AR122_RED_DICT.format(prefix=prefix))
else:
self.logger.error('There was an error determining the marker set.')
raise GenomeMarkerSetUnknown
make_sure_path_exists(os.path.dirname(out_path))
with open(out_path, 'w') as reddictfile:
reddictfile.write('Phylum\t{}\n'.format(marker_dict.get('p__')))
reddictfile.write('Class\t{}\n'.format(marker_dict.get('c__')))
reddictfile.write('Order\t{}\n'.format(marker_dict.get('o__')))
def _check_package_compatibility(self):
"""Check that GTDB-Tk is using the most up-to-date reference package."""
pkg_ver = float(Config.VERSION_DATA.replace('r', ''))
min_ver = float(Config.MIN_REF_DATA_VERSION.replace('r', ''))
self.logger.info('Using GTDB-Tk reference data version {}: {}'
.format(Config.VERSION_DATA, Config.GENERIC_PATH))
if pkg_ver < min_ver:
self.logger.warning(colour('You are not using the reference data '
'intended for this release: {}'
.format(Config.MIN_REF_DATA_VERSION),
['bright'], fg='yellow'))
# check if a scratch file is to be created
pplacer_mmap_file = None
if scratch_dir:
self.logger.info('Using a scratch file for pplacer allocations. '
'This decreases memory usage and performance.')
pplacer_mmap_file = os.path.join(
scratch_dir, prefix + ".pplacer.scratch")
make_sure_path_exists(scratch_dir)
# get path to pplacer reference package
if marker_set_id == 'bac120':
if levelopt is None:
self.logger.info(
f'Placing {num_genomes} bacterial genomes into reference tree with pplacer using {self.pplacer_cpus} cpus (be patient).')
pplacer_ref_pkg = os.path.join(
Config.PPLACER_DIR, Config.PPLACER_BAC120_REF_PKG)
elif levelopt == 'high':
self.logger.info(
f'Placing {num_genomes} bacterial genomes into high reference tree with pplacer using {self.pplacer_cpus} cpus (be patient).')
pplacer_ref_pkg = os.path.join(
Config.HIGH_PPLACER_DIR, Config.HIGH_PPLACER_REF_PKG)
elif levelopt == 'low':
self.logger.info(
f'Placing {num_genomes} bacterial genomes into low reference tree {tree_iter} with pplacer using {self.pplacer_cpus} cpus (be patient).')
pplacer_ref_pkg = os.path.join(
Config.LOW_PPLACER_DIR, Config.LOW_PPLACER_REF_PKG.format(iter=tree_iter))
elif marker_set_id == 'ar122':
self.logger.info(
f'Placing {num_genomes} archaeal genomes into reference tree with pplacer using {self.pplacer_cpus} cpus (be patient).')
pplacer_ref_pkg = os.path.join(
Config.PPLACER_DIR, Config.PPLACER_AR122_REF_PKG)
else:
ar122_marker_file = CopyNumberFileAR122(identity_dir, prefix)
ar122_marker_file.read()
bac120_marker_file = CopyNumberFileBAC120(identity_dir, prefix)
bac120_marker_file.read()
# Get the number of single copy markers for each domain
for out_d, marker_summary in ((ar_count, ar122_marker_file),
(bac_count, bac120_marker_file)):
for genome_id in marker_summary.genomes:
out_d[genome_id] = len(marker_summary.get_single_copy_hits(genome_id))
bac_gids = set()
ar_gids = set()
bac_ar_diff = {}
for gid in bac_count:
arc_aa_per = (ar_count[gid] * 100.0 / Config.AR_MARKER_COUNT)
bac_aa_per = (bac_count[gid] * 100.0 / Config.BAC_MARKER_COUNT)
if bac_aa_per >= arc_aa_per:
bac_gids.add(gid)
else:
ar_gids.add(gid)
if abs(bac_aa_per - arc_aa_per) <= 10:
bac_ar_diff[gid] = {'bac120': round(
bac_aa_per, 1), 'ar122': round(arc_aa_per, 1)}
return bac_gids, ar_gids, bac_ar_diff
marker_set_id : bacterial or archeal id (bac120 or ar122)
Returns
-------
tree: pplacer tree with RED value added to nodes of interest
"""
self.logger.info('Calculating RED values based on reference tree.')
tree = dendropy.Tree.get_from_path(input_tree,
schema='newick',
rooting='force-rooted',
preserve_underscores=True)
if levelopt is None:
red_file = Config.MRCA_RED_BAC120
elif levelopt == 'high':
red_file = Config.HIGH_RED_FILE
elif levelopt == 'low':
red_file = Config.LOW_RED_FILE.format(iter=tree_iter)
if marker_set_id == 'ar122':
red_file = Config.MRCA_RED_AR122
# create map from leave labels to tree nodes
leaf_node_map = {}
for leaf in tree.leaf_node_iter():
leaf_node_map[leaf.taxon.label] = leaf
# parse RED file and associate reference RED value to reference node in
# the tree
reference_nodes = set()
with open(red_file) as rf:
pplacer_mmap_file = os.path.join(
scratch_dir, prefix + ".pplacer.scratch")
make_sure_path_exists(scratch_dir)
# get path to pplacer reference package
if marker_set_id == 'bac120':
if levelopt is None:
self.logger.info(
f'Placing {num_genomes} bacterial genomes into reference tree with pplacer using {self.pplacer_cpus} cpus (be patient).')
pplacer_ref_pkg = os.path.join(
Config.PPLACER_DIR, Config.PPLACER_BAC120_REF_PKG)
elif levelopt == 'high':
self.logger.info(
f'Placing {num_genomes} bacterial genomes into high reference tree with pplacer using {self.pplacer_cpus} cpus (be patient).')
pplacer_ref_pkg = os.path.join(
Config.HIGH_PPLACER_DIR, Config.HIGH_PPLACER_REF_PKG)
elif levelopt == 'low':
self.logger.info(
f'Placing {num_genomes} bacterial genomes into low reference tree {tree_iter} with pplacer using {self.pplacer_cpus} cpus (be patient).')
pplacer_ref_pkg = os.path.join(
Config.LOW_PPLACER_DIR, Config.LOW_PPLACER_REF_PKG.format(iter=tree_iter))
elif marker_set_id == 'ar122':
self.logger.info(
f'Placing {num_genomes} archaeal genomes into reference tree with pplacer using {self.pplacer_cpus} cpus (be patient).')
pplacer_ref_pkg = os.path.join(
Config.PPLACER_DIR, Config.PPLACER_AR122_REF_PKG)
else:
self.logger.error('Unknown marker set: {}'.format(marker_set_id))
raise GenomeMarkerSetUnknown(
'Unknown marker set: {}'.format(marker_set_id))
# create pplacer output directory