How to use the gtdbtk.config.config function in gtdbtk

To help you get started, we’ve selected a few gtdbtk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github

def _get_fastani_genome_path(self, fastani_verification, genomes):
        """Generates a queue of comparisons to be made and the paths to
        the corresponding genome id."""
        dict_compare, dict_paths = dict(), dict()

        for qry_node, qry_dict in fastani_verification.items():
            user_label = qry_node.taxon.label
            dict_paths[user_label] = genomes[user_label]
            dict_compare[user_label] = set()
            for node in qry_dict.get('potential_g'):
                leafnode = node[0]
                shortleaf = leafnode.taxon.label
                if leafnode.taxon.label.startswith('GB_') or leafnode.taxon.label.startswith('RS_'):
                    shortleaf = leafnode.taxon.label[3:]
                ref_path = os.path.join(
                    Config.FASTANI_GENOMES, shortleaf + Config.FASTANI_GENOMES_EXT)
                if not os.path.isfile(ref_path):
                    raise GTDBTkExit(f'Reference genome missing from FastANI database: {ref_path}')

                dict_compare[user_label].add(shortleaf)
                dict_paths[shortleaf] = ref_path

        return dict_compare, dict_paths

Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github

def parser_marker_summary_file(self, marker_summary_file, marker_set_id):
        results = {}
        with open(marker_summary_file, 'r') as msf:
            msf.readline()
            for line in msf:
                infos = line.strip().split('\t')
                if marker_set_id == "bac120":
                    multi_hits_percent = (100 * float(infos[2])) / \
                        Config.BAC_MARKER_COUNT
                elif marker_set_id == "ar122":
                    multi_hits_percent = (100 * float(infos[2])) / \
                        Config.AR_MARKER_COUNT
                # print (marker_set_id, float(infos[3]), multi_hits_percent)
                if multi_hits_percent >= Config.DEFAULT_MULTIHIT_THRESHOLD:
                    results[infos[0]] = round(multi_hits_percent, 1)
        return results

Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github

self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        self.logger.info('Reading taxonomy from file.')
        taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)

        # determine taxa to be used for inferring distribution
        trusted_taxa = None
        taxa_for_dist_inference = self._filter_taxa_for_dist_inference(tree,
                                                                       taxonomy,
                                                                       trusted_taxa,
                                                                       Config.RED_MIN_CHILDREN,
                                                                       Config.RED_MIN_SUPPORT)

        phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree,
                                                                     taxa_for_dist_inference,
                                                                     taxonomy)

        # set edge lengths to median value over all rootings
        tree.seed_node.rel_dist = 0.0
        for n in tree.preorder_node_iter(lambda x: x != tree.seed_node):
            n.rel_dist = np_median(rel_node_dists[n.id])
            rd_to_parent = n.rel_dist - n.parent_node.rel_dist
            if rd_to_parent &lt; 0:
                # This can occur since we are setting all nodes
                # to their median RED value.
                # self.logger.warning('Not all branches are positive after scaling.')
                pass
            n.edge_length = rd_to_parent

Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github

# read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        self.logger.info('Reading taxonomy from file.')
        taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)

        # determine taxa to be used for inferring distribution
        trusted_taxa = None
        taxa_for_dist_inference = self._filter_taxa_for_dist_inference(tree,
                                                                       taxonomy,
                                                                       trusted_taxa,
                                                                       Config.RED_MIN_CHILDREN,
                                                                       Config.RED_MIN_SUPPORT)

        phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree,
                                                                     taxa_for_dist_inference,
                                                                     taxonomy)

        # set edge lengths to median value over all rootings
        tree.seed_node.rel_dist = 0.0
        for n in tree.preorder_node_iter(lambda x: x != tree.seed_node):
            n.rel_dist = np_median(rel_node_dists[n.id])
            rd_to_parent = n.rel_dist - n.parent_node.rel_dist
            if rd_to_parent &lt; 0:
                # This can occur since we are setting all nodes
                # to their median RED value.
                # self.logger.warning('Not all branches are positive after scaling.')
                pass

Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github

Parameters
        ----------
        out_dir : output directory
        prefix : desired prefix for output files
        marker_set_id : bacterial or archeal id (bac120 or ar122)

        Returns
        -------
        dictionary
            dictionary[rank_prefix] = red_value

        """

        if marker_set_id == 'bac120':
            marker_dict = Config.RED_DIST_BAC_DICT
            out_path = os.path.join(
                out_dir, PATH_BAC120_RED_DICT.format(prefix=prefix))
        elif marker_set_id == 'ar122':
            marker_dict = Config.RED_DIST_ARC_DICT
            out_path = os.path.join(
                out_dir, PATH_AR122_RED_DICT.format(prefix=prefix))
        else:
            self.logger.error('There was an error determining the marker set.')
            raise GenomeMarkerSetUnknown

        make_sure_path_exists(os.path.dirname(out_path))

        with open(out_path, 'w') as reddictfile:
            reddictfile.write('Phylum\t{}\n'.format(marker_dict.get('p__')))
            reddictfile.write('Class\t{}\n'.format(marker_dict.get('c__')))
            reddictfile.write('Order\t{}\n'.format(marker_dict.get('o__')))

Ecogenomics / GTDBTk / gtdbtk / main.py View on Github

def _check_package_compatibility(self):
        """Check that GTDB-Tk is using the most up-to-date reference package."""
        pkg_ver = float(Config.VERSION_DATA.replace('r', ''))
        min_ver = float(Config.MIN_REF_DATA_VERSION.replace('r', ''))
        self.logger.info('Using GTDB-Tk reference data version {}: {}'
                         .format(Config.VERSION_DATA, Config.GENERIC_PATH))
        if pkg_ver &lt; min_ver:
            self.logger.warning(colour('You are not using the reference data '
                                       'intended for this release: {}'
                                       .format(Config.MIN_REF_DATA_VERSION),
                                       ['bright'], fg='yellow'))

Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github

# check if a scratch file is to be created
        pplacer_mmap_file = None
        if scratch_dir:
            self.logger.info('Using a scratch file for pplacer allocations. '
                             'This decreases memory usage and performance.')
            pplacer_mmap_file = os.path.join(
                scratch_dir, prefix + ".pplacer.scratch")
            make_sure_path_exists(scratch_dir)

        # get path to pplacer reference package
        if marker_set_id == 'bac120':
            if levelopt is None:
                self.logger.info(
                    f'Placing {num_genomes} bacterial genomes into reference tree with pplacer using {self.pplacer_cpus} cpus (be patient).')
                pplacer_ref_pkg = os.path.join(
                    Config.PPLACER_DIR, Config.PPLACER_BAC120_REF_PKG)
            elif levelopt == 'high':
                self.logger.info(
                    f'Placing {num_genomes} bacterial genomes into high reference tree with pplacer using {self.pplacer_cpus} cpus (be patient).')
                pplacer_ref_pkg = os.path.join(
                    Config.HIGH_PPLACER_DIR, Config.HIGH_PPLACER_REF_PKG)
            elif levelopt == 'low':
                self.logger.info(
                    f'Placing {num_genomes} bacterial genomes into low reference tree {tree_iter} with pplacer using {self.pplacer_cpus} cpus (be patient).')
                pplacer_ref_pkg = os.path.join(
                    Config.LOW_PPLACER_DIR, Config.LOW_PPLACER_REF_PKG.format(iter=tree_iter))
        elif marker_set_id == 'ar122':
            self.logger.info(
                f'Placing {num_genomes} archaeal genomes into reference tree with pplacer using {self.pplacer_cpus} cpus (be patient).')
            pplacer_ref_pkg = os.path.join(
                Config.PPLACER_DIR, Config.PPLACER_AR122_REF_PKG)
        else:

Ecogenomics / GTDBTk / gtdbtk / markers.py View on Github

ar122_marker_file = CopyNumberFileAR122(identity_dir, prefix)
        ar122_marker_file.read()
        bac120_marker_file = CopyNumberFileBAC120(identity_dir, prefix)
        bac120_marker_file.read()

        # Get the number of single copy markers for each domain
        for out_d, marker_summary in ((ar_count, ar122_marker_file),
                                      (bac_count, bac120_marker_file)):
            for genome_id in marker_summary.genomes:
                out_d[genome_id] = len(marker_summary.get_single_copy_hits(genome_id))

        bac_gids = set()
        ar_gids = set()
        bac_ar_diff = {}
        for gid in bac_count:
            arc_aa_per = (ar_count[gid] * 100.0 / Config.AR_MARKER_COUNT)
            bac_aa_per = (bac_count[gid] * 100.0 / Config.BAC_MARKER_COUNT)
            if bac_aa_per &gt;= arc_aa_per:
                bac_gids.add(gid)
            else:
                ar_gids.add(gid)
            if abs(bac_aa_per - arc_aa_per) &lt;= 10:
                bac_ar_diff[gid] = {'bac120': round(
                    bac_aa_per, 1), 'ar122': round(arc_aa_per, 1)}

        return bac_gids, ar_gids, bac_ar_diff

Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github

marker_set_id : bacterial or archeal id (bac120 or ar122)

        Returns
        -------
        tree: pplacer tree with RED value added to nodes of interest

        """

        self.logger.info('Calculating RED values based on reference tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        if levelopt is None:
            red_file = Config.MRCA_RED_BAC120
        elif levelopt == 'high':
            red_file = Config.HIGH_RED_FILE
        elif levelopt == 'low':
            red_file = Config.LOW_RED_FILE.format(iter=tree_iter)
        if marker_set_id == 'ar122':
            red_file = Config.MRCA_RED_AR122

        # create map from leave labels to tree nodes
        leaf_node_map = {}
        for leaf in tree.leaf_node_iter():
            leaf_node_map[leaf.taxon.label] = leaf

        # parse RED file and associate reference RED value to reference node in
        # the tree
        reference_nodes = set()
        with open(red_file) as rf:

Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github

pplacer_mmap_file = os.path.join(
                scratch_dir, prefix + ".pplacer.scratch")
            make_sure_path_exists(scratch_dir)

        # get path to pplacer reference package
        if marker_set_id == 'bac120':
            if levelopt is None:
                self.logger.info(
                    f'Placing {num_genomes} bacterial genomes into reference tree with pplacer using {self.pplacer_cpus} cpus (be patient).')
                pplacer_ref_pkg = os.path.join(
                    Config.PPLACER_DIR, Config.PPLACER_BAC120_REF_PKG)
            elif levelopt == 'high':
                self.logger.info(
                    f'Placing {num_genomes} bacterial genomes into high reference tree with pplacer using {self.pplacer_cpus} cpus (be patient).')
                pplacer_ref_pkg = os.path.join(
                    Config.HIGH_PPLACER_DIR, Config.HIGH_PPLACER_REF_PKG)
            elif levelopt == 'low':
                self.logger.info(
                    f'Placing {num_genomes} bacterial genomes into low reference tree {tree_iter} with pplacer using {self.pplacer_cpus} cpus (be patient).')
                pplacer_ref_pkg = os.path.join(
                    Config.LOW_PPLACER_DIR, Config.LOW_PPLACER_REF_PKG.format(iter=tree_iter))
        elif marker_set_id == 'ar122':
            self.logger.info(
                f'Placing {num_genomes} archaeal genomes into reference tree with pplacer using {self.pplacer_cpus} cpus (be patient).')
            pplacer_ref_pkg = os.path.join(
                Config.PPLACER_DIR, Config.PPLACER_AR122_REF_PKG)
        else:
            self.logger.error('Unknown marker set: {}'.format(marker_set_id))
            raise GenomeMarkerSetUnknown(
                'Unknown marker set: {}'.format(marker_set_id))

        # create pplacer output directory

How to use the gtdbtk.config.config function in gtdbtk

To help you get started, we’ve selected a few gtdbtk examples, based on popular ways it is used in public projects.

gtdbtk

Package Health Score

Popular gtdbtk functions

Similar packages