How to use gtdbtk - 10 common examples

To help you get started, we’ve selected a few gtdbtk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Ecogenomics / GTDBTk / tests / test_gtdbtk / test_tools.py View on Github external
def test_add_ncbi_prefix(self):
        refname = 'GCF_123.1'
        self.assertEqual(tools.add_ncbi_prefix(refname), 'RS_GCF_123.1')
        refname = 'GCA_456.1'
        self.assertEqual(tools.add_ncbi_prefix(refname), 'GB_GCA_456.1')
        refname = 'genome_1'
        self.assertEqual(tools.add_ncbi_prefix(refname), refname)
github Ecogenomics / GTDBTk / tests / test_gtdbtk / test_tools.py View on Github external
def test_add_ncbi_prefix(self):
        refname = 'GCF_123.1'
        self.assertEqual(tools.add_ncbi_prefix(refname), 'RS_GCF_123.1')
        refname = 'GCA_456.1'
        self.assertEqual(tools.add_ncbi_prefix(refname), 'GB_GCA_456.1')
        refname = 'genome_1'
        self.assertEqual(tools.add_ncbi_prefix(refname), refname)
github Ecogenomics / GTDBTk / tests / test_gtdbtk / test_tools.py View on Github external
def test_add_ncbi_prefix(self):
        refname = 'GCF_123.1'
        self.assertEqual(tools.add_ncbi_prefix(refname), 'RS_GCF_123.1')
        refname = 'GCA_456.1'
        self.assertEqual(tools.add_ncbi_prefix(refname), 'GB_GCA_456.1')
        refname = 'genome_1'
        self.assertEqual(tools.add_ncbi_prefix(refname), refname)
github Ecogenomics / GTDBTk / tests / test_classify.py View on Github external
def setUp(self):
        self.classify = Classify()
        self.out_dir = tempfile.mkdtemp(prefix='gtdbtk_tmp_')
        self.prefix = 'gtdbtk'
        self.pplacer_dir_reference = 'tests/data/pplacer_dir_reference'
        self.aln_dir_ref = 'tests/data/align_dir_reference/align'
        self.user_msa_file = os.path.join(self.aln_dir_ref, 'gtdbtk.ar122.user_msa.fasta')
        self.taxonomy_file = Config.TAXONOMY_FILE
        self.gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
github Ecogenomics / GTDBTk / tests / test_cli.py View on Github external
self.options.min_perc_aa = 50
        self.options.rnd_seed = 42

        # classify options
        self.options.scratch_dir = None
        self.options.keep_ref_red = None
        self.options.pplacer_cpus = None

        # infer options
        self.options.prot_model = 'WAG'
        self.options.no_support = False
        self.options.no_gamma = True

        self.version = ' unittest'
        self.optionparser = OptionsParser(self.version)
        logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True)
        # self.generic_out_path = 'tests/data/results'
        self.generic_out_path = tempfile.mkdtemp(prefix='gtdbtk_tmp_')
github Ecogenomics / GTDBTk / gtdbtk / infer_ranks.py View on Github external
def _get_ingroup_domain(self, ingroup_taxon) -> str:
        """Get domain on ingroup taxon."""

        # read GTDB taxonomy in order to establish domain on ingroup taxon
        gtdb_taxonomy = Taxonomy().read(TAXONOMY_FILE)
        ingroup_domain = None
        for taxa in gtdb_taxonomy.values():
            if ingroup_taxon in taxa:
                ingroup_domain = taxa[Taxonomy.DOMAIN_IDX]

        if ingroup_domain is None:
            raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} was not found in '
                             f'the GTDB taxonomy.')

        return ingroup_domain
github Ecogenomics / GTDBTk / gtdbtk / decorate.py View on Github external
fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall)]
          Node with highest F-measure for each taxon.
        taxonomy : d[unique_id] -> [d__; ...; s__]
          Taxonomic information for taxa in tree of interest.
        out_table : str
          Output table to write statistics for assigned labels.  
        """

        # get extent taxa
        extant_taxa = Taxonomy().extant_taxa(taxonomy)

        fout_table = open(out_table, 'w')
        fout_table.write('Taxon\tNo. Expected in Tree\tF-measure\tPrecision\tRecall')
        fout_table.write('\tNo. Genomes from Taxon\tNo. Genome In Lineage')
        fout_table.write('\tRogue out\tRogue in\n')
        for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys()):
            if len(fmeasure_for_taxa[taxon]) != 1:
                self.logger.error('Multiple positions specified for taxon label.')
                sys.exit()

            num_genomes = len(extant_taxa[taxon])

            stat_table = fmeasure_for_taxa[taxon][0]
            fout_table.write('%s\t%d\t%.4f\t%.4f\t%.4f\t%d\t%d\t%s\t%s\n' % (
                taxon,
                num_genomes,
                stat_table.fmeasure,
                stat_table.precision,
                stat_table.recall,
                stat_table.taxa_in_lineage,
                stat_table.num_leaves_with_taxa,
                ','.join(stat_table.rogue_out),
github Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github external
def _get_fastani_genome_path(self, fastani_verification, genomes):
        """Generates a queue of comparisons to be made and the paths to
        the corresponding genome id."""
        dict_compare, dict_paths = dict(), dict()

        for qry_node, qry_dict in fastani_verification.items():
            user_label = qry_node.taxon.label
            dict_paths[user_label] = genomes[user_label]
            dict_compare[user_label] = set()
            for node in qry_dict.get('potential_g'):
                leafnode = node[0]
                shortleaf = leafnode.taxon.label
                if leafnode.taxon.label.startswith('GB_') or leafnode.taxon.label.startswith('RS_'):
                    shortleaf = leafnode.taxon.label[3:]
                ref_path = os.path.join(
                    Config.FASTANI_GENOMES, shortleaf + Config.FASTANI_GENOMES_EXT)
                if not os.path.isfile(ref_path):
                    raise GTDBTkExit(f'Reference genome missing from FastANI database: {ref_path}')

                dict_compare[user_label].add(shortleaf)
                dict_paths[shortleaf] = ref_path

        return dict_compare, dict_paths
github Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github external
def parser_marker_summary_file(self, marker_summary_file, marker_set_id):
        results = {}
        with open(marker_summary_file, 'r') as msf:
            msf.readline()
            for line in msf:
                infos = line.strip().split('\t')
                if marker_set_id == "bac120":
                    multi_hits_percent = (100 * float(infos[2])) / \
                        Config.BAC_MARKER_COUNT
                elif marker_set_id == "ar122":
                    multi_hits_percent = (100 * float(infos[2])) / \
                        Config.AR_MARKER_COUNT
                # print (marker_set_id, float(infos[3]), multi_hits_percent)
                if multi_hits_percent >= Config.DEFAULT_MULTIHIT_THRESHOLD:
                    results[infos[0]] = round(multi_hits_percent, 1)
        return results
github Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github external
the corresponding genome id."""
        dict_compare, dict_paths = dict(), dict()

        for qry_node, qry_dict in fastani_verification.items():
            user_label = qry_node.taxon.label
            dict_paths[user_label] = genomes[user_label]
            dict_compare[user_label] = set()
            for node in qry_dict.get('potential_g'):
                leafnode = node[0]
                shortleaf = leafnode.taxon.label
                if leafnode.taxon.label.startswith('GB_') or leafnode.taxon.label.startswith('RS_'):
                    shortleaf = leafnode.taxon.label[3:]
                ref_path = os.path.join(
                    Config.FASTANI_GENOMES, shortleaf + Config.FASTANI_GENOMES_EXT)
                if not os.path.isfile(ref_path):
                    raise GTDBTkExit(f'Reference genome missing from FastANI database: {ref_path}')

                dict_compare[user_label].add(shortleaf)
                dict_paths[shortleaf] = ref_path

        return dict_compare, dict_paths