How to use the gtdbtk.config.config.TAXONOMY_FILE function in gtdbtk

To help you get started, we’ve selected a few gtdbtk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Ecogenomics / GTDBTk / tests / test_classify.py View on Github external
def setUp(self):
        self.classify = Classify()
        self.out_dir = tempfile.mkdtemp(prefix='gtdbtk_tmp_')
        self.prefix = 'gtdbtk'
        self.pplacer_dir_reference = 'tests/data/pplacer_dir_reference'
        self.aln_dir_ref = 'tests/data/align_dir_reference/align'
        self.user_msa_file = os.path.join(self.aln_dir_ref, 'gtdbtk.ar122.user_msa.fasta')
        self.taxonomy_file = Config.TAXONOMY_FILE
        self.gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
github Ecogenomics / GTDBTk / gtdbtk / infer_ranks.py View on Github external
def _get_ingroup_domain(self, ingroup_taxon) -> str:
        """Get domain on ingroup taxon."""

        # read GTDB taxonomy in order to establish domain on ingroup taxon
        gtdb_taxonomy = Taxonomy().read(TAXONOMY_FILE)
        ingroup_domain = None
        for taxa in gtdb_taxonomy.values():
            if ingroup_taxon in taxa:
                ingroup_domain = taxa[Taxonomy.DOMAIN_IDX]

        if ingroup_domain is None:
            raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} was not found in '
                             f'the GTDB taxonomy.')

        return ingroup_domain
github Ecogenomics / GTDBTk / gtdbtk / tools.py View on Github external
def get_reference_ids():
    results = []
    with open(Config.TAXONOMY_FILE) as tf:
        for line in tf:
            raw_id = line.split('\t')[0]
            results.append(raw_id)
            if raw_id[0:4] in ['GCF_', 'GCA_']:
                results.append(add_ncbi_prefix(raw_id))
            elif raw_id[0:3] in ['RS_', 'GB_']:
                results.append(raw_id[3:])

    return results
github Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github external
Returns
        -------
        dendropy.Tree
            Taxonomy string.
        """

        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        self.logger.info('Reading taxonomy from file.')
        taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)

        # determine taxa to be used for inferring distribution
        trusted_taxa = None
        taxa_for_dist_inference = self._filter_taxa_for_dist_inference(tree,
                                                                       taxonomy,
                                                                       trusted_taxa,
                                                                       Config.RED_MIN_CHILDREN,
                                                                       Config.RED_MIN_SUPPORT)

        phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree,
                                                                     taxa_for_dist_inference,
                                                                     taxonomy)

        # set edge lengths to median value over all rootings
        tree.seed_node.rel_dist = 0.0
        for n in tree.preorder_node_iter(lambda x: x != tree.seed_node):
github Ecogenomics / GTDBTk / gtdbtk / main.py View on Github external
def _read_taxonomy_files(self, options) -> Dict[str, Tuple[str, str, str, str, str, str, str]]:
        """Read and merge taxonomy files."""

        self.logger.info('Reading GTDB taxonomy for representative genomes.')
        taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)

        if options.gtdbtk_classification_file:
            # add and overwrite taxonomy for genomes specified in the
            # GTDB-Tk classification file
            check_file_exists(options.gtdbtk_classification_file)

            self.logger.info('Reading GTDB-Tk classification file.')
            gtdbtk_taxonomy = Taxonomy().read(options.gtdbtk_classification_file)
            del gtdbtk_taxonomy['user_genome']
            num_reassigned = 0
            for gid, taxa in gtdbtk_taxonomy.items():
                if gid in taxonomy:
                    num_reassigned += 1
                taxonomy[gid] = taxa

            self.logger.info(f'Read GTDB-Tk classifications for {len(gtdbtk_taxonomy):,} genomes.')
github Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github external
def __init__(self, cpus=1, pplacer_cpus=None):
        """Initialize."""

        check_dependencies(['pplacer', 'guppy', 'fastANI'])

        self.taxonomy_file = Config.TAXONOMY_FILE
        self.af_threshold = Config.AF_THRESHOLD
        self.gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)

        self.order_rank = ["d__", "p__", "c__", "o__", 'f__', 'g__', 's__']

        self.logger = logging.getLogger('timestamp')
        self.cpus = max(cpus, 1)
        self.pplacer_cpus = max(pplacer_cpus if pplacer_cpus else cpus, 1)

        self.species_radius = self.parse_radius_file()

        self.reference_ids = get_reference_ids()

        # rank_of_interest determine the rank in the tree_mapping file for
        # lower classification
        self.rank_of_interest = "o__"
github Ecogenomics / GTDBTk / gtdbtk / misc.py View on Github external
def check_install(self):
        """Check that all reference files exist.

        Returns
        -------
        bool
            True if the installation is complete, False otherwise.
        """
        ok = True
        ok = ok and self.checkfile(Config.TAXONOMY_FILE, 'Taxonomy')
        ok = ok and self.checkfile(Config.CONCAT_BAC120, 'concat_bac120')
        ok = ok and self.checkfile(Config.CONCAT_AR122, 'concat_ar122')
        ok = ok and self.checkfile(os.path.join(Config.MASK_DIR,  Config.MASK_BAC120), 'mask_bac120')
        ok = ok and self.checkfile(os.path.join(Config.MASK_DIR,  Config.MASK_AR122), 'mask_ar122')
        ok = ok and self.checkfile(Config.TIGRFAM_HMMS, 'tirgfam_hmms')
        ok = ok and self.checkfile(os.path.join(Config.PFAM_HMM_DIR, 'Pfam-A.hmm'), 'pfam_hmms')

        ok = ok and self.checkfolder(Config.FASTANI_GENOMES, 'fastani_genomes')
        ok = ok and self.checkfolder(os.path.join(Config.PPLACER_DIR,  Config.PPLACER_BAC120_REF_PKG), 'pplacer_bac120')
        ok = ok and self.checkfolder(os.path.join(Config.PPLACER_DIR, Config.PPLACER_AR122_REF_PKG), 'pplacer_ar122')

        if not ok:
            self.logger.error('One or more reference files are malformed.')
        return ok
github Ecogenomics / GTDBTk / gtdbtk / markers.py View on Github external
"""Initialize."""

        self.logger = logging.getLogger('timestamp')
        self.warnings = logging.getLogger('warnings')

        self.cpus = cpus
        self.debug = debug
        self.marker_gene_dir = None

        self.genome_file_suffix = GENOME_FILE_SUFFIX
        self.protein_file_suffix = PROTEIN_FILE_SUFFIX
        self.nt_gene_file_suffix = NT_GENE_FILE_SUFFIX
        self.gff_file_suffix = GFF_FILE_SUFFIX
        self.checksum_suffix = CHECKSUM_SUFFIX

        self.taxonomy_file = Config.TAXONOMY_FILE

        self.pfam_hmm_dir = Config.PFAM_HMM_DIR
        self.pfam_suffix = PFAM_SUFFIX
        self.pfam_top_hit_suffix = PFAM_TOP_HIT_SUFFIX

        self.tigrfam_hmms = Config.TIGRFAM_HMMS
        self.tigrfam_suffix = TIGRFAM_SUFFIX
        self.tigrfam_top_hit_suffix = TIGRFAM_TOP_HIT_SUFFIX
github Ecogenomics / GTDBTk / gtdbtk / ani_rep.py View on Github external
mash = Mash(self.cpus, dir_mash, prefix)
            self.logger.info(f'Using Mash version {mash.version()}')
            mash_results = mash.run(genomes, ref_genomes, max_d, mash_k, mash_v, mash_s)
            for qry_gid, ref_hits in mash_results.items():
                d_compare[qry_gid] = d_compare[qry_gid].union(set(ref_hits.keys()))

        # Compare against all reference genomes.
        else:
            for qry_gid in genomes:
                d_compare[qry_gid] = set(ref_genomes.keys())

        self.logger.info(f'Calculating ANI with FastANI v{FastANI._get_version()}.')
        fastani = FastANI(self.cpus, force_single=True)
        fastani_results = fastani.run(d_compare, d_paths)

        taxonomy = Taxonomy().read(TAXONOMY_FILE, canonical_ids=True)
        ANISummaryFile(out_dir, prefix, fastani_results, taxonomy)
        ANIClosestFile(out_dir,
                       prefix,
                       fastani_results,
                       genomes,
                       min_af,
                       taxonomy)