How to use the gtdbtk.io.marker.tophit.TopHitTigrFile function in gtdbtk

To help you get started, we’ve selected a few gtdbtk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Ecogenomics / GTDBTk / gtdbtk / external / tigrfam_search.py View on Github external
def _topHit(self, tigrfam_file):
        """Determine top hits to TIGRFAMs.

        A gene is assigned to a single TIGRFAM
        family. This will be the top hit among
        all TIGRFAM HMMs and pass the threshold
        for the HMM.

        Parameters
        ----------
        tigrfam_file : str
            Name of file containing hits to TIGRFAM HMMs.
        """
        assembly_dir, filename = os.path.split(tigrfam_file)
        genome_id = filename.replace(self.tigrfam_suffix, '')
        tophit_file = TopHitTigrFile(self.output_dir, genome_id)

        # Populate the top-hit file.
        with open(tigrfam_file, 'r') as fh_tigrfam:
            for line in fh_tigrfam:
                if line[0] == '#':
                    continue

                line_split = line.split()
                gene_id = line_split[0]
                hmm_id = line_split[3]
                evalue = float(line_split[4])
                bitscore = float(line_split[5])
                tophit_file.add_hit(gene_id, hmm_id, evalue, bitscore)

        # Write the top-hit file to disk and calculate checksum.
        tophit_file.write()
github Ecogenomics / GTDBTk / gtdbtk / external / tigrfam_search.py View on Github external
while True:
            gene_file = queueIn.get(block=True, timeout=None)
            if gene_file is None:
                break

            assembly_dir, filename = os.path.split(gene_file)
            genome_id = filename.replace(self.protein_file_suffix, '')
            genome_dir = os.path.join(self.output_dir, genome_id)
            output_hit_file = os.path.join(genome_dir, filename.replace(self.protein_file_suffix,
                                                                        self.tigrfam_suffix))

            hmmsearch_out = os.path.join(genome_dir, filename.replace(self.protein_file_suffix,
                                                                      '_tigrfam.out'))

            # Check if this has already been processed.
            out_files = (output_hit_file, hmmsearch_out, TopHitTigrFile.get_path(self.output_dir, genome_id))
            if all([file_has_checksum(x) for x in out_files]):
                self.warnings.info(f'Skipped TIGRFAM processing for: {genome_id}')
                with n_skipped.get_lock():
                    n_skipped.value += 1

            else:
                cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu %d %s %s' % (hmmsearch_out,
                                                                                                 output_hit_file,
                                                                                                 self.cpus_per_genome,
                                                                                                 self.tigrfam_hmms,
                                                                                                 gene_file)
                os.system(cmd)

                # calculate checksum
                for out_file in [output_hit_file, hmmsearch_out]:
                    checksum = sha256(out_file)
github Ecogenomics / GTDBTk / gtdbtk / markers.py View on Github external
def _report_identified_marker_genes(self, gene_dict, outdir, prefix):
        """Report statistics for identified marker genes."""

        # Summarise the copy number of each AR122 and BAC120 markers.
        tln_summary_file = TlnTableSummaryFile(outdir, prefix)
        ar122_copy_number_file = CopyNumberFileAR122(outdir, prefix)
        bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix)

        # Process each genome.
        for db_genome_id, info in sorted(gene_dict.items()):
            cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE)
            pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
            tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
            pfam_tophit_file.read()
            tigr_tophit_file.read()

            # Summarise each of the markers for this genome.
            ar122_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"),
                                              pfam_tophit_file, tigr_tophit_file)
            bac120_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"),
                                               pfam_tophit_file, tigr_tophit_file)

            # Write the best translation table to disk for this genome.
            tln_summary_file.add_genome(db_genome_id, info.get("best_translation_table"))

        # Write each of the summary files to disk.
        ar122_copy_number_file.write()
        bac120_copy_number_file.write()
        tln_summary_file.write()
github Ecogenomics / GTDBTk / gtdbtk / external / hmm_aligner.py View on Github external
def _run_multi_align(self, db_genome_id, path, marker_set_id):
        """
        Returns the concatenated marker sequence for a specific genome
        :param db_genome_id: Selected genome
        :param path: Path to the genomic fasta file for the genome
        :param marker_set_id: Unique ID of marker set to use for alignment
        """

        cur_marker_dir = os.path.dirname(os.path.dirname(path))
        pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
        tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
        pfam_tophit_file.read()
        tigr_tophit_file.read()

        if marker_set_id == 'bac120':
            copy_number_file = CopyNumberFileBAC120('/dev/null', None)
        elif marker_set_id == 'ar122':
            copy_number_file = CopyNumberFileAR122('/dev/null', None)
        else:
            raise GTDBTkException('Unknown marker set.')

        copy_number_file.add_genome(db_genome_id, path, pfam_tophit_file, tigr_tophit_file)
        single_copy_hits = copy_number_file.get_single_copy_hits(db_genome_id)

        # gather information for all marker genes
        marker_paths = {"PFAM": os.path.join(self.pfam_hmm_dir, 'individual_hmms'),
                        "TIGRFAM": os.path.join(os.path.dirname(self.tigrfam_hmm_dir), 'individual_hmms')}