How to use the gtdbtk.io.marker.tophit.TopHitPfamFile function in gtdbtk

To help you get started, we’ve selected a few gtdbtk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Ecogenomics / GTDBTk / gtdbtk / external / pfam_search.py View on Github external
def _workerThread(self, queueIn, queueOut, n_skipped):
        """Process each data item in parallel."""
        try:
            while True:
                gene_file = queueIn.get(block=True, timeout=None)
                if gene_file is None:
                    break

                genome_dir, filename = os.path.split(gene_file)
                genome_id = filename.replace(self.protein_file_suffix, '')
                output_hit_file = os.path.join(self.output_dir, genome_id, filename.replace(self.protein_file_suffix,
                                                                                            self.pfam_suffix))

                # Check if this has already been processed.
                out_files = (output_hit_file, TopHitPfamFile.get_path(self.output_dir, genome_id))
                if all([file_has_checksum(x) for x in out_files]):
                    self.warnings.info(f'Skipped Pfam processing for: {genome_id}')
                    with n_skipped.get_lock():
                        n_skipped.value += 1
                else:
                    pfam_scan = PfamScan(cpu=self.cpus_per_genome, fasta=gene_file, dir=self.pfam_hmm_dir)
                    pfam_scan.search()
                    pfam_scan.write_results(output_hit_file, None, None, None, None)

                    # calculate checksum
                    with open(output_hit_file + self.checksum_suffix, 'w') as fh:
                        fh.write(sha256(output_hit_file))

                    # identify top hit for each gene
                    self._topHit(output_hit_file)
github Ecogenomics / GTDBTk / gtdbtk / external / pfam_search.py View on Github external
A gene may be assigned to multiple
        PFAM families from the same clan. The
        search_pfam.pl script takes care of
        most of these issues and here the results
        are simply parsed.

        Parameters
        ----------
        pfam_file : str
            Name of file containing hits to PFAM HMMs.
        """

        assembly_dir, filename = os.path.split(pfam_file)
        genome_id = filename.replace(self.pfam_suffix, '')
        tophit_file = TopHitPfamFile(self.output_dir, genome_id)

        with open(pfam_file, 'r') as fh_pfam:
            for line in fh_pfam:
                if line[0] == '#' or not line.strip():
                    continue

                line_split = line.split()
                gene_id = line_split[0]
                hmm_id = line_split[5]
                evalue = float(line_split[12])
                bitscore = float(line_split[11])
                tophit_file.add_hit(gene_id, hmm_id, evalue, bitscore)

        tophit_file.write()
github Ecogenomics / GTDBTk / gtdbtk / markers.py View on Github external
def _report_identified_marker_genes(self, gene_dict, outdir, prefix):
        """Report statistics for identified marker genes."""

        # Summarise the copy number of each AR122 and BAC120 markers.
        tln_summary_file = TlnTableSummaryFile(outdir, prefix)
        ar122_copy_number_file = CopyNumberFileAR122(outdir, prefix)
        bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix)

        # Process each genome.
        for db_genome_id, info in sorted(gene_dict.items()):
            cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE)
            pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
            tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
            pfam_tophit_file.read()
            tigr_tophit_file.read()

            # Summarise each of the markers for this genome.
            ar122_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"),
                                              pfam_tophit_file, tigr_tophit_file)
            bac120_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"),
                                               pfam_tophit_file, tigr_tophit_file)

            # Write the best translation table to disk for this genome.
            tln_summary_file.add_genome(db_genome_id, info.get("best_translation_table"))

        # Write each of the summary files to disk.
        ar122_copy_number_file.write()
        bac120_copy_number_file.write()
github Ecogenomics / GTDBTk / gtdbtk / external / hmm_aligner.py View on Github external
def _run_multi_align(self, db_genome_id, path, marker_set_id):
        """
        Returns the concatenated marker sequence for a specific genome
        :param db_genome_id: Selected genome
        :param path: Path to the genomic fasta file for the genome
        :param marker_set_id: Unique ID of marker set to use for alignment
        """

        cur_marker_dir = os.path.dirname(os.path.dirname(path))
        pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
        tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
        pfam_tophit_file.read()
        tigr_tophit_file.read()

        if marker_set_id == 'bac120':
            copy_number_file = CopyNumberFileBAC120('/dev/null', None)
        elif marker_set_id == 'ar122':
            copy_number_file = CopyNumberFileAR122('/dev/null', None)
        else:
            raise GTDBTkException('Unknown marker set.')

        copy_number_file.add_genome(db_genome_id, path, pfam_tophit_file, tigr_tophit_file)
        single_copy_hits = copy_number_file.get_single_copy_hits(db_genome_id)

        # gather information for all marker genes
        marker_paths = {"PFAM": os.path.join(self.pfam_hmm_dir, 'individual_hmms'),