How to use the gtdbtk.biolib_lite.seq_io.read_seq function in gtdbtk

To help you get started, we’ve selected a few gtdbtk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github external
# rename user MSA file for compatibility with pplacer
        if not user_msa_file.endswith('.fasta'):
            if marker_set_id == 'bac120':
                t = PATH_BAC120_USER_MSA.format(prefix=prefix)
            elif marker_set_id == 'ar122':
                t = PATH_AR122_USER_MSA.format(prefix=prefix)
            else:
                self.logger.error(
                    'There was an error determining the marker set.')
                raise GenomeMarkerSetUnknown

            shutil.copyfile(user_msa_file, t)
            user_msa_file = t

        # run pplacer to place bins in reference genome tree
        num_genomes = sum([1 for _seq_id, _seq in read_seq(user_msa_file)])

        # check if a scratch file is to be created
        pplacer_mmap_file = None
        if scratch_dir:
            self.logger.info('Using a scratch file for pplacer allocations. '
                             'This decreases memory usage and performance.')
            pplacer_mmap_file = os.path.join(
                scratch_dir, prefix + ".pplacer.scratch")
            make_sure_path_exists(scratch_dir)

        # get path to pplacer reference package
        if marker_set_id == 'bac120':
            if levelopt is None:
                self.logger.info(
                    f'Placing {num_genomes} bacterial genomes into reference tree with pplacer using {self.pplacer_cpus} cpus (be patient).')
                pplacer_ref_pkg = os.path.join(
github Ecogenomics / GTDBTk / gtdbtk / biolib_lite / parallel.py View on Github external
# populate producer queue with data to process
        seq_iter = read_seq(seq_file)
        producer_queue = mp.Queue()
        read_all_seqs = False
        for _ in range(self.cpus):
            try:
                seq_data = next(seq_iter)
                producer_queue.put(seq_data)
            except StopIteration:
                read_all_seqs = True
                for _ in range(self.cpus):
                    producer_queue.put(None)  # signal processes to terminate
                break

        data_items = sum(1 for _ in read_seq(seq_file))
        try:
            consumer_queue = mp.Queue()
            manager_proc = mp.Process(target=self.__process_manager, args=(
                producer, producer_queue, consumer_queue))

            manager_proc.start()

            # process items produced by workers
            items_processed = 0
            consumer_data = None
            while True:
                if progress:
                    status = progress(items_processed, data_items)
                    sys.stdout.write('\r%s' % status)
                    sys.stdout.flush()
github Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github external
# given domain
                continue

            percent_multihit_dict = self.parser_marker_summary_file(
                marker_summary_file, marker_set_id)

            trans_table_file = os.path.join(
                align_dir, PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))
            trans_table_dict = self.parse_trans_table_file(trans_table_file)

            msa_dict = read_fasta(user_msa_file)

            if splittreeopt is True:
                # run pplacer to place bins in reference genome tree
                num_genomes = sum(
                    [1 for _seq_id, _seq in read_seq(user_msa_file)])
                summaryfout, debugfile, conflict_file, marker_dict = self._generate_summary_file(
                    marker_set_id, prefix, out_dir, debugopt, splittreeopt)

                high_classify_tree = self.place_genomes(user_msa_file,
                                                        marker_set_id,
                                                        out_dir,
                                                        prefix,
                                                        scratch_dir,
                                                        'high')
                tree = self._assign_mrca_red(
                    high_classify_tree, marker_set_id, 'high')

                high_classification = self._get_high_pplacer_taxonomy(
                    out_dir, marker_set_id, prefix, user_msa_file, tree)

                tree_mapping_dict = {}
github Ecogenomics / GTDBTk / gtdbtk / biolib_lite / parallel.py View on Github external
Function to process data items.
        consumer : queue
            Function to consumed processed data items.
        seq_file : str
            Name of fasta/q file to read.
        progress : function
            Function to report progress string.

        Returns
        -------
        
            Set by caller in the consumer function.
        """

        # populate producer queue with data to process
        seq_iter = read_seq(seq_file)
        producer_queue = mp.Queue()
        read_all_seqs = False
        for _ in range(self.cpus):
            try:
                seq_data = next(seq_iter)
                producer_queue.put(seq_data)
            except StopIteration:
                read_all_seqs = True
                for _ in range(self.cpus):
                    producer_queue.put(None)  # signal processes to terminate
                break

        data_items = sum(1 for _ in read_seq(seq_file))
        try:
            consumer_queue = mp.Queue()
            manager_proc = mp.Process(target=self.__process_manager, args=(