How to use the biolib.external.diamond.Diamond function in biolib

To help you get started, we’ve selected a few biolib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dparks1134 / RefineM / refinem / taxon_profile.py View on Github external
sys.exit(-1)
            
        # record length and number of genes in each scaffold
        for aa_file in gene_files:
            genome_id = remove_extension(aa_file)
            self.profiles[genome_id] = Profile(genome_id, percent_to_classify, taxonomy)

            for seq_id, seq in seq_io.read_seq(aa_file):
                scaffold_id = seq_id[0:seq_id.rfind('_')]
                self.profiles[genome_id].genes_in_scaffold[scaffold_id] += 1
                self.profiles[genome_id].coding_bases[scaffold_id] += len(seq) * 3  # length in nucleotide space

        # run diamond and create taxonomic profile for each genome
        self.logger.info('Running DIAMOND blastp with {:,} processes (be patient!)'.format(self.cpus))

        diamond = Diamond(self.cpus)
        diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv')
        if not os.path.exists(diamond_table_out):
            diamond.blastp(gene_file, 
                            db_file, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            1, 
                            False,
                            diamond_table_out, 
                            output_fmt='standard', 
                            tmp_dir=tmpdir)
        else:
            self.logger.warning('Using previously generated DIAMOND results: {}'.format(
                                    diamond_table_out))
github dparks1134 / CompareM / comparem / similarity_search.py View on Github external
evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """
        
        self.logger.info('Creating DIAMOND database of query proteins (be patient!).')
        diamond = Diamond(self.cpus)
        query_diamond_db = os.path.join(output_dir, 'query_genes')
        diamond.create_db(query_gene_file, query_diamond_db)
        
        self.logger.info('Creating DIAMOND database of target proteins (be patient!).')
        target_diamond_db = os.path.join(output_dir, 'target_genes')
        diamond.create_db(target_gene_file, target_diamond_db)

        # blast query genes against target proteins
        self.logger.info('Performing similarity sequence between query and target proteins (be patient!).')
        
        if tmp_dir:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False)
        tmp_query_hits_table.close()
github dparks1134 / RefineM / refinem / reference.py View on Github external
Percent identity threshold of valid hits [0,100].
        per_aln_len : float
            Percent query coverage of valid hits [0, 100].
        """

        # read statistics file
        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(stat_file)

        # perform homology searches
        self.logger.info('Creating DIAMOND database for reference genomes.')
        ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa')
        concatenate_gene_files(ref_genome_gene_files, ref_gene_file)

        diamond = Diamond(self.cpus)
        ref_diamond_db = os.path.join(self.output_dir, 'ref_genes')
        diamond.create_db(ref_gene_file, ref_diamond_db)

        self.logger.info('Identifying homologs within reference genomes of interest (be patient!).')
        self.diamond_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(self.diamond_dir)
        hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, False, hits_ref_genomes)

        self.logger.info('Identifying homologs within competing reference genomes (be patient!).')
        hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, per_aln_len, 1, False, hits_comp_ref_genomes)

        # get list of genes with a top hit to the reference genomes of interest
        hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes)

github dparks1134 / CompareM / comparem / reciprocal_diamond.py View on Github external
Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """

        # concatenate all gene files and create a single diamond database
        self.logger.info('Creating diamond database (be patient!).')
        gene_file = os.path.join(output_dir, 'all_genes.faa')
        concatenate_files(aa_gene_files, gene_file)
        diamond_db = os.path.join(output_dir, 'all_genes')

        diamond = Diamond(self.cpus)
        diamond.make_database(gene_file, diamond_db)

        # blast all genes against the database
        self.logger.info('Identifying hits between all pairs of genomes (be patient!).')
        hits_daa_file = os.path.join(output_dir, 'all_hits')
        diamond.blastp(gene_file, diamond_db, evalue, per_identity, per_aln_len, len(aa_gene_files) * 10, hits_daa_file, tmp_dir)

        # create flat hits table
        if tmp_dir:
            tmp_hits_table = tempfile.NamedTemporaryFile(prefix='diamond_hits_', dir=tmp_dir, delete=False)
        else:
            if os.path.isdir('/dev/shm'):
                tmp_hits_table = tempfile.NamedTemporaryFile(prefix='diamond_hits_', dir='/dev/shm', delete=False)
            else:
                tmp_hits_table = tempfile.NamedTemporaryFile(prefix='diamond_hits_', delete=False)
        tmp_hits_table.close()
github dparks1134 / CompareM / comparem / similarity_search.py View on Github external
per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """
        
        self.logger.info('Creating DIAMOND database (be patient!).')
        
        diamond_db = os.path.join(output_dir, 'query_genes')
        diamond = Diamond(self.cpus)
        diamond.create_db(query_gene_file, diamond_db)
            
        # create flat hits table
        if tmp_dir:
            tmp_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False)
        tmp_hits_table.close()

        # blast all genes against the database
        self.logger.info('Performing self similarity sequence between genomes (be patient!).')

        if high_mem:
            diamond.blastp(query_gene_file, 
                            diamond_db, 
                            evalue, 
github dparks1134 / RefineM / refinem / deprecated / taxonomic_profile.py View on Github external
genome_id = remove_extension(genome_file)
            self.profiles[genome_id] = Profile(genome_id, taxonomy)
            self._fragment_genomes(genome_file,
                                  window_size,
                                  step_size,
                                  self.profiles[genome_id],
                                  fragment_out)

            for seq_id, _seq in seq_io.read_seq(genome_file):
                contig_id_to_genome_id[seq_id] = genome_id

        # run diamond
        self.logger.info('')
        self.logger.info('  Running diamond blastx with %d processes (be patient!)' % self.cpus)

        diamond = Diamond(self.cpus)
        diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits')
        diamond.blastx(fragment_file, db_file, evalue, per_identity, 1, diamond_daa_out)

        diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv')
        diamond.view(diamond_daa_out + '.daa', diamond_table_out)

        self.logger.info('')
        self.logger.info('  Creating taxonomic profile for each genome.')
        self._taxonomic_profiles(diamond_table_out, taxonomy, contig_id_to_genome_id)

        self.logger.info('')
        self.logger.info('  Writing taxonomic profile for each genome.')

        report_dir = os.path.join(self.output_dir, 'bin_reports')
        make_sure_path_exists(report_dir)