How to use the refinem.scaffold_stats.ScaffoldStats function in refinem

To help you get started, we’ve selected a few refinem examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dparks1134 / RefineM / refinem / main.py View on Github external
def compatible(self, options):
        """Compatible command"""

        check_file_exists(options.reference_file)
        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        # read scaffold statistics and calculate genome stats
        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats = genome_stats.run(scaffold_stats)

        # identify putative homologs to reference genomes
        reference = Reference(1, None)
        putative_homologs = reference.homology_check(options.reference_file,
                                                         options.min_genes,
                                                         float(options.perc_genes))

        # identify scaffolds compatible with bins
        outliers = Outliers()
        output_file = os.path.join(options.output_dir, 'compatible.tsv')
        outliers.compatible(putative_homologs, scaffold_stats, genome_stats,
                                      options.gc_perc, options.td_perc,
github dparks1134 / RefineM / refinem / main.py View on Github external
def genome_stats(self, options):
        """Genomes statistics command"""
        
        check_file_exists(options.scaffold_stats_file)

        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats(options.cpus)
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats.run(scaffold_stats)
        genome_stats.write(options.output_file)

        self.logger.info('Genome statistic written to: %s' % options.output_file)
github dparks1134 / RefineM / refinem / taxon_profile.py View on Github external
File containing GreenGenes taxonomy strings for reference genomes.
        percent_to_classify : float
            Minimum percentage of genes to assign scaffold to a taxon [0, 100].
        evalue : float
            E-value threshold used to identify homologs.
        per_identity: float
            Percent identity threshold used to identify homologs [0, 100].
        per_aln_len : float
            Percent coverage of query sequence used to identify homologs [0, 100].
        tmpdir : str
            Directory to use for temporary files.
        """

        # read statistics file
        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(stat_file)

        # concatenate gene files
        self.logger.info('Appending genome identifiers to all gene identifiers.')
        diamond_output_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(diamond_output_dir)

        gene_file = os.path.join(diamond_output_dir, 'genes.faa')
        concatenate_gene_files(gene_files, gene_file)

        # read taxonomy file
        self.logger.info('Reading taxonomic assignment of reference genomes.')

        t = Taxonomy()
        taxonomy = t.read(taxonomy_file)
        if not t.validate(taxonomy, 
github dparks1134 / RefineM / refinem / main.py View on Github external
def kmeans(self, options):
        """kmeans command"""
        
        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        cluster = Cluster(options.cpus)
        cluster.kmeans(scaffold_stats,
                        options.num_clusters,
                        options.num_components,
                        options.K,
                        options.no_coverage,
                        options.no_pca,
                        options.iterations,
                        options.genome_file,
                        options.output_dir)

        self.logger.info('Partitioned sequences written to: ' + options.output_dir)
github dparks1134 / RefineM / refinem / main.py View on Github external
def dbscan(self, options):
        """dbscan command"""
        
        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        cluster = Cluster(options.cpus)
        cluster.dbscan(scaffold_stats,
                        options.num_clusters,
                        options.num_components,
                        options.min_pts,
                        options.dist_frac,
                        options.no_coverage,
                        options.no_pca,
                        options.genome_file,
                        options.output_dir)

        self.logger.info('Partitioned sequences written to: ' + options.output_dir)
github dparks1134 / RefineM / refinem / reference.py View on Github external
File with statistics for individual scaffolds.
        ref_genome_gene_files : list of str
            Fasta files of called genes on reference genomes of interest.
        db_file : str
            Database of competing reference genes.
        evalue : float
            E-value threshold of valid hits.
        per_identity : float
            Percent identity threshold of valid hits [0,100].
        per_aln_len : float
            Percent query coverage of valid hits [0, 100].
        """

        # read statistics file
        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(stat_file)

        # perform homology searches
        self.logger.info('Creating DIAMOND database for reference genomes.')
        ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa')
        concatenate_gene_files(ref_genome_gene_files, ref_gene_file)

        diamond = Diamond(self.cpus)
        ref_diamond_db = os.path.join(self.output_dir, 'ref_genes')
        diamond.create_db(ref_gene_file, ref_diamond_db)

        self.logger.info('Identifying homologs within reference genomes of interest (be patient!).')
        self.diamond_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(self.diamond_dir)
        hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, False, hits_ref_genomes)
github dparks1134 / RefineM / refinem / main.py View on Github external
def outliers(self, options):
        """Outlier command"""

        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats = genome_stats.run(scaffold_stats)

        # identify outliers
        outliers = Outliers()
        outlier_file = os.path.join(options.output_dir, 'outliers.tsv')
        outliers.identify(scaffold_stats, genome_stats,
                                      options.gc_perc, options.td_perc,
                                      options.cov_corr, options.cov_perc,
                                      options.report_type, outlier_file)
        self.logger.info('Outlier information written to: ' + outlier_file)

        # create outlier plots
        if options.create_plots: