Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def compatible(self, options):
"""Compatible command"""
check_file_exists(options.reference_file)
check_file_exists(options.scaffold_stats_file)
make_sure_path_exists(options.output_dir)
# read scaffold statistics and calculate genome stats
self.logger.info('Reading scaffold statistics.')
scaffold_stats = ScaffoldStats()
scaffold_stats.read(options.scaffold_stats_file)
genome_stats = GenomeStats()
genome_stats = genome_stats.run(scaffold_stats)
# identify putative homologs to reference genomes
reference = Reference(1, None)
putative_homologs = reference.homology_check(options.reference_file,
options.min_genes,
float(options.perc_genes))
# identify scaffolds compatible with bins
outliers = Outliers()
output_file = os.path.join(options.output_dir, 'compatible.tsv')
outliers.compatible(putative_homologs, scaffold_stats, genome_stats,
options.gc_perc, options.td_perc,
def genome_stats(self, options):
"""Genomes statistics command"""
check_file_exists(options.scaffold_stats_file)
self.logger.info('Reading scaffold statistics.')
scaffold_stats = ScaffoldStats(options.cpus)
scaffold_stats.read(options.scaffold_stats_file)
genome_stats = GenomeStats()
genome_stats.run(scaffold_stats)
genome_stats.write(options.output_file)
self.logger.info('Genome statistic written to: %s' % options.output_file)
File containing GreenGenes taxonomy strings for reference genomes.
percent_to_classify : float
Minimum percentage of genes to assign scaffold to a taxon [0, 100].
evalue : float
E-value threshold used to identify homologs.
per_identity: float
Percent identity threshold used to identify homologs [0, 100].
per_aln_len : float
Percent coverage of query sequence used to identify homologs [0, 100].
tmpdir : str
Directory to use for temporary files.
"""
# read statistics file
self.logger.info('Reading scaffold statistics.')
scaffold_stats = ScaffoldStats()
scaffold_stats.read(stat_file)
# concatenate gene files
self.logger.info('Appending genome identifiers to all gene identifiers.')
diamond_output_dir = os.path.join(self.output_dir, 'diamond')
make_sure_path_exists(diamond_output_dir)
gene_file = os.path.join(diamond_output_dir, 'genes.faa')
concatenate_gene_files(gene_files, gene_file)
# read taxonomy file
self.logger.info('Reading taxonomic assignment of reference genomes.')
t = Taxonomy()
taxonomy = t.read(taxonomy_file)
if not t.validate(taxonomy,
def kmeans(self, options):
"""kmeans command"""
check_file_exists(options.scaffold_stats_file)
check_file_exists(options.genome_file)
make_sure_path_exists(options.output_dir)
self.logger.info('Reading scaffold statistics.')
scaffold_stats = ScaffoldStats()
scaffold_stats.read(options.scaffold_stats_file)
cluster = Cluster(options.cpus)
cluster.kmeans(scaffold_stats,
options.num_clusters,
options.num_components,
options.K,
options.no_coverage,
options.no_pca,
options.iterations,
options.genome_file,
options.output_dir)
self.logger.info('Partitioned sequences written to: ' + options.output_dir)
def dbscan(self, options):
"""dbscan command"""
check_file_exists(options.scaffold_stats_file)
check_file_exists(options.genome_file)
make_sure_path_exists(options.output_dir)
self.logger.info('Reading scaffold statistics.')
scaffold_stats = ScaffoldStats()
scaffold_stats.read(options.scaffold_stats_file)
cluster = Cluster(options.cpus)
cluster.dbscan(scaffold_stats,
options.num_clusters,
options.num_components,
options.min_pts,
options.dist_frac,
options.no_coverage,
options.no_pca,
options.genome_file,
options.output_dir)
self.logger.info('Partitioned sequences written to: ' + options.output_dir)
File with statistics for individual scaffolds.
ref_genome_gene_files : list of str
Fasta files of called genes on reference genomes of interest.
db_file : str
Database of competing reference genes.
evalue : float
E-value threshold of valid hits.
per_identity : float
Percent identity threshold of valid hits [0,100].
per_aln_len : float
Percent query coverage of valid hits [0, 100].
"""
# read statistics file
self.logger.info('Reading scaffold statistics.')
scaffold_stats = ScaffoldStats()
scaffold_stats.read(stat_file)
# perform homology searches
self.logger.info('Creating DIAMOND database for reference genomes.')
ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa')
concatenate_gene_files(ref_genome_gene_files, ref_gene_file)
diamond = Diamond(self.cpus)
ref_diamond_db = os.path.join(self.output_dir, 'ref_genes')
diamond.create_db(ref_gene_file, ref_diamond_db)
self.logger.info('Identifying homologs within reference genomes of interest (be patient!).')
self.diamond_dir = os.path.join(self.output_dir, 'diamond')
make_sure_path_exists(self.diamond_dir)
hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv')
diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, False, hits_ref_genomes)
def outliers(self, options):
"""Outlier command"""
check_file_exists(options.scaffold_stats_file)
make_sure_path_exists(options.output_dir)
self.logger.info('Reading scaffold statistics.')
scaffold_stats = ScaffoldStats()
scaffold_stats.read(options.scaffold_stats_file)
genome_stats = GenomeStats()
genome_stats = genome_stats.run(scaffold_stats)
# identify outliers
outliers = Outliers()
outlier_file = os.path.join(options.output_dir, 'outliers.tsv')
outliers.identify(scaffold_stats, genome_stats,
options.gc_perc, options.td_perc,
options.cov_corr, options.cov_perc,
options.report_type, outlier_file)
self.logger.info('Outlier information written to: ' + outlier_file)
# create outlier plots
if options.create_plots: