Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
signature_file : str
Name of file to read.
Returns
-------
dict : d[seq_id] -> tetranucleotide signature in canonical order
Count of each kmer.
"""
try:
sig = {}
with open(signature_file) as f:
header = f.readline().split('\t')
kmer_order = [x.strip().upper() for x in header[1:]]
if len(kmer_order) != len(self.canonical_order()):
raise ParsingError("[Error] Tetranucleotide file must contain exactly {:,} tetranucleotide columns.".format(len(self.canonical_order())))
canonical_order_index = np.argsort(kmer_order)
canonical_order = [kmer_order[i] for i in canonical_order_index]
if canonical_order != self.canonical_order():
raise ParsingError("[Error] Failed to process tetranucleotide signature file: " + signature_file)
for line in f:
line_split = line.split('\t')
sig[line_split[0]] = [float(line_split[i + 1]) for i in canonical_order_index]
return sig
except IOError:
print('[Error] Failed to open signature file: %s' % signature_file)
sys.exit(-1)
except ParsingError:
"""Read statistics for scaffolds.
Parameters
----------
stats_file : str
File with statistics for individual scaffolds.
"""
try:
sig = {}
self.genome_ids = set()
with open(stats_file) as f:
header = f.readline().split('\t')
if 'AAAA' not in header:
raise ParsingError("[Error] Statistics file is missing tetranucleotide signature data: %s" % stats_file)
tetra_index = header.index('AAAA')
self.signature_headers = [x.strip() for x in header[tetra_index:]]
self.coverage_headers = [x.strip() for x in header[4:tetra_index]]
self.scaffolds_in_genome = defaultdict(set)
self.stats = {}
for line in f:
line_split = line.split('\t')
scaffold_id = line_split[0]
genome_id = line_split[1]
gc = float(line_split[2])
scaffold_len = int(line_split[3])
coverage = []
for cov in line_split[4:tetra_index]:
for line in f:
line_split = line.split('\t')
scaffold_id = line_split[0]
scaffold_len = int(line_split[1])
length[scaffold_id] = scaffold_len
for i, cov in enumerate(line_split[2:]):
coverage[scaffold_id][bam_ids[i]] = float(cov)
except IOError:
self.logger.error('Failed to open signature file: %s' % coverage_file)
sys.exit(-1)
except:
print(traceback.format_exc())
print('')
raise ParsingError("[Error] Failed to process coverage file: " + coverage_file)
sys.exit(-1)
return coverage, length
coverage.append(float(cov))
signature = []
for freq in line_split[tetra_index:]:
signature.append(float(freq))
self.stats[scaffold_id] = self.ScaffoldStats(genome_id, gc, scaffold_len, coverage, signature)
if genome_id != self.unbinned:
self.scaffolds_in_genome[genome_id].add(scaffold_id)
return sig
except IOError:
print('[Error] Failed to open scaffold statistics file: %s' % stats_file)
sys.exit(-1)
except ParsingError:
sys.exit(-1)
Count of each kmer.
"""
try:
sig = {}
with open(signature_file) as f:
header = f.readline().split('\t')
kmer_order = [x.strip().upper() for x in header[1:]]
if len(kmer_order) != len(self.canonical_order()):
raise ParsingError("[Error] Tetranucleotide file must contain exactly {:,} tetranucleotide columns.".format(len(self.canonical_order())))
canonical_order_index = np.argsort(kmer_order)
canonical_order = [kmer_order[i] for i in canonical_order_index]
if canonical_order != self.canonical_order():
raise ParsingError("[Error] Failed to process tetranucleotide signature file: " + signature_file)
for line in f:
line_split = line.split('\t')
sig[line_split[0]] = [float(line_split[i + 1]) for i in canonical_order_index]
return sig
except IOError:
print('[Error] Failed to open signature file: %s' % signature_file)
sys.exit(-1)
except ParsingError:
sys.exit(-1)
def __init__(self, message):
super(ParsingError, self).__init__(message)
print('')
print(message)