Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_matrix_str():
"""
Test conversion of substitution matrix to string.
"""
alph1 = seq.Alphabet("abc")
alph2 = seq.Alphabet("def")
score_matrix = np.arange(9).reshape((3,3))
matrix = align.SubstitutionMatrix(alph1, alph2, score_matrix)
assert str(matrix) == "\n".join(
[" d e f",
"a 0 1 2",
"b 3 4 5",
"c 6 7 8"]
)
def test_scoring(sequences, gap_penalty, term, seq_indices):
"""
Test `score()` function.
"""
matrix = align.SubstitutionMatrix.std_protein_matrix()
index1, index2 = seq_indices
seq1 = sequences[index1]
seq2 = sequences[index2]
alignment = align.align_optimal(
seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term,
max_number=1
)[0]
try:
assert align.score(alignment, matrix, gap_penalty, term) \
== alignment.score
except AssertionError:
print(alignment)
raise
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics
# Download and parse protein sequences of avidin and streptavidin
file_name = entrez.fetch_single_file(["CAC34569", "ACL82594"],
biotite.temp_file("sequences.fasta"),
"protein", "fasta")
file = fasta.FastaFile.read(file_name)
for name, sequence in file.items():
if "CAC34569" in name:
avidin_seq = seq.ProteinSequence(sequence)
elif "ACL82594" in name:
streptavidin_seq = seq.ProteinSequence(sequence)
# Get BLOSUM62 matrix
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignments = align.align_optimal(avidin_seq, streptavidin_seq, matrix,
gap_penalty=(-10, -1), terminal_penalty=False)
# Draw first and only alignment
# The color intensity indicates the similiarity
fig = plt.figure(figsize=(8.0, 2.5))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(
ax, alignments[0], matrix=matrix, labels=["Avidin", "Streptavidin"],
show_numbers=True, show_line_position=True
)
fig.tight_layout()
plt.show()
import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics
# Download and parse protein sequences of avidin and streptavidin
file_name = entrez.fetch_single_file(["CAC34569", "ACL82594"],
biotite.temp_file("sequences.fasta"),
"protein", "fasta")
file = fasta.FastaFile()
file.read(file_name)
for name, sequence in file:
if "CAC34569" in name:
avidin_seq = seq.ProteinSequence(sequence)
elif "ACL82594" in name:
streptavidin_seq = seq.ProteinSequence(sequence)
# Get BLOSUM62 matrix
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignments = align.align_optimal(avidin_seq, streptavidin_seq, matrix,
gap_penalty=(-10, -1), terminal_penalty=False)
# Draw first and only alignment
# The color intensity indicates the similiarity
vis = graphics.AlignmentSimilarityVisualizer(
alignments[0], matrix=matrix, labels=["Avidin", "Streptavidin"]
)
fig = vis.generate()
plt.show()
for i in range(trace_code.shape[0]):
code2 = trace_code[i, pos_i]
if code2 == -1:
similarities[i] = 0
else:
sim = matrix[code1, code2]
# Normalize (range 0.0 - 1.0)
min_sim = np.min(matrix[code1])
max_sim = np.max(matrix[code1])
sim = (sim - min_sim) / (max_sim - min_sim)
similarities[i] = sim
# Delete self-similarity
similarities = np.delete(similarities, seq_i)
return np.average(similarities)
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Get the alignment columns as symbols codes (-1 for gaps)
trace_code = align.get_codes(alignment)
similarities = np.zeros(trace_code.shape)
for i in range(similarities.shape[0]):
for j in range(similarities.shape[1]):
similarities[i,j] = get_average_normalized_similarity(
trace_code, matrix.score_matrix(), i, j
)
figure = plt.figure(figsize=(8.0, 3.0))
ax = figure.add_subplot(111)
heatmap = ax.pcolor(
similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0
)
cbar = figure.colorbar(heatmap)
figure.tight_layout()
print("\nBLOSUM62\n")
print(matrix)
# Load another matrix from internal database
matrix = align.SubstitutionMatrix(alph, alph, "BLOSUM50")
# Load a matrix dictionary representation,
# modify it, and create the SubstitutionMatrix
# (The dictionary could be alternatively loaded from a string containing
# the matrix in NCBI format)
matrix_dict = align.SubstitutionMatrix.dict_from_db("BLOSUM62")
matrix_dict[("P","Y")] = 100
matrix = align.SubstitutionMatrix(alph, alph, matrix_dict)
# And now create a matrix by directly provding the ndarray
# containing the similarity scores
# (identity matrix in our case)
scores = np.identity(len(alph), dtype=int)
matrix = align.SubstitutionMatrix(alph, alph, scores)
print("\n\nIdentity matrix\n")
print(matrix)
########################################################################
# For our protein alignment we will use the standard *BLOSUM62* matrix.
seq1 = seq.ProteinSequence("BIQTITE")
seq2 = seq.ProteinSequence("IQLITE")
matrix = align.SubstitutionMatrix.std_protein_matrix()
print("\nLocal alignment")
alignments = align.align_optimal(seq1, seq2, matrix, local=True)
for ali in alignments:
print(ali)
print("Global alignment")
alignments = align.align_optimal(seq1, seq2, matrix, local=False)
for ali in alignments:
gb_file = gb.GenBankFile.read(file_name)
annot_seq = gb.get_annotated_sequence(gb_file, include_only=["gene"])
# Find leuL gene
for feature in annot_seq.annotation:
if "gene" in feature.qual and feature.qual["gene"] == "leuL":
leul_feature = feature
# Get leuL sequence
leul_seq = annot_seq[leul_feature]
# Download Salmonella enterica genome without annotations
file_name = entrez.fetch("CP019649", biotite.temp_dir(),
"fa", "nuccore", "fasta")
fasta_file = fasta.FastaFile.read(file_name)
se_genome = fasta.get_sequence(fasta_file)
# Find leuL in genome by local alignment
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
# Use general gap penalty to save RAM
alignments = align.align_optimal(
leul_seq, se_genome, matrix, gap_penalty=-7, local=True
)
# Do the same for reverse complement genome
se_genome_rev = se_genome.reverse().complement()
rev_alignments = align.align_optimal(
leul_seq, se_genome_rev, matrix, gap_penalty=-7, local=True
)
########################################################################
# Now that we have both alignments (forward and reverse strand),
# we can can check which of them has a higher score.
# We simply take the score of the first alignment in each list.
# Due to the nature of the dynamic programming algorithm, every
# alignment in each list has the same score.
import biotite.sequence.align as align
import biotite.sequence.phylo as phylo
import biotite.sequence.graphics as graphics
# Obtain BLOSUM62
matrix = align.SubstitutionMatrix.std_protein_matrix()
print(matrix)
########################################################################
# The original *BLOSUM62* contains symbols for ambiguous amino acids and
# the stop signal.
# As these are not actual amino acids, a new substitution matrix is
# created, where these symbols are are removed.
# Matrix should not contain ambiguous symbols or stop signal
matrix = align.SubstitutionMatrix(
seq.Alphabet(matrix.get_alphabet1().get_symbols()[:-4]),
seq.Alphabet(matrix.get_alphabet2().get_symbols()[:-4]),
matrix.score_matrix()[:-4, :-4]
)
similarities = matrix.score_matrix()
print(matrix)
########################################################################
# Now a function must be defined, that converts the similarity depicted
# by a substitution matrix into a distance required by the UPGMA method.
# In this case, the distance is defined as the difference between the
# similarity of the two symbols and the average maximum similarity of
# the symbols to themselves.
#
# Finally the obtained (phylogenetic) tree is plotted as dendrogram.
def get_distance(similarities, i, j):
import biotite.sequence.io.fasta as fasta
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez
# Generate example alignment
# (the same as in the bacterial luciferase example)
query = entrez.SimpleQuery("luxA", "Gene Name") \
& entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
uids, None, db_name="protein", ret_type="fasta"
))
sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()]
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
# Order alignment according to the guide tree
alignment = alignment[:, order]
alignment = alignment[220:300]
# Get color scheme names
alphabet = seq.ProteinSequence.alphabet
schemes = [
"rainbow", "clustalx",
"flower", "blossom", "spring", "wither", "autumn", "sunset", "ocean",
"zappo", "taylor", "buried", "hydrophobicity",
"prophelix", "propstrand", "propturn"
]
count = len(schemes)
# Assert that this example displays all available amino acid color schemes
all_schemes = graphics.list_color_scheme_names(alphabet)
sequences = []
for seq_str in fasta_file.values():
sequences.append(seq.ProteinSequence(seq_str))
alignment = mafft.MafftApp.align(sequences)
########################################################################
# As measure for the positional conservation, the similarity score is
# used.
# For this purpose each column is extracted from the alignment and
# scored.
# The scores are put into an array with the index being the
# corresponding position of the HCN1 sequence.
matrix = align.SubstitutionMatrix.std_protein_matrix()
scores = np.zeros(len(hcn1))
for i in range(len(alignment)):
# The column is also an alignment with length 1
column = alignment[i:i+1]
hcn1_index = column.trace[0,0]
if hcn1_index == -1:
# Gap in HCN1 row
# As similarity score should be analyzed in dependence of the
# HCN1 sequence position, alignment columns with a gap in HCN1
# are ignored
continue
scores[hcn1_index] = align.score(column, matrix, gap_penalty=-5)
scores = moving_average(scores, 2*ma_radius+1)
########################################################################