Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def sequences():
return [seq.ProteinSequence(string) for string in [
"BIQTITE",
"TITANITE",
"BISMITE",
"IQLITE"
]]
# (the same as in the bacterial luciferase example)
query = entrez.SimpleQuery("luxA", "Gene Name") \
& entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
uids, None, db_name="protein", ret_type="fasta"
))
sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()]
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
# Order alignment according to the guide tree
alignment = alignment[:, order]
alignment = alignment[220:300]
# Get color scheme names
alphabet = seq.ProteinSequence.alphabet
schemes = [
"rainbow", "clustalx",
"flower", "blossom", "spring", "wither", "autumn", "sunset", "ocean",
"zappo", "taylor", "buried", "hydrophobicity",
"prophelix", "propstrand", "propturn"
]
count = len(schemes)
# Assert that this example displays all available amino acid color schemes
all_schemes = graphics.list_color_scheme_names(alphabet)
assert set(schemes) == set(all_schemes)
# Visualize each scheme using the example alignment
fig = plt.figure(figsize=(8.0, count*2.0))
gridspec = GridSpec(2, count)
for i, name in enumerate(schemes):
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics
# Download and parse protein sequences of avidin and streptavidin
file_name = entrez.fetch_single_file(["CAC34569", "ACL82594"],
biotite.temp_file("sequences.fasta"),
"protein", "fasta")
file = fasta.FastaFile.read(file_name)
for name, sequence in file.items():
if "CAC34569" in name:
avidin_seq = seq.ProteinSequence(sequence)
elif "ACL82594" in name:
streptavidin_seq = seq.ProteinSequence(sequence)
# Get BLOSUM62 matrix
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignments = align.align_optimal(avidin_seq, streptavidin_seq, matrix,
gap_penalty=(-10, -1), terminal_penalty=False)
# Draw first and only alignment
# The color intensity indicates the similiarity
fig = plt.figure(figsize=(8.0, 2.5))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(
ax, alignments[0], matrix=matrix, labels=["Avidin", "Streptavidin"],
show_numbers=True, show_line_position=True
)
fig.tight_layout()
import matplotlib.pyplot as plt
import biotite
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics
# Download and parse protein sequences of avidin and streptavidin
file_name = entrez.fetch_single_file(["CAC34569", "ACL82594"],
biotite.temp_file("sequences.fasta"),
"protein", "fasta")
file = fasta.FastaFile.read(file_name)
for name, sequence in file.items():
if "CAC34569" in name:
avidin_seq = seq.ProteinSequence(sequence)
elif "ACL82594" in name:
streptavidin_seq = seq.ProteinSequence(sequence)
# Get BLOSUM62 matrix
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignments = align.align_optimal(avidin_seq, streptavidin_seq, matrix,
gap_penalty=(-10, -1), terminal_penalty=False)
# Draw first and only alignment
# The color intensity indicates the similiarity
fig = plt.figure(figsize=(8.0, 2.5))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(
ax, alignments[0], matrix=matrix, labels=["Avidin", "Streptavidin"],
show_numbers=True, show_line_position=True
)
for gb_file in multi_file:
annotation = gb.get_annotation(gb_file)
# Find ID of strain in 'source' feature
strain = None
for feature in annotation:
if feature.key == "source":
strain = int(feature.qual["strain"])
assert strain is not None
# Find corresponding protein sequence in 'CDS' feature
sequence = None
for feature in annotation:
if feature.key == "CDS":
sequence = seq.ProteinSequence(
# Remove whitespace in sequence
# resulting from line breaks
feature.qual["translation"].replace(" ", "")
)
assert sequence is not None
sequences[strain] = sequence
# None of the THCA synthase variants have an insertion or deletion
# -> each one should have the same sequence length
seq_len = len(list(sequences.values())[0])
for sequence in sequences.values():
assert len(sequence) == seq_len
# Create consensus sequences for the drug-type and fiber-type cannabis
# ----------------------------
#
# .. currentmodule:: biotite.application.muscle
#
# For *multiple sequence alignments* (MSAs) :mod:`biotite.application`
# offers several interfaces to MSA software.
# For our example we choose the software MUSCLE:
# The subpackage :mod:`biotite.application.muscle` contains the class
# :class:`MuscleApp` that does the job.
# You simply input the sequences you want to have aligned, run the
# application and get the resulting :class:`Alignment` object.
import biotite.application.muscle as muscle
import biotite.sequence as seq
seq1 = seq.ProteinSequence("BIQTITE")
seq2 = seq.ProteinSequence("TITANITE")
seq3 = seq.ProteinSequence("BISMITE")
seq4 = seq.ProteinSequence("IQLITE")
app = muscle.MuscleApp([seq1, seq2, seq3, seq4])
app.start()
app.join()
alignment = app.get_alignment()
print(alignment)
########################################################################
# In most MSA software even more information than the mere alignment can
# be extracted.
# For instance the guide tree, that was used for the alignment, can be
# obtained from the MUSCLE output.
import matplotlib.pyplot as plt
# :class:`Application` classes in depth.
#
# Finding homologous sequences with BLAST
# ---------------------------------------
#
# .. currentmodule:: biotite.application.blast
#
# the :mod:`biotite.application.blast` subpackage provides an
# interface to NCBI BLAST: the :class:`BlastWebApp` class.
# Let's dive directly into the code, we try to find
# homologous sequences to the miniprotein *TC5b*:
import biotite.application.blast as blast
import biotite.sequence as seq
tc5b_seq = seq.ProteinSequence("NLYIQWLKDGGPSSGRPPPS")
app = blast.BlastWebApp("blastp", tc5b_seq)
app.start()
app.join()
alignments = app.get_alignments()
best_ali = alignments[0]
print(best_ali)
print()
print("HSP position in query: ", best_ali.query_interval)
print("HSP position in hit: ", best_ali.hit_interval)
print("Score: ", best_ali.score)
print("E-value: ", best_ali.e_value)
print("Hit UID: ", best_ali.hit_id)
print("Hit name: ", best_ali.hit_definition)
########################################################################
# This was too simple for BLAST:
# Finally the obtained (phylogenetic) tree is plotted as dendrogram.
def get_distance(similarities, i, j):
s_max = (similarities[i,i] + similarities[j,j]) / 2
return s_max - similarities[i,j]
distances = np.zeros(similarities.shape)
for i in range(distances.shape[0]):
for j in range(distances.shape[1]):
distances[i,j] = get_distance(similarities, i, j)
tree = phylo.upgma(distances)
fig = plt.figure(figsize=(8.0, 5.0))
ax = fig.add_subplot(111)
# Use the 3-letter amino acid code aa label
labels = [seq.ProteinSequence.convert_letter_1to3(letter).capitalize()
for letter in matrix.get_alphabet1()]
graphics.plot_dendrogram(
ax, tree, orientation="top", labels=labels
)
ax.set_ylabel("Distance")
# Add grid for clearer distance perception
ax.yaxis.grid(color="lightgray")
plt.show()
#
# .. currentmodule:: biotite.application.muscle
#
# For *multiple sequence alignments* (MSAs) :mod:`biotite.application`
# offers several interfaces to MSA software.
# For our example we choose the software MUSCLE:
# The subpackage :mod:`biotite.application.muscle` contains the class
# :class:`MuscleApp` that does the job.
# You simply input the sequences you want to have aligned, run the
# application and get the resulting :class:`Alignment` object.
import biotite.application.muscle as muscle
import biotite.sequence as seq
seq1 = seq.ProteinSequence("BIQTITE")
seq2 = seq.ProteinSequence("TITANITE")
seq3 = seq.ProteinSequence("BISMITE")
seq4 = seq.ProteinSequence("IQLITE")
app = muscle.MuscleApp([seq1, seq2, seq3, seq4])
app.start()
app.join()
alignment = app.get_alignment()
print(alignment)
########################################################################
# In most MSA software even more information than the mere alignment can
# be extracted.
# For instance the guide tree, that was used for the alignment, can be
# obtained from the MUSCLE output.
import matplotlib.pyplot as plt
import biotite.sequence.graphics as graphics