How to use the biotite.sequence.align.SubstitutionMatrix function in biotite

To help you get started, we’ve selected a few biotite examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github biotite-dev / biotite / tests / sequence / test_align.py View on Github external
def test_matrix_str():
    """
    Test conversion of substitution matrix to string.
    """
    alph1 = seq.Alphabet("abc")
    alph2 = seq.Alphabet("def")
    score_matrix = np.arange(9).reshape((3,3))
    matrix = align.SubstitutionMatrix(alph1, alph2, score_matrix)
    assert str(matrix) == "\n".join(
        ["    d   e   f",
         "a   0   1   2",
         "b   3   4   5",
         "c   6   7   8"]
    )
github biotite-dev / biotite / tests / sequence / test_align.py View on Github external
def test_scoring(sequences, gap_penalty, term, seq_indices):
    """
    Test `score()` function.
    """
    matrix = align.SubstitutionMatrix.std_protein_matrix()
    index1, index2 = seq_indices
    seq1 = sequences[index1]
    seq2 = sequences[index2]
    alignment = align.align_optimal(
        seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term,
        max_number=1
    )[0]
    try:
        assert align.score(alignment, matrix, gap_penalty, term) \
               == alignment.score
    except AssertionError:
        print(alignment)
        raise
github biotite-dev / biotite / doc / examples / scripts / sequence / avidin_alignment.py View on Github external
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics

# Download and parse protein sequences of avidin and streptavidin
file_name = entrez.fetch_single_file(["CAC34569", "ACL82594"],
                                     biotite.temp_file("sequences.fasta"),
                                     "protein", "fasta")
file = fasta.FastaFile.read(file_name)
for name, sequence in file.items():
    if "CAC34569" in name:
        avidin_seq = seq.ProteinSequence(sequence)
    elif "ACL82594" in name:
        streptavidin_seq = seq.ProteinSequence(sequence)
# Get BLOSUM62 matrix
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignments = align.align_optimal(avidin_seq, streptavidin_seq, matrix,
                                 gap_penalty=(-10, -1), terminal_penalty=False)
# Draw first and only alignment
# The color intensity indicates the similiarity
fig = plt.figure(figsize=(8.0, 2.5))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(
    ax, alignments[0], matrix=matrix, labels=["Avidin", "Streptavidin"],
    show_numbers=True, show_line_position=True
)
fig.tight_layout()

plt.show()
github biotite-dev / biotite / doc / examples / scripts / alignments / avidin_alignment.py View on Github external
import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics

# Download and parse protein sequences of avidin and streptavidin
file_name = entrez.fetch_single_file(["CAC34569", "ACL82594"],
                                     biotite.temp_file("sequences.fasta"),
                                     "protein", "fasta")
file = fasta.FastaFile()
file.read(file_name)
for name, sequence in file:
    if "CAC34569" in name:
        avidin_seq = seq.ProteinSequence(sequence)
    elif "ACL82594" in name:
        streptavidin_seq = seq.ProteinSequence(sequence)
# Get BLOSUM62 matrix
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignments = align.align_optimal(avidin_seq, streptavidin_seq, matrix,
                                 gap_penalty=(-10, -1), terminal_penalty=False)
# Draw first and only alignment
# The color intensity indicates the similiarity
vis = graphics.AlignmentSimilarityVisualizer(
    alignments[0], matrix=matrix, labels=["Avidin", "Streptavidin"]
)
fig = vis.generate()
plt.show()
github biotite-dev / biotite / doc / examples / scripts / sequence / pi3k_alignment.py View on Github external
for i in range(trace_code.shape[0]):
            code2 = trace_code[i, pos_i]
            if code2 == -1:
                similarities[i] = 0
            else:
                sim = matrix[code1, code2]
                # Normalize (range 0.0 - 1.0)
                min_sim = np.min(matrix[code1])
                max_sim = np.max(matrix[code1])
                sim = (sim - min_sim) / (max_sim - min_sim)
                similarities[i] = sim
        # Delete self-similarity
        similarities = np.delete(similarities, seq_i)
        return np.average(similarities)

matrix = align.SubstitutionMatrix.std_protein_matrix()
# Get the alignment columns as symbols codes (-1 for gaps)
trace_code = align.get_codes(alignment)
similarities = np.zeros(trace_code.shape)
for i in range(similarities.shape[0]):
    for j in range(similarities.shape[1]):
        similarities[i,j] = get_average_normalized_similarity(
            trace_code, matrix.score_matrix(), i, j
        )

figure = plt.figure(figsize=(8.0, 3.0))
ax = figure.add_subplot(111)
heatmap = ax.pcolor(
    similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0
)
cbar = figure.colorbar(heatmap)
figure.tight_layout()
github biotite-dev / biotite / doc / tutorial / src / sequence.py View on Github external
print("\nBLOSUM62\n")
print(matrix)
# Load another matrix from internal database
matrix = align.SubstitutionMatrix(alph, alph, "BLOSUM50")
# Load a matrix dictionary representation,
# modify it, and create the SubstitutionMatrix
# (The dictionary could be alternatively loaded from a string containing
# the matrix in NCBI format)
matrix_dict = align.SubstitutionMatrix.dict_from_db("BLOSUM62")
matrix_dict[("P","Y")] = 100
matrix = align.SubstitutionMatrix(alph, alph, matrix_dict)
# And now create a matrix by directly provding the ndarray
# containing the similarity scores
# (identity matrix in our case)
scores = np.identity(len(alph), dtype=int)
matrix = align.SubstitutionMatrix(alph, alph, scores)
print("\n\nIdentity matrix\n")
print(matrix)

########################################################################
# For our protein alignment we will use the standard *BLOSUM62* matrix.

seq1 = seq.ProteinSequence("BIQTITE")
seq2 = seq.ProteinSequence("IQLITE")
matrix = align.SubstitutionMatrix.std_protein_matrix()
print("\nLocal alignment")
alignments = align.align_optimal(seq1, seq2, matrix, local=True)
for ali in alignments:
    print(ali)
print("Global alignment")
alignments = align.align_optimal(seq1, seq2, matrix, local=False)
for ali in alignments:
github biotite-dev / biotite / doc / examples / scripts / sequence / sw_genome_search.py View on Github external
gb_file = gb.GenBankFile.read(file_name)
annot_seq = gb.get_annotated_sequence(gb_file, include_only=["gene"])
# Find leuL gene
for feature in annot_seq.annotation:
    if "gene" in feature.qual and feature.qual["gene"] == "leuL":
        leul_feature = feature
# Get leuL sequence
leul_seq = annot_seq[leul_feature]

# Download Salmonella enterica genome without annotations
file_name = entrez.fetch("CP019649", biotite.temp_dir(),
                         "fa", "nuccore", "fasta")
fasta_file = fasta.FastaFile.read(file_name)
se_genome = fasta.get_sequence(fasta_file)
# Find leuL in genome by local alignment
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
# Use general gap penalty to save RAM
alignments = align.align_optimal(
    leul_seq, se_genome, matrix, gap_penalty=-7, local=True
)
# Do the same for reverse complement genome
se_genome_rev = se_genome.reverse().complement()
rev_alignments = align.align_optimal(
    leul_seq, se_genome_rev, matrix, gap_penalty=-7, local=True
)

########################################################################
# Now that we have both alignments (forward and reverse strand),
# we can can check which of them has a higher score.
# We simply take the score of the first alignment in each list.
# Due to the nature of the dynamic programming algorithm, every
# alignment in each list has the same score.
github biotite-dev / biotite / doc / examples / scripts / sequence / blosum_dendrogram.py View on Github external
import biotite.sequence.align as align
import biotite.sequence.phylo as phylo
import biotite.sequence.graphics as graphics

# Obtain BLOSUM62
matrix = align.SubstitutionMatrix.std_protein_matrix()
print(matrix)

########################################################################
# The original *BLOSUM62* contains symbols for ambiguous amino acids and
# the stop signal.
# As these are not actual amino acids, a new substitution matrix is
# created, where these symbols are are removed.

# Matrix should not contain ambiguous symbols or stop signal
matrix = align.SubstitutionMatrix(
    seq.Alphabet(matrix.get_alphabet1().get_symbols()[:-4]),
    seq.Alphabet(matrix.get_alphabet2().get_symbols()[:-4]),
    matrix.score_matrix()[:-4, :-4]
)
similarities = matrix.score_matrix()
print(matrix)

########################################################################
# Now a function must be defined, that converts the similarity depicted
# by a substitution matrix into a distance required by the UPGMA method.
# In this case, the distance is defined as the difference between the
# similarity of the two symbols and the average maximum similarity of
# the symbols to themselves.
#
# Finally the obtained (phylogenetic) tree is plotted as dendrogram.
def get_distance(similarities, i, j):
github biotite-dev / biotite / doc / examples / scripts / sequence / color_schemes_protein.py View on Github external
import biotite.sequence.io.fasta as fasta
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez


# Generate example alignment
# (the same as in the bacterial luciferase example)
query =   entrez.SimpleQuery("luxA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
    uids, None, db_name="protein", ret_type="fasta"
))
sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()]
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
# Order alignment according to the guide tree
alignment = alignment[:, order]
alignment = alignment[220:300]

# Get color scheme names
alphabet = seq.ProteinSequence.alphabet
schemes = [
    "rainbow", "clustalx",
    "flower", "blossom", "spring", "wither", "autumn", "sunset", "ocean",
    "zappo", "taylor", "buried", "hydrophobicity",
    "prophelix", "propstrand", "propturn"
]
count = len(schemes)
# Assert that this example displays all available amino acid color schemes
all_schemes = graphics.list_color_scheme_names(alphabet)
github biotite-dev / biotite / doc / examples / scripts / sequence / hcn_hydropathy.py View on Github external
sequences = []
for seq_str in fasta_file.values():
    sequences.append(seq.ProteinSequence(seq_str))

alignment = mafft.MafftApp.align(sequences)

########################################################################
# As measure for the positional conservation, the similarity score is
# used.
# For this purpose each column is extracted from the alignment and
# scored.
# The scores are put into an array with the index being the
# corresponding position of the HCN1 sequence.

matrix = align.SubstitutionMatrix.std_protein_matrix()
scores = np.zeros(len(hcn1))
for i in range(len(alignment)):
    # The column is also an alignment with length 1
    column = alignment[i:i+1]
    hcn1_index = column.trace[0,0]
    if hcn1_index == -1:
        # Gap in HCN1 row
        # As similarity score should be analyzed in dependence of the
        # HCN1 sequence position, alignment columns with a gap in HCN1
        # are ignored
        continue
    scores[hcn1_index] = align.score(column, matrix, gap_penalty=-5)

scores = moving_average(scores, 2*ma_radius+1)

########################################################################