How to use the biotite.sequence.align function in biotite

To help you get started, we’ve selected a few biotite examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github biotite-dev / biotite / tests / application / test_msa.py View on Github external
def test_custom_substitution_matrix(sequences, app_cls):
    alph = seq.ProteinSequence.alphabet
    # Strong identity matrix
    score_matrix = np.identity(len(alph)) * 1000
    matrix = align.SubstitutionMatrix(alph, alph, score_matrix)
    exp_ali = (
        "BI-QTITE\n"
        "TITANITE\n"
        "BI-SMITE\n"
        "-I-QLITE"
    )
    app = app_cls(sequences, matrix=matrix)
    app.start()
    app.join()
    alignment = app.get_alignment()
    assert str(alignment) == exp_ali
github biotite-dev / biotite / doc / examples / scripts / sequence / luxa_comparison.py View on Github external
query =   entrez.SimpleQuery("luxA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
    uids, None, db_name="protein", ret_type="fasta"
))

ids = []
sequences = []
for header, seq_str in fasta_file.items():
    # Extract the UniProt Entry name from header
    identifier = header.split("|")[-1].split()[0]
    ids.append(identifier)
    sequences.append(seq.ProteinSequence(seq_str))

matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, tree, distances = align.align_multiple(
    sequences, matrix, gap_penalty=(-10,-1), terminal_penalty=False
)
# Order alignment according to the guide tree
alignment = alignment[:, order]
ids = [ids[i] for i in order]

fig = plt.figure(figsize=(8.0, 20.0))
ax = fig.add_subplot(111)
graphics.plot_alignment_type_based(
    ax, alignment, labels=ids, show_numbers=True, spacing=2.0
)
fig.tight_layout()

plt.show()
github biotite-dev / biotite / doc / tutorial / src / application.py View on Github external
# Internally the sequences and the matrix are converted into protein
# sequences/matrix.
# Then the masquerading sequences are aligned via the software and
# finally the sequences are mapped back into the original sequence type.
# Let's show this on the example of a nonsense alphabet.

import numpy as np
import biotite.application.mafft as mafft
import biotite.sequence.align as align

alphabet = seq.Alphabet(("foo", "bar", 42))
sequences = [seq.GeneralSequence(alphabet, sequence) for sequence in [
    ["foo", "bar", 42, "foo", "foo", 42, 42],
    ["foo", 42, "foo", "bar", "foo", 42, 42],
]]
matrix = align.SubstitutionMatrix(
    alphabet, alphabet, np.array([
        [ 100, -100, -100],
        [-100,  100, -100],
        [-100, -100,  100]
    ])
)
alignment = mafft.MafftApp.align(sequences, matrix=matrix)
# As the alphabet do not has characters as symbols
# the alignment cannot be directly printed
# However, we can print the trace
print(alignment.trace)

########################################################################
# Secondary structure annotation
# ------------------------------
# 
github biotite-dev / biotite / doc / examples / scripts / sequence / blosum_dendrogram.py View on Github external
of amino acids in the *BLOSUM62* substitution matrix.
The amino acids are clustered with the UPGMA method.
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.phylo as phylo
import biotite.sequence.graphics as graphics

# Obtain BLOSUM62
matrix = align.SubstitutionMatrix.std_protein_matrix()
print(matrix)

########################################################################
# The original *BLOSUM62* contains symbols for ambiguous amino acids and
# the stop signal.
# As these are not actual amino acids, a new substitution matrix is
# created, where these symbols are are removed.

# Matrix should not contain ambiguous symbols or stop signal
matrix = align.SubstitutionMatrix(
    seq.Alphabet(matrix.get_alphabet1().get_symbols()[:-4]),
    seq.Alphabet(matrix.get_alphabet2().get_symbols()[:-4]),
    matrix.score_matrix()[:-4, :-4]
)
similarities = matrix.score_matrix()
print(matrix)
github biotite-dev / biotite / doc / tutorial / src / sequence.py View on Github external
# :class:`ndarray` storing the similarity scores.
# You can choose one of many predefined matrices from an internal
# database or you can create a custom matrix on your own.
# 
# So much for theory.
# Let's start by showing different ways to construct a
# :class:`SubstitutionMatrix`, in our case for protein sequence
# alignments:

import biotite.sequence as seq
import biotite.sequence.align as align
import numpy as np

alph = seq.ProteinSequence.alphabet
# Load the standard protein substitution matrix, which is BLOSUM62
matrix = align.SubstitutionMatrix.std_protein_matrix()
print("\nBLOSUM62\n")
print(matrix)
# Load another matrix from internal database
matrix = align.SubstitutionMatrix(alph, alph, "BLOSUM50")
# Load a matrix dictionary representation,
# modify it, and create the SubstitutionMatrix
# (The dictionary could be alternatively loaded from a string containing
# the matrix in NCBI format)
matrix_dict = align.SubstitutionMatrix.dict_from_db("BLOSUM62")
matrix_dict[("P","Y")] = 100
matrix = align.SubstitutionMatrix(alph, alph, matrix_dict)
# And now create a matrix by directly provding the ndarray
# containing the similarity scores
# (identity matrix in our case)
scores = np.identity(len(alph), dtype=int)
matrix = align.SubstitutionMatrix(alph, alph, scores)
github biotite-dev / biotite / doc / examples / scripts / structure / pb_alignment.py View on Github external
) % 360 - 180
        )**2,
        axis=-1
    )
    # Chose PB, where the RMSDA to the reference angle is lowest
    # Due to the definition of Biotite symbol codes
    # the index of the chosen PB is directly the symbol code
    pb_seq_code = np.argmin(rmsda, axis=0)
    # Put the array of symbol codes into actual sequence objects
    pb_sequence = seq.GeneralSequence(pb_alphabet)
    pb_sequence.code = pb_seq_code
    pb_seqs.append(pb_sequence) 

# Perfrom a multiple sequence alignment of the PB sequences
matrix_dict = align.SubstitutionMatrix.dict_from_str(matrix_str)
matrix = align.SubstitutionMatrix(pb_alphabet, pb_alphabet, matrix_dict)
alignment, order, _, _ = align.align_multiple(
    pb_seqs, matrix, gap_penalty=(-500,-100), terminal_penalty=False
)

# Visualize the alignment
# Order alignment according to guide tree
alignment = alignment[:, order.tolist()]
labels = [organisms[i] for i in order]
fig = plt.figure(figsize=(8.0, 4.0))
ax = fig.add_subplot(111)
# The color scheme was generated with the 'Gecos' software
graphics.plot_alignment_type_based(
    ax, alignment, labels=labels, symbols_per_line=45, spacing=2,
    show_numbers=True, color_scheme="flower"
)
# Organism names in italic
github biotite-dev / biotite / doc / examples / scripts / sequence / thca_synthase_polymorphism.py View on Github external
# sequence
    consensus_sequence.code = consensus_code
    return consensus_sequence

drug_type_consensus = create_consensus(
    [sequences[strain] for strain in (1, 10, 13, 20, 53, 54)]
)
fiber_type_consensus = create_consensus(
    [sequences[strain] for strain in (9, 5, 11, 45, 66, 68, 78)]
)


# Create an alignment for visualization purposes
# No insertion/deletions -> Align ungapped
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment = align.align_ungapped(
    drug_type_consensus, fiber_type_consensus, matrix=matrix
)

# A colormap for hightlighting sequence dissimilarity:
# At low similarity the symbols are colored red,
# at high similarity the symbols are colored white
cmap = LinearSegmentedColormap.from_list(
    "custom", colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)]
    #                    ^ reddish        ^ white
)

fig = plt.figure(figsize=(8.0, 6.0))
ax = fig.add_subplot(111)

graphics.plot_alignment_similarity_based(
    ax, alignment, matrix=matrix, symbols_per_line=50,
github biotite-dev / biotite / doc / examples / scripts / sequence / anderson_logo.py View on Github external
seq.NucleotideSequence("tttacggctagctcagtcctaggtacaatgctagc"),
        seq.NucleotideSequence("ttgacggctagctcagtcctaggtatagtgctagc"),
        seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"),
        seq.NucleotideSequence("ctgatggctagctcagtcctagggattatgctagc"),
        seq.NucleotideSequence("tttatggctagctcagtcctaggtacaatgctagc"),
        seq.NucleotideSequence("tttatagctagctcagcccttggtacaatgctagc"),
        seq.NucleotideSequence("ttgacagctagctcagtcctagggactatgctagc"),
        seq.NucleotideSequence("ttgacagctagctcagtcctagggattgtgctagc"),
        seq.NucleotideSequence("ttgacggctagctcagtcctaggtattgtgctagc")]
# Sequences do not need to be aligned
# -> Create alignment with trivial trace
# [[0 0 0 ...]
#  [1 1 1 ...]
#  [2 2 2 ...]
#     ...     ]
alignment = align.Alignment(
    sequences = seqs,
    trace     = np.tile(np.arange(len(seqs[0])), len(seqs)) \
                .reshape(len(seqs), len(seqs[0])) \
                .transpose(),
    score     = 0
)
# Create sequence logo from alignment
fig = plt.figure(figsize=(8.0, 1.5))
ax = fig.add_subplot(111)
graphics.plot_sequence_logo(ax, alignment)
# Remove the entire frame
ax.axis("off")
fig.tight_layout()
plt.show()