How to use the biotite.sequence.io.fasta.FastaFile.read function in biotite

To help you get started, we’ve selected a few biotite examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github biotite-dev / biotite / tests / sequence / test_align.py View on Github external
def sequences():
    """
    10 Cas9 sequences.
    """
    fasta_file = fasta.FastaFile.read(join(data_dir("sequence"), "cas9.fasta"))
    return [seq.ProteinSequence(sequence) for sequence in fasta_file.values()]
github biotite-dev / biotite / tests / database / test_entrez.py View on Github external
def test_fetch(common_name, as_file_like):
    path = None if as_file_like else biotite.temp_dir()
    db_name = "Protein" if common_name else "protein"
    file = entrez.fetch("1L2Y_A", path, "fa", db_name,
                        "fasta", overwrite=True)
    fasta_file = fasta.FastaFile.read(file)
    prot_seq = fasta.get_sequence(fasta_file)
github biotite-dev / biotite / tests / sequence / test_fasta.py View on Github external
def test_access():
    path = os.path.join(data_dir("sequence"), "nuc.fasta")
    file = fasta.FastaFile.read(path)
    assert file["dna sequence"] == "ACGCTACGT"
    assert file["another dna sequence"] == "A"
    assert file["third dna sequence"] == "ACGT"
    assert dict(file.items()) == {
        "dna sequence" : "ACGCTACGT",
        "another dna sequence" : "A",
        "third dna sequence" : "ACGT"
    }
    file["another dna sequence"] = "AA"
    del file["dna sequence"]
    file["yet another sequence"] = "ACGT"
    assert dict(file.items()) == {
        "another dna sequence" : "AA",
        "third dna sequence"   : "ACGT",
        "yet another sequence" : "ACGT"
    }
github biotite-dev / biotite / tests / sequence / test_fasta.py View on Github external
path = os.path.join(data_dir("sequence"), "nuc.fasta")
    file = fasta.FastaFile.read(path)
    assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file)
    
    seq_dict = fasta.get_sequences(file)
    file2 = fasta.FastaFile()
    fasta.set_sequences(file2, seq_dict)
    seq_dict2 = fasta.get_sequences(file2)
    assert seq_dict == seq_dict2
    
    file3 = fasta.FastaFile()
    fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG"))
    assert file3["sequence"] == "AACCTTGG"
    
    path = os.path.join(data_dir("sequence"), "prot.fasta")
    file4 = fasta.FastaFile.read(path)
    assert seq.ProteinSequence("YAHGFRTGS") == fasta.get_sequence(file4)
    
    path = os.path.join(data_dir("sequence"), "invalid.fasta")
    file5 = fasta.FastaFile.read(path)
    with pytest.raises(ValueError):
        seq.NucleotideSequence(fasta.get_sequence(file5))
github biotite-dev / biotite / doc / examples / scripts / sequence / luxa_comparison.py View on Github external
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez


# Search for protein products of LexA gene in UniProtKB/Swiss-Prot database
query =   entrez.SimpleQuery("luxA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
    uids, None, db_name="protein", ret_type="fasta"
))

ids = []
sequences = []
for header, seq_str in fasta_file.items():
    # Extract the UniProt Entry name from header
    identifier = header.split("|")[-1].split()[0]
    ids.append(identifier)
    sequences.append(seq.ProteinSequence(seq_str))

matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, tree, distances = align.align_multiple(
    sequences, matrix, gap_penalty=(-10,-1), terminal_penalty=False
)
# Order alignment according to the guide tree
github biotite-dev / biotite / doc / examples / scripts / sequence / pi3k_alignment.py View on Github external
import warnings
import numpy as np
import matplotlib.pyplot as plt
import biotite
import biotite.database.entrez as entrez
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.sequence.io.fasta as fasta
import biotite.application.clustalo as clustalo

uids  = ["5JHB_A", "5LUQ_A",   "5FLC_B", "5YZ0_A", "5NP0_A", "4FUL_A"]
names = ["PI3K",   "DNA-PKcs", "mTOR",   "ATR",    "ATM",    "hSMG-1"]

sequences = []
file = fasta.FastaFile.read(entrez.fetch_single_file(
    uids, None, db_name="protein", ret_type="fasta"
))
for header, seq_str in file.items():
    sequences.append(seq.ProteinSequence(seq_str))

alignment = clustalo.ClustalOmegaApp.align(sequences)

########################################################################
# Since the sequences are relatively long, the display of the entire
# alignment using a :func:`plot_alignment` would be too big.
# Instead we use a heat map, which highlights the similarity in the
# alignment column using a color map.
#
# Like the :class:`LetterSimilarityPlotter` we will use the
# *average normalized similarity* as measure.
github biotite-dev / biotite / doc / examples / scripts / sequence / orf_identification.py View on Github external
Since we want to perform a six-frame translation we have to look at
the complementary strand of the genome as well.
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import matplotlib.pyplot as plt

# Download Porcine circovirus genome
file = entrez.fetch("KP282147", None, "fa", "nuccore", "fasta")
fasta_file = fasta.FastaFile.read(file)
genome = fasta.get_sequence(fasta_file)
# Perform translation for forward strand
proteins, positions = genome.translate()
print("Forward strand:")
for i in range(len(proteins)):
    print("{:4d} - {:4d}:   {:}"
          .format(positions[i][0], positions[i][1], str(proteins[i])))
print("\n")
# Perform translation for complementary strand
genome_rev = genome.reverse().complement()
proteins, positions = genome_rev.translate()
print("Reverse strand:")
for i in range(len(proteins)):
    print("{:5d} - {:5d}:   {:}"
          .format(positions[i][0], positions[i][1], str(proteins[i])))
github biotite-dev / biotite / doc / examples / scripts / sequence / bionigma_alignment.py View on Github external
for ax in (axes, twin):
        ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color":"white"})
    axes.get_figure().patch.set_facecolor("#181818")




# Using cyclotide sequences as example
query = (
    entrez.SimpleQuery("Cyclotide") &
    entrez.SimpleQuery("cter") &
    entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^
    entrez.SimpleQuery("Precursor")
)
uids = entrez.search(query, "protein")
fasta_file = fasta.FastaFile.read(
    entrez.fetch_single_file(uids, None, "protein", "fasta")
)
sequence_dict = fasta.get_sequences(fasta_file)
headers = list(sequence_dict.keys())
sequences = list(sequence_dict.values())
labels = [header[-1] for header in headers]

# Perform a multiple sequence alignment
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
# Order alignment according to guide tree
alignment = alignment[:, order.tolist()]
labels = [labels[i] for i in order]

# Visualize the alignment using the new alignment plotter
fig = plt.figure(figsize=(8.0, 3.7))
github biotite-dev / biotite / doc / examples / scripts / sequence / codon_usage.py View on Github external
# Again, symbol codes are used here
opt_codons = {}
for amino_acid_code in range(20):
    codon_codes_for_aa = table[amino_acid_code]
    # Find codon with maximum frequency
    max_freq = 0
    best_codon_code = None
    for codon_code in codon_codes_for_aa:
        if codon_counter[codon_code] > max_freq:
            max_freq = codon_counter[codon_code]
            best_codon_code = codon_code
    # Map the amino acid to the codon with maximum frequency
    opt_codons[amino_acid_code] = best_codon_code

# Fetch the streptavidin protein sequence from Streptomyces avidinii
fasta_file = fasta.FastaFile.read(
    entrez.fetch("P22629", None, "fasta", "protein", "fasta")
)
strep_prot_seq = fasta.get_sequence(fasta_file)
# Create a DNA sequence from the protein sequence
# using the optimal codons
strep_dna_seq = seq.NucleotideSequence()
strep_dna_seq.code = np.concatenate(
    [opt_codons[amino_acid_code] for amino_acid_code in strep_prot_seq.code]
)
# Add stop codon
strep_dna_seq += seq.NucleotideSequence("TAA")
# Put the DNA sequence into a FASTA file
fasta_file = fasta.FastaFile()
fasta_file["Codon optimized streptavidin"] = str(strep_dna_seq)
# Print the contents of the created FASTA file
print(fasta_file)