How to use the biotite.database.entrez function in biotite

To help you get started, we’ve selected a few biotite examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github biotite-dev / biotite / doc / examples / scripts / sequence / luxa_comparison.py View on Github external
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez


# Search for protein products of LexA gene in UniProtKB/Swiss-Prot database
query =   entrez.SimpleQuery("luxA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
    uids, None, db_name="protein", ret_type="fasta"
))

ids = []
sequences = []
for header, seq_str in fasta_file.items():
    # Extract the UniProt Entry name from header
    identifier = header.split("|")[-1].split()[0]
    ids.append(identifier)
    sequences.append(seq.ProteinSequence(seq_str))

matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, tree, distances = align.align_multiple(
github biotite-dev / biotite / doc / examples / scripts / sequence / lexa_conservation.py View on Github external
import numpy as np
import matplotlib.pyplot as plt
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as graphics
import biotite.application.clustalo as clustalo
import biotite.database.entrez as entrez
# Search for protein products of LexA gene in UniProtKB/Swiss-Prot database
query =   entrez.SimpleQuery("lexA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
# Search for the first 200 hits
# More than 200 UIDs are not recommended for the EFetch service
# for a single fetch
uids = entrez.search(query, db_name="protein", number=200)
file = entrez.fetch_single_file(
    uids, None, db_name="protein", ret_type="gp"
)
# The file contains multiple concatenated GenPept files
# -> Usage of MultiFile
multi_file = gb.MultiFile.read(file)
# Separate MultiFile into single GenBankFile instances
files = [f for f in multi_file]
print("Definitions:")
for file in files[:20]:
    print(gb.get_definition(file))
print()
print("Sources:")
for file in files[:20]:
    print(gb.get_source(file))

########################################################################
github biotite-dev / biotite / doc / examples / scripts / sequence / color_schemes_protein.py View on Github external
# Code source: Patrick Kunzmann
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez


# Generate example alignment
# (the same as in the bacterial luciferase example)
query =   entrez.SimpleQuery("luxA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
    uids, None, db_name="protein", ret_type="fasta"
))
sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()]
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
# Order alignment according to the guide tree
alignment = alignment[:, order]
alignment = alignment[220:300]

# Get color scheme names
alphabet = seq.ProteinSequence.alphabet
schemes = [
    "rainbow", "clustalx",
github biotite-dev / biotite / doc / examples / scripts / alignments / avidin_alignment.py View on Github external
with streptavidin (*Streptomyces lavendulae*).
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import matplotlib.pyplot as plt
import biotite
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics

# Download and parse protein sequences of avidin and streptavidin
file_name = entrez.fetch_single_file(["CAC34569", "ACL82594"],
                                     biotite.temp_file("sequences.fasta"),
                                     "protein", "fasta")
file = fasta.FastaFile()
file.read(file_name)
for name, sequence in file:
    if "CAC34569" in name:
        avidin_seq = seq.ProteinSequence(sequence)
    elif "ACL82594" in name:
        streptavidin_seq = seq.ProteinSequence(sequence)
# Get BLOSUM62 matrix
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignments = align.align_optimal(avidin_seq, streptavidin_seq, matrix,
                                 gap_penalty=(-10, -1), terminal_penalty=False)
# Draw first and only alignment
github biotite-dev / biotite / doc / tutorial / src / sequence.py View on Github external
# One popular source to obtain information about sequence features are
# GenBank (for DNA and RNA) and GenPept (for peptides) files.
# As example for sequence features we will work with the GenBank file
# for the avidin gene (Accession: ``AJ311647``),
# that we can download from the NCBI Entrez database.
# After downloading we can load the file using the :class:`GenBankFile`
# class from :mod:`biotite.sequence.io.genbank`.
# Similar to the other file classes we have encountered, a
# :class:`GenBankFile` provides a low-level interface.
# In contrast, the :mod:`biotite.sequence.io.genbank` module contains
# high-level functions to directly obtain useful objects from a
# :class:`GenBankFile` object.

import biotite.sequence.io.genbank as gb

file_path = entrez.fetch(
    "AJ311647", gettempdir(), suffix="gb",
    db_name="nuccore", ret_type="gb"
)
file = gb.GenBankFile.read(file_path)
print("Accession:", gb.get_accession(file))
print("Definition:", gb.get_definition(file))

########################################################################
# 
# .. currentmodule:: biotite.sequence
# 
# Now that we have loaded the file, we want to have a look at the
# sequence features.
# Therefore, we grab the :class:`Annotation` from the file.
# An annotation is the collection of features corresponding to one
# sequence (the sequence itself is not included, though).
github biotite-dev / biotite / doc / examples / scripts / sequence / sigma_domains.py View on Github external
import biotite.database.entrez as entrez


# The names of the sigma factors and the corresponding genes
genes = OrderedDict({
    r"$\sigma^{70}$": "rpoD",
    r"$\sigma^{24}$": "rpoE",
    r"$\sigma^{28}$": "rpoF",
    r"$\sigma^{32}$": "rpoH",
    r"$\sigma^{38}$": "rpoS",
})

# Find SwissProt entries for these genes in NCBI Entrez protein database
uids = []
for name, gene in genes.items():
    query =   entrez.SimpleQuery(gene, "Gene Name") \
            & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") \
            & entrez.SimpleQuery("Escherichia coli K-12", "Organism")
    ids = entrez.search(query, "protein")
    # Only one entry per gene in E. coli K-12 is expected
    assert len(ids) == 1
    uids += ids
# Download corresponding GenBank files as single, merged file
file = entrez.fetch_single_file(
    uids, None, "protein", ret_type="gb"
)

# Array that will hold for each of the genes and each of the 4 domains
# the first and last position
# The array is initally filled with -1, as the value -1 will indicate
# that the domain does not exist in the sigma factor
domain_pos = np.full((len(genes), 4, 2), -1, dtype=int)
github biotite-dev / biotite / doc / examples / scripts / sequence / orf_identification.py View on Github external
since domestic pigs are the host of the virus.

Since we want to perform a six-frame translation we have to look at
the complementary strand of the genome as well.
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import matplotlib.pyplot as plt

# Download Porcine circovirus genome
file = entrez.fetch("KP282147", None, "fa", "nuccore", "fasta")
fasta_file = fasta.FastaFile.read(file)
genome = fasta.get_sequence(fasta_file)
# Perform translation for forward strand
proteins, positions = genome.translate()
print("Forward strand:")
for i in range(len(proteins)):
    print("{:4d} - {:4d}:   {:}"
          .format(positions[i][0], positions[i][1], str(proteins[i])))
print("\n")
# Perform translation for complementary strand
genome_rev = genome.reverse().complement()
proteins, positions = genome_rev.translate()
print("Reverse strand:")
for i in range(len(proteins)):
    print("{:5d} - {:5d}:   {:}"
          .format(positions[i][0], positions[i][1], str(proteins[i])))
github biotite-dev / biotite / doc / examples / scripts / sequence / sw_genome_search.py View on Github external
# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as graphics
import biotite.sequence.align as align
import biotite.database.entrez as entrez
import numpy as np
import matplotlib.pyplot as plt

# Download E. coli BL21 genome
file_name = entrez.fetch("CP001509", biotite.temp_dir(), "gb", "nuccore", "gb")
gb_file = gb.GenBankFile.read(file_name)
annot_seq = gb.get_annotated_sequence(gb_file, include_only=["gene"])
# Find leuL gene
for feature in annot_seq.annotation:
    if "gene" in feature.qual and feature.qual["gene"] == "leuL":
        leul_feature = feature
# Get leuL sequence
leul_seq = annot_seq[leul_feature]

# Download Salmonella enterica genome without annotations
file_name = entrez.fetch("CP019649", biotite.temp_dir(),
                         "fa", "nuccore", "fasta")
fasta_file = fasta.FastaFile.read(file_name)
se_genome = fasta.get_sequence(fasta_file)
# Find leuL in genome by local alignment
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
github biotite-dev / biotite / doc / examples / scripts / sequence / homolog_msa.py View on Github external
ref_seq = fasta.get_sequence(fasta_file)
# Find homologous proteins using NCBI Blast
# Search only the UniProt/SwissProt database
blast_app = blast.BlastWebApp("blastp", ref_seq, "swissprot", obey_rules=False)
blast_app.start()
blast_app.join()
alignments = blast_app.get_alignments()
# Get hit IDs for hits with score > 200
hits = []
for ali in alignments:
    if ali.score > 200:
        hits.append(ali.hit_id)
# Get the sequences from hit IDs
hit_seqs = []
for hit in hits:
    file_name = entrez.fetch(hit, gettempdir(), "fa", "protein", "fasta")
    fasta_file = fasta.FastaFile.read(file_name)
    hit_seqs.append(fasta.get_sequence(fasta_file))

# Perform a multiple sequence alignment using MUSCLE
app = muscle.MuscleApp(hit_seqs)
app.start()
app.join()
alignment = app.get_alignment()
# Print the MSA with hit IDs
print("MSA results:")
gapped_seqs = alignment.get_gapped_sequences()
for i in range(len(gapped_seqs)):
    print(hits[i], " "*3, gapped_seqs[i])

# Visualize the first 200 columns of the alignment
# Reorder alignments to reflect sequence distance