How to use the biotite.database.entrez.SimpleQuery function in biotite

To help you get started, we’ve selected a few biotite examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github biotite-dev / biotite / doc / examples / scripts / sequence / bionigma_alignment.py View on Github external
labels=labels, label_size=label_size,
        show_line_position=show_line_position,
        spacing=spacing
    )
    
    twin = axes.get_shared_x_axes().get_siblings(axes)[0]
    for ax in (axes, twin):
        ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color":"white"})
    axes.get_figure().patch.set_facecolor("#181818")




# Using cyclotide sequences as example
query = (
    entrez.SimpleQuery("Cyclotide") &
    entrez.SimpleQuery("cter") &
    entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^
    entrez.SimpleQuery("Precursor")
)
uids = entrez.search(query, "protein")
fasta_file = fasta.FastaFile.read(
    entrez.fetch_single_file(uids, None, "protein", "fasta")
)
sequence_dict = fasta.get_sequences(fasta_file)
headers = list(sequence_dict.keys())
sequences = list(sequence_dict.values())
labels = [header[-1] for header in headers]

# Perform a multiple sequence alignment
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
github biotite-dev / biotite / doc / examples / scripts / sequence / lexa_conservation.py View on Github external
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as graphics
import biotite.application.clustalo as clustalo
import biotite.database.entrez as entrez
# Search for protein products of LexA gene in UniProtKB/Swiss-Prot database
query =   entrez.SimpleQuery("lexA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
# Search for the first 200 hits
# More than 200 UIDs are not recommended for the EFetch service
# for a single fetch
uids = entrez.search(query, db_name="protein", number=200)
file = entrez.fetch_single_file(
    uids, None, db_name="protein", ret_type="gp"
)
# The file contains multiple concatenated GenPept files
# -> Usage of MultiFile
multi_file = gb.MultiFile.read(file)
# Separate MultiFile into single GenBankFile instances
files = [f for f in multi_file]
print("Definitions:")
for file in files[:20]:
    print(gb.get_definition(file))
print()
github biotite-dev / biotite / doc / examples / scripts / sequence / lexa_conservation.py View on Github external
the entires for their definition (title) and source (species).
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as graphics
import biotite.application.clustalo as clustalo
import biotite.database.entrez as entrez
# Search for protein products of LexA gene in UniProtKB/Swiss-Prot database
query =   entrez.SimpleQuery("lexA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
# Search for the first 200 hits
# More than 200 UIDs are not recommended for the EFetch service
# for a single fetch
uids = entrez.search(query, db_name="protein", number=200)
file = entrez.fetch_single_file(
    uids, None, db_name="protein", ret_type="gp"
)
# The file contains multiple concatenated GenPept files
# -> Usage of MultiFile
multi_file = gb.MultiFile.read(file)
# Separate MultiFile into single GenBankFile instances
files = [f for f in multi_file]
print("Definitions:")
for file in files[:20]:
    print(gb.get_definition(file))
github biotite-dev / biotite / doc / examples / scripts / sequence / thca_synthase_polymorphism.py View on Github external
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.io.genbank as gb
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez
import biotite.application.clustalo as clustalo


# Search for DNA sequences that belong to the cited article
query =   entrez.SimpleQuery("Forensic Sci. Int.", "Journal") \
        & entrez.SimpleQuery("159", "Volume") \
        & entrez.SimpleQuery("132-140", "Page Number")
uids = entrez.search(query, db_name="nuccore")

# Download and read file containing the Genbank records for the THCA
# synthase genes 
multi_file = gb.MultiFile.read(entrez.fetch_single_file(
    uids, file_name=None, db_name="nuccore", ret_type="gb"
))


# This dictionary maps the strain ID to the protein sequence
sequences = {}

for gb_file in multi_file:
    annotation = gb.get_annotation(gb_file)
github biotite-dev / biotite / doc / tutorial_src / database.py View on Github external
db_name="protein", ret_type="fasta"
)
print(relpath(file_path))

########################################################################
# Similar to the *RCSB PDB*, you can also search in the *NCBI Entrez*
# database, but in an even more powerful manner:
# Due to the simple design of the search queries accepted by
# *NCBI Entrez*, you can search in every
# `field `_
# of the database.

# Search in all fields
print(entrez.SimpleQuery("BL21 genome"))
# Search in the 'Organism' field
print(entrez.SimpleQuery("Escherichia coli", field="Organism"))

########################################################################
# You can even combine multiple :class:`Query` objects in any way you
# like using the binary operators ``|``, ``&`` and ``^``,
# that represent ``OR``,  ``AND`` and ``NOT`` linkage, respectively.

composite_query = (
    entrez.SimpleQuery("50:100", field="Sequence Length") &
    (
        entrez.SimpleQuery("Escherichia coli", field="Organism") |
        entrez.SimpleQuery("Bacillus subtilis", field="Organism")
    )
)
print(composite_query)
github biotite-dev / biotite / doc / examples / scripts / sequence / hcn_hydropathy.py View on Github external
# Apply moving average over 15 amino acids for clearer visualization
ma_radius = 7
hydropathies = moving_average(hydropathies, 2*ma_radius+1)

########################################################################
# In order to assess the positional conservation, the sequences
# of all human HCN proteins are downloaded and aligned.

names = ["HCN1", "HCN2", "HCN3", "HCN4"]

uids = []
for name in names:
    query =   entrez.SimpleQuery(name, "Gene Name") \
            & entrez.SimpleQuery("homo sapiens", "Organism") \
            & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
    uids += entrez.search(query, db_name="protein")
file_name = entrez.fetch_single_file(
    uids, biotite.temp_file("fasta"), db_name="protein", ret_type="fasta"
)

fasta_file = fasta.FastaFile.read(file_name)

for header in fasta_file:
    print(header)

sequences = []
for seq_str in fasta_file.values():
    sequences.append(seq.ProteinSequence(seq_str))

alignment = mafft.MafftApp.align(sequences)
github biotite-dev / biotite / doc / tutorial_src / database.py View on Github external
file_path = entrez.fetch_single_file(
    ["1L2Y_A","1AKI_A"], biotite.temp_file("fa"),
    db_name="protein", ret_type="fasta"
)
print(relpath(file_path))

########################################################################
# Similar to the *RCSB PDB*, you can also search in the *NCBI Entrez*
# database, but in an even more powerful manner:
# Due to the simple design of the search queries accepted by
# *NCBI Entrez*, you can search in every
# `field `_
# of the database.

# Search in all fields
print(entrez.SimpleQuery("BL21 genome"))
# Search in the 'Organism' field
print(entrez.SimpleQuery("Escherichia coli", field="Organism"))

########################################################################
# You can even combine multiple :class:`Query` objects in any way you
# like using the binary operators ``|``, ``&`` and ``^``,
# that represent ``OR``,  ``AND`` and ``NOT`` linkage, respectively.

composite_query = (
    entrez.SimpleQuery("50:100", field="Sequence Length") &
    (
        entrez.SimpleQuery("Escherichia coli", field="Organism") |
        entrez.SimpleQuery("Bacillus subtilis", field="Organism")
    )
)
print(composite_query)
github biotite-dev / biotite / doc / examples / scripts / sequence / color_schemes_protein.py View on Github external
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez


# Generate example alignment
# (the same as in the bacterial luciferase example)
query =   entrez.SimpleQuery("luxA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
    uids, None, db_name="protein", ret_type="fasta"
))
sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()]
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
# Order alignment according to the guide tree
alignment = alignment[:, order]
alignment = alignment[220:300]

# Get color scheme names
alphabet = seq.ProteinSequence.alphabet
schemes = [
    "rainbow", "clustalx",
    "flower", "blossom", "spring", "wither", "autumn", "sunset", "ocean",
github biotite-dev / biotite / doc / tutorial_src / database.py View on Github external
# of the database.

# Search in all fields
print(entrez.SimpleQuery("BL21 genome"))
# Search in the 'Organism' field
print(entrez.SimpleQuery("Escherichia coli", field="Organism"))

########################################################################
# You can even combine multiple :class:`Query` objects in any way you
# like using the binary operators ``|``, ``&`` and ``^``,
# that represent ``OR``,  ``AND`` and ``NOT`` linkage, respectively.

composite_query = (
    entrez.SimpleQuery("50:100", field="Sequence Length") &
    (
        entrez.SimpleQuery("Escherichia coli", field="Organism") |
        entrez.SimpleQuery("Bacillus subtilis", field="Organism")
    )
)
print(composite_query)


########################################################################
# Finally, the query is given to the :func:`search()` function to obtain
# the GIs, that can be used as input to :func:`fetch()`.

# Return a maximum number of 10 entries
gis = entrez.search(composite_query, "protein", number=10)
print(gis)