Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""Stub for the docmeta service."""
import os
import json
from flask import Flask
from flask.json import jsonify
from werkzeug.exceptions import NotFound, InternalServerError
from arxiv.base import Base
from arxiv.base.converter import ArXivConverter
from arxiv.base import logging
logger = logging.getLogger(__name__)
METADATA_DIR = os.environ.get("METADATA_DIR")
app = Flask("metadata")
Base(app)
app.url_map.converters["arxiv"] = ArXivConverter
@app.route("/docmeta/", methods=["GET"])
def docmeta(document_id):
"""Retrieve document metadata."""
logger.debug(f"Get metadata for {document_id}")
logger.debug(f"Metadata base is {METADATA_DIR}")
if not METADATA_DIR:
"""Query-builders and helpers for searching by author name."""
from typing import Tuple, Optional, List
import re
from functools import reduce, wraps
from operator import ior, iand
from elasticsearch_dsl import Search, Q, SF
from arxiv.base import logging
from .util import wildcard_escape, escape, STRING_LITERAL, \
remove_single_characters, has_wildcard
logger = logging.getLogger(__name__)
logger.propagate = False
# We don't remove stopwords from author names at index time because
# institutions and collaborations are often treated as authors just like
# people.
STOP = ["and", "or", "the", "of", "a", "for"]
def _remove_stopwords(term: str) -> str:
"""Remove common stopwords, except in literal queries."""
parts = re.split(STRING_LITERAL, term)
for stopword in STOP:
parts = [re.sub(f"(^|\s+){stopword}(\s+|$)", " ", part)
if not part.startswith('"') and not part.startswith("'")
else part for part in parts]
return "".join(parts)
"""Handle requests to support sequential navigation between arXiv IDs."""
from flask import url_for, escape
from typing import Tuple, Dict, Any
from werkzeug.exceptions import BadRequest
from browse.domain.identifier import Identifier, IdentifierException
from browse.services.database import get_sequential_id
from arxiv import status
from arxiv.taxonomy.definitions import ARCHIVES, CATEGORIES_ACTIVE
from arxiv.base import logging
Response = Tuple[Dict[str, Any], int, Dict[str, Any]]
logger = logging.getLogger(__name__)
def get_prevnext(id: str, function: str, context: str) -> Response:
"""
Get the next or previous arXiv ID in the browse context.
The 'site' parameter from the classic prevnext is no longer supported.
Parameters
----------
id
arxiv id
function
prev or next
context
which archive or category to browse
.. _todo: This should move to arXiv-base, per ARXIVNG-281.
"""
import time
import json
import os
from typing import Any, Optional, Tuple, Generator, Callable
from contextlib import contextmanager
import signal
import boto3
from botocore.exceptions import WaiterError, NoCredentialsError, \
PartialCredentialsError, BotoCoreError, ClientError
from arxiv.base import logging
logger = logging.getLogger(__name__)
logger.propagate = False
class CheckpointError(RuntimeError):
"""Checkpointing failed."""
class StreamNotAvailable(RuntimeError):
"""Could not find or connect to the stream."""
class KinesisRequestFailed(RuntimeError):
"""Raised when a Kinesis request failed permanently."""
class StopProcessing(RuntimeError):
"""Gracefully stopped processing upon unrecoverable error."""
MappingError,
)
from search.services.index.util import MAX_RESULTS
from search.services.index.advanced import advanced_search
from search.services.index.simple import simple_search
from search.services.index.api import api_search
from search.services.index.api_classic import classic_search
from search.services.index import highlighting
from search.services.index import results
logger = logging.getLogger(__name__)
# Disable the Elasticsearch logger. When enabled, the Elasticsearch logger
# dumps entire Tracebacks prior to propagating exceptions. Thus we end up with
# tracebacks in the logs even for handled exceptions.
logging.getLogger("elasticsearch").disabled = True
ALL_SEARCH_FIELDS = [
"author",
"title",
"abstract",
"comments",
"journal_ref",
"acm_class",
"msc_class",
"report_num",
"paper_id",
"doi",
"orcid",
"author_id",
]
QueryError,
IndexConnectionError,
DocumentNotFound,
IndexingError,
OutsideAllowedRange,
MappingError,
)
from search.services.index.util import MAX_RESULTS
from search.services.index.advanced import advanced_search
from search.services.index.simple import simple_search
from search.services.index.api import api_search
from search.services.index.api_classic import classic_search
from search.services.index import highlighting
from search.services.index import results
logger = logging.getLogger(__name__)
# Disable the Elasticsearch logger. When enabled, the Elasticsearch logger
# dumps entire Tracebacks prior to propagating exceptions. Thus we end up with
# tracebacks in the logs even for handled exceptions.
logging.getLogger("elasticsearch").disabled = True
ALL_SEARCH_FIELDS = [
"author",
"title",
"abstract",
"comments",
"journal_ref",
"acm_class",
"msc_class",
"report_num",
from string import punctuation
from elasticsearch_dsl import Search, Q, SF
from arxiv.base import logging
from search.domain import SimpleQuery, Query, AdvancedQuery, Classification, \
ClassificationList
from .util import strip_tex, Q_, is_tex_query, is_literal_query, escape, \
wildcard_escape, remove_single_characters, has_wildcard, is_old_papernum, \
parse_date, parse_date_partial
from .highlighting import HIGHLIGHT_TAG_OPEN, HIGHLIGHT_TAG_CLOSE
from .authors import author_query, author_id_query, orcid_query
logger = logging.getLogger(__name__)
START_YEAR = 1991
END_YEAR = datetime.now().year
def _query_title(term: str, default_operator: str = 'AND') -> Q:
if is_tex_query(term):
return Q("match", **{f'title.tex': {'query': term}})
fields = ['title.english']
if is_literal_query(term):
fields += ['title']
return Q("query_string", fields=fields, default_operator=default_operator,
allow_leading_wildcard=False, query=escape(term))
def _query_abstract(term: str, default_operator: str = 'AND') -> Q:
"""Web Server Gateway Interface entry-point for classic API."""
import os
from arxiv.base import logging
from search.factory import create_classic_api_web_app
logger = logging.getLogger(__name__)
__flask_app__ = None
def application(environ, start_response):
"""WSGI application factory."""
for key, value in environ.items():
# In some deployment scenarios (e.g. uWSGI on k8s), uWSGI will pass in
# the hostname as part of the request environ. This will usually just
# be a container ID, which is not helpful for things like building
# URLs. We want to keep ``SERVER_NAME`` explicitly configured, either
# in config.py or via an os.environ var loaded by config.py.
if key == "SERVER_NAME":
continue
if type(value) is str:
os.environ[key] = value
"""Creates the Kinesis stream if it does not already exist."""
import boto3
from botocore.exceptions import ClientError
from botocore.vendored.requests.exceptions import ConnectionError
from botocore.client import Config
import os
from search.factory import create_ui_web_app
from search.context import get_application_config
import sys
import time
from arxiv.base import logging
logger = logging.getLogger(__name__)
if __name__ == '__main__':
logger.debug('Pre-start routine for indexing agent')
app = create_ui_web_app()
app.app_context().push()
config = get_application_config()
logger.debug('App context initialized')
endpoint = config.get('KINESIS_ENDPOINT')
region = config.get('AWS_REGION', 'us-east-1')
access_key = config.get('AWS_ACCESS_KEY_ID', "")
aws_secret = config.get('AWS_SECRET_ACCESS_KEY', "")
verify = config.get('KINESIS_VERIFY') == 'true'
stream_name = config.get('KINESIS_STREAM', 'MetadataIsAvailable')
logger.debug(f'Kinesis endpoint: {endpoint}')
from browse.services.util.response_headers import abs_expires_header, \
mime_header_date
from browse.services.document import metadata
from browse.services.document.metadata import AbsException,\
AbsNotFoundException, AbsVersionNotFoundException, AbsDeletedException
from arxiv.browse.domain.identifier import Identifier, IdentifierException,\
IdentifierIsArchiveException
from browse.services.database import count_trackback_pings,\
get_trackback_ping_latest_date, has_sciencewise_ping, \
get_dblp_listing_path, get_dblp_authors
from browse.services.util.external_refs_cits import include_inspire_link,\
include_dblp_section, get_computed_dblp_listing_path, get_dblp_bibtex_path
from browse.services.document.config.external_refs_cits import DBLP_BASE_URL,\
DBLP_BIBTEX_PATH, DBLP_AUTHOR_SEARCH_PATH
logger = logging.getLogger(__name__)
Response = Tuple[Dict[str, Any], int, Dict[str, Any]]
truncate_author_list_size = 100
def get_abs_page(arxiv_id: str) -> Response:
"""
Get abs page data from the document metadata service.
Parameters
----------
arxiv_id : str
The arXiv identifier as provided in the request.
download_format_pref: str
Download format preference.