How to use the feedparser._FeedParserMixin.namespaces function in feedparser

To help you get started, we’ve selected a few feedparser examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github logangraham / arXausality / fetch_papers.py View on Github external
specific_abs,
                                                        specific_ti)

    query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                         start,
                                                         max_results)

    if only_recent:
        suffix = "&sortBy=submittedDate&sortOrder=descending"
        query += suffix

    full_url = base_url + query
    print(full_url)

    feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
    feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

    with urllib.request.urlopen(base_url+query) as url:
        response = url.read()

    feed = feedparser.parse(response)

    print('Feed title: %s' % feed.feed.title)
    print('Feed last updated: %s' % feed.feed.updated)

    print('totalResults for this query: %s' % feed.feed.opensearch_totalresults)
    print('itemsPerPage for this query: %s' % feed.feed.opensearch_itemsperpage)
    print('startIndex for this query: %s'   % feed.feed.opensearch_startindex)

    rows = []
    for entry in feed.entries:  # extract information & add to list
        entry_id = entry.id.split('/abs/')[-1]
github logangraham / arXausality / pull_weekly_update.py View on Github external
general_ti,
                                                        specific_abs,
                                                        specific_ti)

    query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                         start,
                                                         max_results)

    if only_recent:
        suffix = "&sortBy=submittedDate&sortOrder=descending"
        query += suffix

    full_url = base_url + query
    print(full_url)

    feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
    feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

    response = urllib.urlopen(base_url+query).read()

    feed = feedparser.parse(response)

    print('Feed title: %s' % feed.feed.title)
    print('Feed last updated: %s' % feed.feed.updated)

    print('totalResults for this query: %s' % feed.feed.opensearch_totalresults)
    print('itemsPerPage for this query: %s' % feed.feed.opensearch_itemsperpage)
    print('startIndex for this query: %s'   % feed.feed.opensearch_startindex)

    rows = []
    for entry in feed.entries:  # extract information & add to list
        entry_id = entry.id.split('/abs/')[-1]
github zingale / lazy-astroph / lazy_astroph.py View on Github external
def do_query(self, keywords=None, old_id=None):
        """ perform the actual query """

        # note, in python3 this will be bytes not str
        response = urlopen(self.get_url()).read()
        response = response.replace(b"author", b"contributor")

        # this feedparser magic comes from the example of Julius Lucks / Andrea Zonca
        # https://github.com/zonca/python-parse-arxiv/blob/master/python_arXiv_parsing_example.py
        feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
        feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

        feed = feedparser.parse(response)

        if feed.feed.opensearch_totalresults == 0:
            sys.exit("no results found")

        results = []

        latest_id = None

        for e in feed.entries:

            arxiv_id = e.id.split("/abs/")[-1]
            title = e.title.replace("\n", " ")
github pculture / vidscraper / vidscraper / bulk_import / opensearch.py View on Github external
import urlparse
import urllib

import feedparser

from vidscraper.bulk_import.utils import join_feeds

# add the OpenSearch namespace to FeedParser
# http://code.google.com/p/feedparser/issues/detail?id=55
feedparser._FeedParserMixin.namespaces[
    'http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'


def _opensearch_get(parsed_feed, key):
    return (parsed_feed.feed.get('opensearch_%s' % key) or
            parsed_feed.feed.get(key, None))

def video_count(parsed_feed):
    """
    Returns the number of videos that we think are in this feed in total.  If
    the feed isn't a valid OpenSearch feed, return None.
    """
    if not (_opensearch_get(parsed_feed, 'startindex') and
            _opensearch_get(parsed_feed, 'itemsperpage') and
            _opensearch_get(parsed_feed, 'totalresults')):
        return None # not a valid OpenSearch feed
github pculture / vidscraper / vidscraper / suites / kaltura.py View on Github external
import urlparse

import feedparser

from vidscraper.exceptions import UnhandledFeed
from vidscraper.suites import BaseSuite, registry
from vidscraper.utils.feedparser import (get_accepted_enclosures,
                                         struct_time_to_datetime)
from vidscraper.videos import FeedparserFeed, VideoFile


# add the Kaltura namespace to FeedParser.
# http://code.google.com/p/feedparser/issues/detail?id=55
feedparser._FeedParserMixin.namespaces[
    'http://kaltura.com/playlist/1.0'] = 'kaltura'


class Feed(FeedparserFeed):
    schemes = ('http', 'https')
    netlocs = ('kaltura.com', 'www.kaltura.com')
    path = '/index.php/partnerservices2/executeplaylist'
    page_url_format = ('http://www.kaltura.com/index.php/partnerservices2/'
                       'executeplaylist?format=8&partner_id={partner_id}'
                       '&subp_id={subp_id}&playlist_id={playlist_id}')

    def _next_page(self):
        if self.start_index != 1 or self.item_count > 0:
            raise StopIteration
        super(Feed, self)._next_page()
github pculture / vidscraper / vidscraper / suites / youtube.py View on Github external
from datetime import datetime
import json
import re
import time
import urllib
import urlparse

from bs4 import BeautifulSoup, SoupStrainer

import feedparser
# add the OpenSearch namespace to FeedParser
# http://code.google.com/p/feedparser/issues/detail?id=55
feedparser._FeedParserMixin.namespaces[
    'http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
import requests

from vidscraper.exceptions import UnhandledVideo, UnhandledFeed
from vidscraper.suites import BaseSuite, registry
from vidscraper.utils.feedparser import struct_time_to_datetime
from vidscraper.videos import (BaseFeed, BaseSearch, VideoLoader,
                               OEmbedLoaderMixin, VideoFile)


# Information on the YouTube API can be found at the following links:
# * https://developers.google.com/youtube/2.0/developers_guide_protocol
# * https://developers.google.com/youtube/2.0/reference


class PathMixin(object):
github wowdd1 / xlinkBook / arXiv.py View on Github external
# Search parameters
search_query = 'ti:vision+AND+cat:cs.AI' # search for electron in all fields
start = 0                     # retreive the first 5 results
max_results = 50

query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                     start,
                                                     max_results)

# Opensearch metadata such as totalResults, startIndex, 
# and itemsPerPage live in the opensearch namespase.
# Some entry metadata lives in the arXiv namespace.
# This is a hack to expose both of these namespaces in
# feedparser v4.1
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

# perform a GET request using the base_url and query
response = urllib.urlopen(base_url+query).read()

# change author -> contributors (because contributors is a list)
response = response.replace('author','contributor')

# parse the response using feedparser
feed = feedparser.parse(response)

# print out feed information
#print 'Feed title: %s' % feed.feed.title
#print 'Feed last updated: %s' % feed.feed.updated

# print opensearch metadata
github 8planes / mirosubs / libs / vidscraper / bulk_import / opensearch.py View on Github external
import feedparser

from vidscraper.bulk_import import util

# add the OpenSearch namespace to FeedParser
# http://code.google.com/p/feedparser/issues/detail?id=55
feedparser._FeedParserMixin.namespaces[
    'http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'


def _opensearch_get(parsed_feed, key):
    return (parsed_feed.feed.get('opensearch_%s' % key) or
            parsed_feed.feed.get(key, None))

def video_count(parsed_feed):
    """
    Returns the number of videos that we think are in this feed in total.  If
    the feed isn't a valid OpenSearch feed, return None.
    """
    if not (_opensearch_get(parsed_feed, 'startindex') and
            _opensearch_get(parsed_feed, 'itemsperpage') and
            _opensearch_get(parsed_feed, 'totalresults')):
        return None # not a valid OpenSearch feed
github Keep-Current / web-miner / webminer / use_cases / request_arxiv / arxiv_repo.py View on Github external
Raises:
        ValueError: If formatting operator is not supported
        AssertionError: If url parsing went wrong

    Returns:
        dict: Dictionary of results
    """

    # Base api query url
    base_url = "http://export.arxiv.org/api/query?"

    # Expose both of the open source metadata namespaces in feedparser
    feedparser._FeedParserMixin.namespaces[  # pylint: disable=W0212
        "http://a9.com/-/spec/opensearch/1.1/"
    ] = "opensearch"
    feedparser._FeedParserMixin.namespaces[  # pylint: disable=W0212
        "http://arxiv.org/schemas/atom"
    ] = "arxiv"

    def __init__(self, entries=None):
        self._entries = []
        if entries:
            self._entries.extend(entries)

    def _check(self, element, key, value):
        """Checks elements and formats them

        Args:
            element (obj): Document object
            key (string): Key in document object
            value (string): Value of the corresponding document object
github pvskand / arXiv_download / arXiv.py View on Github external
import requests
import urllib
import feedparser
from docopt import docopt
import os

base_url = "http://export.arxiv.org/api/query?"

search_query = "DeepLab"
start = 0
max_result = 10


feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

''' A function to query from the arXiv API '''

def get_query():
    query = 'search_query=%s&start=%i&max_results=%i' % (search_query, start, max_result)
    response = urllib.urlopen(base_url+query).read() 
    feed = feedparser.parse(urllib.urlopen(base_url+query))
    if feed.get('status') != 200:
        raise Exception("HTTP Error " + str((feed.get('status', 'no status')) + " in query"))

    return feed

''' Function to print the list of papers that was queried for'''

def print_query(feed):