Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
specific_abs,
specific_ti)
query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
start,
max_results)
if only_recent:
suffix = "&sortBy=submittedDate&sortOrder=descending"
query += suffix
full_url = base_url + query
print(full_url)
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'
with urllib.request.urlopen(base_url+query) as url:
response = url.read()
feed = feedparser.parse(response)
print('Feed title: %s' % feed.feed.title)
print('Feed last updated: %s' % feed.feed.updated)
print('totalResults for this query: %s' % feed.feed.opensearch_totalresults)
print('itemsPerPage for this query: %s' % feed.feed.opensearch_itemsperpage)
print('startIndex for this query: %s' % feed.feed.opensearch_startindex)
rows = []
for entry in feed.entries: # extract information & add to list
entry_id = entry.id.split('/abs/')[-1]
general_ti,
specific_abs,
specific_ti)
query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
start,
max_results)
if only_recent:
suffix = "&sortBy=submittedDate&sortOrder=descending"
query += suffix
full_url = base_url + query
print(full_url)
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'
response = urllib.urlopen(base_url+query).read()
feed = feedparser.parse(response)
print('Feed title: %s' % feed.feed.title)
print('Feed last updated: %s' % feed.feed.updated)
print('totalResults for this query: %s' % feed.feed.opensearch_totalresults)
print('itemsPerPage for this query: %s' % feed.feed.opensearch_itemsperpage)
print('startIndex for this query: %s' % feed.feed.opensearch_startindex)
rows = []
for entry in feed.entries: # extract information & add to list
entry_id = entry.id.split('/abs/')[-1]
def do_query(self, keywords=None, old_id=None):
""" perform the actual query """
# note, in python3 this will be bytes not str
response = urlopen(self.get_url()).read()
response = response.replace(b"author", b"contributor")
# this feedparser magic comes from the example of Julius Lucks / Andrea Zonca
# https://github.com/zonca/python-parse-arxiv/blob/master/python_arXiv_parsing_example.py
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'
feed = feedparser.parse(response)
if feed.feed.opensearch_totalresults == 0:
sys.exit("no results found")
results = []
latest_id = None
for e in feed.entries:
arxiv_id = e.id.split("/abs/")[-1]
title = e.title.replace("\n", " ")
import urlparse
import urllib
import feedparser
from vidscraper.bulk_import.utils import join_feeds
# add the OpenSearch namespace to FeedParser
# http://code.google.com/p/feedparser/issues/detail?id=55
feedparser._FeedParserMixin.namespaces[
'http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
def _opensearch_get(parsed_feed, key):
return (parsed_feed.feed.get('opensearch_%s' % key) or
parsed_feed.feed.get(key, None))
def video_count(parsed_feed):
"""
Returns the number of videos that we think are in this feed in total. If
the feed isn't a valid OpenSearch feed, return None.
"""
if not (_opensearch_get(parsed_feed, 'startindex') and
_opensearch_get(parsed_feed, 'itemsperpage') and
_opensearch_get(parsed_feed, 'totalresults')):
return None # not a valid OpenSearch feed
import urlparse
import feedparser
from vidscraper.exceptions import UnhandledFeed
from vidscraper.suites import BaseSuite, registry
from vidscraper.utils.feedparser import (get_accepted_enclosures,
struct_time_to_datetime)
from vidscraper.videos import FeedparserFeed, VideoFile
# add the Kaltura namespace to FeedParser.
# http://code.google.com/p/feedparser/issues/detail?id=55
feedparser._FeedParserMixin.namespaces[
'http://kaltura.com/playlist/1.0'] = 'kaltura'
class Feed(FeedparserFeed):
schemes = ('http', 'https')
netlocs = ('kaltura.com', 'www.kaltura.com')
path = '/index.php/partnerservices2/executeplaylist'
page_url_format = ('http://www.kaltura.com/index.php/partnerservices2/'
'executeplaylist?format=8&partner_id={partner_id}'
'&subp_id={subp_id}&playlist_id={playlist_id}')
def _next_page(self):
if self.start_index != 1 or self.item_count > 0:
raise StopIteration
super(Feed, self)._next_page()
from datetime import datetime
import json
import re
import time
import urllib
import urlparse
from bs4 import BeautifulSoup, SoupStrainer
import feedparser
# add the OpenSearch namespace to FeedParser
# http://code.google.com/p/feedparser/issues/detail?id=55
feedparser._FeedParserMixin.namespaces[
'http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
import requests
from vidscraper.exceptions import UnhandledVideo, UnhandledFeed
from vidscraper.suites import BaseSuite, registry
from vidscraper.utils.feedparser import struct_time_to_datetime
from vidscraper.videos import (BaseFeed, BaseSearch, VideoLoader,
OEmbedLoaderMixin, VideoFile)
# Information on the YouTube API can be found at the following links:
# * https://developers.google.com/youtube/2.0/developers_guide_protocol
# * https://developers.google.com/youtube/2.0/reference
class PathMixin(object):
# Search parameters
search_query = 'ti:vision+AND+cat:cs.AI' # search for electron in all fields
start = 0 # retreive the first 5 results
max_results = 50
query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
start,
max_results)
# Opensearch metadata such as totalResults, startIndex,
# and itemsPerPage live in the opensearch namespase.
# Some entry metadata lives in the arXiv namespace.
# This is a hack to expose both of these namespaces in
# feedparser v4.1
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'
# perform a GET request using the base_url and query
response = urllib.urlopen(base_url+query).read()
# change author -> contributors (because contributors is a list)
response = response.replace('author','contributor')
# parse the response using feedparser
feed = feedparser.parse(response)
# print out feed information
#print 'Feed title: %s' % feed.feed.title
#print 'Feed last updated: %s' % feed.feed.updated
# print opensearch metadata
import feedparser
from vidscraper.bulk_import import util
# add the OpenSearch namespace to FeedParser
# http://code.google.com/p/feedparser/issues/detail?id=55
feedparser._FeedParserMixin.namespaces[
'http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
def _opensearch_get(parsed_feed, key):
return (parsed_feed.feed.get('opensearch_%s' % key) or
parsed_feed.feed.get(key, None))
def video_count(parsed_feed):
"""
Returns the number of videos that we think are in this feed in total. If
the feed isn't a valid OpenSearch feed, return None.
"""
if not (_opensearch_get(parsed_feed, 'startindex') and
_opensearch_get(parsed_feed, 'itemsperpage') and
_opensearch_get(parsed_feed, 'totalresults')):
return None # not a valid OpenSearch feed
Raises:
ValueError: If formatting operator is not supported
AssertionError: If url parsing went wrong
Returns:
dict: Dictionary of results
"""
# Base api query url
base_url = "http://export.arxiv.org/api/query?"
# Expose both of the open source metadata namespaces in feedparser
feedparser._FeedParserMixin.namespaces[ # pylint: disable=W0212
"http://a9.com/-/spec/opensearch/1.1/"
] = "opensearch"
feedparser._FeedParserMixin.namespaces[ # pylint: disable=W0212
"http://arxiv.org/schemas/atom"
] = "arxiv"
def __init__(self, entries=None):
self._entries = []
if entries:
self._entries.extend(entries)
def _check(self, element, key, value):
"""Checks elements and formats them
Args:
element (obj): Document object
key (string): Key in document object
value (string): Value of the corresponding document object
import requests
import urllib
import feedparser
from docopt import docopt
import os
base_url = "http://export.arxiv.org/api/query?"
search_query = "DeepLab"
start = 0
max_result = 10
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'
''' A function to query from the arXiv API '''
def get_query():
query = 'search_query=%s&start=%i&max_results=%i' % (search_query, start, max_result)
response = urllib.urlopen(base_url+query).read()
feed = feedparser.parse(urllib.urlopen(base_url+query))
if feed.get('status') != 200:
raise Exception("HTTP Error " + str((feed.get('status', 'no status')) + " in query"))
return feed
''' Function to print the list of papers that was queried for'''
def print_query(feed):