Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def get_internet_archive_document(url) -> str:
"""Downloads a document (book, etc.) from Internet Archive and returns it as a string. The linked document must
have a text version. PDF text extraction is not supported at this time.
Returns a ParsedText instance.
"""
validate_url(url, expected_netloc='archive.org')
url_parts = urlsplit(url).path.split("/")
if len(url_parts) > 2:
document_id = url_parts[2]
else:
raise Exception(f'Not a valid url')
try:
response = download(document_id, glob_pattern="*txt", return_responses=True)[0]
# Remove single newlines, preserve double newlines (because they demarcate paragraphs
text = re.sub('(?
def download_jp2(self, item, glob_pattern):
success = False
while not success:
try:
download(item, glob_pattern=glob_pattern, destdir=self.top_dir,\
ignore_existing = True, retries = 10)
success = True
except Exception as e:
success = False
time.sleep(60)
from internetarchive import download
ident = 'podcasts'
destifolder = 'iapodcasts'
search = ia.search_items('collection:%s' % ident)
current = [f for f in os.listdir(destifolder)]
num = 0
for result in search: #for all items in a collection
num = num + 1 #item count
itemid = result['identifier']
print('Downloading: #' + str(num) + '\t' + itemid)
if itemid not in current:
try:
download(itemid, destdir=destifolder, retries=5, glob_pattern=['*.ogg', '*.mp3', '*.wav', '*.flv'])
print('\t\t Download success.')
except Exception as e:
print("Error Occurred downloading () = {}".format(itemid, e) )
print('Pausing for 20 minutes')
#time.sleep(1200)
#time.sleep(0.5)
if num == 5000:
break