How to use the internetarchive.download function in internetarchive

To help you get started, we’ve selected a few internetarchive examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github coreybobco / generativepoetry-py / generativepoetry / decomposer.py View on Github external
def get_internet_archive_document(url) -> str:
    """Downloads a document (book, etc.) from Internet Archive and returns it as a string. The linked document must
       have a text version. PDF text extraction is not supported at this time.
       Returns a ParsedText instance.
    """
    validate_url(url, expected_netloc='archive.org')
    url_parts = urlsplit(url).path.split("/")
    if len(url_parts) > 2:
        document_id = url_parts[2]
    else:
        raise Exception(f'Not a valid url')
    try:
        response = download(document_id, glob_pattern="*txt", return_responses=True)[0]
        # Remove single newlines, preserve double  newlines (because they demarcate paragraphs
        text = re.sub('(?
github sushant354 / egazette / gvision.py View on Github external
def download_jp2(self, item, glob_pattern):
        success = False
        while not success:
            try:
                download(item, glob_pattern=glob_pattern, destdir=self.top_dir,\
                         ignore_existing = True, retries = 10)
                success = True         
            except Exception as e:
                success = False
                time.sleep(60)
github gittb / audiosandbox / scrape / podcast_scrape.py View on Github external
from internetarchive import download

ident = 'podcasts'
destifolder = 'iapodcasts'
search = ia.search_items('collection:%s' % ident)
current = [f for f in os.listdir(destifolder)]

num = 0

for result in search: #for all items in a collection
    num = num + 1 #item count
    itemid = result['identifier']
    print('Downloading: #' + str(num) + '\t' + itemid)
    if itemid not in current:
        try:
            download(itemid, destdir=destifolder, retries=5, glob_pattern=['*.ogg', '*.mp3', '*.wav', '*.flv'])
            print('\t\t Download success.')
        except Exception as e:
            print("Error Occurred downloading () = {}".format(itemid, e) )
            print('Pausing for 20 minutes')
            #time.sleep(1200)
        #time.sleep(0.5)

    if num == 5000:
        break