How to use the internetarchive.search_items function in internetarchive

To help you get started, we’ve selected a few internetarchive examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github eellak / gsoc2018-3gm / scripts / batch_ia_upload.py View on Github external
https://archive.org/details/greekGovernmentgazette''')
    required = parser.add_argument_group('required arguments')
    optional = parser.add_argument_group('optional arguments')

    # arguments
    required.add_argument('-d', help='Input directory', required=True)
    optional.add_argument('--w', help='Number of workers', type=int, default=1)

    args = argparser.parse_args()

    # pdfs listed recursively
    pdfs = list_files(args.d, '.pdf', recursive=True)

    # frozenset for faster lookup
    # returns uploaded files
    uploaded = frozenset([x['identifier'] for x in search_items(
        'collection:greekgovernmentgazette')])

    # pool for multiprocessing
    pool = multiprocessing.Pool(args.w)
    pool.map(ia_upload, pdfs)
github unscrollinc / unscroll / server / bin / download / dl_archive_radio.py View on Github external
def search(search_term):
    dictset = []
    
    for item in search_items(search_term).iter_as_items():
        files = [file_meta for file_meta in item.files
                 if file_meta["format"] == "VBR MP3"]
        
        if (len(files) > 0):
            # Make Collection
            item_datetime = None
            if 'date' in item.metadata:
                item_date = parse(item.metadata['date'])
                item_datetime = datetime.combine(item_date,
                                                 datetime.min.time())
            
                c = Collection()
                c.id = item.identifier
                c.source = "{}{}".format(DETAILS_URL,
                                         item.identifier)
                c.description = None
github gittb / audiosandbox / scrape / podcast_scrape.py View on Github external
import os
import time
import sys
import internetarchive as ia
from internetarchive.session import ArchiveSession
from internetarchive import get_item
from internetarchive import download

ident = 'podcasts'
destifolder = 'iapodcasts'
search = ia.search_items('collection:%s' % ident)
current = [f for f in os.listdir(destifolder)]

num = 0

for result in search: #for all items in a collection
    num = num + 1 #item count
    itemid = result['identifier']
    print('Downloading: #' + str(num) + '\t' + itemid)
    if itemid not in current:
        try:
            download(itemid, destdir=destifolder, retries=5, glob_pattern=['*.ogg', '*.mp3', '*.wav', '*.flv'])
            print('\t\t Download success.')
        except Exception as e:
            print("Error Occurred downloading () = {}".format(itemid, e) )
            print('Pausing for 20 minutes')
            #time.sleep(1200)
github unscrollinc / unscroll / server / bin / download / archive.org / old-radio-news.py View on Github external
def search(search_term):
    dictset = []
    
    for item in search_items(search_term).iter_as_items():
        files = [file_meta for file_meta in item.files
                 if file_meta["format"] == "VBR MP3"]
        
        if (len(files) > 0):
            # Make Collection
            item_datetime = None
            if 'date' in item.metadata:
                item_date = parse(item.metadata['date'])
                item_datetime = datetime.combine(item_date, datetime.min.time())
            
                c = Collection()
                c.id = item.identifier
                c.source = "{}{}".format(DETAILS_URL,
                                         item.identifier)
                c.description = None # item.metadata['description']