How to use internetarchive - 10 common examples

To help you get started, we’ve selected a few internetarchive examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github bibanon / tubeup / tests / test_tubeup.py View on Github external
import json
import time
import requests_mock
import glob

from logging import Logger
from tubeup.TubeUp import TubeUp, DOWNLOAD_DIR_NAME
from tubeup.utils import LogErrorToStdout
from youtube_dl import YoutubeDL
from .constants import info_dict_playlist, info_dict_video


current_path = os.path.dirname(os.path.realpath(__file__))

INTERNET_ARCHIVE_VERSION = 'Internet Archive Python library {0}'.format(
    internetarchive.__version__)


def get_testfile_path(name):
    return os.path.join(current_path, 'test_tubeup_files', name)


def mocked_ydl_progress_hook(d):
    pass


def mock_upload_response_by_videobasename(m, ia_id, videobasename):
    files_to_upload = glob.glob(videobasename + '*')

    for file_path in files_to_upload:
        filename = os.path.basename(file_path)
        m.put('https://s3.us.archive.org/%s/%s' % (ia_id, filename),
github gdamdam / iagitup / iagitup / iagitup.py View on Github external
# here we set the ia identifier
    itemname = '%s-%s_-_%s' % ('github.com', repo_name, pushed_date)
    title = '%s' % (itemname)

    #initializing the main metadata
    meta = dict(mediatype=mediatype, creator=uploader_name, collection=collection, title=title, year=year, date=date, \
           subject=subject, uploaded_with=uploader, originalurl=originalurl, pushed_date=raw_pushed_date, description=description)

    # override default metadata with any supplemental metadata provided.
    if custom_meta != None:
        meta.update(custom_meta)

    try:
        # upload the item to the Internet Archive
        print(("Creating item on Internet Archive: %s") % meta['title'])
        item = internetarchive.get_item(itemname)
        # checking if the item already exists:
        if not item.exists:
            print(("Uploading file to the internet archive: %s") % bundle_file)
            item.upload(bundle_file, metadata=meta, retries=9001, request_kwargs=dict(timeout=9001), delete=False)
            # upload the item to the Internet Archive
            print("Uploading avatar...")
            item.upload('{}/cover.jpg'.format(gh_repo_folder), retries=9001, request_kwargs=dict(timeout=9001), delete=True)
        else:
            print("\nSTOP: The same repository seems already archived.")
            print(("---->>  Archived repository URL: \n \thttps://archive.org/details/%s") % itemname)
            print("---->>  Archived git bundle file: \n \thttps://archive.org/download/{0}/{1}.bundle \n\n".format(itemname,bundle_filename))
            shutil.rmtree(gh_repo_folder)
            exit(0)

    except Exception as e:
        print str(e)
github WikiTeam / wikiteam / uploader.py View on Github external
wiki = wiki.lower()
        prefix = dumpgenerator.domain2prefix(config={'api': wiki})

        wikiname = prefix.split('-')[0]
        dumps = []
        for dirname, dirnames, filenames in os.walk('.'):
            if dirname == '.':
                for f in filenames:
                    if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
                        dumps.append(f)
                break

        c = 0
        for dump in dumps:
            wikidate = dump.split('-')[1]
            item = get_item('wiki-' + wikiname)
            if dump in uploadeddumps:
                if config['prune-directories']:
                    rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
                    # With -f the deletion might have happened before and we won't know
                    if not os.system(rmline):
                        print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
                if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
                        # Simplistic quick&dirty check for the presence of this file in the item
                        stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
                        dumphash = re.sub(' +.+\n?', '', stdout)

                        if dumphash in map(lambda x: x['md5'], item.files):
                            log(wiki, dump, 'verified')
                            rmline='rm -rf %s' % dump
                            if not os.system(rmline):
                                print 'DELETED ' + dump
github ArchiveTeam / NewsGrabber / server_main / upload.py View on Github external
def upload_single(self, name, f, ia_args):
        with open(settings.keys, 'r') as keys:
            access_key, secret_key = keys.read().strip().split(':')
        try:
            internetarchive.upload('archiveteam_newssites_{name}'.format(name=name),
                os.path.join(settings.dir_ready, f),
                metadata=ia_args,
                access_key=access_key,
                secret_key=secret_key,
                queue_derive=True,
                verify=True,
                verbose=True,
                delete=True,
                retries=10,
                retries_sleep=300)
        except:
            pass # see code below
        self.concurrent_uploads -= 1
        os.remove(os.path.join(settings.dir_ready, f+'.upload'))
        if os.path.isfile(os.path.join(settings.dir_ready, f)):
            settings.irc_bot.send('PRIVMSG', '{name} uploaded unsuccessful.'.format(
github coreybobco / generativepoetry-py / generativepoetry / decomposer.py View on Github external
def get_internet_archive_document(url) -> str:
    """Downloads a document (book, etc.) from Internet Archive and returns it as a string. The linked document must
       have a text version. PDF text extraction is not supported at this time.
       Returns a ParsedText instance.
    """
    validate_url(url, expected_netloc='archive.org')
    url_parts = urlsplit(url).path.split("/")
    if len(url_parts) > 2:
        document_id = url_parts[2]
    else:
        raise Exception(f'Not a valid url')
    try:
        response = download(document_id, glob_pattern="*txt", return_responses=True)[0]
        # Remove single newlines, preserve double  newlines (because they demarcate paragraphs
        text = re.sub('(?
github sushant354 / egazette / gvision.py View on Github external
def delete_imagepdf(self, item, abby_filegz):
        head, abby_file = os.path.split(abby_filegz)
        pdffile = re.sub('_abbyy.gz$', '.pdf', abby_file)

        itemobj = internetarchive.get_item(item)
        fileobj = internetarchive.File(itemobj, pdffile)
        if fileobj and fileobj.source == 'derivative' and \
                fileobj.format == 'Image Container PDF':
            fileobj.delete(access_key = self.access_key, headers= self.headers,\
                           secret_key = self.secret_key)    
            self.logger.warn('Old image pdf exists in %s. Deleted it', item)
github pastpages / pastpages.org / archive / models.py View on Github external
saved_crop = self.save_crop()
            files.append(saved_crop)
        internetarchive.upload(
            self.ia_id,
            files,
            metadata=self.ia_metadata,
            access_key=settings.IA_ACCESS_KEY_ID,
            secret_key=settings.IA_SECRET_ACCESS_KEY,
            checksum=False,
            verbose=True
        )
        if self.has_image:
            os.remove(saved_image)
        if self.has_crop:
            os.remove(saved_crop)
        return internetarchive.get_item(self.ia_id)
github pastpages / pastpages.org / archive / models.py View on Github external
def get_ia_item(self):
        logger.debug("Getting IA item for {}".format(self.ia_id))
        config = dict(s3=dict(access=settings.IA_ACCESS_KEY_ID, secret=settings.IA_SECRET_ACCESS_KEY))
        return internetarchive.get_item(self.ia_id, config=config)
github rohit-dua / BUB / bot / worker.py View on Github external
def get_valid_identifier(self, primary = True):
        """Iterate over identifiers suffixed by _, until found."""
        if self.ia_identifier:
            ia_key = self.ia_identifier
        else:
            ia_key = "%s_%s_%s" %('bub', self.library, self.Id)
        item = ia.get_item(ia_key)
        if item.exists == False and primary == True:
            return item
        for index in range(2,10):
            item = ia.get_item("%s_%s" %(ia_key, index))
            if item.identifier == self.ia_identifier:
                continue
            if item.exists == False:
                return item
        item = ia.get_item(urandom(16).encode("hex"))
        return item
github ekansa / open-context-py / opencontext_py / apps / ocitems / mediafiles / internetarchive.py View on Github external
if isinstance(json_ld, dict):
            # cache the remote file locally to upload it
            item_id = self.id_prefix + '-' + json_ld['slug']
            if not isinstance(cache_dir, str):
                cache_dir = self.cache_file_dir
            dir_file = self.bin_file_obj.join_dir_filename(file_name,
                                                           cache_dir)
            if not os.path.exists(dir_file):
                print('Cannot find the cached file: ' +  dir_file + ' !')
            else:
                sleep(self.delay_before_request)
                print('Ready to upload: ' + file_name)
                # start an internet archive session
                s = self.start_ia_session()
                # get or make an item
                item = get_item(item_id,
                                archive_session=s,
                                debug=True)
                # now upload file
                try:
                    # sometimes the connect fails with an uncaught exception, so
                    # catch it here.
                    r = item.upload_file(dir_file,
                                         key=file_name,
                                         metadata=metadata)
                    # set the uri for the media item just uploaded
                    if r.status_code == requests.codes.ok or self.save_db:
                        ia_file_uri = self.make_ia_image_uri(item_id, file_name)
                except:
                    print('Upload failure for:'  + file_name + ' uuid: ' + man_obj.uuid)
                    ia_file_uri = None
        return ia_file_uri