Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import json
import time
import requests_mock
import glob
from logging import Logger
from tubeup.TubeUp import TubeUp, DOWNLOAD_DIR_NAME
from tubeup.utils import LogErrorToStdout
from youtube_dl import YoutubeDL
from .constants import info_dict_playlist, info_dict_video
current_path = os.path.dirname(os.path.realpath(__file__))
INTERNET_ARCHIVE_VERSION = 'Internet Archive Python library {0}'.format(
internetarchive.__version__)
def get_testfile_path(name):
return os.path.join(current_path, 'test_tubeup_files', name)
def mocked_ydl_progress_hook(d):
pass
def mock_upload_response_by_videobasename(m, ia_id, videobasename):
files_to_upload = glob.glob(videobasename + '*')
for file_path in files_to_upload:
filename = os.path.basename(file_path)
m.put('https://s3.us.archive.org/%s/%s' % (ia_id, filename),
# here we set the ia identifier
itemname = '%s-%s_-_%s' % ('github.com', repo_name, pushed_date)
title = '%s' % (itemname)
#initializing the main metadata
meta = dict(mediatype=mediatype, creator=uploader_name, collection=collection, title=title, year=year, date=date, \
subject=subject, uploaded_with=uploader, originalurl=originalurl, pushed_date=raw_pushed_date, description=description)
# override default metadata with any supplemental metadata provided.
if custom_meta != None:
meta.update(custom_meta)
try:
# upload the item to the Internet Archive
print(("Creating item on Internet Archive: %s") % meta['title'])
item = internetarchive.get_item(itemname)
# checking if the item already exists:
if not item.exists:
print(("Uploading file to the internet archive: %s") % bundle_file)
item.upload(bundle_file, metadata=meta, retries=9001, request_kwargs=dict(timeout=9001), delete=False)
# upload the item to the Internet Archive
print("Uploading avatar...")
item.upload('{}/cover.jpg'.format(gh_repo_folder), retries=9001, request_kwargs=dict(timeout=9001), delete=True)
else:
print("\nSTOP: The same repository seems already archived.")
print(("---->> Archived repository URL: \n \thttps://archive.org/details/%s") % itemname)
print("---->> Archived git bundle file: \n \thttps://archive.org/download/{0}/{1}.bundle \n\n".format(itemname,bundle_filename))
shutil.rmtree(gh_repo_folder)
exit(0)
except Exception as e:
print str(e)
wiki = wiki.lower()
prefix = dumpgenerator.domain2prefix(config={'api': wiki})
wikiname = prefix.split('-')[0]
dumps = []
for dirname, dirnames, filenames in os.walk('.'):
if dirname == '.':
for f in filenames:
if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
dumps.append(f)
break
c = 0
for dump in dumps:
wikidate = dump.split('-')[1]
item = get_item('wiki-' + wikiname)
if dump in uploadeddumps:
if config['prune-directories']:
rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
# With -f the deletion might have happened before and we won't know
if not os.system(rmline):
print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
# Simplistic quick&dirty check for the presence of this file in the item
stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
dumphash = re.sub(' +.+\n?', '', stdout)
if dumphash in map(lambda x: x['md5'], item.files):
log(wiki, dump, 'verified')
rmline='rm -rf %s' % dump
if not os.system(rmline):
print 'DELETED ' + dump
def upload_single(self, name, f, ia_args):
with open(settings.keys, 'r') as keys:
access_key, secret_key = keys.read().strip().split(':')
try:
internetarchive.upload('archiveteam_newssites_{name}'.format(name=name),
os.path.join(settings.dir_ready, f),
metadata=ia_args,
access_key=access_key,
secret_key=secret_key,
queue_derive=True,
verify=True,
verbose=True,
delete=True,
retries=10,
retries_sleep=300)
except:
pass # see code below
self.concurrent_uploads -= 1
os.remove(os.path.join(settings.dir_ready, f+'.upload'))
if os.path.isfile(os.path.join(settings.dir_ready, f)):
settings.irc_bot.send('PRIVMSG', '{name} uploaded unsuccessful.'.format(
def get_internet_archive_document(url) -> str:
"""Downloads a document (book, etc.) from Internet Archive and returns it as a string. The linked document must
have a text version. PDF text extraction is not supported at this time.
Returns a ParsedText instance.
"""
validate_url(url, expected_netloc='archive.org')
url_parts = urlsplit(url).path.split("/")
if len(url_parts) > 2:
document_id = url_parts[2]
else:
raise Exception(f'Not a valid url')
try:
response = download(document_id, glob_pattern="*txt", return_responses=True)[0]
# Remove single newlines, preserve double newlines (because they demarcate paragraphs
text = re.sub('(?
def delete_imagepdf(self, item, abby_filegz):
head, abby_file = os.path.split(abby_filegz)
pdffile = re.sub('_abbyy.gz$', '.pdf', abby_file)
itemobj = internetarchive.get_item(item)
fileobj = internetarchive.File(itemobj, pdffile)
if fileobj and fileobj.source == 'derivative' and \
fileobj.format == 'Image Container PDF':
fileobj.delete(access_key = self.access_key, headers= self.headers,\
secret_key = self.secret_key)
self.logger.warn('Old image pdf exists in %s. Deleted it', item)
saved_crop = self.save_crop()
files.append(saved_crop)
internetarchive.upload(
self.ia_id,
files,
metadata=self.ia_metadata,
access_key=settings.IA_ACCESS_KEY_ID,
secret_key=settings.IA_SECRET_ACCESS_KEY,
checksum=False,
verbose=True
)
if self.has_image:
os.remove(saved_image)
if self.has_crop:
os.remove(saved_crop)
return internetarchive.get_item(self.ia_id)
def get_ia_item(self):
logger.debug("Getting IA item for {}".format(self.ia_id))
config = dict(s3=dict(access=settings.IA_ACCESS_KEY_ID, secret=settings.IA_SECRET_ACCESS_KEY))
return internetarchive.get_item(self.ia_id, config=config)
def get_valid_identifier(self, primary = True):
"""Iterate over identifiers suffixed by _, until found."""
if self.ia_identifier:
ia_key = self.ia_identifier
else:
ia_key = "%s_%s_%s" %('bub', self.library, self.Id)
item = ia.get_item(ia_key)
if item.exists == False and primary == True:
return item
for index in range(2,10):
item = ia.get_item("%s_%s" %(ia_key, index))
if item.identifier == self.ia_identifier:
continue
if item.exists == False:
return item
item = ia.get_item(urandom(16).encode("hex"))
return item
if isinstance(json_ld, dict):
# cache the remote file locally to upload it
item_id = self.id_prefix + '-' + json_ld['slug']
if not isinstance(cache_dir, str):
cache_dir = self.cache_file_dir
dir_file = self.bin_file_obj.join_dir_filename(file_name,
cache_dir)
if not os.path.exists(dir_file):
print('Cannot find the cached file: ' + dir_file + ' !')
else:
sleep(self.delay_before_request)
print('Ready to upload: ' + file_name)
# start an internet archive session
s = self.start_ia_session()
# get or make an item
item = get_item(item_id,
archive_session=s,
debug=True)
# now upload file
try:
# sometimes the connect fails with an uncaught exception, so
# catch it here.
r = item.upload_file(dir_file,
key=file_name,
metadata=metadata)
# set the uri for the media item just uploaded
if r.status_code == requests.codes.ok or self.save_db:
ia_file_uri = self.make_ia_image_uri(item_id, file_name)
except:
print('Upload failure for:' + file_name + ' uuid: ' + man_obj.uuid)
ia_file_uri = None
return ia_file_uri