Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# here we set the ia identifier
itemname = '%s-%s_-_%s' % ('github.com', repo_name, pushed_date)
title = '%s' % (itemname)
#initializing the main metadata
meta = dict(mediatype=mediatype, creator=uploader_name, collection=collection, title=title, year=year, date=date, \
subject=subject, uploaded_with=uploader, originalurl=originalurl, pushed_date=raw_pushed_date, description=description)
# override default metadata with any supplemental metadata provided.
if custom_meta != None:
meta.update(custom_meta)
try:
# upload the item to the Internet Archive
print(("Creating item on Internet Archive: %s") % meta['title'])
item = internetarchive.get_item(itemname)
# checking if the item already exists:
if not item.exists:
print(("Uploading file to the internet archive: %s") % bundle_file)
item.upload(bundle_file, metadata=meta, retries=9001, request_kwargs=dict(timeout=9001), delete=False)
# upload the item to the Internet Archive
print("Uploading avatar...")
item.upload('{}/cover.jpg'.format(gh_repo_folder), retries=9001, request_kwargs=dict(timeout=9001), delete=True)
else:
print("\nSTOP: The same repository seems already archived.")
print(("---->> Archived repository URL: \n \thttps://archive.org/details/%s") % itemname)
print("---->> Archived git bundle file: \n \thttps://archive.org/download/{0}/{1}.bundle \n\n".format(itemname,bundle_filename))
shutil.rmtree(gh_repo_folder)
exit(0)
except Exception as e:
print str(e)
wiki = wiki.lower()
prefix = dumpgenerator.domain2prefix(config={'api': wiki})
wikiname = prefix.split('-')[0]
dumps = []
for dirname, dirnames, filenames in os.walk('.'):
if dirname == '.':
for f in filenames:
if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
dumps.append(f)
break
c = 0
for dump in dumps:
wikidate = dump.split('-')[1]
item = get_item('wiki-' + wikiname)
if dump in uploadeddumps:
if config['prune-directories']:
rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
# With -f the deletion might have happened before and we won't know
if not os.system(rmline):
print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
# Simplistic quick&dirty check for the presence of this file in the item
stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
dumphash = re.sub(' +.+\n?', '', stdout)
if dumphash in map(lambda x: x['md5'], item.files):
log(wiki, dump, 'verified')
rmline='rm -rf %s' % dump
if not os.system(rmline):
print 'DELETED ' + dump
saved_crop = self.save_crop()
files.append(saved_crop)
internetarchive.upload(
self.ia_id,
files,
metadata=self.ia_metadata,
access_key=settings.IA_ACCESS_KEY_ID,
secret_key=settings.IA_SECRET_ACCESS_KEY,
checksum=False,
verbose=True
)
if self.has_image:
os.remove(saved_image)
if self.has_crop:
os.remove(saved_crop)
return internetarchive.get_item(self.ia_id)
def get_ia_item(self):
logger.debug("Getting IA item for {}".format(self.ia_id))
config = dict(s3=dict(access=settings.IA_ACCESS_KEY_ID, secret=settings.IA_SECRET_ACCESS_KEY))
return internetarchive.get_item(self.ia_id, config=config)
def get_valid_identifier(self, primary = True):
"""Iterate over identifiers suffixed by _, until found."""
if self.ia_identifier:
ia_key = self.ia_identifier
else:
ia_key = "%s_%s_%s" %('bub', self.library, self.Id)
item = ia.get_item(ia_key)
if item.exists == False and primary == True:
return item
for index in range(2,10):
item = ia.get_item("%s_%s" %(ia_key, index))
if item.identifier == self.ia_identifier:
continue
if item.exists == False:
return item
item = ia.get_item(urandom(16).encode("hex"))
return item
if isinstance(json_ld, dict):
# cache the remote file locally to upload it
item_id = self.id_prefix + '-' + json_ld['slug']
if not isinstance(cache_dir, str):
cache_dir = self.cache_file_dir
dir_file = self.bin_file_obj.join_dir_filename(file_name,
cache_dir)
if not os.path.exists(dir_file):
print('Cannot find the cached file: ' + dir_file + ' !')
else:
sleep(self.delay_before_request)
print('Ready to upload: ' + file_name)
# start an internet archive session
s = self.start_ia_session()
# get or make an item
item = get_item(item_id,
archive_session=s,
debug=True)
# now upload file
try:
# sometimes the connect fails with an uncaught exception, so
# catch it here.
r = item.upload_file(dir_file,
key=file_name,
metadata=metadata)
# set the uri for the media item just uploaded
if r.status_code == requests.codes.ok or self.save_db:
ia_file_uri = self.make_ia_image_uri(item_id, file_name)
except:
print('Upload failure for:' + file_name + ' uuid: ' + man_obj.uuid)
ia_file_uri = None
return ia_file_uri
os.remove(description_file_path)
# Delete empty annotations.xml file so it isn't uploaded
annotations_file_path = videobasename + '.annotations.xml'
if (os.path.exists(annotations_file_path) and
(('annotations' in vid_meta and
vid_meta['annotations'] in {'', EMPTY_ANNOTATION_FILE}) or
check_is_file_empty(annotations_file_path))):
os.remove(annotations_file_path)
# Upload all files with videobase name: e.g. video.mp4,
# video.info.json, video.srt, etc.
files_to_upload = glob.glob(videobasename + '*')
# Upload the item to the Internet Archive
item = internetarchive.get_item(itemname)
if custom_meta:
metadata.update(custom_meta)
# Parse internetarchive configuration file.
parsed_ia_s3_config = parse_config_file(self.ia_config_path)[1]['s3']
s3_access_key = parsed_ia_s3_config['access']
s3_secret_key = parsed_ia_s3_config['secret']
if None in {s3_access_key, s3_secret_key}:
msg = ('`internetarchive` configuration file is not configured'
' properly.')
self.logger.error(msg)
if self.verbose:
print(msg)
def upload(self):
item = internetarchive.get_item(self.identifier)
metadata = dict(
title=self.title,
collection=self.collection,
mediatype='software',
subject=self.subject,
description=self.description,
)
logger.info('Begin upload %s %s.', self.identifier, self.filenames)
item.upload(self.filenames, metadata=metadata,
verify=True, verbose=True,
access_key=self.access_key, secret_key=self.secret_key,
retries=10)
logger.info('Done upload.')
@ia_online(logger = log)
def upload_to_IA(self, library, Id):
"""Upload book to IA with appropriate metadata."""
if self.ia_identifier == None:
item = self.get_valid_identifier()
self.ia_identifier = item.identifier
else:
item = ia.get_item(self.ia_identifier)
metadata = dict(
mediatype = "text",
creator = self.author,
title = re.sub(r"""[!#\n\r|^\\\"~()\[\]:\-/]""",'',self.title)[:330],
publisher = self.publisher,
description = re.sub(r"""[!#\n\r|^\\\"~()\[\]:\-/]""",'',self.description),
source = self.infoLink,
language = self.language,
year = self.year,
date = self.publishedDate,
subject = "bub_upload",
licenseurl = "http://creativecommons.org/publicdomain/mark/1.0/" if self.publicDomain == True else "",
scanner = self.scanner,
sponsor = self.sponser,
uploader = "bub")
metadata['google-id'] = self.Id if self.library == 'gb' else ""
def main(args, root='root'):
item = internetarchive.get_item('johns_hopkins_costar_dataset')
path = os.path.expanduser(args['path'])
dryrun = args['dryrun']
r = item.download(
destdir=path, # The directory to download files to
ignore_existing=True, # Skip files that already exist locally
checksum=True, # Skip files based on checksum
verbose=True, # Print progress to stdout
retries=100, # Thenumber of times to retry on failed requests
# Set to true to print headers to stdout, and exit without downloading
dryrun=dryrun)
print(r)