Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def setUp(self):
self.resolver = Resolver()
self.tempdir = mkdtemp()
self.workspace_dir = join(self.tempdir, 'kant_aufklaerung_1784')
copytree(assets.path_to('kant_aufklaerung_1784/data'), self.workspace_dir)
self.workspace = Workspace(self.resolver, directory=join(self.workspace_dir))
self.mgr = WorkspaceBackupManager(self.workspace)
define named groups that can be used in --page-id, --file-id, --mimetype, --url and
--file-grp by referencing the named group 'grp' in the regex as '{{ grp }}'.
\b
Example:
ocrd workspace bulk-add \\
--regex '^.*/(?P[^/]+)/page_(?P.*)\.(?P[^\.]*)$' \\
--file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\
--page-id 'PHYS_{{ pageid }}' \\
--file-grp "{{ fileGrp }}" \\
--url '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\
path/to/files/*/*.*
"""
log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
try:
pat = re.compile(regex)
except Exception as e:
log.error("Invalid regex: %s" % e)
sys.exit(1)
file_paths = []
for fglob in file_glob:
file_paths += [Path(x).resolve() for x in glob(fglob)]
for i, file_path in enumerate(file_paths):
log.info("[%4d/%d] %s" % (i, len(file_paths), file_path))
# match regex
m = pat.match(str(file_path))
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_file_exists, force, fname):
"""
Add a file or http(s) URL FNAME to METS in a workspace.
If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
kwargs = {'fileGrp': file_grp, 'ID': file_id, 'mimetype': mimetype, 'pageId': page_id, 'force': force, 'ignore': ignore}
log = getLogger('ocrd.cli.workspace.add')
log.debug("Adding '%s' (%s)", fname, kwargs)
if not (fname.startswith('http://') or fname.startswith('https://')):
if not fname.startswith(ctx.directory):
if not isabs(fname) and exists(join(ctx.directory, fname)):
fname = join(ctx.directory, fname)
else:
log.debug("File '%s' is not in workspace, copying", fname)
try:
fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp)
except FileNotFoundError:
if check_file_exists:
log.error("File '%s' does not exist, halt execution!" % fname)
sys.exit(1)
def set_id(ctx, id): # pylint: disable=redefined-builtin
"""
Set METS ID.
If one of the supported identifier mechanisms is used, will set this identifier.
Otherwise will create a new {{ ID }}.
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
workspace.mets.unique_identifier = id
workspace.save_mets()
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download):
"""
Find files.
(If any ``FILTER`` starts with ``//``, then its remainder
will be interpreted as a regular expression.)
"""
modified_mets = False
ret = list()
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
for f in workspace.mets.find_files(
ID=file_id,
fileGrp=file_grp,
mimetype=mimetype,
pageId=page_id,
):
if download and not f.local_filename:
workspace.download_file(f)
modified_mets = True
ret.append([f.ID if field == 'pageId' else getattr(f, field) or ''
for field in output_field])
if modified_mets:
workspace.save_mets()
if 'pageId' in output_field:
idx = output_field.index('pageId')
fileIds = list(map(lambda fields: fields[idx], ret))
def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefined-builtin
"""
Delete files (given by their ID attribute ``ID``).
(If any ``ID`` starts with ``//``, then its remainder
will be interpreted as a regular expression.)
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
for i in id:
workspace.remove_file(i, force=force, keep_file=keep_file)
workspace.save_mets()
def list_pages(ctx):
"""
List physical page IDs
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
print("\n".join(workspace.mets.physical_pages))
def prune_files(ctx, file_grp, mimetype, page_id, file_id):
"""
Removes mets:files that point to non-existing local files
(If any ``FILTER`` starts with ``//``, then its remainder
will be interpreted as a regular expression.)
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
with pushd_popd(workspace.directory):
for f in workspace.mets.find_files(
ID=file_id,
fileGrp=file_grp,
mimetype=mimetype,
pageId=page_id,
):
try:
if not f.local_filename or not exists(f.local_filename):
workspace.mets.remove_file(f.ID)
except Exception as e:
log.exception("Error removing %f: %s", f, e)
raise(e)
workspace.save_mets()