Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test():
""" Main function, called when running file as script
see module doc for more info
"""
log_helper.enable_logging(False, 'debug')
if len(sys.argv) != 2:
print(u'To test this code, give me a single file as arg')
return 2
# test get_type
print('Detected type: ' + get_type(sys.argv[1]))
# test complete parsing
parser = XmlParser(sys.argv[1])
for subfile, elem, depth in parser.iter_xml():
if depth < 4:
print(u'{0} {1}{2}'.format(subfile, ' ' * depth, debug_str(elem)))
for index, (subfile, content_type, _) in enumerate(parser.iter_non_xml()):
print(u'Non-XML subfile: {0} of type {1}'
.format(subfile, content_type or u'unknown'))
if index > 100:
print(u'...')
break
log_helper.end_logging()
return 0
# TODO: option to extract objects to files (false by default)
print('-'*79)
print('File: %r' % filename)
index = 1
# do not throw errors but remember them and try continue with other streams
err_stream = False
err_dumping = False
did_dump = False
xml_parser = None
if is_zipfile(filename):
log.info('file could be an OOXML file, looking for relationships with '
'external links')
xml_parser = XmlParser(filename)
for relationship, target in find_external_relationships(xml_parser):
did_dump = True
print("Found relationship '%s' with external link %s" % (relationship, target))
# look for ole files inside file (e.g. unzip docx)
# have to finish work on every ole stream inside iteration, since handles
# are closed in find_ole
for ole in find_ole(filename, data, xml_parser):
if ole is None: # no ole file found
continue
for path_parts in ole.listdir():
stream_path = '/'.join(path_parts)
log.debug('Checking stream %r', stream_path)
if path_parts[-1] == '\x01Ole10Native':
stream = None
def get_type(filename):
""" return one of the DOCTYPE_* constants or raise error """
parser = XmlParser(filename)
if parser.is_single_xml():
match = None
with open(filename, 'r') as handle:
match = re.search(OFFICE_XML_PROGID_REGEX, handle.read(1024))
if not match:
return DOCTYPE_NONE
prog_id = match.groups()[0]
if prog_id == WORD_XML_PROG_ID:
return DOCTYPE_WORD_XML
if prog_id == EXCEL_XML_PROG_ID:
return DOCTYPE_EXCEL_XML
return DOCTYPE_NONE
is_doc = False
is_xls = False
is_ppt = False
try:
if olefile.isOleFile(arg_for_ole):
if is_ppt(arg_for_ole):
log.info('is ppt file: ' + filename)
for ole in find_ole_in_ppt(arg_for_ole):
yield ole
ole = None # is closed in find_ole_in_ppt
# in any case: check for embedded stuff in non-sectored streams
log.info('is ole file: ' + filename)
ole = olefile.OleFileIO(arg_for_ole)
yield ole
elif xml_parser is not None or is_zipfile(arg_for_zip):
# keep compatibility with 3rd-party code that calls this function
# directly without providing an XmlParser instance
if xml_parser is None:
xml_parser = XmlParser(arg_for_zip)
# force iteration so XmlParser.iter_non_xml() returns data
[x for x in xml_parser.iter_xml()]
log.info('is zip file: ' + filename)
# we looped through the XML files before, now we can
# iterate the non-XML files looking for ole objects
for subfile, _, file_handle in xml_parser.iter_non_xml():
try:
head = file_handle.read(len(olefile.MAGIC))
except RuntimeError:
log.error('zip is encrypted: ' + filename)
yield None
continue
if head == olefile.MAGIC:
file_handle.seek(0)