Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def getURIsAndDatetimesInCDXJ(cdxjFilePath=INDEX_FILE):
indexFileContents = getIndexFileContents(cdxjFilePath)
if not indexFileContents:
return 0
lines = indexFileContents.strip().split('\n')
uris = {}
for i, l in enumerate(lines):
if not ipwbUtils.isValidCDXJLine(l):
continue
if ipwbUtils.isCDXJMetadataRecord(l):
continue
cdxjFields = l.split(' ', 2)
uri = unsurt(cdxjFields[0])
datetime = cdxjFields[1]
try:
jsonFields = json.loads(cdxjFields[2])
except Exception as e: # Skip lines w/o JSON block
continue
if uri not in uris:
uris[uri] = []
mementoAsJSON = {
'datetime': datetime,
lines = indexFileContents.strip().split('\n')
if not lines:
return errReturn
mementoInfo = {
'mementoCount': 0,
'htmlCount': 0,
'surtURIs': {},
'oldestDatetime': None,
'newestDatetime': None
}
for i, l in enumerate(lines):
validCDXJLine = ipwbUtils.isValidCDXJLine(l)
metadataRecord = ipwbUtils.isCDXJMetadataRecord(l)
if validCDXJLine and not metadataRecord:
mementoInfo['mementoCount'] += 1
(surtURI, datetime, jsonInLine) = l.split(' ', 2)
if surtURI not in mementoInfo['surtURIs']:
mementoInfo['surtURIs'][surtURI] = 1
else: # Unnecessary to keep count now, maybe useful later
mementoInfo['surtURIs'][surtURI] += 1
j = json.loads(jsonInLine)
# Count only non-redirect HTML pages for htmlCount display
if j['mime_type'] and \
j['mime_type'].lower().startswith('text/html') and \
j['status_code'][0] != '3':
mementoInfo['htmlCount'] += 1