How to use the ipwb.util.isCDXJMetadataRecord function in ipwb

To help you get started, we’ve selected a few ipwb examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github oduwsdl / ipwb / ipwb / replay.py View on Github external
def getURIsAndDatetimesInCDXJ(cdxjFilePath=INDEX_FILE):
    indexFileContents = getIndexFileContents(cdxjFilePath)

    if not indexFileContents:
        return 0

    lines = indexFileContents.strip().split('\n')

    uris = {}
    for i, l in enumerate(lines):
        if not ipwbUtils.isValidCDXJLine(l):
            continue

        if ipwbUtils.isCDXJMetadataRecord(l):
            continue

        cdxjFields = l.split(' ', 2)
        uri = unsurt(cdxjFields[0])
        datetime = cdxjFields[1]

        try:
            jsonFields = json.loads(cdxjFields[2])
        except Exception as e:  # Skip lines w/o JSON block
            continue

        if uri not in uris:
            uris[uri] = []

        mementoAsJSON = {
            'datetime': datetime,
github oduwsdl / ipwb / ipwb / replay.py View on Github external
lines = indexFileContents.strip().split('\n')

    if not lines:
        return errReturn

    mementoInfo = {
        'mementoCount': 0,
        'htmlCount': 0,
        'surtURIs': {},
        'oldestDatetime': None,
        'newestDatetime': None
    }

    for i, l in enumerate(lines):
        validCDXJLine = ipwbUtils.isValidCDXJLine(l)
        metadataRecord = ipwbUtils.isCDXJMetadataRecord(l)
        if validCDXJLine and not metadataRecord:
            mementoInfo['mementoCount'] += 1
            (surtURI, datetime, jsonInLine) = l.split(' ', 2)
            if surtURI not in mementoInfo['surtURIs']:
                mementoInfo['surtURIs'][surtURI] = 1
            else:  # Unnecessary to keep count now, maybe useful later
                mementoInfo['surtURIs'][surtURI] += 1

            j = json.loads(jsonInLine)

            # Count only non-redirect HTML pages for htmlCount display
            if j['mime_type'] and \
                    j['mime_type'].lower().startswith('text/html') and \
                    j['status_code'][0] != '3':
                mementoInfo['htmlCount'] += 1