How to use the pywb.utils.canonicalize.unsurt function in pywb

To help you get started, we’ve selected a few pywb examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github oduwsdl / ipwb / ipwb / replay.py View on Github external
for i, line in enumerate(cdxjLines):
        (surtURI, datetime, json) = line.split(' ', 2)
        dtRFC1123 = ipwbConfig.datetimeToRFC1123(datetime)
        firstLastStr = ''

        if len(cdxjLines) > 1:
            if i == 0:
                firstLastStr = 'first '
            elif i == len(cdxjLines) - 1:
                firstLastStr = 'last '
        elif len(cdxjLines) == 1:
            firstLastStr = 'first last '

        tmData += '<{0}{1}/{2}>; rel="{3}memento"; datetime="{4}",\n'.format(
                hostAndPort, datetime, unsurt(surtURI),
                firstLastStr, dtRFC1123)
    tmData = tmData[0:-2]  # Trim final , and LF
    return tmData
github oduwsdl / ipwb / ipwb / replay.py View on Github external
def generateCDXJTimeMapFromCDXJLines(cdxjLines, original, tmself):
    tmurl = getProxiedURIT(tmself)
    if app.proxy is not None:
        tmself = urlunsplit(tmurl)

    # unsurted URI will never have a scheme, add one
    originalURI = 'http://{0}'.format(unsurt(original))

    tmData = '!context ["http://tools.ietf.org/html/rfc7089"]\n'
    tmData += '!id {{"uri": "{0}"}}\n'.format(tmself)
    tmData += '!keys ["memento_datetime_YYYYMMDDhhmmss"]\n'
    tmData += '!meta {{"original_uri": "{0}"}}\n'.format(originalURI)

    linkTMURI = tmself.replace('/timemap/cdxj/', '/timemap/link/')
    tmData += ('!meta {{"timemap_uri": {{'
               '"link_format": "{0}", '
               '"cdxj_format": "{1}"'
               '}}}}\n').format(linkTMURI, tmself)
    hostAndPort = tmself[0:tmself.index('timemap/')]

    for i, line in enumerate(cdxjLines):
        (surtURI, datetime, json) = line.split(' ', 2)
        dtRFC1123 = ipwbConfig.datetimeToRFC1123(datetime)
github oduwsdl / ipwb / ipwb / replay.py View on Github external
if not indexFileContents:
        return 0

    lines = indexFileContents.strip().split('\n')

    uris = {}
    for i, l in enumerate(lines):
        if not ipwbConfig.isValidCDXJLine(l):
            continue

        if ipwbConfig.isCDXJMetadataRecord(l):
            continue

        cdxjFields = l.split(' ', 2)
        uri = unsurt(cdxjFields[0])
        datetime = cdxjFields[1]

        try:
            jsonFields = json.loads(cdxjFields[2])
        except Exception as e:  # Skip lines w/o JSON block
            continue

        if uri not in uris:
            uris[uri] = {}
            uris[uri]['datetimes'] = []
        uris[uri]['datetimes'].append(datetime)
        uris[uri]['mime'] = jsonFields['mime_type']

        pass
    return json.dumps(uris)
github oduwsdl / ipwb / ipwb / replay.py View on Github external
def generateLinkTimeMapFromCDXJLines(cdxjLines, original, tmself):
    tmurl = getProxiedURIT(tmself)
    if app.proxy is not None:
        tmself = urlunsplit(tmurl)

    # Extract and trim for host:port prepending
    tmurl[2] = ''  # Clear TM path
    hostAndPort = urlunsplit(tmurl) + '/'

    # unsurted URI will never have a scheme, add one
    originalURI = 'http://{0}'.format(unsurt(original))

    tmData = '<{0}>; rel="original",\n'.format(originalURI)
    tmData += '<{0}>; rel="self timemap"; '.format(tmself)
    tmData += 'type="application/link-format",\n'

    cdxjTMURI = tmself.replace('/timemap/link/', '/timemap/cdxj/')
    tmData += '<{0}>; rel="timemap"; '.format(cdxjTMURI)
    tmData += 'type="application/cdxj+ors",\n'

    for i, line in enumerate(cdxjLines):
        (surtURI, datetime, json) = line.split(' ', 2)
        dtRFC1123 = ipwbConfig.datetimeToRFC1123(datetime)
        firstLastStr = ''

        if len(cdxjLines) > 1:
            if i == 0:
github oduwsdl / ipwb / ipwb / replay.py View on Github external
respString = ('{0} not found :(' +
                      ' <a href="http://{1}:{2}">Go home</a>').format(
            path, IPWBREPLAY_IP, IPWBREPLAY_PORT)
        return Response(respString)
    if cdxjLine is None:  # Resource not found in archives
        msg = '<h1>ERROR 404</h1>'
        msg += 'No capture found for {0} at {1}.'.format(path, datetime)
        linesWithSameURIR = getCDXJLinesWithURIR(path, None)
        print('CDXJ lines with URI-R at {0}'.format(path))
        print(linesWithSameURIR)

        # TODO: Use closest instead of conditioning on single entry
        #  temporary fix for core functionality in #225
        if len(linesWithSameURIR) == 1:
            fields = linesWithSameURIR[0].split(' ', 2)
            redirectURI = '/{1}/{0}'.format(unsurt(fields[0]), fields[1])
            return redirect(redirectURI, code=302)

        urir = ''
        if linesWithSameURIR:
            msg += '<p>{0} capture(s) available:</p><ul>'.format(
                  len(linesWithSameURIR))
            for line in linesWithSameURIR:
                fields = line.split(' ', 2)
                urir = unsurt(fields[0])
                msg += ('<li><a href="/{1}/{0}">{0} at {1}</a></li>'
                        .format(urir, fields[1]))
            msg += '</ul>'

        msg += '<p>TimeMaps: '
        msg += '<a href="/timemap/link/{0}">Link</a> '.format(urir)
        msg += '<a href="/timemap/cdxj/{0}">CDXJ</a> '.format(urir)</p>
github oduwsdl / ipwb / ipwb / replay.py View on Github external
def showMementosForURIRs(urir):
    urir = getCompleteURI(urir)

    if ipwbConfig.isLocalHosty(urir):
        urir = urir.split('/', 4)[4]
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    indexPath = ipwbConfig.getIPWBReplayIndexPath()

    print('Getting CDXJ Lines with the URI-R {0} from {1}'
          .format(urir, indexPath))
    cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath)

    if len(cdxjLinesWithURIR) == 1:
        fields = cdxjLinesWithURIR[0].split(' ', 2)
        redirectURI = '/{1}/{0}'.format(unsurt(fields[0]), fields[1])
        return redirect(redirectURI, code=302)

    msg = ''
    if cdxjLinesWithURIR:
        msg += '<p>{0} capture(s) available:</p><ul>'.format(
            len(cdxjLinesWithURIR))
        for line in cdxjLinesWithURIR:
            fields = line.split(' ', 2)
            dt14 = fields[1]
            dtrfc1123 = ipwbConfig.datetimeToRFC1123(fields[1])
            msg += ('<li><a href="/{1}/{0}">{0} at {2}</a></li>'
                    .format(unsurt(fields[0]), dt14, dtrfc1123))
        msg += '</ul>'
    return Response(msg)
github oduwsdl / ipwb / ipwb / replay.py View on Github external
if ipwbConfig.isLocalHosty(urir):
        urir = urir.split('/', 4)[4]
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    indexPath = ipwbConfig.getIPWBReplayIndexPath()

    print('Getting CDXJ Lines with the URI-R {0} from {1}'
          .format(urir, indexPath))
    cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath)

    closestLine = getCDXJLineClosestTo(datetime, cdxjLinesWithURIR)
    if closestLine is None:
        msg = '<h1>ERROR 404</h1>'
        msg += 'No capture found for {0} at {1}.'.format(urir, datetime)
        return Response(msg, status=404)

    uri = unsurt(closestLine.split(' ')[0])
    newDatetime = closestLine.split(' ')[1]
    return show_uri(uri, newDatetime)
github webrecorder / pywb / pywb / cdx / cdxdomainspecific.py View on Github external
def unsurt(self):
        """
        urlkey is assumed to be in surt format by default
        In the case of non-surt format, this method is called
        to desurt any urls
        """
        self.url_prefix = map(unsurt, self.url_prefix)
        if self.regex:
            self.regex = re.compile(unsurt(self.regex.pattern))

        if self.replace:
            self.replace = unsurt(self.replace)
github webrecorder / pywb / pywb / cdx / cdxdomainspecific.py View on Github external
def unsurt(self):
        """
        urlkey is assumed to be in surt format by default
        In the case of non-surt format, this method is called
        to desurt any urls
        """
        self.url_prefix = map(unsurt, self.url_prefix)
        if self.regex:
            self.regex = re.compile(unsurt(self.regex.pattern))

        if self.replace:
            self.replace = unsurt(self.replace)