How to use the pywb.rewrite.wburl.WbUrl function in pywb

To help you get started, we’ve selected a few pywb examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github webrecorder / pywb / pywb / rewrite / wburl.py View on Github external
def to_uri(url):
        """ Converts a url to an ascii %-encoded form
        where:
        - scheme is ascii,
        - host is punycode,
        - and remainder is %-encoded
        Not using urlsplit to also decode partially encoded
        scheme urls
        """
        parts = WbUrl.FIRST_PATH.split(url, 1)

        sep = url[len(parts[0])] if len(parts) > 1 else None

        scheme_dom = unquote_plus(parts[0])

        if six.PY2 and isinstance(scheme_dom, six.binary_type):
            if scheme_dom == parts[0]:
                url = url.replace('#', '%23')
                return url

            scheme_dom = scheme_dom.decode('utf-8', 'ignore')

        scheme_dom = scheme_dom.rsplit('/', 1)
        domain = scheme_dom[-1]

        try:
github harvard-lil / perma / services / docker / webrecorder / contentcontroller.py View on Github external
def check_remote_archive(self, wb_url, mode, wb_url_obj=None):
        wb_url_obj = wb_url_obj or WbUrl(wb_url)

        res = self.wam_loader.find_archive_for_url(wb_url_obj.url)
        if not res:
            return

        pk, new_url, id_ = res

        mode = 'extract:' + id_

        new_url = WbUrl(new_url).to_str(mod=wb_url_obj.mod)

        return mode, new_url
github harvard-lil / perma / perma_web / warc_server / pywb_config.py View on Github external
# Store the line for use in PermaCDXSource
        # so we don't need to hit the DB again
        wbrequest.custom_params['lines'] = cdx_lines
        wbrequest.custom_params['guid'] = guid

        # Adds the Memento-Datetime header
        # Normally this is done in MementoReqMixin#_parse_extra
        # but we need the GUID to make the DB query and that
        # isn't parsed from the url until this point
        wbrequest.wb_url.set_replay_timestamp(CDXLine(raw=cdx_lines[0]).timestamp)


# prevent mod getting added to rewritten urls
# timestamp already disabled via 'redir_to_exact' flag
class PermaUrl(WbUrl):
    def to_str(self, **overrides):
        overrides['mod'] = ''
        overrides['timestamp'] = ''
        return super(PermaUrl, self).to_str(**overrides)


class PermaMementoResponse(MementoResponse):
    def _init_derived(self, params):
        """
            Override MementoResponse to set cache time based on type of response (single memento or timegate).
        """
        # is_timegate logic via super _init_derived function:
        wbrequest = params.get('wbrequest')
        if not wbrequest or not wbrequest.wb_url:
            return
        is_top_frame = wbrequest.options.get('is_top_frame', False)
github harvard-lil / perma / services / docker / webrecorder / contentcontroller.py View on Github external
self.redir_set_session()
                else:
                    self._raise_error(404, 'no_such_collection')

            if access != 'public':
                frontend_cache_header = ('Cache-Control', 'private')

            if type == 'replay':
                if not recording:
                    self._raise_error(404, 'no_such_recording')

        request.environ['SCRIPT_NAME'] = quote(request.environ['SCRIPT_NAME'], safe='/:')

        wb_url = self._context_massage(wb_url)

        wb_url_obj = WbUrl(wb_url)

        is_top_frame = (wb_url_obj.mod == self.frame_mod or wb_url_obj.mod.startswith('$br:'))

        if type == 'record' and is_top_frame:
            result = self.check_remote_archive(wb_url, type, wb_url_obj)
            if result:
                mode, wb_url = result
                new_url = '/{user}/{coll}/{rec}/{mode}/{url}'.format(user=user,
                                                                     coll=coll_name,
                                                                     rec=rec_name,
                                                                     mode=mode,
                                                                     url=wb_url)
                return self.redirect(new_url)

        elif type == 'replay-coll' and not is_top_frame:
            collection.sync_coll_index(exists=False, do_async=False)
github webrecorder / pywb / pywb / rewrite / url_rewriter.py View on Github external
def rebase_rewriter(self, base_url):
        if not base_url.startswith(self.PROTOCOLS):
            base_url = self.urljoin(self.wburl.url, base_url)

        new_wburl_str = self.wburl.to_str(url=base_url)
        new_wburl = WbUrl(new_wburl_str)

        return self._create_rebased_rewriter(new_wburl, self.prefix)
github webrecorder / pywb / pywb / apps / rewriterapp.py View on Github external
def unrewrite_referrer(self, environ, full_prefix):
        referrer = environ.get('HTTP_REFERER')
        if not referrer:
            return False

        if referrer.startswith(full_prefix):
            referrer = referrer[len(full_prefix):]
            if referrer:
                environ['HTTP_REFERER'] = WbUrl(referrer).url
                return True

        return False
github webrecorder / pywb / pywb / framework / memento.py View on Github external
def make_timemap_memento_link(cdx, prefix, datetime=None,
                             rel='memento', end=',\n', mod=''):

    memento = '<{0}>; rel="{1}"; datetime="{2}"' + end

    string = WbUrl.to_wburl_str(url=cdx['url'],
                                mod=mod,
                                timestamp=cdx['timestamp'],
                                type=WbUrl.REPLAY)

    url = prefix + string

    if not datetime:
        datetime = timestamp_to_http_date(cdx['timestamp'])

    return memento.format(url, rel, datetime)
github webrecorder / pywb / pywb / apps / rewriterapp.py View on Github external
def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)

        proto = environ.get('HTTP_X_FORWARDED_PROTO', self.force_scheme)

        if proto:
            environ['wsgi.url_scheme'] = proto

        history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
        if history_page:
            wb_url.url = history_page
            is_ajax = True
        else:
            is_ajax = self.is_ajax(environ)

        is_timegate = self._check_accept_dt(wb_url, environ)

        host_prefix = self.get_host_prefix(environ)
github webrecorder / pywb / pywb / urlrewrite / platformhandler.py View on Github external
def render_content(self, wbrequest):
        if wbrequest.wb_url.mod == 'vi_':
            return self._get_video_info(wbrequest)

        ref_wburl_str = wbrequest.extract_referrer_wburl_str()
        if ref_wburl_str:
            wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url

        urlkey = canonicalize(wbrequest.wb_url.url)
        url = wbrequest.wb_url.url

        inputreq = RewriteInputRequest(wbrequest.env, urlkey, url,
                                       self.content_rewriter)

        req_data = inputreq.reconstruct_request(url)

        headers = {'Content-Length': len(req_data),
                   'Content-Type': 'application/request'}

        if wbrequest.wb_url.is_latest_replay():
            closest = 'now'
        else:
            closest = wbrequest.wb_url.timestamp