How to use the ural.patterns.QUERY_VALUE_IN_URL_TEMPLATE function in ural

To help you get started, we’ve selected a few ural examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github medialab / ural / ural / google.py View on Github external
# =============================================================================
# Ural Google-related heuristic functions
# =============================================================================
#
# Collection of functions related to Google urls.
#
import re
from ural.utils import safe_urlsplit, unquote
from ural.patterns import QUERY_VALUE_IN_URL_TEMPLATE

AMP_QUERY_RE = re.compile(r'amp(_.+)=?', re.I)
AMP_SUFFIXES_RE = re.compile(r'(?:\.amp(?=\.html$)|\.amp/?$|(?<=/)amp/?$)', re.I)

URL_EXTRACT_RE = re.compile(QUERY_VALUE_IN_URL_TEMPLATE % r'url')


def is_amp_url(url):
    splitted = safe_urlsplit(url)

    if splitted.hostname.endswith('.ampproject.org'):
        return True

    if splitted.hostname.startswith('amp-'):
        return True

    if splitted.hostname.startswith('amp.'):
        return True

    if '/amp/' in splitted.path:
        return True
github medialab / ural / ural / infer_redirection.py View on Github external
# =============================================================================
# Ural Redirecion Inferrence Function
# =============================================================================
#
# A lot of urls contains an obvious hint that they will in fact trigger
# a redirection. This modules gathers routines aimed at discovering
# those redirections without even firing a HTTP request.
#
import re

from ural.patterns import QUERY_VALUE_IN_URL_TEMPLATE
from ural.utils import unquote, urljoin

OBVIOUS_REDIRECTS_RE = re.compile(QUERY_VALUE_IN_URL_TEMPLATE % r'(?:redirect(?:_to)?|url|[lu])', re.I)
REDIRECTION_DOMAINS_RE = re.compile(r'(?:\.ampproject\.org/[cv]/(?:s/)?|bc\.marfeelcache\.com/amp/|bc\.marfeel\.com/)', re.I)


def infer_redirection(url):
    """
    Function returning the url that the given url will redirect to. This is done
    by finding obvious hints in the GET parameters that the given url is in
    fact a redirection.

    Args:
        url (string): Target url.

    Returns:
        string: Redirected url or the original url if nothing was found.
    """
github medialab / ural / ural / facebook.py View on Github external
urlunsplit,
    safe_urlsplit,
    SplitResult
)

NUMERIC_ID_RE = re.compile(r'[0-9]{8,}')

BASE_FACEBOOK_URL = 'https://www.facebook.com'

FACEBOOK_ID_RE = re.compile(r'^\d+$')
FACEBOOK_FULL_ID_RE = re.compile(r'^\d+_\d+$')
FACEBOOK_DOMAIN_RE = re.compile(r'(?:facebook\.[^.]+$|fb\.me$)', re.I)
FACEBOOK_URL_RE = re.compile(DOMAIN_TEMPLATE % r'(?:[^.]+\.)*(?:facebook\.[^.]+|fb\.me)', re.I)
MOBILE_REPLACE_RE = re.compile(r'^([^.]+\.)?facebook\.', re.I)

URL_EXTRACT_RE = re.compile(QUERY_VALUE_IN_URL_TEMPLATE % r'u')


def is_facebook_id(value):
    return bool(re.search(FACEBOOK_ID_RE, value))


def is_facebook_full_id(value):
    return bool(re.search(FACEBOOK_FULL_ID_RE, value))


def is_facebook_url(url):
    """
    Function returning whether the given url is a valid Facebook url.

    Args:
        url (str): Url to test.