How to use the ural.utils.urlsplit function in ural

To help you get started, we’ve selected a few ural examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github medialab / ural / ural / normalize_url.py View on Github external
def get_normalized_hostname(url, normalize_amp=True, strip_lang_subdomains=False,
                            infer_redirection=True):

    if infer_redirection:
        url = resolve(url)

    if isinstance(url, SplitResult):
        splitted = url
    else:
        try:
            splitted = urlsplit(ensure_protocol(url))
        except ValueError:
            return None

    if not splitted.hostname:
        return None

    hostname = splitted.hostname.lower()

    pattern = IRRELEVANT_SUBDOMAIN_AMP_RE if normalize_amp else IRRELEVANT_SUBDOMAIN_RE

    hostname = pattern.sub('', hostname)

    if normalize_amp and hostname.startswith('amp-'):
        hostname = hostname[4:]

    hostname = decode_punycode(hostname)
github medialab / ural / ural / is_shortened_url.py View on Github external
def is_shortened_url(url):
    hostname = urlsplit(ensure_protocol(url)).hostname

    return bool(TRIE.longest(reversed(hostname.split('.'))))
github medialab / ural / ural / lru / stems.py View on Github external
def lru_stems(url, tld_aware=False):
    """
    Function returning the parts of the given url in the hierarchical order (lru).

    Args:
        url (str): Target URL as a string.

    Returns:
        list: The lru, with a prefix identifying the type of each part.
    """

    full_url = ensure_protocol(url)
    return lru_stems_from_parsed_url(urlsplit(full_url), tld_aware=tld_aware)
github medialab / ural / ural / get_domain_name.py View on Github external
def get_hostname(url):
    try:
        return urlsplit(ensure_protocol(url)).hostname or None
    except ValueError:
        return None
github medialab / ural / ural / youtube.py View on Github external
# Inferring redirection
    url = infer_redirection(url)

    # Continuation urls
    m = NEXT_V_RE.search(url) or NESTED_NEXT_V_RE.search(url)

    if m:
        return YoutubeVideo(id=m.group(1))

    # Parsing
    if isinstance(url, SplitResult):
        parsed = url
    else:
        url = ensure_protocol(url)
        parsed = urlsplit(url)

    if not is_youtube_url(parsed):
        return

    _, _, path, query, fragment = parsed

    # youtu.be
    if parsed.hostname.endswith('youtu.be'):

        if path.count('/') > 0:
            v = urlpathsplit(path)[0]

            if fix_common_mistakes:
                v = v[:11]

            if not is_youtube_video_id(v):
github medialab / ural / ural / facebook.py View on Github external
def convert_facebook_url_to_mobile(url):
    """
    Function parsing the given facebook url and returning the same but for
    the mobile website.
    """
    safe_url = ensure_protocol(url)

    has_protocol = safe_url == url

    scheme, netloc, path, query, fragment = urlsplit(safe_url)

    if 'facebook' not in netloc:
        raise Exception('ural.facebook.convert_facebook_url_to_mobile: %s is not a facebook url' % url)

    netloc = re.sub(MOBILE_REPLACE_RE, 'm.facebook.', netloc)

    result = (
        scheme,
        netloc,
        path,
        query,
        fragment
    )

    result = urlunsplit(result)
github medialab / ural / ural / lru / stems.py View on Github external
def lru_stems(url, tld_aware=False):
    """
    Function returning the parts of the given url in the hierarchical order (lru).

    Args:
        url (str): Target URL as a string.

    Returns:
        list: The lru, with a prefix identifying the type of each part.
    """

    full_url = ensure_protocol(url)
    return lru_stems_from_parsed_url(urlsplit(full_url), tld_aware=tld_aware)
github medialab / ural / ural / normalize_url.py View on Github external
if infer_redirection:
        url = resolve(url)

    if isinstance(url, SplitResult):
        has_protocol = bool(splitted.scheme)
        splitted = url
    else:
        has_protocol = PROTOCOL_RE.match(url)

        # Ensuring scheme so parsing works correctly
        if not has_protocol:
            url = 'http://' + url

        # Parsing
        try:
            splitted = urlsplit(url)
        except ValueError:
            return original_url_arg

    scheme, netloc, path, query, fragment = splitted

    # Fixing common mistakes
    if fix_common_mistakes:
        if query:
            query = re.sub(MISTAKES_RE, '&', query)

    # Handling punycode
    netloc = decode_punycode(netloc)

    # Dropping :80 & :443
    if netloc.endswith(':80'):
        netloc = netloc[:-3]