How to use the tldextract.tldextract.TLDExtract function in tldextract

To help you get started, we’ve selected a few tldextract examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github john-kurkowski / tldextract / tldextract / cli.py View on Github external
prog='tldextract',
        description='Parse hostname from a url or fqdn')

    parser.add_argument('--version', action='version', version='%(prog)s ' + __version__)  # pylint: disable=no-member
    parser.add_argument('input', metavar='fqdn|url',
                        type=unicode, nargs='*', help='fqdn or url')

    parser.add_argument('-u', '--update', default=False, action='store_true',
                        help='force fetch the latest TLD definitions')
    parser.add_argument('-c', '--cache_file',
                        help='use an alternate TLD definition file')
    parser.add_argument('-p', '--private_domains', default=False, action='store_true',
                        help='Include private domains')

    args = parser.parse_args()
    tld_extract = TLDExtract(include_psl_private_domains=args.private_domains)

    if args.cache_file:
        tld_extract.cache_file = args.cache_file

    if args.update:
        tld_extract.update(True)
    elif not args.input:
        parser.print_usage()
        sys.exit(1)
        return

    for i in args.input:
        print(' '.join(tld_extract(i)))  # pylint: disable=superfluous-parens
github robbielynch / RoblySearch / tldextract / tldextract.py View on Github external
if sys.version_info < (3,):
                    sys.stderr.write(line.encode('utf-8') + "\n")
                else:
                    sys.stderr.write(line + "\n")

        if self.cache_file:
            try:
                with open(self.cache_file, 'wb') as f:
                    pickle.dump(tlds, f)
            except IOError as e:
                LOG.warn("unable to cache TLDs in file %s: %s", self.cache_file, e)

        self._extractor = _PublicSuffixListTLDExtractor(tlds)
        return self._extractor

TLD_EXTRACTOR = TLDExtract()

@wraps(TLD_EXTRACTOR.__call__)
def extract(url):
    return TLD_EXTRACTOR(url)

@wraps(TLD_EXTRACTOR.update)
def update(*args, **kwargs):
    return TLD_EXTRACTOR.update(*args, **kwargs)

def get_tlds_from_raw_suffix_list_data(suffix_list_source):
    tld_finder = re.compile(r'^(?P[.*!]*\w[\S]*)', re.UNICODE | re.MULTILINE)
    tld_iter = (m.group('tld') for m in tld_finder.finditer(suffix_list_source))
    return frozenset(tld_iter)

def fetch_file(urls):
    """ Decode the first successfully fetched URL, from UTF-8 encoding to
github john-kurkowski / tldextract / tldextract / tldextract.py View on Github external
LOG.debug('computed TLD diff:\n' + '\n'.join(difflib.unified_diff(
                snapshot,
                new,
                fromfile=".tld_set_snapshot",
                tofile=self.cache_file
            )))

        if self.cache_file:
            try:
                with open(self.cache_file, 'w') as cache_file:
                    json.dump(tlds, cache_file)
            except IOError as ioe:
                LOG.warning("unable to cache TLDs in file %s: %s", self.cache_file, ioe)


TLD_EXTRACTOR = TLDExtract()


@wraps(TLD_EXTRACTOR.__call__)
def extract(url):
    return TLD_EXTRACTOR(url)


@wraps(TLD_EXTRACTOR.update)
def update(*args, **kwargs):
    return TLD_EXTRACTOR.update(*args, **kwargs)


def get_tlds_from_raw_suffix_list_data(suffix_list_source, include_psl_private_domains=False):
    if include_psl_private_domains:
        text = suffix_list_source
    else:

tldextract

Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.

BSD-3-Clause
Latest version published 17 days ago

Package Health Score

86 / 100
Full package analysis

Similar packages