How to use the tldextract.extract function in tldextract

To help you get started, we’ve selected a few tldextract examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github gwen001 / pentest-tools / csp-analyzer.py View on Github external
def getWarningLevel( t_tld_orig, item ):
    w_level = 0
    
    if item in t_help:
        return 0

    if not item.startswith('http'):
        item = 'https://'+item
    
    tmp_parse = urlparse( item )
    tmp_tld = tldextract.extract( tmp_parse.netloc )
    # print(tmp_parse)

    if tmp_tld.subdomain == t_tld_orig.subdomain and tmp_tld.domain == t_tld_orig.domain and tmp_tld.suffix == t_tld_orig.suffix:
        w_level = 1
    elif tmp_tld.domain == t_tld_orig.domain and tmp_tld.suffix == t_tld_orig.suffix:
        w_level = 2
    else:
        w_level = 3

    if '*' in tmp_parse.netloc:
        w_level+=1

    return w_level
github gwen001 / pentest-tools / cloudflare-origin-ip.py View on Github external
def grabSubs( domain ):
    print( "[+] Grabbing subdomains from crt.sh: %s" % domain )
    url = 'https://crt.sh/?q=%25.' + domain + '&output=json'
    try:
        ex = 0
        r = requests.get( url )
    except Exception as e:
        ex = 1
        print( colored("[-] error occured: %s" % e, 'red') )
    if ex == 0 and r.status_code == 200:
        n = 0
        j = r.json()
        for item in j:
            parse = tldextract.extract( item['name_value'] )
            sub = item['name_value'].replace( '*.', '' )
            if sub != domain and not sub in t_subs:
                t_subs.append( sub )
                try:
                    ex = 0
                    data = socket.gethostbyname( sub )
                    if not data in t_ips:
                        n = n + 1
                        t_ips.append( data )
                except Exception as e:
                    ex = 1
        print( colored("[+] %d subdomains found, %d ips added" % (len(t_subs),n), 'green') )
github PrivacyScore / privacyscanner / privacyscanner / scanmodules / openwpm.py View on Github external
rule = line.split('$')[0]
            if is_acceptable_rule(rule):
                rules.append(rule)
        except Exception:
            logger.exception('Unexpected error while applying easylist rules.')

    abr = AdblockRules(rules)

    elapsed = timeit.default_timer() - start_time
    logger.info('Took %i secs to parse easylist rules' % elapsed)

    i = 0

    for url in third_party_requests:
        if abr.should_block(url):
            ext = tldextract.extract(url)
            trackers.append("{}.{}".format(ext.domain, ext.suffix))
        i = i + 1
        if i % 20 == 0:
            elapsed = timeit.default_timer() - start_time
            logger.info("Checked %i domains, %i secs elapsed..." % (i, elapsed))
    return list(set(trackers))
github bit4woo / teemo / domainsites / CrtSearch.py View on Github external
def get_related_domains(self):
        result = []
        main_of_domain = tldextract.extract(self.domain).domain

        reg_urls = re.compile('<a href="\?id=(.*?)">
        urls = reg_urls.findall(self.resp)


        reg_domains = re.compile('DNS:(.*?)<br>') #DNS:*.jdpay.com<br>

        for item in urls:
            url = "https://crt.sh/?id={0}".format(item)
            resp = req.get(url, proxies=self.proxy).content

            reg_common_name = re.compile("Subject:<br>(.*?)<br>")
            common_name = reg_common_name.findall(resp)
            if len(common_name) !=0:
                common_name = common_name[0].replace("&nbsp;", "").split("=")[-1]
                main_of_cn_domain = tldextract.extract(common_name).domain</a>
github paulnaoki / DomainFinderSrcUniversal / DomainFinderSrc / Scrapers / LinkChecker.py View on Github external
def get_root_domain(full_link: str, use_www=True) ->(False, str, str, str, str, str, str):
        """
        get the root domain from url
        :param full_link: e.g "http://www.google.com"
        :return:Tuple(True is the domain is root domain else Sub-domain, the real root domain, link to root domain,
        link to sub.domain, sub.domain, suffix of the domain, domain pure)
        """
        scheme = "http"
        if full_link.startswith("https"):
            scheme = "https"
            #scheme, target_domain, a, b, c = urlsplit(full_link)
            #scheme = urlsplit(full_link)[0]
        scheme += "://"
        #ext = tldextract.extract(target_domain)
        ext = tldextract.extract(full_link)
        root = ext.domain+"."+ext.suffix
        prefix = "www."
        if len(ext.domain) == 0 or len(ext.suffix) == 0:
            return False, "", "", "", "", "", ""
        elif ext.subdomain is None or len(ext.subdomain) == 0:
            if use_www and prefix not in full_link:
                return True, root, scheme+prefix+root, scheme+prefix+root, prefix+root, ext.suffix, ext.domain
            else:
                return True, root, scheme+root, scheme+root, root, ext.suffix, ext.domain
        else:
            sub_domain = ext.subdomain+"."+root
            if use_www:
                return False, root, scheme+prefix+root, scheme+sub_domain, sub_domain, ext.suffix, ext.domain
            else:
                return False, root, scheme+root, scheme+sub_domain, sub_domain, ext.suffix, ext.domain
github waterbear-cloud / paco / src / paco / aws_api / acm / ACM.py View on Github external
def get_domain_from_host(validation_dns_record):
            """ Given an FQDN, return the domain
                portion of a host
            """
            domain_tld_info = tldextract.extract(validation_dns_record)
            return "%s.%s" % (domain_tld_info.domain, domain_tld_info.suffix)
github observerss / pygodaddy / pygodaddy / client.py View on Github external
def _split_hostname(self, hostname):
        """ split hostname into prefix + domain """
        ext = tldextract.extract(hostname)
        prefix = ext.subdomain
        domain = ext.registered_domain
        if not prefix:
            prefix = '@'
        return prefix, domain
github fportantier / habu / habu / cli / cmd_data_extract_domain.py View on Github external
result = set()

    for m in match:

        candidate = m.group(0).lower()

        if '.' not in candidate:
            continue

        if not re.match('[a-z]+', candidate):
            continue

        if not re.match('[a-z0-9]+\.[a-z0-9]', candidate):
            continue

        tld = tldextract.extract(candidate)
        if tld.suffix:
            result.add(tld.domain + '.' + tld.suffix.rstrip('.'))

    return list(result)
github PrivacyScore / privacyscanner / privacyscanner / scanmodules / serverleaks.py View on Github external
def _concat_full(url, suffix):
    url_extract = extract(url)
    site = url_extract.domain + "." + url_extract.suffix
    if url_extract.subdomain != "":
        site = url_extract.subdomain + "." + site
    return site + suffix
github alienwithin / Scripts-Sploits / giveWebHead.py View on Github external
def gwhEngine(target, wordlist, method, redirects=False):
    error_codes_non_redir=[200,403]
    error_codes_redir= [200,301,302,403]
    with open(wordlist) as dirPerLine:
		for dir in dirPerLine:
			cleanDirName=str(dir.rstrip('\n'))
			fullURL=tldextract.extract(target)
			getHostname=fullURL.domain	
			resultFile=open(str(getHostname)+'.csv', 'a')
			badResults=open(str(getHostname)+'_ignored.csv', 'a')
			csvWritingObject = csv.writer(resultFile)
			BadResultObject=csv.writer(badResults)
			if method=="HEAD" and redirects=="False":
				gwhRequester=requests.head(target+cleanDirName,verify=False)
				gwhStatus=gwhRequester.status_code
				if gwhStatus in error_codes_non_redir:
					csvWritingObject.writerow( (target+cleanDirName, gwhStatus) )
					resultFile.close()
					print target+cleanDirName+" => "+ str(gwhStatus)		
			elif method=="HEAD" and redirects=="True":
				gwhRequester=requests.head(target+cleanDirName,verify=False)
				gwhStatus=gwhRequester.status_code
				if gwhStatus in error_codes_redir:

tldextract

Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.

BSD-3-Clause
Latest version published 4 days ago

Package Health Score

88 / 100
Full package analysis

Similar packages