Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
'password', 'imgur', 'flickr', 'search', 'subscription', 'itunes',
'siteindex', 'events', 'stop', 'jobs', 'careers', 'newsletter',
'subscribe', 'academy', 'shopping', 'purchase', 'site-map',
'shop', 'donate', 'newsletter', 'product', 'advert', 'info',
'tickets', 'coupons', 'forum', 'board', 'archive', 'browse',
'howto', 'how to', 'faq', 'terms', 'charts', 'services',
'contact', 'plus', 'admin', 'login', 'signup', 'register',
'developer', 'proxy']
_valid_categories = []
# TODO Stop spamming urlparse and tldextract calls...
for p_url in valid_categories:
path = urls.get_path(p_url)
subdomain = tldextract.extract(p_url).subdomain
conjunction = path + ' ' + subdomain
bad = False
for badword in stopwords:
if badword.lower() in conjunction.lower():
if self.config.verbose:
print(('elim category url %s for subdomain '
'contain stopword!' % p_url))
bad = True
break
if not bad:
_valid_categories.append(p_url)
_valid_categories.append('/') # add the root
for i, p_url in enumerate(_valid_categories):
if p_url.startswith('://'):
def __init__(self):
"""Loads in Ed and Olivier's domainRules.json file, now converted to a big (7k+ entry) dict object"""
#import domainRules.json
from domain_rules import domain_rules
from tldextract.tldextract import extract
self.extract = extract
from nltk.stem.porter import PorterStemmer as PorterStemmer
self.domain_rules = domain_rules
#create stemmer
self.Stemmer = PorterStemmer()