Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
logger.setLevel(logging.INFO)
""" Print version """
logger.info(pkg_resources.require("music_dl")[0])
""" Validate parameters """
logger.info('Validating parameters...')
try:
# Validate download url
url_parsed = urlparse(self.download_url)
if not url_parsed.scheme.startswith('http'):
raise DirectoryException('Invalid URL. URL must start with http*. Input value is {}'.format(self.download_url))
tld_parsed = tldextract.extract(self.download_url)
if not (tld_parsed.domain in ['youtube', 'soundcloud']):
raise DirectoryException('Invalid URL. Music Downloader supports only YouTube and SoundCloud. Input value is {}'.format(self.download_url))
# Validate download directory
if not is_path_exists_or_creatable(self.working_dir):
raise DirectoryException('Invalid directory. Please specify valid download directory. Input value is {}'.format(self.working_dir))
except DirectoryException as e:
logger.error(e.message)
logger.fatal('Aborted.')
exit()
# Validate playlist configuration
try:
self.playlist.validate()
except PlaylistParameterException as e:
# if the file type is a media type, reject instantly
if file_type and file_type not in ALLOWED_TYPES:
if verbose: print('\t%s rejected due to bad filetype' % url)
return False
last_chunk = path_chunks[-1].split('.')
# the file type is not of use to use anymore, remove from url
if len(last_chunk) > 1:
path_chunks[-1] = last_chunk[-2]
# Index gives us no information
if 'index' in path_chunks:
path_chunks.remove('index')
# extract the tld (top level domain)
tld_dat = tldextract.extract(url)
subd = tld_dat.subdomain
tld = tld_dat.domain.lower()
url_slug = path_chunks[-1] if path_chunks else ''
if tld in BAD_DOMAINS:
if verbose: print('%s caught for a bad tld' % url)
return False
if len(path_chunks) == 0:
dash_count, underscore_count = 0, 0
else:
dash_count = url_slug.count('-')
underscore_count = url_slug.count('_')
# If the url has a news slug title
except:
continue
# appending upper level domains, from http://data.iana.org/TLD/tlds-alpha-by-domain.txt
# Version 2018040300, Last Updated Tue Apr 3 07:07:01 2018 UTC
df = pd.read_csv(config.datasets + 'data/iana/org/TLD/tlds-alpha-by-domain.txt', sep=" ", header=None)
for index, row in df.iterrows():
print(index, row[0])
domain.append(str(row[0]).lower())
df = pd.read_csv(DATASET_MICROSOFT_PATH, delimiter='\t', header=0)
for index, row in df.iterrows():
url = str(row[3])
print(index, url)
try:
o = tldextract.extract(url)
if o.suffix is not None:
domain_s.append(str(o.suffix).lower())
if o.domain is not None:
domain.append(str(o.domain).lower())
except:
continue
le1.fit(domain)
joblib.dump(le1, ENC_WEB_DOMAIN)
print(le1.classes_)
le2.fit(domain_s)
joblib.dump(le2, ENC_WEB_DOMAIN_SUFFIX)
print(le2.classes_)
if self.config.verbose:
print('elim category url %s for no domain and path'
% p_url)
continue
if path and path.startswith('#'):
if self.config.verbose:
print('elim category url %s path starts with #' % p_url)
continue
if scheme and (scheme != 'http' and scheme != 'https'):
if self.config.verbose:
print(('elim category url %s for bad scheme, '
'not http nor https' % p_url))
continue
if domain:
child_tld = tldextract.extract(p_url)
domain_tld = tldextract.extract(source_url)
child_subdomain_parts = child_tld.subdomain.split('.')
subdomain_contains = False
for part in child_subdomain_parts:
if part == domain_tld.domain:
if self.config.verbose:
print(('subdomain contains at %s and %s' %
(str(part), str(domain_tld.domain))))
subdomain_contains = True
break
# Ex. microsoft.com is definitely not related to
# espn.com, but espn.go.com is probably related to espn.com
if not subdomain_contains and \
(child_tld.domain != domain_tld.domain):
if self.config.verbose:
def get_open_page_rank(self, url):
try:
o = tldextract.extract(url)
domain=('%s.%s' % (o.domain, o.suffix))
try:
pginfo=self.page_rank.pg[domain]
except KeyError:
config.logger.warn('page rank information for domain [' + domain + '] not found')
return MISSING_FEATURE * 2, True
return [pginfo['page_rank_decimal'], pginfo['rank']], False
except Exception as e:
config.logger.error(repr(e))
return MISSING_FEATURE * 2, True
import pandas as pd
from sklearn import preprocessing
le1 = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()
domain_s = ['com']
domain_s = ['']
domain = ['']
df_sites = pd.read_csv(DATASET_3C_SITES_PATH, na_values=0, delimiter=',', usecols=['document_url'])
for index, row in df_sites.iterrows():
url = str(row[0])
print(index, url)
try:
o = tldextract.extract(url)
if o.suffix is not None:
domain_s.append(str(o.suffix).lower())
if o.domain is not None:
domain.append(str(o.domain).lower())
except:
continue
# appending upper level domains, from http://data.iana.org/TLD/tlds-alpha-by-domain.txt
# Version 2018040300, Last Updated Tue Apr 3 07:07:01 2018 UTC
df = pd.read_csv(config.datasets + 'data/iana/org/TLD/tlds-alpha-by-domain.txt', sep=" ", header=None)
for index, row in df.iterrows():
print(index, row[0])
domain.append(str(row[0]).lower())
df = pd.read_csv(DATASET_MICROSOFT_PATH, delimiter='\t', header=0)
for index, row in df.iterrows():
def get_base_url(url):
"""
Takes as input a url, returns the protocol,domain and suffix concatenated
to form the base url of the website. Uses the tldextract library.
"""
tld = tldextract.extract(url)
print(tld.subdomain, ' - ', tld.domain, ' - ', tld.suffix)
if tld.subdomain != "":
base_url = '.'.join([tld.subdomain, tld.domain, tld.suffix])
else:
base_url = '.'.join([tld.domain, tld.suffix])
return base_url
self.url = url
self.url = urls.prepare_url(url)
self.domain = urls.get_domain(self.url)
self.scheme = urls.get_scheme(self.url)
self.categories = []
self.feeds = []
self.articles = []
self.html = ''
self.doc = None
self.logo_url = ''
self.favicon = ''
self.brand = tldextract.extract(self.url).domain
self.description = ''
self.is_parsed = False
self.is_downloaded = False