Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
'start': self.gsearch['n_res_page']*self.gsearch['n_page'], # the offset to the search results. page number = (start / num) + 1
'pws': '0' # personalization turned off by default
}
headers = {
'User-Agent': 'Mozilla/5.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'close',
'DNT': '1'
}
print("Initiating search with params={}".format(params))
r = requests.get('http://www.google.com/search', params=params, headers=headers)
html = r.text
# Try to parse the google HTML llresult using lxml
try:
doc = UnicodeDammit(html, is_html=True)
parser = lxml.html.HTMLParser(encoding=doc.declared_html_encoding)
dom = lxml.html.document_fromstring(html, parser=parser)
dom.resolve_base_href()
except Exception as e:
print('Some error occurred while lxml tried to parse: {}'.format(e.msg))
return False
try:
res = dom.xpath(HTMLTranslator().css_to_xpath('div#resultStats'))[0].text_content()
print("Got number of results: `{}` for query {}".format(res, self.gsearch['query']))
except Exception as e:
print(e.msg)
:return: A tuple of the newly formatted data, the mappings from class names
to numbers, and the mappings from feature names to numbers.
:rtype: 3-L{tuple} of (L{list} of L{unicode}, L{dict}, and L{dict})
'''
# Initialize variables
field_num_dict = UniqueNumberDict()
class_num_dict = UniqueNumberDict()
result_list = []
# Iterate through MegaM file
for line in lines:
line_fields = set()
# Process encoding
line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup.strip()
# Ignore comments (and TEST/DEV lines)
if not line.startswith('#') and not line == 'TEST' and not line == 'DEV':
result_string = ''
split_line = line.split()
result_string += '{0}'.format(class_num_dict[split_line[0]])
# Handle features if there are any
if len(split_line) > 1:
del split_line[0]
# Loop through all feature-value pairs printing out pairs
# separated by commas (and with feature names replaced with
# numbers)
for field_num, value in sorted(zip((field_num_dict[field_name] for field_name in islice(split_line, 0, None, 2)),
(float(value) if value != 'N/A' else 0.0 for value in islice(split_line, 1, None, 2)))):
# Check for duplicates
if field_num in line_fields:
def decode_html(html_string):
converted = UnicodeDammit(html_string)
if not converted.unicode_markup:
raise UnicodeDecodeError(
"Failed to detect encoding, tried [%s]",
', '.join(converted.tried_encodings))
return converted.unicode_markup
except ImportError:
logging.debug("BAZARR falling back to file content analysis to detect language.")
if is_binary(subtitle_path):
logging.debug("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " +
subtitle_path)
continue
detected_language = None
if six.PY3:
with open(subtitle_path, 'r', errors='ignore') as f:
text = f.read()
else:
with open(subtitle_path, 'r') as f:
text = f.read()
try:
encoding = UnicodeDammit(text)
if six.PY2:
text = text.decode(encoding.original_encoding)
detected_language = langdetect.detect(text)
except Exception as e:
logging.exception('BAZARR Error trying to detect language for this subtitles file: ' +
subtitle_path + ' You should try to delete this subtitles file manually and ask '
'Bazarr to download it again.')
else:
if detected_language:
logging.debug("BAZARR external subtitles detected and guessed this language: " + str(
detected_language))
try:
subtitles[subtitle] = Language.rebuild(Language.fromietf(detected_language))
except:
pass
return subtitles
else:
# The server is not reporting an encoding in the headers.
# Use content-aware mechanisms for determing the content encoding.
if "text/html" in cType or \
'text/javascript' in cType or \
'text/css' in cType or \
'application/xml' in cType or \
'application/atom+xml' in cType: # If this is a html/text page, we want to decode it using the local encoding
pgctnt = self.decodeHtml(pgctnt, cType)
elif "text/plain" in cType or "text/xml" in cType:
pgctnt = bs4.UnicodeDammit(pgctnt).unicode_markup
# Assume JSON is utf-8. Probably a bad idea?
elif "application/json" in cType:
pgctnt = pgctnt.decode('utf-8')
elif "text" in cType:
self.log.critical("Unknown content type!")
self.log.critical(cType)
else:
self.log.critical("No content disposition header!")
self.log.critical("Cannot guess content type!")
return pgctnt
def to_unicode(data, is_html=False, detwingle=False, verbose=False,
lang=None):
""" Produce unicode from text of unknown encoding.
Input: bytestring """
dammit = UnicodeDammit(data, is_html=is_html)
if detwingle and dammit.original_encoding == 'windows-1252':
new_data = UnicodeDammit.detwingle(data)
dammit = UnicodeDammit(new_data, is_html=is_html)
if verbose:
sys.stderr.write("Original encoding (via BS): %s\n" %
(dammit.original_encoding))
if lang is None:
return dammit.unicode_markup
if lang == 'auto':
lang = TextSanitizer.guess_lang_from_data(
dammit.unicode_markup, is_html=is_html)
if verbose:
sys.stderr.write("Detected language: %s\n" % (lang))
return TextSanitizer._to_unicode_chared(data, lang, verbose=verbose)
def parse(raw_content, base_href=None, notify=lambda *args: None):
try:
content = UnicodeDammit(raw_content, is_html=True).markup
cleaned = _clean_crufty_html(content)
return create_doc(cleaned, base_href)
except compat_html_parser.HTMLParseError as e:
notify("parsing failed:", e)
raise Unparseable()
# changing the path to that particular directory
os.chdir(path)
for email in os.listdir(path)[:self.limit]:
email_file = open(email, 'r') # explicit better than implicit
email_text = email_file.read()
"""
Don't even get me started on the Unicode issues that I faced
here. Thankfullly 'BeautifulSoup' was there to our rescue.
Thanks to Leonard Richardson for this module
"""
try:
email_text = bs4.UnicodeDammit.detwingle(
email_text).decode('utf-8')
except:
"Skipping the file for bad encoding"
continue
email_file.close()
email_text = email_text.encode("ascii", "ignore")
# Extracting the features from the text
features = self.extract_features(email_text)
# Training the classifier
self.classifier.train(features, label)