How to use the bs4.UnicodeDammit function in bs4

To help you get started, we’ve selected a few bs4 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github NikolaiT / GoogleScraper / tests / threadtests.py View on Github external
'start': self.gsearch['n_res_page']*self.gsearch['n_page'], # the offset to the search results. page number = (start / num) + 1
            'pws': '0'      # personalization turned off by default
        }
        headers = {
            'User-Agent': 'Mozilla/5.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'close',
            'DNT': '1'
        }
        print("Initiating search with params={}".format(params))
        r = requests.get('http://www.google.com/search', params=params, headers=headers)
        html = r.text
        # Try to parse the google HTML llresult using lxml
        try:
            doc = UnicodeDammit(html, is_html=True)
            parser = lxml.html.HTMLParser(encoding=doc.declared_html_encoding)
            dom = lxml.html.document_fromstring(html, parser=parser)
            dom.resolve_base_href()
        except Exception as e:
            print('Some error occurred while lxml tried to parse: {}'.format(e.msg))
            return False

        try:
            res = dom.xpath(HTMLTranslator().css_to_xpath('div#resultStats'))[0].text_content()
            print("Got number of results: `{}` for query {}".format(res, self.gsearch['query']))
        except Exception as e:
            print(e.msg)
github EducationalTestingService / skll / skll / utilities / megam_to_libsvm.py View on Github external
:return: A tuple of the newly formatted data, the mappings from class names
             to numbers, and the mappings from feature names to numbers.
    :rtype: 3-L{tuple} of (L{list} of L{unicode}, L{dict}, and L{dict})
    '''

    # Initialize variables
    field_num_dict = UniqueNumberDict()
    class_num_dict = UniqueNumberDict()

    result_list = []
    # Iterate through MegaM file
    for line in lines:
        line_fields = set()
        # Process encoding
        line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup.strip()

        # Ignore comments (and TEST/DEV lines)
        if not line.startswith('#') and not line == 'TEST' and not line == 'DEV':
            result_string = ''
            split_line = line.split()
            result_string += '{0}'.format(class_num_dict[split_line[0]])
            # Handle features if there are any
            if len(split_line) > 1:
                del split_line[0]
                # Loop through all feature-value pairs printing out pairs
                # separated by commas (and with feature names replaced with
                # numbers)
                for field_num, value in sorted(zip((field_num_dict[field_name] for field_name in islice(split_line, 0, None, 2)),
                                                   (float(value) if value != 'N/A' else 0.0 for value in islice(split_line, 1, None, 2)))):
                    # Check for duplicates
                    if field_num in line_fields:
github qazbnm456 / VWGen / core / attack / mod_unfilter.py View on Github external
def decode_html(html_string):
        converted = UnicodeDammit(html_string)
        if not converted.unicode_markup:
            raise UnicodeDecodeError(
                "Failed to detect encoding, tried [%s]",
                ', '.join(converted.tried_encodings))
        return converted.unicode_markup
except ImportError:
github morpheus65535 / bazarr / bazarr / list_subtitles.py View on Github external
logging.debug("BAZARR falling back to file content analysis to detect language.")
                if is_binary(subtitle_path):
                    logging.debug("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " +
                                  subtitle_path)
                    continue
                detected_language = None

                if six.PY3:
                    with open(subtitle_path, 'r', errors='ignore') as f:
                        text = f.read()
                else:
                    with open(subtitle_path, 'r') as f:
                        text = f.read()

                try:
                    encoding = UnicodeDammit(text)
                    if six.PY2:
                        text = text.decode(encoding.original_encoding)
                    detected_language = langdetect.detect(text)
                except Exception as e:
                    logging.exception('BAZARR Error trying to detect language for this subtitles file: ' +
                                      subtitle_path + ' You should try to delete this subtitles file manually and ask '
                                                      'Bazarr to download it again.')
                else:
                    if detected_language:
                        logging.debug("BAZARR external subtitles detected and guessed this language: " + str(
                            detected_language))
                        try:
                            subtitles[subtitle] = Language.rebuild(Language.fromietf(detected_language))
                        except:
                            pass
    return subtitles
github fake-name / ReadableWebProxy / common / util / webFunctions.py View on Github external
else:
				# The server is not reporting an encoding in the headers.
				# Use content-aware mechanisms for determing the content encoding.


				if "text/html" in cType or \
					'text/javascript' in cType or    \
					'text/css' in cType or    \
					'application/xml' in cType or    \
					'application/atom+xml' in cType:				# If this is a html/text page, we want to decode it using the local encoding

					pgctnt = self.decodeHtml(pgctnt, cType)

				elif "text/plain" in cType or "text/xml" in cType:
					pgctnt = bs4.UnicodeDammit(pgctnt).unicode_markup

				# Assume JSON is utf-8. Probably a bad idea?
				elif "application/json" in cType:
					pgctnt = pgctnt.decode('utf-8')

				elif "text" in cType:
					self.log.critical("Unknown content type!")
					self.log.critical(cType)

		else:
			self.log.critical("No content disposition header!")
			self.log.critical("Cannot guess content type!")

		return pgctnt
github bitextor / bitextor / textsanitizer.py View on Github external
def to_unicode(data, is_html=False, detwingle=False, verbose=False,
                   lang=None):
        """ Produce unicode from text of unknown encoding.
        Input: bytestring """
        dammit = UnicodeDammit(data, is_html=is_html)
        if detwingle and dammit.original_encoding == 'windows-1252':
            new_data = UnicodeDammit.detwingle(data)
            dammit = UnicodeDammit(new_data, is_html=is_html)

        if verbose:
            sys.stderr.write("Original encoding (via BS): %s\n" %
                             (dammit.original_encoding))

        if lang is None:
            return dammit.unicode_markup

        if lang == 'auto':
            lang = TextSanitizer.guess_lang_from_data(
                dammit.unicode_markup, is_html=is_html)
            if verbose:
                sys.stderr.write("Detected language: %s\n" % (lang))

        return TextSanitizer._to_unicode_chared(data, lang, verbose=verbose)
github harshavardhana / boilerpipy / boilerpipy / common.py View on Github external
def parse(raw_content, base_href=None, notify=lambda *args: None):
    try:
        content = UnicodeDammit(raw_content, is_html=True).markup
        cleaned = _clean_crufty_html(content)
        return create_doc(cleaned, base_href)
    except compat_html_parser.HTMLParseError as e:
        notify("parsing failed:", e)
    raise Unparseable()
github tasdikrahman / spammy / spammy / train.py View on Github external
# changing the path to that particular directory
        os.chdir(path)

        for email in os.listdir(path)[:self.limit]:
            email_file = open(email, 'r')  # explicit better than implicit
            email_text = email_file.read()

            """
            Don't even get me started on the Unicode issues that I faced
            here. Thankfullly 'BeautifulSoup' was there to our rescue.

            Thanks to Leonard Richardson for this module
            """

            try:
                email_text = bs4.UnicodeDammit.detwingle(
                    email_text).decode('utf-8')
            except:
                "Skipping the file for bad encoding"
                continue

            email_file.close()
            email_text = email_text.encode("ascii", "ignore")

            # Extracting the features from the text
            features = self.extract_features(email_text)

            # Training the classifier
            self.classifier.train(features, label)