How to use the bs4.NavigableString function in bs4

To help you get started, we’ve selected a few bs4 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github crossroadchurch / paul / openlp / plugins / bibles / lib / http.py View on Github external
# garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse
            # completely.
            try:
                clean_verse_num = int(str(raw_verse_num))
            except ValueError:
                verse_parts = str(raw_verse_num).split('-')
                if len(verse_parts) > 1:
                    clean_verse_num = int(verse_parts[0])
            except TypeError:
                log.warning('Illegal verse number: %s', str(raw_verse_num))
            if clean_verse_num:
                verse_text = raw_verse_num.next_element
                part = raw_verse_num.next_element.next_element
                while not (isinstance(part, Tag) and part.get('class')[0] == 'versenum'):
                    # While we are still in the same verse grab all the text.
                    if isinstance(part, NavigableString):
                        verse_text += part
                    if isinstance(part.next_element, Tag) and part.next_element.name == 'div':
                        # Run out of verses so stop.
                        break
                    part = part.next_element
                verse_list[clean_verse_num] = str(verse_text)
        return verse_list
github kmyk / online-judge-tools / onlinejudge / _implementation / utils.py View on Github external
def parse_content(parent: Union[bs4.NavigableString, bs4.Tag]) -> bs4.NavigableString:
    res = ''
    if isinstance(parent, bs4.NavigableString):
        return parent
    else:
        children = parent.contents
        if len(children) == 0:
            html_tag = str(parent)
            return bs4.NavigableString('\n') if 'br' in html_tag else bs4.NavigableString('')
        else:
            for child in children:
                res += parse_content(child)
    return bs4.NavigableString(res)
github sslab-gatech / hdfi / chip / chisel / bin / tex2html.py View on Github external
{{text}}
""")
        text = ''
        for img in soup.find_all('img'):
            img['src'] = os.path.join(
                'figs', os.path.basename(img['src']))
        for node in soup.body.contents:
            if isinstance(node, Tag) and node.name == 'pre':
                codefile = tempfile.NamedTemporaryFile(delete=False)
                codepath = codefile.name
                with codefile:
                    for code in node.contents:
                        if isinstance(code, Tag) and code.name == 'br':
                            codefile.write('\n')
                        elif not isinstance(code, Comment):
                            if isinstance(code, NavigableString):
                                codefile.write(code.string)
                            else:
                                codefile.write(code.get_text().encode('utf-8'))
                codetext = subprocess.check_output(
                    ["source-highlight", "-s", "scala", "-i", codepath])
                os.remove(codepath)
                text = text + codetext.decode('utf-8')
            elif not isinstance(node, Comment):
                text = text + str(node).decode('utf-8')
        htmlfile.write("""
{% endblock %}
""")

    # Dress up with base layout
    env = Environment(loader=ChoiceLoader([
                FileSystemLoader('.'),
github polyrabbit / hacker-news-digest / page_content_extractor / html.py View on Github external
def calc_effective_text_len(self, node):
        """
        Calc the total the length of text in a child, same as
        sum(len(s) for s in cur_node.stripped_strings)
        """
        if node.text_len is not None:
            return node.text_len
        text_len = 0
        for child in node.children:
            if isinstance(child, Tag):
                if child.name == 'a':
                    continue
                text_len += self.calc_effective_text_len(child)
            # Comment is also an instance of NavigableString,
            # so we should not use isinstance(child, NavigableString)
            elif type(child) is NavigableString:
                text_len += len(child.string.strip()) + child.string.count(',') + \
                            child.string.count(u',')  # Chinese comma
        node.text_len = text_len * .2 if self.has_negative_effect(node) else text_len
        return node.text_len
github santoshphilip / eppy / p3 / eppy / readhtml.py View on Github external
def is_simpletable(table):
    """test if the table has only strings in the cells"""
    tds = table('td')
    for td in tds:
        if td.contents != []:
            if len(td.contents) == 1:
                if not isinstance(td.contents[0], NavigableString):
                    return False
            else:
                return False
    return True
github jhpyle / docassemble / docassemble_base / docassemble / base / file_docx.py View on Github external
def traverse(self, elem):
        for part in elem.contents:
            if isinstance(part, NavigableString):
                self.run.add(text_type(part), italic=self.italic, bold=self.bold, underline=self.underline, strike=self.strike, size=self.size)
                self.still_new = False
            elif isinstance(part, Tag):
                # logmessage("Part name is " + text_type(part.name))
                if part.name == 'p':
                    self.new_paragraph()
                    self.traverse(part)
                elif part.name == 'li':
                    self.new_paragraph()
                    self.traverse(part)
                elif part.name == 'ul':
                    # logmessage("Entering a UL")
                    oldstyle = self.style
                    self.style = 'ul'
                    self.indentation += 10
                    self.traverse(part)
github buffer / thug / thug / DOM / W3C / Core / Document.py View on Github external
def createTextNode(self, data):
        from .Text import Text
        return Text(self, bs4.NavigableString(data))
github titipata / scipdf_parser / scipdf / pdf / parse_pdf.py View on Github external
def parse_sections(article):
    """
    Parse list of sections from a given BeautifulSoup of an article 
    """
    article_text = article.find('text')
    divs = article_text.find_all('div', attrs={'xmlns': 'http://www.tei-c.org/ns/1.0'})
    sections = []
    for div in divs:
        div_list = list(div.children)
        if len(div_list) == 0:
            heading = ''
            text = ''
        elif len(div_list) == 1:
            if isinstance(div_list[0], NavigableString):
                heading = str(div_list[0])
                text = ''
            else:
                heading = ''
                text = div_list[0].text
        else:
            text = []
            heading = div_list[0]
            if isinstance(heading, NavigableString):
                heading = str(heading)
                p_all = list(div.children)[1:]
            else:
                heading = ''
                p_all = list(div.children)
            for p in p_all:
                if p is not None:
github the-blue-alliance / the-blue-alliance / datafeeds / parser_base.py View on Github external
def _recurseUntilString(self, node):
        """
        Digs through HTML that Word made worse.
        Written to deal with http://www2.usfirst.org/2011comp/Events/cmp/matchresults.html
        """
        from bs4 import NavigableString
        if node.string is not None:
            return re.sub('\s+', ' ', node.string.replace(u'\xa0', ' ')).strip()  # remove multiple whitespaces
        if isinstance(node, NavigableString):
            return node
        if hasattr(node, 'contents'):
            results = []
            for content in node.contents:
                result = self._recurseUntilString(content)
                if result is not None:
                    result = result.strip().replace('\r', '').replace('\n', '').replace('  ', ' ')
                if result is not None and result != "":
                    results.append(result)
            if results != []:
                return ' '.join(results)
        return None
github Integreat / cms-django / src / cms / page_xliff_converter.py View on Github external
def _add_navigable_string_to_empty_tag(soup):
        for el in list(soup.descendants):
            if isinstance(el, Tag) and not list(el.children) and el.name not in ('br',):
                el.append(NavigableString(' '))