Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _breakRemNewlines(tag):
"""non-recursively break spaces and remove newlines in the tag"""
for i,c in enumerate(tag.contents):
if type(c) != bs4.element.NavigableString:
continue
c.replace_with(re.sub(r' {2,}', ' ', c).replace('\n',''))
def insert_escaped_tags(tags, label=None):
"""For each tag in "tags", insert contextual tags (e.g., <p> </p>) as escaped text
so that these tags are still there when html markup is stripped out."""
found = False
for tag in tags:
strs = list(tag.strings)
if len(strs) > 0:
if label != None:
l = label
else:
l = tag.name
strs[0].parent.insert(0, NavigableString("<"+l+">"))
strs[-1].parent.append(NavigableString(""))
found = True
return found
def embed_css_in_html_file(html_file, css_dir):
with open(html_file, 'r') as f:
soup = bs4.BeautifulSoup(f.read(), "html.parser")
stylesheets = soup.findAll("link", {"rel": "stylesheet"})
for s in stylesheets:
t = soup.new_tag('style')
css_file = s["href"]
print(f"found link to {css_file}")
with open(os.path.join(css_dir, css_file), 'r') as f:
c = bs4.element.NavigableString(f.read())
t.insert(0, c)
t['type'] = 'text/css'
s.replaceWith(t)
with open(html_file, 'w') as f:
f.write(str(soup))
string_child = child = node
elif isinstance(node, Tag):
# Some other piece of code decided to pass in a Tag
# instead of creating an Element object to contain the
# Tag.
child = node
elif node.element.__class__ == NavigableString:
string_child = child = node.element
else:
child = node.element
if not isinstance(child, basestring) and child.parent is not None:
node.element.extract()
if (string_child and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# We are appending a string onto another string.
# TODO This has O(n^2) performance, for input like
# "aaa..."
old_element = self.element.contents[-1]
new_element = self.soup.new_string(old_element + string_child)
old_element.replace_with(new_element)
self.soup._most_recent_element = new_element
else:
if isinstance(node, basestring):
# Create a brand new NavigableString from this string.
child = self.soup.new_string(node)
# Tell Beautiful Soup to act as if it parsed this element
# immediately after the parent's last descendant. (Or
# immediately after the parent, if it has no children.)
if self.element.contents:
def new_string(self, s):
"""Create a new NavigableString associated with this soup."""
navigable = NavigableString(s)
navigable.setup()
return navigable
def insert_escaped_tags(tags, label=None):
"""For each tag in "tags", insert contextual tags (e.g., <p> </p>) as escaped text
so that these tags are still there when html markup is stripped out."""
found = False
for tag in tags:
strs = list(tag.strings)
if len(strs) > 0:
if label != None:
l = label
else:
l = tag.name
strs[0].parent.insert(0, NavigableString("<"+l+">"))
strs[-1].parent.append(NavigableString(""))
found = True
return found
block = blocks[0]
# If there aren't any inner sections, we are done
if block.find() is None:
return block
# Othwerwise, fix punctuation errors
punctuation = ".,!/;:%'\""
for x in block.find():
if not isinstance(x, bs4.element.NavigableString):
continue
if len(x) <= 1:
continue
if x[0] == ' ' and x[1] in punctuation:
xs = bs4.element.NavigableString(x.string[1:])
x.replace_with(xs)
return block
def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup."""
return subclass(s)
if isinstance(element, Doctype):
m = doctype_re.match(element)
if m:
name = m.group(1)
if m.lastindex > 1:
publicId = m.group(2) or ""
systemId = m.group(3) or m.group(4) or ""
rv.append("""|%s""" %
(' ' * indent, name, publicId, systemId))
else:
rv.append("|%s" % (' ' * indent, name))
else:
rv.append("|%s" % (' ' * indent,))
elif isinstance(element, Comment):
rv.append("|%s" % (' ' * indent, element))
elif isinstance(element, NavigableString):
rv.append("|%s\"%s\"" % (' ' * indent, element))
else:
if element.namespace:
name = "%s %s" % (prefixes[element.namespace],
element.name)
else:
name = element.name
rv.append("|%s<%s>" % (' ' * indent, name))
if element.attrs:
attributes = []
for name, value in element.attrs.items():
if isinstance(name, NamespacedAttribute):
name = "%s %s" % (prefixes[name.namespace], name.name)
if isinstance(value, list):
value = " ".join(value)
attributes.append((name, value))
def endData(self, containerClass=NavigableString):
if self.currentData:
currentData = u''.join(self.currentData)
if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
not set([tag.name for tag in self.tagStack]).intersection(
self.builder.preserve_whitespace_tags)):
if '\n' in currentData:
currentData = '\n'
else:
currentData = ' '
self.currentData = []
if self.parse_only and len(self.tagStack) <= 1 and \
(not self.parse_only.text or \
not self.parse_only.search(currentData)):
return
o = containerClass(currentData)
self.object_was_parsed(o)