Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def parse_content_listing(page: Page) -> list:
raw = ''
for n in page.subpages[0].children:
if isinstance(n, str):
raw += n
pass
elif isinstance(n, bs4.element.Tag):
if 'class' not in n.attrs or all((x not in n.attrs['class']) for x in ['bgB', 'bgY', 'Y']):
raw += n.get_text()
entries = raw.splitlines()
entries = [e for e in entries if not re.fullmatch(' *', e)]
entries = [parse_content_entry(e) for e in entries]
return entries
meta.extract()
# Handle the title differently
if meta["name"] == "Title":
title = soup.new_tag("title")
title.insert(0, meta["content"])
soup.head.insert(0, title)
else:
soup.head.insert(0, meta)
# Add the HTML doctype
soup.insert(0, bs4.element.Doctype("HTML"))
# Unwrap all useless p tags
for ele in soup.find_all("p"):
if isinstance(ele, bs4.element.Tag):
if not ele.get_text().strip():
ele.unwrap()
# Wrap the parent element codes if needed
for ele in soup.find_all("span"):
if "class" not in ele.attrs:
continue
if "inline-equation" in ele["class"]:
continue
if "block-equation" in ele["class"]:
continue
parent = ele.parent
if parent.name not in ["h1", "h2", "h3", "h4", "h5", "h6"]:
continue
def is_visible(self, elem):
if isinstance(elem, (bs4.element.ProcessingInstruction,
bs4.element.Doctype)):
return False
if not PY3:
elem = elem.encode(self.encoding, self.encoding_errors)
if re.match('', elem):
return False
return True
def trimSoup(bs):
'''Trim "body" of bs for whitespace including <br>'''
for rev in [False, True]:
children = list(bs.children)
if rev:
children = reversed(children)
for c in children:
if isinstance(c, bs4.element.Tag):
if c.name == 'br':
c.extract()
continue
if isinstance(c, bs4.element.NavigableString):
text = c.string
text = text.rstrip() if rev else text.lstrip()
if not text:
c.extract()
continue
c.string.replace_with(text)
break
with urllib.request.urlopen(start_url) as response:
start_page = bs4.BeautifulSoup(response, features) # Use lxml's HTML parser if possible
paginator = start_page.find('div', 'paginator')
for page_href in paginator.find_all('a'):
url = start_url + page_href.get('href')
if url not in urls:
urls.append(url)
with tqdm.tqdm(total=250) as pbar:
for url in urls:
with urllib.request.urlopen(url) as response:
soup = bs4.BeautifulSoup(response, features)
movies_list = soup.find(id = 'content').ol
for movie_item in movies_list:
if isinstance(movie_item, bs4.element.Tag):
movie_no = int(movie_item.em.string)
movie_title = movie_item.find('span', 'title').string
print('{:03d}: {}'.format(movie_no, movie_title), file=file)
pbar.update(1)
def add(self, item):
if isinstance(item, bs4.element.Tag):
item = item.string
if item:
self[item] = None
:param selector: CSS selector or a bs4.element.Tag object to identify
the form to select.
If not specified, ``selector`` defaults to "form", which is
useful if, e.g., there is only one form on the page.
For ``selector`` syntax, see the `.select() method in BeautifulSoup
`__.
:param nr: A zero-based index specifying which form among those that
match ``selector`` will be selected. Useful when one or more forms
have the same attributes as the form you want to select, and its
position on the page is the only way to uniquely identify it.
Default is the first matching form (``nr=0``).
:return: The selected form as a soup object. It can also be
retrieved later with the :attr:`form` attribute.
"""
if isinstance(selector, bs4.element.Tag):
if selector.name != "form":
raise LinkNotFoundError
self.__state.form = Form(selector)
else:
# nr is a 0-based index for consistency with mechanize
found_forms = self.page.select(selector,
limit=nr + 1)
if len(found_forms) != nr + 1:
if self.__debug:
print('select_form failed for', selector)
self.launch_browser()
raise LinkNotFoundError()
self.__state.form = Form(found_forms[-1])
return self.form
elif keyword in bs_node.name:
# print("Found in tag name")
noscript = self.closeNoscript(bs_node)
d = {"type": "tag", "value": bs_node.name, "noscript": noscript}
if d not in entries:
entries.append(d)
# recursively search injection points for the same variable
for x in bs_node.contents:
self.study(x, parent=bs_node, keyword=keyword, entries=entries)
elif isinstance(bs_node, element.Comment):
# print("Found in comment, tag {0}".format(parent.name))
noscript = self.closeNoscript(bs_node)
d = {"type": "comment", "parent": parent.name, "noscript": noscript}
if d not in entries:
entries.append(d)
elif isinstance(bs_node, element.NavigableString):
# print("Found in text, tag {0}".format(parent.name))
noscript = self.closeNoscript(bs_node)
d = {"type": "text", "parent": parent.name, "noscript": noscript}
if d not in entries:
entries.append(d)
def ExtractXPath(element, use_id=True):
# Check that element is a tag node
if type(element) != bs4.element.Tag:
raise ExtractXPathError(
'%s is not a supported data type. '
'Only tag nodes from the tag tree are accepted.'
% type(element)
)
# Starting node
# Check id first
if use_id and element.get('id') is not None:
return '//*/' + element.name + '[@id="' + element.get('id') + '"]'
xpath = check_previous_tags(element)
# Parent Nodes
for parent in element.parents:
# End of XPath - exclude from string