How to use the html.parser.HTMLParser function in html

To help you get started, we’ve selected a few html examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github GRASS-GIS / grass-ci / tools / g.html2man / ghtml.py View on Github external
allowed = omit(allowed, omit_start)

excluded = {
    "a": ["a"],
    "button": formctrl + ["a", "form", "isindex", "fieldset", "iframe"],
    "dir": block,
    "form": ["form"],
    "label": ["label"],
    "menu": block,
    "pre": pre_exclusion
}

excluded = setify(excluded)


class HTMLParser(base.HTMLParser):

    def __init__(self, entities=None):
        base.HTMLParser.__init__(self)
        self.tag_stack = []
        self.excluded = frozenset()
        self.excluded_stack = []
        self.data = []
        self.data_stack = []
        self.decls = []
        if entities:
            self.entities = entities
        else:
            self.entities = {}

    def top(self):
        if self.tag_stack == []:
github qbittorrent / search-plugins / nova3 / engines / limetorrents.py View on Github external
import ssl
ssl._create_default_https_context = ssl._create_unverified_context


class limetorrents(object):
    url = "https://www.limetorrents.info"
    name = "LimeTorrents"
    supported_categories = {'all': 'all',
                            'anime': 'anime',
                            'software': 'applications',
                            'games': 'games',
                            'movies': 'movies',
                            'music': 'music',
                            'tv': 'tv'}

    class MyHtmlParser(HTMLParser):
        """ Sub-class for parsing results """

        def error(self, message):
            pass

        A, TD, TR, HREF = ('a', 'td', 'tr', 'href')

        def __init__(self, url):
            HTMLParser.__init__(self)
            self.url = url
            self.current_item = {}  # dict for found item
            self.item_name = None  # key's name in current_item dict
            self.page_empty = 22000
            self.inside_tr = False
            self.findTable = False
            self.parser_class = {"tdnormal": "size",  # class
github Thimxx / pyGenealogical-Tools / pyRegisters / pyrememori.py View on Github external
def handle_endtag(self, tag):
        if tag == 'div' and self.inside_results:
            #We closed one of the tags
            if (self.internal_div > 0):
                self.internal_div -= 1
                if (self.internal_div == 0):
                    self.inside_results = False
        if self.inside_results:
            if (tag == "li"):
                name, surname, _ = get_name_surname_from_complete_name(self.name, convention="spanish_surname")
                prof_record = gen_profile(name, surname)
                prof_record.setWebReference(self.web_link)
                prof_record.setCheckedDate("death", self.death_date.year, self.death_date.month,self.death_date.day,"EXACT")
                prof_record.setComments(self.comments)
                self.records.append(prof_record)
class RememoryPersonParser(HTMLParser):
    '''
    This function will parser an specific individual to extract specific data useful for comparison
    '''
    def __init__(self):
        '''
        As input we intoduce
        profile: a generic profile to be updated
        url: the specific url address
        '''
        HTMLParser.__init__(self)
        self.location = None
        self.age = None
        self.located = False
    def handle_starttag(self, tag, attrs):
        if tag == "br":
            self.located = True
github erkyrath / tworld / twcommand.py View on Github external
def __init__(self, parent, child):
        html.parser.HTMLParser.__init__(self)
        self.parenttag = parent
        self.childtag = child
        self.results = []
        self.current = None
github amol9 / wallp / wallp / htmlparser.py View on Github external
if self._start_level is not None:
				if end:
					if level == self._start_level:
						self._start_level = None
			
			spaces = ''.join([' ' for i in range(level)])
			attr_string = None
			if attrs:
				attr_string = ''
				for (k, v) in attrs:
					attr_string += ' ' + str(k) + '=\"' + str(v) + '\"'
			print(('%s<%s%s%s>%s'%(spaces, ('/' if end else ''), tag, (attr_string if attr_string else ''),
						(' ' + msg if msg else ''))))
		

class HtmlParser(HTMLParser):
	def __init__(self, skip_tags=[], ddump=None):
		self._root = None
		self._stack = []
		self._skip_tags = skip_tags
		self._skip = False, None
		self._ddump = ddump

		if is_py3():
			HTMLParser.__init__(self, convert_charrefs=True)
		else:
			HTMLParser.__init__(self)

	
	def handle_starttag(self, tag, attrs):
		if self._skip[0] == True:
			return
github frnsys / port / port / build.py View on Github external
from nom.md2html import compile_markdown
from port.fs import FileManager
from jinja2 import Environment, FileSystemLoader

meta_re = re.compile(r'^---\n(.*?)\n---', re.DOTALL)
title_re = re.compile(r'^#\s?([^#\n]+)')
md_img_re = re.compile(r'!\[.*?\]\(`?([^`\(\)]+)`?\)')


class Bunch():
    def __init__(self, **data):
        for k, v in data.items():
            setattr(self, k, v)


class HTMLCleaner(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)


def remove_html(html):
    cleaner = HTMLCleaner()
    cleaner.feed(html)
    return cleaner.get_data()
github codeforpdx / recordexpungPDX / src / backend / expungeservice / crawler / parsers / node_parser.py View on Github external
from html.parser import HTMLParser


class NodeParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.node_id = ""
        self.stop_flag = False

    def handle_starttag(self, tag, attrs):
        if not self.stop_flag and tag == "option":
            self.node_id = dict(attrs)["value"]
            self.stop_flag = True

    # TODO: Add error response here.
    def error(self, message):
        pass
github googlefonts / nototools / nototools / extract_ohchr_attributions.py View on Github external
def __init__(self, trace=False):
    html.HTMLParser.__init__(self)
    self.trace = trace
    self.result_list = []
    self.restart()
github CenterForOpenScience / modular-file-renderer / mfr / extensions / jasp / html_processor.py View on Github external
from io import StringIO
from html.parser import HTMLParser

import base64

class HTMLProcessor(HTMLParser):

    # The HTMLProcessor replaces the src attribute in <img> tags with the base64 equivalent
    # The image content comes from the zip_file (specified with set_src_source())
    # It also strips 
github Jenyay / outwiker / plugins / webpage / webpage / libs / bs4 / diagnose.py View on Github external
print("Here's what %s did with the markup:" % parser)
            print(soup.prettify())

        print("-" * 80)

def lxml_trace(data, html=True, **kwargs):
    """Print out the lxml events that occur during parsing.

    This lets you see how lxml parses a document when no Beautiful
    Soup code is running.
    """
    from lxml import etree
    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
        print(("%s, %4s, %s" % (event, element.tag, element.text)))

class AnnouncingParser(HTMLParser):
    """Announces HTMLParser parse events, without doing anything else."""

    def _p(self, s):
        print(s)

    def handle_starttag(self, name, attrs):
        self._p("%s START" % name)

    def handle_endtag(self, name):
        self._p("%s END" % name)

    def handle_data(self, data):
        self._p("%s DATA" % data)

    def handle_charref(self, name):
        self._p("%s CHARREF" % name)