How to use the urlscan.urlscan.HTMLChunker function in urlscan

To help you get started, we’ve selected a few urlscan examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github firecat53 / urlscan / urlscan / urlscan.py View on Github external
def extracthtmlurls(mesg):
    """Extract URLs with context from html type message. Similar to extracturls.

    """
    chunk = HTMLChunker()
    chunk.feed(mesg)
    chunk.close()
    # above_context = 1
    # below_context = 1

    def somechunkisurl(chunks):
        for chnk in chunks:
            if chnk.url is not None:
                return True
        return False

    return extract_with_context(chunk.rval, somechunkisurl, 1, 1)
github firecat53 / urlscan / urlscan / urlscan.py View on Github external
def handle_entityref(self, name):
        if name in HTMLChunker.entities:
            self.handle_data(HTMLChunker.entities[name])
        else:
            # If you see a reference, it needs to be
            # added above.
            self.handle_data('&%s;' % name)
github firecat53 / urlscan / urlscan / urlscan.py View on Github external
def handle_starttag(self, tag, attrs):
        if tag == 'a':
            self.anchor_stack.append(self.findattr(attrs, 'href'))
        elif tag in ('ul', 'ol'):
            self.list_stack.append((tag, 1))
            self.end_para()
        elif tag in HTMLChunker.tag_styles:
            self.style_stack.append(self.style_stack[-1] |
                                    set([HTMLChunker.tag_styles[tag]]))
        elif isheadertag(tag):
            self.style_stack.append(self.style_stack[-1] | set(['bold']))
        elif tag in ('p', 'br'):
            self.end_para()
        elif tag == 'img':
            # Since we expect HTML *email*, image links
            # should be external (naja?)
            alt = self.findattr(attrs, 'alt')
            if alt is None:
                alt = '[IMG]'
            src = self.findattr(attrs, 'src')
            if src is not None and not src.startswith(('http://', 'https://')):
                src = None
github firecat53 / urlscan / urlscan / urlscan.py View on Github external
def handle_charref(self, name):
        if name[0] == 'x':
            char = int(name[1:], 16)
        else:
            char = int(name)
        if char < 128:
            name = chr(char)
        elif char in HTMLChunker.extrachars:
            name = HTMLChunker.extrachars[char]
        else:
            name = '&#%s;' % name
        self.handle_data(name)