How to use the normality.collapse_spaces function in normality

To help you get started, we’ve selected a few normality examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github occrp-attic / ingestors / ingestors / support / html.py View on Github external
def extract_html_header(self, doc):
        """Get metadata from the HTML head element."""
        self.update('title', self.get_meta(doc, 'og:title'))
        self.update('title', doc.findtext('.//title'))
        self.update('summary', self.get_meta(doc, 'og:description'))
        self.update('summary', self.get_meta(doc, 'description'))
        self.update('author', self.get_meta(doc, 'author'))
        self.update('author', self.get_meta(doc, 'og:site_name'))
        self.update('published_at', self.get_meta(doc, 'artcile:published_time'))  # noqa
        self.update('modified_at', self.get_meta(doc, 'artcile:modified_time'))

        for field in ['keywords', 'news_keywords']:
            content = self.get_meta(doc, field)
            if content is not None:
                for keyword in content.split(','):
                    keyword = collapse_spaces(keyword)
                    if len(keyword):
                        self.result.emit_keyword(keyword)
github alephdata / aleph / services / ingest-file / ingestors / support / html.py View on Github external
def get_meta(self, doc, field):
        for field_attr in ('property', 'name'):
            for el in doc.findall('.//meta[@%s="%s"]' % (field_attr, field)):
                content = collapse_spaces(el.get('content'))
                if content is not None and len(content):
                    return content
github alephdata / aleph / services / extract-entities / entityextractor / result.py View on Github external
def clean_name(cls, text):
        if text is None or len(text) > MAX_LENGTH:
            return
        text = clean_entity_name(text)
        text = collapse_spaces(text)
        if not len(text) or len(text) < MIN_LENGTH:
            return
        return text
github alephdata / aleph / aleph / logic / bulk / formatting.py View on Github external
def apply(self, record):
        value = six.text_type(self.template)
        for repl, ref in self.replacements.items():
            ref_value = record.get(ref) or ''
            ref_value = six.text_type(ref_value)
            value = value.replace(repl, ref_value)
        return collapse_spaces(value).strip()
github occrp-attic / ingestors / ingestors / support / html.py View on Github external
def get_meta(self, doc, field):
        for field_attr in ('property', 'name'):
            for el in doc.findall('.//meta[@%s="%s"]' % (field_attr, field)):
                content = collapse_spaces(el.get('content'))
                if content is not None and len(content):
                    return content
github alephdata / opensanctions / opensanctions / models.py View on Github external
def name(self, name):
        name = stringify(name)
        if name is not None:
            name = collapse_spaces(name)
        self._name = name
github alephdata / opensanctions / opensanctions / crawlers / us_cia_world_leaders.py View on Github external
def element_text(el):
    if el is None:
        return
    text = stringify(el.text_content())
    if text is not None:
        return collapse_spaces(text)
github occrp-attic / ingestors / ingestors / support / html.py View on Github external
def extract_html_text(self, doc):
        """Get all text from a DOM, also used by the XML parser."""
        text = ' '.join(self.extract_html_elements(doc))
        text = collapse_spaces(text)
        if len(text):
            return text
github alephdata / memorious / memorious / operations / parse.py View on Github external
def parse_for_metadata(context, data, html):
    meta = context.params.get('meta', {})
    meta_date = context.params.get('meta_date', {})

    meta_paths = meta
    meta_paths.update(meta_date)

    for key, xpaths in meta_paths.items():
        for xpath in ensure_list(xpaths):
            element = html.find(xpath)
            if element is None:
                continue
            value = collapse_spaces(element.text_content())
            if key in meta_date:
                value = iso_date(value)
            if value is not None:
                data[key] = value
            break

    return data