How to use the textacy.compat.unicode_ function in textacy

To help you get started, we’ve selected a few textacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / tests / test_readme.py View on Github external
def test_plaintext_functionality(text):
    preprocessed_text = preprocessing.normalize_whitespace(text)
    preprocessed_text = preprocessing.remove_punctuation(text)
    preprocessed_text = preprocessed_text.lower()
    assert all(char.islower() for char in preprocessed_text if char.isalpha())
    assert all(char.isalnum() or char.isspace() for char in preprocessed_text)
    keyword = "America"
    kwics = text_utils.keyword_in_context(
        text, keyword, window_width=35, print_only=False
    )
    for pre, kw, post in kwics:
        assert kw == keyword
        assert isinstance(pre, compat.unicode_)
        assert isinstance(post, compat.unicode_)
github chartbeat-labs / textacy / tests / spacier / test_doc_extensions.py View on Github external
def test_lang(self, doc):
        lang = doc._.lang
        assert isinstance(lang, compat.unicode_)
        assert lang == doc.vocab.lang
github chartbeat-labs / textacy / textacy / corpora / wiki_reader.py View on Github external
if obj_title.startswith('File:') or obj_title.startswith('Image:'):
                            section.remove(obj)
                    except Exception:
                        pass
                sec['text'] = unicode_(section.strip_code(normalize=True, collapse=True)).strip()
                if sec.get('title'):
                    sec['text'] = re.sub(r'^' + re.escape(sec['title']) + r'\s*', '', sec['text'])
                parsed_page['sections'].append(sec)
                section_idx += 1

            # dammit! the parser has failed us; let's handle it as best we can
            elif len(headings) > 1:
                titles = [unicode_(h.title).strip() for h in headings]
                levels = [int(h.level) for h in headings]
                sub_sections = [
                    unicode_(ss) for ss in
                    re.split(r'\s*' + '|'.join(re.escape(unicode_(h)) for h in headings) + r'\s*', unicode_(section))]
                # re.split leaves an empty string result up front :shrug:
                if sub_sections[0] == '':
                    del sub_sections[0]
                if len(headings) != len(sub_sections):
                    LOGGER.warning(
                        '# headings = %s, but # sections = %s',
                        len(headings), len(sub_sections))
                for i, sub_section in enumerate(sub_sections):
                    try:
                        if titles[i].lower() in bad_section_titles:
                            continue
                        parsed_page['sections'].append({'title': titles[i], 'level': levels[i], 'idx': section_idx,
                                                        'text': strip_markup(sub_section)})
                        section_idx += 1
                    except IndexError:
github chartbeat-labs / textacy / textacy / ke / yake.py View on Github external
must be in the interval (0.0, 1.0], which is converted to an int by
            ``int(round(len(candidates) * topn))``

    Returns:
        List[Tuple[str, float]]: Sorted list of top ``topn`` key terms and
        their corresponding scores.

    References:
        Campos, Mangaravite, Pasquali, Jorge, Nunes, and Jatowt. (2018).
        A Text Feature Based Automatic Keyword Extraction Method for Single Documents.
        Advances in Information Retrieval. ECIR 2018.
        Lecture Notes in Computer Science, vol 10772, pp. 684-691.
    """
    # validate / transform args
    ngrams = utils.to_collection(ngrams, int, tuple)
    include_pos = utils.to_collection(include_pos, compat.unicode_, set)
    if isinstance(topn, float):
        if not 0.0 < topn <= 1.0:
            raise ValueError(
                "topn={} is invalid; "
                "must be an int, or a float between 0.0 and 1.0".format(topn)
            )

    # bail out on empty docs
    if not doc:
        return []

    stop_words = set()
    seen_candidates = set()
    # compute key values on a per-word basis
    word_occ_vals = _get_per_word_occurrence_values(doc, normalize, stop_words, window_size)
    # doc doesn't have any words...
github chartbeat-labs / textacy / textacy / corpora / wiki_reader.py View on Github external
def _parse_content(self, content, parser):
        wikicode = parser.parse(content)
        parsed_page = {'sections': []}

        wikilinks = [unicode_(wc.title) for wc in wikicode.ifilter_wikilinks()]
        parsed_page['categories'] = [wc for wc in wikilinks if wc.startswith('Category:')]
        parsed_page['wiki_links'] = [wc for wc in wikilinks
                                     if not wc.startswith('Category:') and
                                     not wc.startswith('File:') and
                                     not wc.startswith('Image:')]
        parsed_page['ext_links'] = [
            unicode_(wc.url) for wc in wikicode.ifilter_external_links()]

        def _filter_tags(obj):
            return obj.tag == 'ref' or obj.tag == 'table'

        bad_section_titles = {'external links', 'notes', 'references'}
        section_idx = 0

        for section in wikicode.get_sections(flat=True, include_lead=True, include_headings=True):
            headings = section.filter_headings()
github chartbeat-labs / textacy / textacy / datasets / wikipedia.py View on Github external
]
                categories = [
                    wc
                    for wc in wikilinks
                    if wc.startswith(cat_link) or wc.startswith(lc_cat_link)
                ]
                parsed_record["categories"] = categories
                parsed_record["wiki_links"] = [
                    wc
                    for wc in wikilinks
                    if wc not in categories
                    and not wc.startswith("File:")
                    and not wc.startswith("Image:")
                ]
                parsed_record["ext_links"] = [
                    compat.unicode_(wc.url) for wc in wikicode.ifilter_external_links()
                ]
                parsed_record["text"] = strip_markup_slow(
                    wikicode, include_headings, parser,
                )
                return parsed_record
github chartbeat-labs / textacy / textacy / corpora / wiki_reader.py View on Github external
if sec_title.lower() in bad_section_titles:
                        continue
                    sec['title'] = sec_title
                    sec['level'] = int(headings[0].level)
                except IndexError:
                    if section_idx == 0:
                        sec['level'] = 1
                # strip out references, tables, and file/image links
                for obj in section.ifilter_tags(matches=_filter_tags, recursive=True):
                    try:
                        section.remove(obj)
                    except Exception:
                        continue
                for obj in section.ifilter_wikilinks(recursive=True):
                    try:
                        obj_title = unicode_(obj.title)
                        if obj_title.startswith('File:') or obj_title.startswith('Image:'):
                            section.remove(obj)
                    except Exception:
                        pass
                sec['text'] = unicode_(section.strip_code(normalize=True, collapse=True)).strip()
                if sec.get('title'):
                    sec['text'] = re.sub(r'^' + re.escape(sec['title']) + r'\s*', '', sec['text'])
                parsed_page['sections'].append(sec)
                section_idx += 1

            # dammit! the parser has failed us; let's handle it as best we can
            elif len(headings) > 1:
                titles = [unicode_(h.title).strip() for h in headings]
                levels = [int(h.level) for h in headings]
                sub_sections = [
                    unicode_(ss) for ss in
github chartbeat-labs / textacy / textacy / spacier / utils.py View on Github external
an already-instantiated spaCy language pipeline.
        chunk_size (int): Number of characters comprising each text chunk
            (excluding the last chunk, which is probably smaller). For best
            performance, value should be somewhere between 1e3 and 1e7,
            depending on how much RAM you have available.

            .. note:: Since chunking is done by character, chunks edges' probably
               won't respect natural language segmentation, which means that every
               ``chunk_size`` characters, spaCy will probably get tripped up and
               make weird parsing errors.

    Returns:
        :class:`spacy.tokens.Doc`: A single processed document, initialized from
        components accumulated chunk by chunk.
    """
    if isinstance(lang, compat.unicode_):
        lang = cache.load_spacy_lang(lang)
    elif not isinstance(lang, Language):
        raise TypeError(
            "`lang` must be {}, not {}".format({compat.unicode_, Language}, type(lang))
        )

    words = []
    spaces = []
    np_arrays = []
    cols = [attrs.POS, attrs.TAG, attrs.DEP, attrs.HEAD, attrs.ENT_IOB, attrs.ENT_TYPE]
    text_len = len(text)
    i = 0
    # iterate over text chunks and accumulate components needed to make a doc
    while i < text_len:
        chunk_doc = lang(text[i : i + chunk_size])
        words.extend(tok.text for tok in chunk_doc)
github chartbeat-labs / textacy / textacy / datasets / wikipedia.py View on Github external
def slow_parse(include_headings, parser, record):
                wikicode = parser.parse(record["text"])
                parsed_record = record.copy()
                cat_link = MAPPING_CAT[self.lang]
                # catch category links errantly marked up in lowercase
                lc_cat_link = cat_link.lower()
                wikilinks = [
                    compat.unicode_(wc.title)
                    for wc in wikicode.ifilter_wikilinks()
                ]
                categories = [
                    wc
                    for wc in wikilinks
                    if wc.startswith(cat_link) or wc.startswith(lc_cat_link)
                ]
                parsed_record["categories"] = categories
                parsed_record["wiki_links"] = [
                    wc
                    for wc in wikilinks
                    if wc not in categories
                    and not wc.startswith("File:")
                    and not wc.startswith("Image:")
                ]
                parsed_record["ext_links"] = [