How to use the pycorrector.text_preprocess.is_chinese_string function in pycorrector

To help you get started, we’ve selected a few pycorrector examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github shibing624 / pycorrector / pycorrector / corrector.py View on Github external
# same first pinyin
        confusion_char_set = _get_confusion_set(word[0])
        confusion = [i + word[1:] for i in confusion_char_set if i]
        candidates_2_order.extend(confusion)
        # same last pinyin
        confusion_char_set = _get_confusion_set(word[-1])
        confusion = [word[:-1] + i for i in confusion_char_set]
        candidates_2_order.extend(confusion)
        if len(word) > 2:
            # same mid pinyin
            confusion_char_set = _get_confusion_set(word[1])
            confusion = [word[0] + i + word[2:] for i in confusion_char_set]
            candidates_3_order.extend(confusion)
    # add all confusion word list
    confusion_word_set = set(candidates_1_order + candidates_2_order + candidates_3_order)
    confusion_word_list = [item for item in confusion_word_set if is_chinese_string(item)]
    confusion_sorted = sorted(confusion_word_list, key=lambda k: \
        get_frequency(k), reverse=True)
    return confusion_sorted[:len(confusion_word_list) // fraction + 1]
github shibing624 / pycorrector / pycorrector / corrector.py View on Github external
def _generate_items(word, fraction=2):
    if not is_chinese_string(word):
        return []
    candidates_1_order = []
    candidates_2_order = []
    candidates_3_order = []
    candidate_words = list(_known(_edit_distance_word(word, cn_char_set)))
    for candidate_word in candidate_words:
        if lazy_pinyin(candidate_word) == lazy_pinyin(word):
            # same pinyin
            candidates_1_order.append(candidate_word)
    if len(word) == 1:
        # same pinyin
        confusion_char_set = _get_confusion_set(word[0])
        confusion = [i for i in confusion_char_set if i]
        candidates_2_order.extend(confusion)
    if len(word) > 1:
        # same first pinyin