How to use the underthesea.corpus.viet_dict_11K.words function in underthesea

To help you get started, we’ve selected a few underthesea examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github undertheseanlp / underthesea / tests / test_corpus / test_vietdict.py View on Github external
def test_unicode(self):
        word = viet_dict_11K.words[0]
        self.assertEqual(unicode, type(word))
github undertheseanlp / underthesea / tests / test_corpus / test_vietdict.py View on Github external
def test_viet_dict(self):
        words = viet_dict_11K.words
        self.assertEqual(11373, len(words))
github undertheseanlp / word_tokenize / pipelines / model_evaluation / analysis_word.py View on Github external
def compare_dictionary(model_output_folder):
    # f = open(join(dirname(__file__), "logs", "crf", "new_word.txt"), "w")
    # f1 = open(join(dirname(__file__), "logs", "crf", "word_in_dictionary.txt"), "w")
    corpus = PlainTextCorpus()
    corpus.load(model_output_folder)
    new_words = []
    words = []
    for document in corpus.documents:
        for sentences in document.sentences:
            for word in sentences.split(" "):
                if '_' in word:
                    new_words.append(word)
    dictionary = viet_dict_11K.words
    for word in new_words:
        words.append(word.replace('_', ' '))
    new_word = [x for x in words if x not in dictionary]

    new_word = set(new_word)
    new_word = sorted(new_word)
    new_word_per_dict = float(len(new_word)) / float(len(dictionary)) * 100
    # f.write("Scale word not in dictionary %0.2f: \n" % new_word_per_dict)
    # for word in new_word:
    #     f.write(word.encode('utf-8') + "\n")
    word_in_dictionary = [x for x in words if x in dictionary]

    word_in_dictionary = set(word_in_dictionary)
    word_in_dictionary = sorted(word_in_dictionary)
    word_in_dictionary_per_total = float(len(word_in_dictionary)) / float(len(viet_dict_11K.words))
    # f1.write("scale word in dictionary: %0.2f \n" % word_in_dictionary_per_total)
github undertheseanlp / word_tokenize / models / dictionary_based / model.py View on Github external
def predict(self, sentence):
        words = viet_dict_11K.words
        dictionary = [word for word in words if re.search(" $", word) is None]

        tokenized_words = [word.replace(" ", "_") for word in dictionary]
        s = sentence
        for word, tokenized_word in zip(dictionary, tokenized_words):
            if word in sentence:
                s = s.replace(word, tokenized_word)
        return s
github undertheseanlp / word_tokenize / pipelines / model_evaluation / analysis_word.py View on Github external
dictionary = viet_dict_11K.words
    for word in new_words:
        words.append(word.replace('_', ' '))
    new_word = [x for x in words if x not in dictionary]

    new_word = set(new_word)
    new_word = sorted(new_word)
    new_word_per_dict = float(len(new_word)) / float(len(dictionary)) * 100
    # f.write("Scale word not in dictionary %0.2f: \n" % new_word_per_dict)
    # for word in new_word:
    #     f.write(word.encode('utf-8') + "\n")
    word_in_dictionary = [x for x in words if x in dictionary]

    word_in_dictionary = set(word_in_dictionary)
    word_in_dictionary = sorted(word_in_dictionary)
    word_in_dictionary_per_total = float(len(word_in_dictionary)) / float(len(viet_dict_11K.words))
    # f1.write("scale word in dictionary: %0.2f \n" % word_in_dictionary_per_total)
    # for word in word_in_dictionary:
    #     f1.write(word.encode('utf-8') + "\n")
    return new_word, word_in_dictionary
github undertheseanlp / word_tokenize / pipelines / benchmark1.py View on Github external
f.write(
        "IW\t" + str(confusion_matrix[1][0]) + "\t" + str(confusion_matrix[1][1]) + "\t" + str(
            confusion_matrix[1][2]) + "\n")
    f.write("O\t" + str(confusion_matrix[2][0]) + "\t\t" + str(confusion_matrix[2][1]) + "\t\t" + str(
        confusion_matrix[2][2]) + "\n")

    plt.figure()
    class_name = ["BW", "IW", "O"]
    Confution_Matrix.plot_confusion_matrix(confusion_matrix, classes=class_name,
                                           title='Confusion matrix')
    f.write("\n\n")
    (new_word, word_in_dictionary) = compare_dictionary(model_output_folder)
    f.write("Word Analysis: \n")
    f.write("- Word in dictionary : %d\n" % len(word_in_dictionary))
    f.write("- New Word : %d\n" % len(new_word))
    coverage = float(len(new_word)) / float(len(viet_dict_11K.words))
    f.write("- Word Coverage : %0.2f\n" % coverage)
    f.write("\n\n")
    plt.savefig('confusion matrix.png')
    plt.show()
    time_stop = time.time()
    time_per_token = (time_stop - time_start) / float(count_token(actual_corpus.documents))
    f.write("Time speed: %0.6f second per token\n" % time_per_token)
    print 0