How to use the underthesea.corpus.PlainTextCorpus function in underthesea

To help you get started, we’ve selected a few underthesea examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github undertheseanlp / underthesea / tests / test_corpus / test_plaintext.py View on Github external
def test_save(self):
        corpus = PlainTextCorpus()
        corpus.load(self.plaintext_folder)
        corpus.save(self.saved_plaintext_folder)
        files = listdir(self.saved_plaintext_folder)
        self.assertEqual(4, len(files))
        try:
            shutil.rmtree(self.saved_plaintext_folder)
        except Exception:
            pass
github undertheseanlp / underthesea / tests / test_corpus / test_plaintext.py View on Github external
def test___init__(self):
        corpus = PlainTextCorpus()
        self.assertIsNone(corpus.documents)
github undertheseanlp / word_tokenize / pipelines / model_evaluation / error_analysis.py View on Github external
from os.path import dirname
from os.path import join
from underthesea.corpus import PlainTextCorpus

model_name = "output_crf"
model_output_folder = join(dirname(dirname(__file__)), "data", "corpus", "train", model_name)
input_folder = join(dirname(dirname(__file__)), "data", "corpus", "train", "input")
actual_corpus = PlainTextCorpus()
actual_corpus.load(model_output_folder)
input_corpus = PlainTextCorpus()
input_corpus.load(input_folder)

f = open(join(dirname(__file__), "error_analysis", "input_word.txt"), "w")
f1 = open(join(dirname(__file__), "error_analysis", "output_word.txt"), "w")
actual_words = []
input_words = []
for a in actual_corpus.documents:
    for a_sentences in a.sentences:
        for a_word in a_sentences.split(' '):
            actual_words.append(a_word)
for i in input_corpus.documents:
    for i_sentences in i.sentences:
        for i_word in i_sentences.split(' '):
            input_words.append(i_word)
github undertheseanlp / word_tokenize / pipelines / model_evaluation / analysis_word.py View on Github external
def compare_dictionary(model_output_folder):
    # f = open(join(dirname(__file__), "logs", "crf", "new_word.txt"), "w")
    # f1 = open(join(dirname(__file__), "logs", "crf", "word_in_dictionary.txt"), "w")
    corpus = PlainTextCorpus()
    corpus.load(model_output_folder)
    new_words = []
    words = []
    for document in corpus.documents:
        for sentences in document.sentences:
            for word in sentences.split(" "):
                if '_' in word:
                    new_words.append(word)
    dictionary = viet_dict_11K.words
    for word in new_words:
        words.append(word.replace('_', ' '))
    new_word = [x for x in words if x not in dictionary]

    new_word = set(new_word)
    new_word = sorted(new_word)
    new_word_per_dict = float(len(new_word)) / float(len(dictionary)) * 100
github undertheseanlp / word_tokenize / eda / eda.py View on Github external
from underthesea.corpus import PlainTextCorpus


def count_token(documents):
    count = 0
    for document in documents:
        for sentences in document.sentences:
            for word in sentences.split(' '):
                count += 1
    return count


f = open(join(dirname(__file__), "eda", "anonymous", "stats.txt"), "w")
f.write("[Statistics] Train Data Set\n")
train_folder = join(dirname(__file__), "corpus", "anonymous", "train")
train_corpus = PlainTextCorpus()
train_corpus.load(train_folder)
f.write("Total documents: %d\n" % len(train_corpus.documents))
s = pd.Series([len(d.sentences) for d in train_corpus.documents])

print(pd.Series.describe(s))
f.write("Min token in sentence %d\n" % s.describe()['min'])
f.write("Max token in sentence %d\n" % s.describe()['max'])
f.write("Total sentences: %d\n" % sum(s))
f.write("Total token: %d\n" % count_token(train_corpus.documents))
f.write("\n")

f.write("[Statistics] Test Data Set\n")
test_folder = join(dirname(__file__), "corpus", "anonymous", "test", "output")
test_corpus = PlainTextCorpus()
test_corpus.load(test_folder)
# (new_word, word_in_dictionary) = compare_dictionary(train_folder)
github undertheseanlp / word_tokenize / models / crf_2 / transformer.py View on Github external
def load_train_sents(self):
        corpus = PlainTextCorpus()
        file_path = join(dirname(dirname(dirname(__file__))), "corpus", "anonymous", "train")
        corpus.load(file_path)
        sentences = []
        for document in corpus.documents:
            for sentence in document.sentences:
                if sentence != "":
                    sentences.append(sentence)
        return sentences
github undertheseanlp / word_tokenize / models / dummy / make_output.py View on Github external
from underthesea.corpus import PlainTextCorpus
from os.path import join, dirname
from model import DummyModel

input_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus", "train", "input")
output_dummy_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus", "train", "output_dummy")
corpus = PlainTextCorpus()
corpus.load(input_folder)
output = PlainTextCorpus()
model = DummyModel()
for document in corpus.documents:
    sentences = document.sentences
    sentences = [sentence.lower() for sentence in sentences]
    output = [model.predict(s) for s in sentences]
    document.sentences = output
corpus.save(output_dummy_folder)
github undertheseanlp / word_tokenize / pipelines / model_evaluation / get_score.py View on Github external
def get_data():
    output_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus_2", "test", "output")
    model_output_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus_2", "test", "output_%s" % model_name)
    expected_corpus = PlainTextCorpus()
    expected_corpus.load(output_folder)
    actual_corpus = PlainTextCorpus()
    actual_corpus.load(model_output_folder)
    return expected_corpus, actual_corpus
github undertheseanlp / word_tokenize / models / crf_model / make_output.py View on Github external
# -*- coding: utf-8 -*-
from os.path import dirname
from os.path import join
import time

from model import CRFModel
from underthesea.corpus import PlainTextCorpus

start = time.time()
input_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus", "test", "input")
output_crf_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus", "test", "output_crf")
# input_folder = join(dirname(dirname(dirname(__file__))), "data", "test", "input")
# output_crf_folder = join(dirname(dirname(dirname(__file__))), "data", "test", "output")
corpus = PlainTextCorpus()
corpus.load(input_folder)
output = PlainTextCorpus()
model = CRFModel()
for document in corpus.documents:
    print document.id
    sentences = document.sentences
    output = []
    for sentence in sentences:
        sentence = model.predict(sentence)
        output.append(sentence)

    document.sentences = output

count = 0
for document in corpus.documents:
    sentences = document.sentences
    count += sentences.__len__()
# path = join(dirname(dirname(dirname(__file__))), 'data', 'raw', 'train', 'output')