How to use the underthesea.word_sent.tokenize.tokenize function in underthesea

To help you get started, we’ve selected a few underthesea examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github undertheseanlp / underthesea / tests / word_sent / test_performance_2.py View on Github external
def test_1(self):
        n_tokens = len(tokenize(self.text).split(" "))
        start = time.time()
        word_sent(self.text)
        end = time.time()
        duration = end - start  # in seconds
        if duration != 0:
            speed = n_tokens / duration
            print("Speed: ", speed)
            self.assertGreater(speed, EXPECTED_SPEED)
github undertheseanlp / underthesea / tests / word_sent / test_performance.py View on Github external
def test_1(self):
        n_tokens = 0
        for text in self.texts:
            n_tokens += len(tokenize(text).split(" "))
        start = time.time()
        for text in self.texts:
            word_sent(text)
        end = time.time()
        duration = end - start  # in seconds
        speed = n_tokens / duration
        print("Speed: ", speed)
        self.assertGreater(speed, EXPECTED_SPEED)
github undertheseanlp / underthesea / underthesea / word_sent_4 / __init__.py View on Github external
def word_sent(sentence, format=None):
    """
    part of speech tagging

    :param unicode|str sentence: raw sentence
    :return: tagged sentence
    :rtype: list
    """
    sentence = tokenize(sentence).split()
    crf_model = CRFModel.Instance()
    output = crf_model.predict(sentence, format)
    tokens = [token[0] for token in output]
    tags = [token[1] for token in output]
    output = []
    for tag, token in zip(tags, tokens):
        if tag == "IW":
            output[-1] = output[-1] + u" " + token
        else:
            output.append(token)
    if format == "text":
        output = u" ".join([item.replace(" ", "_") for item in output])
    return output
github undertheseanlp / underthesea / underthesea / word_sent_3 / __init__.py View on Github external
def word_sent(sentence, format=None):
    """
    part of speech tagging

    :param unicode|str sentence: raw sentence
    :return: tagged sentence
    :rtype: list
    """
    sentence = tokenize(sentence).split()
    crf_model = CRFModel.Instance()
    output = crf_model.predict(sentence, format)
    tokens = [token[0] for token in output]
    tags = [token[1] for token in output]
    output = []
    for tag, token in zip(tags, tokens):
        if tag == "IW":
            output[-1] = output[-1] + u" " + token
        else:
            output.append(token)
    if format == "text":
        output = u" ".join([item.replace(" ", "_") for item in output])
    return output
github undertheseanlp / underthesea / underthesea / word_sent_6 / __init__.py View on Github external
def word_sent(sentence, format=None):
    """
    part of speech tagging

    :param unicode|str sentence: raw sentence
    :return: tagged sentence
    :rtype: list
    """
    sentence = tokenize(sentence).split()
    crf_model = CRFModel.Instance()
    output = crf_model.predict(sentence, format)
    tokens = [token[0] for token in output]
    tags = [token[1] for token in output]
    output = []
    for tag, token in zip(tags, tokens):
        if tag == "IW":
            output[-1] = output[-1] + u" " + token
        else:
            output.append(token)
    if format == "text":
        output = u" ".join([item.replace(" ", "_") for item in output])
    return output
github undertheseanlp / underthesea / underthesea / word_sent_5 / __init__.py View on Github external
def word_sent(sentence, format=None):
    """
    part of speech tagging

    :param unicode|str sentence: raw sentence
    :return: tagged sentence
    :rtype: list
    """
    sentence = tokenize(sentence).split()
    crf_model = CRFModel.Instance()
    output = crf_model.predict(sentence, format)
    tokens = [token[0] for token in output]
    tags = [token[1] for token in output]
    output = []
    for tag, token in zip(tags, tokens):
        if tag == "IW":
            output[-1] = output[-1] + u" " + token
        else:
            output.append(token)
    if format == "text":
        output = u" ".join([item.replace(" ", "_") for item in output])
    return output
github undertheseanlp / underthesea / underthesea / word_sent_2 / __init__.py View on Github external
def word_sent(sentence, format=None):
    """
    part of speech tagging

    :param unicode|str sentence: raw sentence
    :return: tagged sentence
    :rtype: list
    """
    sentence = tokenize(sentence).split()
    crf_model = CRFModel.Instance()
    output = crf_model.predict(sentence, format)
    tokens = [token[0] for token in output]
    tags = [token[1] for token in output]
    output = []
    for tag, token in zip(tags, tokens):
        if tag == "IW":
            output[-1] = output[-1] + u" " + token
        else:
            output.append(token)
    if format == "text":
        output = u" ".join([item.replace(" ", "_") for item in output])
    return output
github undertheseanlp / underthesea / underthesea / word_sent / word_sent.py View on Github external
def word_sent(sentence, format=None):
    """
    :param unicode|str sentence: raw sentence
    :return: segmented sentence
    :rtype: unicode|str
    """
    sentence = tokenize(sentence)
    crf_model = CRFModel.Instance()
    result = crf_model.predict(sentence, format)
    return result