How to use the konlpy.tag.Twitter function in konlpy

To help you get started, we’ve selected a few konlpy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github MSWon / Sentimental-Analysis / Word2Vec / Word2Vec_train.py View on Github external
import tensorflow as tf
import numpy as np
import codecs

os.chdir("C:\\Users\\jbk48\\Desktop\\Sentimental-Analysis-master\\Sentimental-Analysis-master\\Word2Vec\\Movie_rating_data")

def read_data(filename):    
    with open(filename, 'r',encoding='utf-8') as f:
        data = [line.split('\t') for line in f.read().splitlines()]        
        data = data[1:]   # header 제외 #    
    return data 
    
train_data = read_data('ratings_train.txt') 
test_data = read_data('ratings_test.txt') 

pos_tagger = Twitter() 

def tokenize(doc):

    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]


## training Word2Vec model using skip-gram   
tokens = [tokenize(row[1]) for row in train_data]
model = gensim.models.Word2Vec(size=300,sg = 1, alpha=0.025,min_alpha=0.025, seed=1234)
model.build_vocab(tokens)
    
for epoch in range(30):
           
    model.train(tokens,model.corpus_count,epochs = model.iter)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
github bzantium / nlp-tensorflow / [01] sentiment_analysis / data_process.py View on Github external
import numpy as np
import re
from collections import Counter
from konlpy.tag import Twitter

def tokenizer(sentence):
    tokens = re.findall(r"[\w]+|[^\s\w]", sentence)
    return tokens

tw = Twitter()
def pos_extractor(sentence):
    """
    extract Noun, Adjective, Verb only
    """
    tokens = []
    pos = tw.pos(sentence, norm=True, stem=True)
    for token, p in pos:
        if p == 'Noun' or p == 'Adjective' or p == 'Verb':
            tokens.append(token)
    return tokens

def morphs_extractor(sentence):
    """
    extract morphs
    """
    tokens = tw.morphs(sentence, norm=True, stem=True)
github bzantium / chatbot-seminar / 03.sentiment / data_process.py View on Github external
import numpy as np
import pandas as pd
import re
from collections import Counter
from konlpy.tag import Twitter

def tokenizer(sentence):
    tokens = re.findall(r"[\w]+|[^\s\w]", sentence)
    return tokens

tw = Twitter()
def pos_extractor(sentence):
    """
    extract Noun, Adjective, Verb only
    """
    tokens = []
    pos = tw.pos(sentence, norm=True, stem=True)
    for token, p in pos:
        if p == 'Noun' or p == 'Adjective' or p == 'Verb':
            tokens.append(token)
    return tokens

def morphs_extractor(sentence):
    """
    extract morphs
    """
    tokens = tw.morphs(sentence, norm=True, stem=True)
github TensorMSA / tensormsa / cluster / common / common_node.py View on Github external
def _twitter_parse(self, str_arr, tag_combine=True):
        """

        :param h5file:
        :return:
        """
        twitter = Twitter(jvmpath=None)
        return_arr = []
        for data in str_arr:
            return_arr = return_arr + self._flat(twitter.pos(str(data)), tag_combine=tag_combine)
        return return_arr
github DongjunLee / kino-bot / kino / nlp / disintegrator.py View on Github external
def __init__(self):
        self.ko_twitter = Twitter()
github NLP-kr / tensorflow-ml-nlp / 6.CHATBOT / Appendix-transformer / data.py View on Github external
def prepro_like_morphlized(data):
    # 형태소 분석 모듈 객체를
    # 생성합니다.

    morph_analyzer = Twitter()
    # 형태소 토크나이즈 결과 문장을 받을
    #  리스트를 생성합니다.
    result_data = list()
    # 데이터에 있는 매 문장에 대해 토크나이즈를
    # 할 수 있도록 반복문을 선언합니다.
    for seq in tqdm(data):
        # Twitter.morphs 함수를 통해 토크나이즈 된
        # 리스트 객체를 받고 다시 공백문자를 기준으로
        # 하여 문자열로 재구성 해줍니다.
        morphlized_seq = " ".join(morph_analyzer.morphs(seq.replace(' ', '')))
        result_data.append(morphlized_seq)

    return result_data
github MSWon / Sentimental-Analysis / Bidirectional_LSTM / Word2Vec.py View on Github external
def tokenize(self, doc):
        pos_tagger = Twitter()
        return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
github NLP-kr / tensorflow-ml-nlp / 6.CHATBOT / 6.4 transformer / data.py View on Github external
def prepro_like_morphlized(data):
    # 형태소 분석 모듈 객체를
    # 생성합니다.

    morph_analyzer = Twitter()
    # 형태소 토크나이즈 결과 문장을 받을
    #  리스트를 생성합니다.
    result_data = list()
    # 데이터에 있는 매 문장에 대해 토크나이즈를
    # 할 수 있도록 반복문을 선언합니다.
    for seq in tqdm(data):
        # Twitter.morphs 함수를 통해 토크나이즈 된
        # 리스트 객체를 받고 다시 공백문자를 기준으로
        # 하여 문자열로 재구성 해줍니다.
        morphlized_seq = " ".join(morph_analyzer.morphs(seq.replace(' ', '')))
        result_data.append(morphlized_seq)

    return result_data