How to use the konlpy.tag.Mecab function in konlpy

To help you get started, we’ve selected a few konlpy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github TensorMSA / tensormsa / cluster / generator / ner_augmentation.py View on Github external
def _conv_type_b(self, idx):
        """
        
        :return: 
        """
        df_csv_read = pd.read_csv(self.pattern_data_path,
                                  skipinitialspace=True,
                                  engine="python",
                                  encoding='utf-8-sig')

        i = 0
        for key, line in zip(df_csv_read['decode'].values, df_csv_read['encode'].values) :
            words = []
            if (self.use_mecab):
                self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
                pos = self.mecab.pos(line)
                for word, tag in pos:
                    words.append(word)
            else:
                words = str(line).split(' ')
            match_keys = self._check_all_match(words)
            aug_data = self._aug_sent(match_keys, words, [])
            self._intent_formatter(aug_data, key, idx)

            if(i%100 == 0) :
                print("====Therad{0} : {1} line job done".format(idx, i))
            i = i + 1
github TensorMSA / tensormsa / cluster / neuralnet / neuralnet_node_w2v.py View on Github external
def _pos_raw_data(self, lt):
        """

        :param lt: list type value
        :return:
        """
        mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
        return_arr= []
        for raw in lt :
            pos = mecab.pos(raw)
            for word, tag in pos:
                return_arr.append("{0}/{1}".format(word, tag))
        return return_arr
github edwardrha / Korean-NLP-Project / src / lemKR.py View on Github external
def ko_lemmatize_nouns(inputString):
    '''
        Input:  string (Korean)
        Output: list of strings (Korean)
    ----------------------------------------------------------------------------
    Returns list of nouns from the input.
    '''
    mecab = Mecab()
    return mecab.nouns(inputString)
github lyeoni / prenlp / prenlp / tokenizer / tokenizer.py View on Github external
def __init__(self):
        try:
            from konlpy.tag import Mecab
        except ImportError:
            raise ImportError(
                'Mecab is not installed. '
                'You can install Mecab with "sh scripts/install_mecab.sh" '
                'You can refer to the installation guide in https://github.com/lyeoni/prenlp/blob/master/scripts/install_mecab.sh or https://bitbucket.org/eunjeon/mecab-ko-dic/src')
        self.tokenizer = Mecab()
github TensorMSA / tensormsa / cluster / common / common_node.py View on Github external
def _mecab_parse(self, str_arr, tag_combine=True):
        """

        :param h5file:
        :return:
        """
        mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
        return_arr = []
        for data in str_arr:
            return_arr = return_arr + self._flat(mecab.pos(str(data)), tag_combine=tag_combine)
        return return_arr
github Kyubyong / g2pK / g2pk / g2pk.py View on Github external
def __init__(self):
        self.mecab = Mecab() # for annotation
        self.table = parse_table()

        self.cmu = cmudict.dict() # for English

        self.rule2text = get_rule_id2text() # for comments of main rules
github TensorMSA / tensormsa / cluster / neuralnet / neuralnet_node_d2v.py View on Github external
def _pos_raw_data(self, lt):
        """

        :param lt: list type value
        :return:
        """
        mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
        return_arr= []
        for raw in lt :
            pos = mecab.pos(raw)
            for word, tag in pos:
                return_arr.append("{0}/{1}".format(word, tag))
        return return_arr
github TensorMSA / tensormsa / cluster / neuralnet / neuralnet_node_seq2seq.py View on Github external
def _pos_tag_predict_data(self, x_input):
        """

        :param x_input:
        :return:
        """
        word_list = []
        mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
        for word_tuple in self._pad_predict_input(mecab.pos(x_input)):
            if (len(word_tuple[1]) > 0):
                word = ''.join([word_tuple[0], "/", word_tuple[1]])
            else:
                word = word_tuple[0]
            word_list.append(word)
        return word_list
github lyeoni / pretraining-for-language-understanding / inference.py View on Github external
print('#{} =============='.format(iter_*config.batch_size + i))
                print('Actu:\t{}\nPred:\t{}\n'.format(target_sentences, pred_sentences))

if __name__=='__main__':
    config = argparser()
    print(config)

    # Load vocabulary
    import pickle
    with open(config.vocab, 'rb') as reader:
        vocab = pickle.load(reader)
    
    # Select tokenizer
    if config.tokenizer=='mecab':
        from konlpy.tag import Mecab
        tokenizer = Tokenizer(tokenization_fn=Mecab().morphs,
                              vocab=vocab, max_seq_length=config.max_seq_len)

    # Build dataloader
    corpus = Corpus(corpus_path=config.corpus, tokenizer=tokenizer, model_type=config.model_type, cuda=config.cuda)
    loader = DataLoader(dataset=corpus, batch_size=config.batch_size)

    # Load model with trained parameters
    if config.model_type=='LSTM':
        model = LSTMLM(input_size=len(vocab),
                       embedding_size=config.embedding_size,
                       hidden_size=config.hidden_size,
                       output_size=len(vocab),
                       n_layers=config.n_layers,
                       dropout_p=config.dropout_p)
    elif config.model_type=='BiLSTM':
        model = BiLSTMLM(input_size=len(vocab),
github Kyubyong / word2word / _make.py View on Github external
def load_tokenizer(lang):
    if lang=="ko":
        from konlpy.tag import Mecab
        tokenizer = Mecab()
    elif lang=="ja":
        import Mykytea
        opt="-model jp-0.4.7-1.mod"
        tokenizer = Mykytea.Mykytea(opt)
    elif lang=="zh_cn":
        import Mykytea
        opt = "-model ctb-0.4.0-1.mod"
        tokenizer = Mykytea.Mykytea(opt)
    elif lang=="zh_tw":
        import jieba
        tokenizer = jieba
    elif lang=="vi":
        from pyvi import ViTokenizer
        tokenizer = ViTokenizer
    elif lang=="th":
        from pythainlp.tokenize import word_tokenize