How to use the text2vec.USER_DATA_DIR function in text2vec

To help you get started, we’ve selected a few text2vec examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github shibing624 / text2vec / text2vec / embeddings / bert_embedding.py View on Github external
def _build_token2idx_from_bert(self):
        dict_path = os.path.join(self.model_folder, 'vocab.txt')
        if not os.path.exists(dict_path):
            model_name = self.model_key_map.get(self.model_folder, 'chinese_L-12_H-768_A-12')
            url = self.pre_trained_models.get(model_name)
            get_file(
                model_name + ".zip", url, extract=True,
                cache_dir=text2vec.USER_DIR,
                cache_subdir=text2vec.USER_DATA_DIR,
                verbose=1
            )
            self.model_folder = os.path.join(text2vec.USER_DATA_DIR, model_name)
            dict_path = os.path.join(self.model_folder, 'vocab.txt')
        logger.debug(f'load vocab.txt from {dict_path}')
        token2idx = {}
        with codecs.open(dict_path, 'r', encoding='utf-8') as f:
            for line in f:
                token = line.strip()
                token2idx[token] = len(token2idx)

        self.bert_token2idx = token2idx
        self.tokenizer = keras_bert.Tokenizer(token2idx)
        self.processor.token2idx = self.bert_token2idx
        self.processor.idx2token = dict([(value, key) for key, value in token2idx.items()])
github shibing624 / text2vec / text2vec / bert / train.py View on Github external
# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description: 
"""
import os

import tensorflow as tf

import text2vec
from text2vec.bert.model import BertSimilarity

if __name__ == '__main__':
    sim = BertSimilarity(data_dir='../data/', model_dir=os.path.join(text2vec.USER_DATA_DIR, 'chinese_L-12_H-768_A-12'),
                         output_dir=os.path.join(text2vec.USER_DATA_DIR, 'fine_tuned_bert_similarity'))
    sim.set_mode(tf.estimator.ModeKeys.TRAIN)
    sim.train()
    sim.set_mode(tf.estimator.ModeKeys.EVAL)
    sim.eval()
github shibing624 / text2vec / text2vec / bert / extract_feature.py View on Github external
line = line.strip()
            text_a = None
            text_b = None
            m = re.match(r"^(.*) \|\|\| (.*)$", line)
            if m is None:
                text_a = line
            else:
                text_a = m.group(1)
                text_b = m.group(2)
            yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
            unique_id += 1


if __name__ == "__main__":
    vector = BertVector(model_dir=os.path.join(text2vec.USER_DATA_DIR, 'chinese_L-12_H-768_A-12'),
                        output_dir=os.path.join(text2vec.USER_DATA_DIR, 'bert_vector'))
    emb = vector.encode(['你好吗朋友', '您好呀小盆友'])
    print(str(emb))
    print(emb.shape)
github shibing624 / text2vec / text2vec / bert / extract_feature.py View on Github external
continue
            line = line.strip()
            text_a = None
            text_b = None
            m = re.match(r"^(.*) \|\|\| (.*)$", line)
            if m is None:
                text_a = line
            else:
                text_a = m.group(1)
                text_b = m.group(2)
            yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
            unique_id += 1


if __name__ == "__main__":
    vector = BertVector(model_dir=os.path.join(text2vec.USER_DATA_DIR, 'chinese_L-12_H-768_A-12'),
                        output_dir=os.path.join(text2vec.USER_DATA_DIR, 'bert_vector'))
    emb = vector.encode(['你好吗朋友', '您好呀小盆友'])
    print(str(emb))
    print(emb.shape)
github shibing624 / text2vec / text2vec / embeddings / word_embedding.py View on Github external
def _build_token2idx_from_w2v(self):
        if not self.w2v_path or not os.path.exists(self.w2v_path):
            if self.w2v_path in self.model_key_map:
                self.w2v_path = self.model_key_map[self.w2v_path]
            model_dict = self.model_key_map.get(self.w2v_path, self.model_key_map['w2v-light-tencent-chinese'])
            tar_filename = model_dict.get('tar_filename')
            self.w2v_kwargs = {'binary': model_dict.get('binary')}
            url = model_dict.get('url')
            untar_filename = model_dict.get('untar_filename')
            self.w2v_path = os.path.join(text2vec.USER_DATA_DIR, untar_filename)
            if not os.path.exists(self.w2v_path):
                get_file(
                    tar_filename, url, extract=True,
                    cache_dir=text2vec.USER_DIR,
                    cache_subdir=text2vec.USER_DATA_DIR,
                    verbose=1
                )
        t0 = time.time()
        w2v = KeyedVectors.load_word2vec_format(self.w2v_path, **self.w2v_kwargs)
        w2v.init_sims(replace=True)
        logger.debug('load w2v from %s, spend %s s' % (self.w2v_path, time.time() - t0))
        token2idx = {
            self.processor.token_pad: 0,
            self.processor.token_unk: 1,
            self.processor.token_bos: 2,
            self.processor.token_eos: 3
        }

        for token in w2v.index2word:
            token2idx[token] = len(token2idx)
github shibing624 / text2vec / text2vec / bert / train.py View on Github external
# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description: 
"""
import os

import tensorflow as tf

import text2vec
from text2vec.bert.model import BertSimilarity

if __name__ == '__main__':
    sim = BertSimilarity(data_dir='../data/', model_dir=os.path.join(text2vec.USER_DATA_DIR, 'chinese_L-12_H-768_A-12'),
                         output_dir=os.path.join(text2vec.USER_DATA_DIR, 'fine_tuned_bert_similarity'))
    sim.set_mode(tf.estimator.ModeKeys.TRAIN)
    sim.train()
    sim.set_mode(tf.estimator.ModeKeys.EVAL)
    sim.eval()
github shibing624 / text2vec / text2vec / embeddings / bert_embedding.py View on Github external
def _build_token2idx_from_bert(self):
        dict_path = os.path.join(self.model_folder, 'vocab.txt')
        if not os.path.exists(dict_path):
            model_name = self.model_key_map.get(self.model_folder, 'chinese_L-12_H-768_A-12')
            url = self.pre_trained_models.get(model_name)
            get_file(
                model_name + ".zip", url, extract=True,
                cache_dir=text2vec.USER_DIR,
                cache_subdir=text2vec.USER_DATA_DIR,
                verbose=1
            )
            self.model_folder = os.path.join(text2vec.USER_DATA_DIR, model_name)
            dict_path = os.path.join(self.model_folder, 'vocab.txt')
        logger.debug(f'load vocab.txt from {dict_path}')
        token2idx = {}
        with codecs.open(dict_path, 'r', encoding='utf-8') as f:
            for line in f:
                token = line.strip()
                token2idx[token] = len(token2idx)

        self.bert_token2idx = token2idx
        self.tokenizer = keras_bert.Tokenizer(token2idx)
        self.processor.token2idx = self.bert_token2idx
        self.processor.idx2token = dict([(value, key) for key, value in token2idx.items()])
github shibing624 / text2vec / text2vec / embeddings / word_embedding.py View on Github external
def _build_token2idx_from_w2v(self):
        if not self.w2v_path or not os.path.exists(self.w2v_path):
            if self.w2v_path in self.model_key_map:
                self.w2v_path = self.model_key_map[self.w2v_path]
            model_dict = self.model_key_map.get(self.w2v_path, self.model_key_map['w2v-light-tencent-chinese'])
            tar_filename = model_dict.get('tar_filename')
            self.w2v_kwargs = {'binary': model_dict.get('binary')}
            url = model_dict.get('url')
            untar_filename = model_dict.get('untar_filename')
            self.w2v_path = os.path.join(text2vec.USER_DATA_DIR, untar_filename)
            if not os.path.exists(self.w2v_path):
                get_file(
                    tar_filename, url, extract=True,
                    cache_dir=text2vec.USER_DIR,
                    cache_subdir=text2vec.USER_DATA_DIR,
                    verbose=1
                )
        t0 = time.time()
        w2v = KeyedVectors.load_word2vec_format(self.w2v_path, **self.w2v_kwargs)
        w2v.init_sims(replace=True)
        logger.debug('load w2v from %s, spend %s s' % (self.w2v_path, time.time() - t0))
        token2idx = {
            self.processor.token_pad: 0,
            self.processor.token_unk: 1,
            self.processor.token_bos: 2,
            self.processor.token_eos: 3
github shibing624 / text2vec / text2vec / bert / predict.py View on Github external
# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description: 
"""
import os

import tensorflow as tf

import text2vec
from text2vec.bert.model import BertSimilarity

if __name__ == '__main__':
    sim = BertSimilarity(data_dir='../data/', model_dir=os.path.join(text2vec.USER_DATA_DIR, 'chinese_L-12_H-768_A-12'),
                         output_dir=os.path.join(text2vec.USER_DATA_DIR, 'fine_tuned_bert_similarity'))
    sim.set_mode(tf.estimator.ModeKeys.PREDICT)
    while True:
        print('input start:')
        sentence1 = input('sentence1: ')
        sentence2 = input('sentence2: ')
        predict = sim.predict(sentence1, sentence2)
        print(f'similarity:{predict[0][1]}')
github shibing624 / text2vec / text2vec / bert / predict.py View on Github external
# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description: 
"""
import os

import tensorflow as tf

import text2vec
from text2vec.bert.model import BertSimilarity

if __name__ == '__main__':
    sim = BertSimilarity(data_dir='../data/', model_dir=os.path.join(text2vec.USER_DATA_DIR, 'chinese_L-12_H-768_A-12'),
                         output_dir=os.path.join(text2vec.USER_DATA_DIR, 'fine_tuned_bert_similarity'))
    sim.set_mode(tf.estimator.ModeKeys.PREDICT)
    while True:
        print('input start:')
        sentence1 = input('sentence1: ')
        sentence2 = input('sentence2: ')
        predict = sim.predict(sentence1, sentence2)
        print(f'similarity:{predict[0][1]}')