How to use the text2vec.utils.get_file.get_file function in text2vec

To help you get started, we’ve selected a few text2vec examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github shibing624 / text2vec / text2vec / embeddings / word_embedding.py View on Github external
def _build_token2idx_from_w2v(self):
        if not self.w2v_path or not os.path.exists(self.w2v_path):
            if self.w2v_path in self.model_key_map:
                self.w2v_path = self.model_key_map[self.w2v_path]
            model_dict = self.model_key_map.get(self.w2v_path, self.model_key_map['w2v-light-tencent-chinese'])
            tar_filename = model_dict.get('tar_filename')
            self.w2v_kwargs = {'binary': model_dict.get('binary')}
            url = model_dict.get('url')
            untar_filename = model_dict.get('untar_filename')
            self.w2v_path = os.path.join(text2vec.USER_DATA_DIR, untar_filename)
            if not os.path.exists(self.w2v_path):
                get_file(
                    tar_filename, url, extract=True,
                    cache_dir=text2vec.USER_DIR,
                    cache_subdir=text2vec.USER_DATA_DIR,
                    verbose=1
                )
        t0 = time.time()
        w2v = KeyedVectors.load_word2vec_format(self.w2v_path, **self.w2v_kwargs)
        w2v.init_sims(replace=True)
        logger.debug('load w2v from %s, spend %s s' % (self.w2v_path, time.time() - t0))
        token2idx = {
            self.processor.token_pad: 0,
            self.processor.token_unk: 1,
            self.processor.token_bos: 2,
            self.processor.token_eos: 3
        }
github shibing624 / text2vec / text2vec / embeddings / bert_embedding.py View on Github external
def _build_token2idx_from_bert(self):
        dict_path = os.path.join(self.model_folder, 'vocab.txt')
        if not os.path.exists(dict_path):
            model_name = self.model_key_map.get(self.model_folder, 'chinese_L-12_H-768_A-12')
            url = self.pre_trained_models.get(model_name)
            get_file(
                model_name + ".zip", url, extract=True,
                cache_dir=text2vec.USER_DIR,
                cache_subdir=text2vec.USER_DATA_DIR,
                verbose=1
            )
            self.model_folder = os.path.join(text2vec.USER_DATA_DIR, model_name)
            dict_path = os.path.join(self.model_folder, 'vocab.txt')
        logger.debug(f'load vocab.txt from {dict_path}')
        token2idx = {}
        with codecs.open(dict_path, 'r', encoding='utf-8') as f:
            for line in f:
                token = line.strip()
                token2idx[token] = len(token2idx)

        self.bert_token2idx = token2idx
        self.tokenizer = keras_bert.Tokenizer(token2idx)