How to use the text2vec.utils.logger.logger.debug function in text2vec

To help you get started, we’ve selected a few text2vec examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github shibing624 / text2vec / text2vec / embeddings / word_embedding.py View on Github external
embeds = []
        for sentence in sentence_list:
            emb = []
            count = 0
            for word in sentence:
                if word not in self.w2v.vocab:
                    continue
                emb.append(self.w2v[word])
                count += 1
            tensor_x = np.array(emb).sum(axis=0)  # 纵轴相加
            avg_tensor_x = np.divide(tensor_x, count)
            embeds.append(avg_tensor_x)
        embeds = np.array(embeds)
        if debug:
            logger.debug(f'sentence tensor shape: {embeds.shape}')
        return embeds
github shibing624 / text2vec / text2vec / embeddings / word_embedding.py View on Github external
def _build_model(self, **kwargs):
        if self.embed_model is None:
            from tensorflow import keras
            if self.token_count == 0:
                logger.debug('need to build after build_word2idx')
            else:
                input_tensor = keras.layers.Input(shape=(self.sequence_length,),
                                                  name='input')
                layer_embedding = keras.layers.Embedding(self.token_count,
                                                         self.embedding_size,
                                                         weights=[self.w2v_vector_matrix],
                                                         trainable=False,
                                                         name='layer_embedding')

                embedded_tensor = layer_embedding(input_tensor)
                self.embed_model = keras.Model(input_tensor, embedded_tensor)
github shibing624 / text2vec / text2vec / embeddings / bert_embedding.py View on Github external
Returns:
            vectorized sentence list

            print(token, predicts[i].tolist()[:4])
            [CLS] [0.24250675737857819, 0.04605229198932648, ...]
            from [0.2858668565750122, 0.12927496433258057,  ...]
            that [-0.7514970302581787, 0.14548861980438232, ...]
            day [0.32245880365371704, -0.043174318969249725, ...]
            ...
        """
        if self.embed_model is None:
            raise ValueError('need to build model for embed sentence')

        tensor_x = self.process_x_dataset(sentence_list)
        if debug:
            logger.debug(f'sentence tensor: {tensor_x}')
        embed_results = self.embed_model.predict(tensor_x)
        return embed_results
github shibing624 / text2vec / text2vec / embeddings / word_embedding.py View on Github external
token2idx[token] = len(token2idx)

        vector_matrix = np.zeros((len(token2idx), w2v.vector_size))
        vector_matrix[1] = np.random.rand(w2v.vector_size)
        vector_matrix[4:] = w2v.vectors

        self.embedding_size = w2v.vector_size
        self.w2v_vector_matrix = vector_matrix
        self.w2v_token2idx = token2idx
        self.w2v_top_words = w2v.index2entity[:50]
        self.w2v_model_loaded = True
        self.w2v = w2v

        self.processor.token2idx = self.w2v_token2idx
        self.processor.idx2token = dict([(value, key) for key, value in self.w2v_token2idx.items()])
        logger.debug('word count   : {}'.format(len(self.w2v_vector_matrix)))
        logger.debug('emb size     : {}'.format(self.embedding_size))
        logger.debug('Top 50 word  : {}'.format(self.w2v_top_words))
        self.tokenizer = Tokenizer()
github shibing624 / text2vec / text2vec / embeddings / word_embedding.py View on Github external
w2v_path: word2vec file path
            w2v_kwargs: params pass to the ``load_word2vec_format()`` function of ``gensim.models.KeyedVectors`` -
                https://radimrehurek.com/gensim/models/keyedvectors.html#module-gensim.models.keyedvectors
            sequence_length: ``'auto'``, ``'variable'`` or integer. When using ``'auto'``, use the 95% of corpus length
                as sequence length. When using ``'variable'``, model input shape will set to None, which can handle
                various length of input, it will use the length of max sequence in every batch for sequence length.
                If using an integer, let's say ``50``, the input output sequence length will set to 50.
            processor:
        """
        if w2v_kwargs is None:
            w2v_kwargs = {}
        self.w2v_path = w2v_path
        self.w2v_kwargs = w2v_kwargs
        self.w2v = None
        self.w2v_model_loaded = False
        logger.debug('load w2v embedding ...')
        super(WordEmbedding, self).__init__(sequence_length=sequence_length,
                                            embedding_size=0,
                                            processor=processor)
        self._build_token2idx_from_w2v()
        if trainable:
            self._build_model()