How to use the hanlp.utils.log_util.logger.warning function in hanlp

To help you get started, we’ve selected a few hanlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hankcs / HanLP / hanlp / utils / io_util.py View on Github external
logger.debug(f'Loaded {binpath}')
            return word2vec, dim
        except IOError:
            pass

    dim = None
    word2vec = dict()
    with open(realpath, encoding='utf-8', errors='ignore') as f:
        for idx, line in enumerate(f):
            line = line.rstrip().split(delimiter)
            if len(line) > 2:
                if dim is None:
                    dim = len(line)
                else:
                    if len(line) != dim:
                        logger.warning('{}#{} length mismatches with {}'.format(path, idx + 1, dim))
                        continue
                word, vec = line[0], line[1:]
                word2vec[word] = np.array(vec, dtype=np.float32)
    dim -= 1
    if cache:
        save_pickle((word2vec, dim), binpath)
        logger.debug(f'Cached {binpath}')
    return word2vec, dim
github hankcs / HanLP / hanlp / components / parsers / conll.py View on Github external
raw_batch = [[], [], [], []]
        max_len = len(max([corpus[i] for i in indices], key=len))
        for idx in indices:
            arc = np.zeros((max_len, max_len), dtype=np.bool)
            rel = np.zeros((max_len, max_len), dtype=np.int64)
            for b in raw_batch[:2]:
                b.append([])
            for m, cells in enumerate(corpus[idx]):
                for b, c, v in zip(raw_batch, cells,
                                   [self.form_vocab, self.cpos_vocab]):
                    b[-1].append(v.get_idx_without_add(c))
                for n, r in zip(cells[2], cells[3]):
                    arc[m, n] = True
                    rid = self.rel_vocab.get_idx_without_add(r)
                    if rid is None:
                        logger.warning(f'Relation OOV: {r} not exists in train')
                        continue
                    rel[m, n] = rid
            raw_batch[-2].append(arc)
            raw_batch[-1].append(rel)
        batch = []
        for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab]):
            b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post',
                                                              value=v.safe_pad_token_idx,
                                                              dtype='int64')
            batch.append(b)
        batch += raw_batch[2:]
        assert len(batch) == 4
        yield (batch[0], batch[1]), (batch[2], batch[3])
github hankcs / HanLP / hanlp / components / taggers / transformers / utils.py View on Github external
tokens = []
    label_ids = []
    for word, label in zip(words, labels):
        word_tokens = tokenizer.tokenize(word)
        if not word_tokens:
            # some wired chars cause the tagger to return empty list
            word_tokens = [unk_token] * len(word)
        tokens.extend(word_tokens)
        # Use the real label id for the first token of the word, and padding ids for the remaining tokens
        label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))

    # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
    special_tokens_count = 3 if sep_token_extra else 2
    if len(tokens) > max_seq_length - special_tokens_count:
        logger.warning(
            f'Input tokens {words} exceed the max sequence length of {max_seq_length - special_tokens_count}. '
            f'The exceeded part will be truncated and ignored. '
            f'You are recommended to split your long text into several sentences within '
            f'{max_seq_length - special_tokens_count} tokens beforehand.')
        tokens = tokens[: (max_seq_length - special_tokens_count)]
        label_ids = label_ids[: (max_seq_length - special_tokens_count)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids:   0   0   0   0  0     0   0
    #
    # Where "type_ids" are used to indicate whether this is the first