How to use the pycorrector.tokenizer.segment function in pycorrector

To help you get started, we’ve selected a few pycorrector examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github shibing624 / pycorrector / pycorrector / rnn_crf / preprocess.py View on Github external
locate_dict[i] = error_type
                # for i in range(int(start_off) - 1, int(end_off)):
                #     locate_dict[i] = error_type
                if text_id in truth_dict:
                    truth_dict[text_id].update(locate_dict)
                else:
                    truth_dict[text_id] = locate_dict

    # read input file and get token
    with open(input_path, 'r', encoding='utf-8') as input_f:
        for line in input_f:
            parts = line.strip().split('\t')
            text_id = parts[0].replace('(sid=', '').replace(')', '')
            text = parts[1]
            # segment with pos
            word_seq, pos_seq = segment(text, cut_type='char', pos=True)
            word_arr, label_arr = [], []
            if text_id in truth_dict:
                locate_dict = truth_dict[text_id]
                for i in range(len(word_seq)):
                    if i in locate_dict:
                        word_arr.append(word_seq[i])
                        # fill with error type
                        label_arr.append(locate_dict[i])
                    else:
                        word_arr.append(word_seq[i])
                        # fill with pos tag
                        label_arr.append(pos_seq[i])
            else:
                word_arr = word_seq
                label_arr = pos_seq
            id_lst.append(text_id)
github shibing624 / pycorrector / pycorrector / seq2seq_attention / preprocess_short_text.py View on Github external
childNodes[0].data.strip()
        # Input the correct text
        correction = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()

        texts = split_2_short_text(text)
        corrections = split_2_short_text(correction)
        if len(texts) != len(corrections):
            # print('error:' + text + '\t' + correction)
            continue
        for i in range(len(texts)):
            if len(texts[i]) > 40:
                # print('error:' + texts[i] + '\t' + corrections[i])
                continue
            source = segment(texts[i], cut_type='char')
            target = segment(corrections[i], cut_type='char')
            pair = [source, target]
            if pair not in data_list:
                data_list.append(pair)
    return data_list
github shibing624 / pycorrector / pycorrector / rnn_crf / preprocess.py View on Github external
errors = doc.getElementsByTagName('ERROR')
        # Locate the error position and error type
        locate_dict = {}
        for error in errors:
            start_off = error.getAttribute('start_off')
            end_off = error.getAttribute('end_off')
            error_type = error.getAttribute('type')
            for i in range(int(start_off) - 1, int(end_off)):
                if i == int(start_off) - 1:
                    error_type_change = 'B-' + error_type
                else:
                    error_type_change = 'I-' + error_type
                # locate_dict[i] = error_type_change
                locate_dict[i] = error_type
        # Segment with pos
        word_seq, pos_seq = segment(text, cut_type='char', pos=True)
        word_arr, label_arr = [], []
        for i in range(len(word_seq)):
            if i in locate_dict:
                word_arr.append(word_seq[i])
                # Fill with error type
                label_arr.append(locate_dict[i])
            else:
                word_arr.append(word_seq[i])
                # Fill with pos tag
                label_arr.append(pos_seq[i])
        id_lst.append(text_id)
        word_lst.append(word_arr)
        label_lst.append(label_arr)
    return id_lst, word_lst, label_lst
github shibing624 / pycorrector / pycorrector / rnn_lm / preprocess.py View on Github external
def parse_xml_file(path):
    print('Parse data from %s' % path)
    word_arr = []
    with open(path, 'r', encoding='utf-8') as f:
        dom_tree = minidom.parse(f)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()
        # Segment
        word_seq = segment(text, cut_type='char', pos=False)
        word_arr.append(word_seq)
    return word_arr
github shibing624 / pycorrector / pycorrector / seq2seq_attention / preprocess_short_text.py View on Github external
text = doc.getElementsByTagName('TEXT')[0]. \
            childNodes[0].data.strip()
        # Input the correct text
        correction = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()

        texts = split_2_short_text(text)
        corrections = split_2_short_text(correction)
        if len(texts) != len(corrections):
            # print('error:' + text + '\t' + correction)
            continue
        for i in range(len(texts)):
            if len(texts[i]) > 40:
                # print('error:' + texts[i] + '\t' + corrections[i])
                continue
            source = segment(texts[i], cut_type='char')
            target = segment(corrections[i], cut_type='char')
            pair = [source, target]
            if pair not in data_list:
                data_list.append(pair)
    return data_list