Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
locate_dict[i] = error_type
# for i in range(int(start_off) - 1, int(end_off)):
# locate_dict[i] = error_type
if text_id in truth_dict:
truth_dict[text_id].update(locate_dict)
else:
truth_dict[text_id] = locate_dict
# read input file and get token
with open(input_path, 'r', encoding='utf-8') as input_f:
for line in input_f:
parts = line.strip().split('\t')
text_id = parts[0].replace('(sid=', '').replace(')', '')
text = parts[1]
# segment with pos
word_seq, pos_seq = segment(text, cut_type='char', pos=True)
word_arr, label_arr = [], []
if text_id in truth_dict:
locate_dict = truth_dict[text_id]
for i in range(len(word_seq)):
if i in locate_dict:
word_arr.append(word_seq[i])
# fill with error type
label_arr.append(locate_dict[i])
else:
word_arr.append(word_seq[i])
# fill with pos tag
label_arr.append(pos_seq[i])
else:
word_arr = word_seq
label_arr = pos_seq
id_lst.append(text_id)
childNodes[0].data.strip()
# Input the correct text
correction = doc.getElementsByTagName('CORRECTION')[0]. \
childNodes[0].data.strip()
texts = split_2_short_text(text)
corrections = split_2_short_text(correction)
if len(texts) != len(corrections):
# print('error:' + text + '\t' + correction)
continue
for i in range(len(texts)):
if len(texts[i]) > 40:
# print('error:' + texts[i] + '\t' + corrections[i])
continue
source = segment(texts[i], cut_type='char')
target = segment(corrections[i], cut_type='char')
pair = [source, target]
if pair not in data_list:
data_list.append(pair)
return data_list
errors = doc.getElementsByTagName('ERROR')
# Locate the error position and error type
locate_dict = {}
for error in errors:
start_off = error.getAttribute('start_off')
end_off = error.getAttribute('end_off')
error_type = error.getAttribute('type')
for i in range(int(start_off) - 1, int(end_off)):
if i == int(start_off) - 1:
error_type_change = 'B-' + error_type
else:
error_type_change = 'I-' + error_type
# locate_dict[i] = error_type_change
locate_dict[i] = error_type
# Segment with pos
word_seq, pos_seq = segment(text, cut_type='char', pos=True)
word_arr, label_arr = [], []
for i in range(len(word_seq)):
if i in locate_dict:
word_arr.append(word_seq[i])
# Fill with error type
label_arr.append(locate_dict[i])
else:
word_arr.append(word_seq[i])
# Fill with pos tag
label_arr.append(pos_seq[i])
id_lst.append(text_id)
word_lst.append(word_arr)
label_lst.append(label_arr)
return id_lst, word_lst, label_lst
def parse_xml_file(path):
print('Parse data from %s' % path)
word_arr = []
with open(path, 'r', encoding='utf-8') as f:
dom_tree = minidom.parse(f)
docs = dom_tree.documentElement.getElementsByTagName('DOC')
for doc in docs:
# Input the text
text = doc.getElementsByTagName('CORRECTION')[0]. \
childNodes[0].data.strip()
# Segment
word_seq = segment(text, cut_type='char', pos=False)
word_arr.append(word_seq)
return word_arr
text = doc.getElementsByTagName('TEXT')[0]. \
childNodes[0].data.strip()
# Input the correct text
correction = doc.getElementsByTagName('CORRECTION')[0]. \
childNodes[0].data.strip()
texts = split_2_short_text(text)
corrections = split_2_short_text(correction)
if len(texts) != len(corrections):
# print('error:' + text + '\t' + correction)
continue
for i in range(len(texts)):
if len(texts[i]) > 40:
# print('error:' + texts[i] + '\t' + corrections[i])
continue
source = segment(texts[i], cut_type='char')
target = segment(corrections[i], cut_type='char')
pair = [source, target]
if pair not in data_list:
data_list.append(pair)
return data_list