Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def parse_xml_file(path):
print('Parse data from %s' % path)
word_arr = []
with open(path, 'r', encoding='utf-8') as f:
dom_tree = minidom.parse(f)
docs = dom_tree.documentElement.getElementsByTagName('DOC')
for doc in docs:
# Input the text
text = doc.getElementsByTagName('CORRECTION')[0]. \
childNodes[0].data.strip()
# Segment
word_seq = segment(text, cut_type='char', pos=False)
word_arr.append(word_seq)
return word_arr
def parse_xml_file(path):
print('Parse data from %s' % path)
data_list = []
dom_tree = minidom.parse(path)
docs = dom_tree.documentElement.getElementsByTagName('DOC')
for doc in docs:
# Input the text
text = doc.getElementsByTagName('TEXT')[0]. \
childNodes[0].data.strip()
# Input the correct text
correction = doc.getElementsByTagName('CORRECTION')[0]. \
childNodes[0].data.strip()
# Segment
source = segment(text, cut_type='char')
target = segment(correction, cut_type='char')
data_list.append([source, target])
return data_list
def parse_xml_file(path):
print('Parse data from %s' % path)
data_list = []
dom_tree = minidom.parse(path)
docs = dom_tree.documentElement.getElementsByTagName('DOC')
for doc in docs:
# Input the text
text = doc.getElementsByTagName('TEXT')[0]. \
childNodes[0].data.strip()
# Input the correct text
correction = doc.getElementsByTagName('CORRECTION')[0]. \
childNodes[0].data.strip()
source = segment(text.strip(), cut_type='char')
target = segment(correction.strip(), cut_type='char')
pair = [source, target]
if pair not in data_list:
data_list.append(pair)
return data_list
def parse_xml_file(path):
print('Parse data from %s' % path)
data_list = []
dom_tree = minidom.parse(path)
docs = dom_tree.documentElement.getElementsByTagName('DOC')
for doc in docs:
# Input the text
text = doc.getElementsByTagName('TEXT')[0]. \
childNodes[0].data.strip()
# Input the correct text
correction = doc.getElementsByTagName('CORRECTION')[0]. \
childNodes[0].data.strip()
source = segment(text.strip(), cut_type='char')
target = segment(correction.strip(), cut_type='char')
pair = [source, target]
if pair not in data_list:
data_list.append(pair)
return data_list
def parse_xml_file(path):
print('Parse data from %s' % path)
data_list = []
dom_tree = minidom.parse(path)
docs = dom_tree.documentElement.getElementsByTagName('DOC')
for doc in docs:
# Input the text
text = doc.getElementsByTagName('TEXT')[0]. \
childNodes[0].data.strip()
# Input the correct text
correction = doc.getElementsByTagName('CORRECTION')[0]. \
childNodes[0].data.strip()
# Segment
source = segment(text, cut_type='char')
target = segment(correction, cut_type='char')
data_list.append([source, target])
return data_list
def parse_xml_file(path):
print('Parse data from %s' % path)
data_list = []
dom_tree = minidom.parse(path)
docs = dom_tree.documentElement.getElementsByTagName('DOC')
for doc in docs:
# Input the text
text = doc.getElementsByTagName('TEXT')[0]. \
childNodes[0].data.strip()
# Input the correct text
correction = doc.getElementsByTagName('CORRECTION')[0]. \
childNodes[0].data.strip()
source = segment(text.strip(), cut_type='char')
target = segment(correction.strip(), cut_type='char')
pair = [source, target]
if pair not in data_list:
data_list.append(pair)
return data_list
def parse_xml_file(path):
print('Parse data from %s' % path)
data_list = []
dom_tree = minidom.parse(path)
docs = dom_tree.documentElement.getElementsByTagName('DOC')
for doc in docs:
# Input the text
text = doc.getElementsByTagName('TEXT')[0]. \
childNodes[0].data.strip()
# Input the correct text
correction = doc.getElementsByTagName('CORRECTION')[0]. \
childNodes[0].data.strip()
source = segment(text.strip(), cut_type='char')
target = segment(correction.strip(), cut_type='char')
pair = [source, target]
if pair not in data_list:
data_list.append(pair)
return data_list
text = doc.getElementsByTagName('TEXT')[0]. \
childNodes[0].data.strip()
# Input the correct text
correction = doc.getElementsByTagName('CORRECTION')[0]. \
childNodes[0].data.strip()
texts = split_2_short_text(text)
corrections = split_2_short_text(correction)
if len(texts) != len(corrections):
# print('error:' + text + '\t' + correction)
continue
for i in range(len(texts)):
if len(texts[i]) > 40:
# print('error:' + texts[i] + '\t' + corrections[i])
continue
source = segment(texts[i], cut_type='char')
target = segment(corrections[i], cut_type='char')
pair = [source, target]
if pair not in data_list:
data_list.append(pair)
return data_list