Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def setUpClass(cls):
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
cls.en_predictor = RNNMorphPredictor(language="en")
cls.ru_predictor = RNNMorphPredictor(language="ru")
def setUpClass(cls):
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
cls.en_predictor = RNNMorphPredictor(language="en")
cls.ru_predictor = RNNMorphPredictor(language="ru")
def test_en_accuracy(self):
self.assertGreater(tag_en_files(self.en_predictor).tag_accuracy, 85)
with open(input_filename, "r", encoding='utf-8') as r, open(output_filename, "w", encoding='utf-8') as w:
i = 0
for line in r:
if line[0] == "#" or line[0] == "=":
continue
if line == "\n":
w.write("\n")
i = 0
continue
records = line.split("\t")
pos = records[3]
if with_forth_column:
gram = records[5]
else:
gram = records[4]
gram = process_gram_tag(gram)
if pos == "PUNCT" and not with_punct:
continue
if add_number:
i += 1
w.write("\t".join([str(i), records[1], records[2].lower(), pos, gram]) + "\n")
else:
w.write("\t".join([records[1], records[2].lower(), pos, gram]) + "\n")
def main():
if sys.argv.__len__() > 1:
init_dir_name = os.path.normpath(sys.argv[1])
assert os.path.isdir(init_dir_name), 'Directory `{0}` does not exist!'.format(init_dir_name)
all_prompts = sorted(list(get_all_prompts(init_dir_name)))
accentor = Accentor()
morpho_predictor = RNNMorphPredictor()
i = 0
for cur_prompt in all_prompts[:100]:
trouble = False
unknown_words = []
for cur_subsentence in select_subsentences(cur_prompt):
morphotags = ['{0} {1}'.format(cur_morpho.pos, cur_morpho.tag)
for cur_morpho in morpho_predictor.predict_sentence_tags(cur_subsentence)]
accent_variants = accentor.do_accents(cur_subsentence, morphotags)
if len(accent_variants) > 1:
trouble = True
else:
accented_phrase = accent_variants[0]
for cur_word in accented_phrase:
vowels_counter = 0
for cur_char in cur_word.lower():
if cur_char in VOWEL_LETTERS:
def __init__(self, batch_size=1):
self.batch_size = batch_size
self.predictor = RNNMorphPredictor(language="ru")
def tag_en_files(predictor: RNNMorphPredictor):
if not os.path.exists(TEST_TAGGED_FOLDER):
os.makedirs(TEST_TAGGED_FOLDER)
tag(predictor, TEST_GOLD_EN_EWT_UD, TEST_TAGGED_EN_EWT_UD)
return measure(TEST_GOLD_EN_EWT_UD, TEST_TAGGED_EN_EWT_UD, True, None)
def tag_ru_files(predictor: RNNMorphPredictor) -> Dict:
if not os.path.exists(TEST_TAGGED_FOLDER):
os.makedirs(TEST_TAGGED_FOLDER)
tag(predictor, TEST_UNTAGGED_LENTA, TEST_TAGGED_LENTA)
tag(predictor, TEST_UNTAGGED_VK, TEST_TAGGED_VK)
tag(predictor, TEST_UNTAGGED_JZ, TEST_TAGGED_JZ)
quality = dict()
print("Lenta:")
quality['Lenta'] = measure(TEST_GOLD_LENTA, TEST_TAGGED_LENTA, True, None)
print("VK:")
quality['VK'] = measure(TEST_GOLD_VK, TEST_TAGGED_VK, True, None)
print("JZ:")
quality['JZ'] = measure(TEST_GOLD_JZ, TEST_TAGGED_JZ, True, None)
print("All:")
count_correct_tags = quality['Lenta'].correct_tags + quality['VK'].correct_tags + quality['JZ'].correct_tags
count_correct_pos = quality['Lenta'].correct_pos + quality['VK'].correct_pos + quality['JZ'].correct_pos
count_tags = quality['Lenta'].total_tags + quality['VK'].total_tags + quality['JZ'].total_tags
count_correct_sentences = quality['Lenta'].correct_sentences + quality['VK'].correct_sentences + \
quality['JZ'].correct_sentences
count_sentences = quality['Lenta'].total_sentences + quality['VK'].total_sentences + \
quality['JZ'].total_sentences
quality['All'] = dict()
quality['All']['tag_accuracy'] = float(count_correct_tags) / count_tags
quality['All']['pos_accuracy'] = float(count_correct_pos) / count_tags
quality['All']['sentence_accuracy'] = float(count_correct_sentences) / count_sentences
return quality
def tag_ru_files(predictor: RNNMorphPredictor) -> Dict:
if not os.path.exists(TEST_TAGGED_FOLDER):
os.makedirs(TEST_TAGGED_FOLDER)
tag(predictor, TEST_UNTAGGED_LENTA, TEST_TAGGED_LENTA)
tag(predictor, TEST_UNTAGGED_VK, TEST_TAGGED_VK)
tag(predictor, TEST_UNTAGGED_JZ, TEST_TAGGED_JZ)
quality = dict()
print("Lenta:")
quality['Lenta'] = measure(TEST_GOLD_LENTA, TEST_TAGGED_LENTA, True, None)
print("VK:")
quality['VK'] = measure(TEST_GOLD_VK, TEST_TAGGED_VK, True, None)
print("JZ:")
quality['JZ'] = measure(TEST_GOLD_JZ, TEST_TAGGED_JZ, True, None)
print("All:")
count_correct_tags = quality['Lenta'].correct_tags + quality['VK'].correct_tags + quality['JZ'].correct_tags
count_correct_pos = quality['Lenta'].correct_pos + quality['VK'].correct_pos + quality['JZ'].correct_pos
count_tags = quality['Lenta'].total_tags + quality['VK'].total_tags + quality['JZ'].total_tags
count_correct_sentences = quality['Lenta'].correct_sentences + quality['VK'].correct_sentences + \
quality['JZ'].correct_sentences
count_sentences = quality['Lenta'].total_sentences + quality['VK'].total_sentences + \
quality['JZ'].total_sentences
quality['All'] = dict()
quality['All']['tag_accuracy'] = float(count_correct_tags) / count_tags
quality['All']['pos_accuracy'] = float(count_correct_pos) / count_tags
quality['All']['sentence_accuracy'] = float(count_correct_sentences) / count_sentences
def tag_ru_files(predictor: RNNMorphPredictor) -> Dict:
if not os.path.exists(TEST_TAGGED_FOLDER):
os.makedirs(TEST_TAGGED_FOLDER)
tag(predictor, TEST_UNTAGGED_LENTA, TEST_TAGGED_LENTA)
tag(predictor, TEST_UNTAGGED_VK, TEST_TAGGED_VK)
tag(predictor, TEST_UNTAGGED_JZ, TEST_TAGGED_JZ)
quality = dict()
print("Lenta:")
quality['Lenta'] = measure(TEST_GOLD_LENTA, TEST_TAGGED_LENTA, True, None)
print("VK:")
quality['VK'] = measure(TEST_GOLD_VK, TEST_TAGGED_VK, True, None)
print("JZ:")
quality['JZ'] = measure(TEST_GOLD_JZ, TEST_TAGGED_JZ, True, None)
print("All:")
count_correct_tags = quality['Lenta'].correct_tags + quality['VK'].correct_tags + quality['JZ'].correct_tags
count_correct_pos = quality['Lenta'].correct_pos + quality['VK'].correct_pos + quality['JZ'].correct_pos
count_tags = quality['Lenta'].total_tags + quality['VK'].total_tags + quality['JZ'].total_tags
count_correct_sentences = quality['Lenta'].correct_sentences + quality['VK'].correct_sentences + \
quality['JZ'].correct_sentences
count_sentences = quality['Lenta'].total_sentences + quality['VK'].total_sentences + \
quality['JZ'].total_sentences
quality['All'] = dict()
quality['All']['tag_accuracy'] = float(count_correct_tags) / count_tags
quality['All']['pos_accuracy'] = float(count_correct_pos) / count_tags
quality['All']['sentence_accuracy'] = float(count_correct_sentences) / count_sentences
return quality