Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_custom_metadata_parsers(self):
data = dedent("""\
# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC
# newdoc id = mf920901-001
# newpar id = mf920901-001-p1
# sent_id = mf920901-001-p1s1A
# text = Slovenská ústava: pro i proti
# text_en = Slovak constitution: pros and cons
""")
_, metadata = parse_token_and_metadata(data)
self.assertEqual(metadata, Token([
("global.columns", "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"),
("newdoc id", "mf920901-001"),
("newpar id", "mf920901-001-p1"),
("sent_id", "mf920901-001-p1s1A"),
("text", "Slovenská ústava: pro i proti"),
("text_en", "Slovak constitution: pros and cons"),
]))
_, metadata = parse_token_and_metadata(
data,
metadata_parsers={"global.columns": lambda key, value: (key, value.split())}
)
self.assertEqual(metadata, Token([
("global.columns", ["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]),
("newdoc id", "mf920901-001"),
# newpar id = mf920901-001-p1
# sent_id = mf920901-001-p1s1A
# text = Slovenská ústava: pro i proti
# text_en = Slovak constitution: pros and cons
""")
_, metadata = parse_token_and_metadata(data)
self.assertEqual(metadata, Token([
("global.columns", "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"),
("newdoc id", "mf920901-001"),
("newpar id", "mf920901-001-p1"),
("sent_id", "mf920901-001-p1s1A"),
("text", "Slovenská ústava: pro i proti"),
("text_en", "Slovak constitution: pros and cons"),
]))
_, metadata = parse_token_and_metadata(
data,
metadata_parsers={"global.columns": lambda key, value: (key, value.split())}
)
self.assertEqual(metadata, Token([
("global.columns", ["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]),
("newdoc id", "mf920901-001"),
("newpar id", "mf920901-001-p1"),
("sent_id", "mf920901-001-p1s1A"),
("text", "Slovenská ústava: pro i proti"),
("text_en", "Slovak constitution: pros and cons"),
]))
def test_metadata(self):
data = dedent("""\
# newdoc
# data = meta
# meta = data
1\tdog
""")
tokenlist = TokenList(*parse_token_and_metadata(data))
self.assertEqual(serialize(tokenlist), data)
def test_empty(self):
with self.assertRaises(ParseException):
parse_token_and_metadata(None)
def test_invalid_metadata(self):
data = dedent("""\
# meta = data2
# meta = data
# newdoc
# newpar
# meta
# = data
""")
_, metadata = parse_token_and_metadata(data)
self.assertEqual(metadata, Token([
("meta", "data"),
("newdoc", None),
("newpar", None),
]))
def test_serialize_tricky_fields(self):
data = dedent("""\
5\tjumps\tjump\tVERB\tVBZ\tMood=Ind|Number=Sing\t0\troot\t_\tSpaceAfter=No
""")
tokenlist = TokenList(*parse_token_and_metadata(data))
self.assertEqual(serialize(tokenlist).strip(), data.strip())
def test_newlines_in_sentence(self):
data = dedent("""\
# meta = data
1\thej
2\tdå
3\thej
4\tdå
""")
tokens, metadata = parse_token_and_metadata(data)
self.assertListEqual(tokens, [
Token([("id", 1), ("form", "hej")]),
Token([("id", 2), ("form", "då")]),
Token([("id", 3), ("form", "hej")]),
Token([("id", 4), ("form", "då")]),
])
self.assertEqual(metadata, Token([("meta", "data")]))
def test_custom_fields(self):
data = dedent("""\
1\t1\t1
2\t2\t2
""")
tokens, _ = parse_token_and_metadata(data, fields=("id", "id", "id"))
self.assertEqual(tokens, [
Token([("id", 1), ("id", 1), ("id", 1)]),
Token([("id", 2), ("id", 2), ("id", 2)]),
])
1\tbackwards\tline
2\tparis\tsirap
""")
fields = ("id", "backwards")
# A field parser that takes all remaining field, reverses their letters and joins them
def parse_backwards(value):
return " ".join([part[::-1] for part in value])
# This overrides the default parsers, so the id is parsed as a string
field_parsers = {
"id": lambda line, i: line[i],
"backwards": lambda line, i: parse_backwards(line[i:len(line)])
}
tokens, _ = parse_token_and_metadata(data, fields=fields, field_parsers=field_parsers)
self.assertEqual(tokens, [
Token([("id", '1'), ("backwards", "sdrawkcab enil")]),
Token([("id", '2'), ("backwards", "sirap paris")]),
])
def parse_incr(in_file, fields=None, field_parsers=None, metadata_parsers=None):
if not hasattr(in_file, 'read'):
raise FileNotFoundError("Invalid file, 'parse_incr' needs an opened file as input")
if not fields:
fields = parse_conllu_plus_fields(in_file, metadata_parsers=metadata_parsers)
for sentence in parse_sentences(in_file):
yield TokenList(*parse_token_and_metadata(
sentence,
fields=fields,
field_parsers=field_parsers,
metadata_parsers=metadata_parsers
))