Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
5\tregnante\tregno\tt\tt\tt|-|s|p|p|a|m|b|-\t0\tADV\t_\t_
""")
sentences = parse(
data,
fields=(
'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc'
),
field_parsers={
"feats": lambda line, i: [feat for feat in line[i].split("|")]
}
)
self.assertEqual(
sentences[0][4],
Token([
('id', 5),
('form', 'regnante'),
('lemma', 'regno'),
('upostag', 't'),
('xpostag', 't'),
('feats', ['t', '-', 's', 'p', 'p', 'a', 'm', 'b', '-']),
('head', 0),
('deprel', 'ADV'),
('deps', None),
('misc', None),
])
)
self.assertEqual(
sentences[0].metadata,
Token([
('id', "'1'-document_id='36:1047'-span='1'")
def test_deep_filtering(self):
tokenlist = TokenList([
{"form": "The", "feats": Token([('Definite', 'Def'), ('PronType', 'Art')])},
{"form": "quick", "feats": Token([('Degree', 'Pos')])},
{"form": "brown", "feats": Token([('Degree', 'Pos')])},
{"form": "fox", "feats": Token([('Number', 'Sing')])},
])
self.assertEqual(
tokenlist.filter(feats__Degree="Pos"),
TokenList([
{"form": "quick", "feats": Token([('Degree', 'Pos')])},
{"form": "brown", "feats": Token([('Degree', 'Pos')])},
])
)
self.assertEqual(
tokenlist.filter(form="brown", feats__Degree="Pos"),
TokenList([
{"form": "brown", "feats": Token([('Degree', 'Pos')])},
])
)
self.assertEqual(
tokenlist.filter(form="brown", feats__Degree="Pos", id=1),
TokenList([])
)
self.assertEqual(
tokenlist.filter(unknown__property__value="undefined"),
TokenList([])
""")
fields = ("id", "backwards")
# A field parser that takes all remaining field, reverses their letters and joins them
def parse_backwards(value):
return " ".join([part[::-1] for part in value])
# This overrides the default parsers, so the id is parsed as a string
field_parsers = {
"id": lambda line, i: line[i],
"backwards": lambda line, i: parse_backwards(line[i:len(line)])
}
tokens, _ = parse_token_and_metadata(data, fields=fields, field_parsers=field_parsers)
self.assertEqual(tokens, [
Token([("id", '1'), ("backwards", "sdrawkcab enil")]),
Token([("id", '2'), ("backwards", "sirap paris")]),
])
def test_simple_tree(self):
tokenlist = TokenList([
Token([("id", 2), ("form", "dog"), ("head", 0)]),
Token([("id", 1), ("form", "a"), ("head", 2)]),
])
tree = TokenTree(
token=Token([("id", 2), ("form", "dog"), ("head", 0)]),
children=[TokenTree(
token=Token([("id", 1), ("form", "a"), ("head", 2)]),
children=[]
)]
)
self.assertTreeEqual(tokenlist.to_tree(), tree)
def test_ordered_dict(self):
data = Token()
self.assertEqual(serialize_field(data), "")
data = Token([('SpaceAfter', 'No')])
self.assertEqual(serialize_field(data), "SpaceAfter=No")
data = Token([('Translit', None)])
self.assertEqual(serialize_field(data), "Translit=_")
def test_invalid_metadata(self):
data = dedent("""\
# meta = data2
# meta = data
# newdoc
# newpar
# meta
# = data
""")
_, metadata = parse_token_and_metadata(data)
self.assertEqual(metadata, Token([
("meta", "data"),
("newdoc", None),
("newpar", None),
]))
def test_xupos_to_xupostag(self):
token = Token({"id": 1, "xpos": "DT", "upos": "DET"})
self.assertEqual(token["xpos"], "DT")
self.assertEqual(token["xpostag"], "DT")
self.assertEqual(token["upos"], "DET")
self.assertEqual(token["upostag"], "DET")
def test_parse_tree(self):
sentences = parse_tree(data)
self.assertEqual(len(sentences), 1)
root = sentences[0]
self.assertEqual(text(root), "TokenTree")
self.assertEqual(
root.token,
Token([
('id', 5),
('form', 'jumps'),
('lemma', 'jump'),
('upos', 'VERB'),
('xpos', 'VBZ'),
('feats', Token([
("Mood", "Ind"),
("Number", "Sing"),
("Person", "3"),
("Tense", "Pres"),
("VerbForm", "Fin"),
])),
('head', 0),
('deprel', 'root'),
('deps', None),
('misc', None)
def test_parse_line_two_spaces(self):
line = "1 The the DET DT Definite=Def|PronType=Art 4 det _ _"
self.assertEqual(parse_line(line, fields=["id", "form"]), Token([
('id', 1),
('form', 'The'),
]))
data = dedent("""\
# meta = data
1\thej
2\tdå
3\thej
4\tdå
""")
tokens, metadata = parse_token_and_metadata(data)
self.assertListEqual(tokens, [
Token([("id", 1), ("form", "hej")]),
Token([("id", 2), ("form", "då")]),
Token([("id", 3), ("form", "hej")]),
Token([("id", 4), ("form", "då")]),
])
self.assertEqual(metadata, Token([("meta", "data")]))