Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
5\tregnante\tregno\tt\tt\tt|-|s|p|p|a|m|b|-\t0\tADV\t_\t_
""")
sentences = parse(
data,
fields=(
'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc'
),
field_parsers={
"feats": lambda line, i: [feat for feat in line[i].split("|")]
}
)
self.assertEqual(
sentences[0][4],
Token([
('id', 5),
('form', 'regnante'),
('lemma', 'regno'),
('upostag', 't'),
('xpostag', 't'),
('feats', ['t', '-', 's', 'p', 'p', 'a', 'm', 'b', '-']),
('head', 0),
('deprel', 'ADV'),
('deps', None),
('misc', None),
])
)
self.assertEqual(
sentences[0].metadata,
Token([
('id', "'1'-document_id='36:1047'-span='1'")
def test_custom_metadata_parsers(self):
data = dedent("""\
# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC
# newdoc id = mf920901-001
# newpar id = mf920901-001-p1
# sent_id = mf920901-001-p1s1A
# text = Slovenská ústava: pro i proti
# text_en = Slovak constitution: pros and cons
""")
_, metadata = parse_token_and_metadata(data)
self.assertEqual(metadata, Token([
("global.columns", "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"),
("newdoc id", "mf920901-001"),
("newpar id", "mf920901-001-p1"),
("sent_id", "mf920901-001-p1s1A"),
("text", "Slovenská ústava: pro i proti"),
("text_en", "Slovak constitution: pros and cons"),
]))
_, metadata = parse_token_and_metadata(
data,
metadata_parsers={"global.columns": lambda key, value: (key, value.split())}
)
self.assertEqual(metadata, Token([
("global.columns", ["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]),
("newdoc id", "mf920901-001"),
# newpar id = mf920901-001-p1
# sent_id = mf920901-001-p1s1A
# text = Slovenská ústava: pro i proti
# text_en = Slovak constitution: pros and cons
""")
_, metadata = parse_token_and_metadata(data)
self.assertEqual(metadata, Token([
("global.columns", "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"),
("newdoc id", "mf920901-001"),
("newpar id", "mf920901-001-p1"),
("sent_id", "mf920901-001-p1s1A"),
("text", "Slovenská ústava: pro i proti"),
("text_en", "Slovak constitution: pros and cons"),
]))
_, metadata = parse_token_and_metadata(
data,
metadata_parsers={"global.columns": lambda key, value: (key, value.split())}
)
self.assertEqual(metadata, Token([
("global.columns", ["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]),
("newdoc id", "mf920901-001"),
("newpar id", "mf920901-001-p1"),
("sent_id", "mf920901-001-p1s1A"),
("text", "Slovenská ústava: pro i proti"),
("text_en", "Slovak constitution: pros and cons"),
]))
def test_deep_filtering(self):
tokenlist = TokenList([
{"form": "The", "feats": Token([('Definite', 'Def'), ('PronType', 'Art')])},
{"form": "quick", "feats": Token([('Degree', 'Pos')])},
{"form": "brown", "feats": Token([('Degree', 'Pos')])},
{"form": "fox", "feats": Token([('Number', 'Sing')])},
])
self.assertEqual(
tokenlist.filter(feats__Degree="Pos"),
TokenList([
{"form": "quick", "feats": Token([('Degree', 'Pos')])},
{"form": "brown", "feats": Token([('Degree', 'Pos')])},
])
)
self.assertEqual(
tokenlist.filter(form="brown", feats__Degree="Pos"),
TokenList([
{"form": "brown", "feats": Token([('Degree', 'Pos')])},
])
)
self.assertEqual(
tokenlist.filter(form="brown", feats__Degree="Pos", id=1),
TokenList([])
)
self.assertEqual(
tokenlist.filter(unknown__property__value="undefined"),
TokenList([])
""")
fields = ("id", "backwards")
# A field parser that takes all remaining field, reverses their letters and joins them
def parse_backwards(value):
return " ".join([part[::-1] for part in value])
# This overrides the default parsers, so the id is parsed as a string
field_parsers = {
"id": lambda line, i: line[i],
"backwards": lambda line, i: parse_backwards(line[i:len(line)])
}
tokens, _ = parse_token_and_metadata(data, fields=fields, field_parsers=field_parsers)
self.assertEqual(tokens, [
Token([("id", '1'), ("backwards", "sdrawkcab enil")]),
Token([("id", '2'), ("backwards", "sirap paris")]),
])
def test_simple_tree(self):
tokenlist = TokenList([
Token([("id", 2), ("form", "dog"), ("head", 0)]),
Token([("id", 1), ("form", "a"), ("head", 2)]),
])
tree = TokenTree(
token=Token([("id", 2), ("form", "dog"), ("head", 0)]),
children=[TokenTree(
token=Token([("id", 1), ("form", "a"), ("head", 2)]),
children=[]
)]
)
self.assertTreeEqual(tokenlist.to_tree(), tree)
def test_ordered_dict(self):
data = Token()
self.assertEqual(serialize_field(data), "")
data = Token([('SpaceAfter', 'No')])
self.assertEqual(serialize_field(data), "SpaceAfter=No")
data = Token([('Translit', None)])
self.assertEqual(serialize_field(data), "Translit=_")
def test_invalid_metadata(self):
data = dedent("""\
# meta = data2
# meta = data
# newdoc
# newpar
# meta
# = data
""")
_, metadata = parse_token_and_metadata(data)
self.assertEqual(metadata, Token([
("meta", "data"),
("newdoc", None),
("newpar", None),
]))
def test_xupos_to_xupostag(self):
token = Token({"id": 1, "xpos": "DT", "upos": "DET"})
self.assertEqual(token["xpos"], "DT")
self.assertEqual(token["xpostag"], "DT")
self.assertEqual(token["upos"], "DET")
self.assertEqual(token["upostag"], "DET")
def test_parse_tree(self):
sentences = parse_tree(data)
self.assertEqual(len(sentences), 1)
root = sentences[0]
self.assertEqual(text(root), "TokenTree")
self.assertEqual(
root.token,
Token([
('id', 5),
('form', 'jumps'),
('lemma', 'jump'),
('upos', 'VERB'),
('xpos', 'VBZ'),
('feats', Token([
("Mood", "Ind"),
("Number", "Sing"),
("Person", "3"),
("Tense", "Pres"),
("VerbForm", "Fin"),
])),
('head', 0),
('deprel', 'root'),
('deps', None),
('misc', None)