ssa-perin / mtool /codec /treex.py
larkkin's picture
Add code and readme
c45d283
raw
history blame contribute delete
No virus
6.62 kB
from operator import itemgetter;
import os.path;
import re;
import xml.etree.ElementTree as ET;
from graph import Graph;
def walk(id, node, parent, nodes, edges, ns):
i = node.get("id");
o = node.findtext(ns + "ord");
if i is None or o is None and parent is not None:
raise Exception("treex.walk(): "
"missing ‘id’ or ‘ord’ values while decoding tree #{}; exit."
"".format(id));
nodes.append((i, int(o) if o is not None else 0, node));
if edges is not None:
functor = node.findtext(ns + "functor");
if parent is not None and functor is not None:
edges.append((parent, i, functor));
children = node.find(ns + "children");
if children is not None:
for child in children:
if child.tag == ns + "LM":
walk(id, child, i, nodes, edges, ns);
if children.find(ns + "LM") is None:
walk(id, children, i, nodes, edges, ns);
def read(fp, text = None):
ns = "{http://ufal.mff.cuni.cz/pdt/pml/}";
#
# _fix_me_
# factor out the anchor()ing code into a reusable form. (oe; 4-apr-20)
#
n = None;
i = 0;
def skip():
nonlocal i;
while i < n and graph.input[i] in {" ", "\t"}:
i += 1;
def scan(candidates):
for candidate in candidates:
if graph.input.startswith(candidate, i):
return len(candidate);
def anchor(form):
nonlocal i;
skip();
m = None;
if graph.input.startswith(form, i):
m = len(form);
else:
for old, new in {("‘", "`"), ("’", "'")}:
form = form.replace(old, new);
if graph.input.startswith(form, i):
m = len(form);
break;
if not m:
m = scan({"“", "\"", "``"}) or scan({"‘", "`"}) \
or scan({"”", "\"", "''"}) or scan({"’", "'"}) \
or scan({"—", "—", "---", "--"}) \
or scan({"…", "...", ". . ."});
if m:
anchor = {"from": i, "to": i + m};
i += m;
skip();
return anchor;
else:
raise Exception("{}: failed to anchor |{}| in |{}| ({})"
"".format(graph.id, form, graph.input, i));
tree = ET.parse(fp).getroot();
bundles = tree.find(ns + "bundles");
for item in bundles.findall(ns + "LM"):
id = item.get("id");
graph = Graph(id, flavor = 0, framework = "ptg");
surface = list(); nodes = list(); edges = list();
for zone in item.iter(ns + "zone"):
if zone.get("language") == "en":
sentence = zone.findtext(ns + "sentence");
trees = zone.find(ns + "trees");
if trees is not None:
atree = trees.find(ns + "a_tree");
ttree = trees.find(ns + "t_tree");
root = atree.find(ns + "children");
top = ttree.find(ns + "children");
# print(id, sentence, atree, ttree, root, top);
if root is None or top is None:
raise Exception("treex.read(): "
"missing ‘a_tree’ or ‘t_tree’ values while decoding tree #{}; exit."
"".format(id));
walk(id, root, None, surface, None, ns);
walk(id, top, None, nodes, edges, ns);
#
# determine character-based anchors for all .surface. (analytical) tokens
#
anchoring = dict();
if sentence is not None:
graph.add_input(sentence);
n = len(graph.input);
i = 0;
for node in sorted(surface, key = itemgetter(1)):
anchoring[node[0]] = anchor(node[2].findtext(ns + "form"));
#
# now process tectogrammatical nodes in surface order (as indicated in the
# annotations): map to consecutive numerical identifiers; retrieve anchors
# from corresponding analytical nodes; and create actual (new) graph nodes.
#
mapping = {};
to = 0;
for node in sorted(nodes, key = itemgetter(1)):
mapping[node[0]] = i = len(mapping);
properties = dict();
a = node[2].find(ns + "a");
if a is not None:
anchors = list();
for lex in a:
if len(lex) == 0:
anchors.append(anchoring[lex.text]);
else:
for lm in lex.findall(ns + "LM"):
anchors.append(anchoring[lm.text]);
anchors = sorted(anchors, key = itemgetter("to"));
to = anchors[-1]["to"];
else:
#
# _fix_me_
# discuss anchoring of generated nodes: currently, for uniformity, we
# anchor them to an empty string immediately after the final character
# of the preceding non-generated node. but this arguably introduces a
# vacuous piece of information, unless one were to argue that it rather
# is an encoding of the node status for generated nodes? (oe; 4-apr-20)
#
anchors = [{"from": to, "to": to}];
#
# the node label comes from the tectogrammatical lemma
#
lemma = node[2].findtext(ns + "t_lemma");
frame = node[2].findtext(ns + "val_frame.rf");
#
# where present (mostly on verbs), extract the valency frame identifier
# _fix_me_
# for compatibility with earlier PSD releases, strip prefix that seems to
# identify the valency dictionary. (oe; 4-apr-20)
#
if frame is not None:
if "#" in frame:
properties["frame"] = frame[frame.index("#") + 1:];
else:
properties["frame"] = frame;
#
# selectively expose grammatemes as node-local properties, but ignore
# (vanilla but very high-frequent) default values
#
grammatemes = node[2].find(ns + "gram");
if grammatemes is not None:
for property, default in [("tense", {"nil"}), ("negation", {"neg0"})]:
match = grammatemes.findtext(ns + property);
if match is not None and match not in default:
properties[property] = match;
graph.add_node(id = i, label = lemma, anchors = anchors,
properties = properties.keys(),
values = properties.values(),
top = node[0] == top.get("id"));
#
# similarly, record all edges, now using mapped identifiers
#
for source, target, label in edges:
graph.add_edge(mapping[source], mapping[target], label);
#
# in a second pass (so that all internal identifiers are mapped already),
# create edges reflecting coreference annotations.
#
for node in nodes:
coref = node[2].findtext(ns + "coref_gram.rf");
if coref is not None:
graph.add_edge(mapping[node[0]], mapping[coref], "coref_gram");
yield graph, None;