File size: 6,620 Bytes
c45d283 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
from operator import itemgetter;
import os.path;
import re;
import xml.etree.ElementTree as ET;
from graph import Graph;
def walk(id, node, parent, nodes, edges, ns):
i = node.get("id");
o = node.findtext(ns + "ord");
if i is None or o is None and parent is not None:
raise Exception("treex.walk(): "
"missing ‘id’ or ‘ord’ values while decoding tree #{}; exit."
"".format(id));
nodes.append((i, int(o) if o is not None else 0, node));
if edges is not None:
functor = node.findtext(ns + "functor");
if parent is not None and functor is not None:
edges.append((parent, i, functor));
children = node.find(ns + "children");
if children is not None:
for child in children:
if child.tag == ns + "LM":
walk(id, child, i, nodes, edges, ns);
if children.find(ns + "LM") is None:
walk(id, children, i, nodes, edges, ns);
def read(fp, text = None):
ns = "{http://ufal.mff.cuni.cz/pdt/pml/}";
#
# _fix_me_
# factor out the anchor()ing code into a reusable form. (oe; 4-apr-20)
#
n = None;
i = 0;
def skip():
nonlocal i;
while i < n and graph.input[i] in {" ", "\t"}:
i += 1;
def scan(candidates):
for candidate in candidates:
if graph.input.startswith(candidate, i):
return len(candidate);
def anchor(form):
nonlocal i;
skip();
m = None;
if graph.input.startswith(form, i):
m = len(form);
else:
for old, new in {("‘", "`"), ("’", "'")}:
form = form.replace(old, new);
if graph.input.startswith(form, i):
m = len(form);
break;
if not m:
m = scan({"“", "\"", "``"}) or scan({"‘", "`"}) \
or scan({"”", "\"", "''"}) or scan({"’", "'"}) \
or scan({"—", "—", "---", "--"}) \
or scan({"…", "...", ". . ."});
if m:
anchor = {"from": i, "to": i + m};
i += m;
skip();
return anchor;
else:
raise Exception("{}: failed to anchor |{}| in |{}| ({})"
"".format(graph.id, form, graph.input, i));
tree = ET.parse(fp).getroot();
bundles = tree.find(ns + "bundles");
for item in bundles.findall(ns + "LM"):
id = item.get("id");
graph = Graph(id, flavor = 0, framework = "ptg");
surface = list(); nodes = list(); edges = list();
for zone in item.iter(ns + "zone"):
if zone.get("language") == "en":
sentence = zone.findtext(ns + "sentence");
trees = zone.find(ns + "trees");
if trees is not None:
atree = trees.find(ns + "a_tree");
ttree = trees.find(ns + "t_tree");
root = atree.find(ns + "children");
top = ttree.find(ns + "children");
# print(id, sentence, atree, ttree, root, top);
if root is None or top is None:
raise Exception("treex.read(): "
"missing ‘a_tree’ or ‘t_tree’ values while decoding tree #{}; exit."
"".format(id));
walk(id, root, None, surface, None, ns);
walk(id, top, None, nodes, edges, ns);
#
# determine character-based anchors for all .surface. (analytical) tokens
#
anchoring = dict();
if sentence is not None:
graph.add_input(sentence);
n = len(graph.input);
i = 0;
for node in sorted(surface, key = itemgetter(1)):
anchoring[node[0]] = anchor(node[2].findtext(ns + "form"));
#
# now process tectogrammatical nodes in surface order (as indicated in the
# annotations): map to consecutive numerical identifiers; retrieve anchors
# from corresponding analytical nodes; and create actual (new) graph nodes.
#
mapping = {};
to = 0;
for node in sorted(nodes, key = itemgetter(1)):
mapping[node[0]] = i = len(mapping);
properties = dict();
a = node[2].find(ns + "a");
if a is not None:
anchors = list();
for lex in a:
if len(lex) == 0:
anchors.append(anchoring[lex.text]);
else:
for lm in lex.findall(ns + "LM"):
anchors.append(anchoring[lm.text]);
anchors = sorted(anchors, key = itemgetter("to"));
to = anchors[-1]["to"];
else:
#
# _fix_me_
# discuss anchoring of generated nodes: currently, for uniformity, we
# anchor them to an empty string immediately after the final character
# of the preceding non-generated node. but this arguably introduces a
# vacuous piece of information, unless one were to argue that it rather
# is an encoding of the node status for generated nodes? (oe; 4-apr-20)
#
anchors = [{"from": to, "to": to}];
#
# the node label comes from the tectogrammatical lemma
#
lemma = node[2].findtext(ns + "t_lemma");
frame = node[2].findtext(ns + "val_frame.rf");
#
# where present (mostly on verbs), extract the valency frame identifier
# _fix_me_
# for compatibility with earlier PSD releases, strip prefix that seems to
# identify the valency dictionary. (oe; 4-apr-20)
#
if frame is not None:
if "#" in frame:
properties["frame"] = frame[frame.index("#") + 1:];
else:
properties["frame"] = frame;
#
# selectively expose grammatemes as node-local properties, but ignore
# (vanilla but very high-frequent) default values
#
grammatemes = node[2].find(ns + "gram");
if grammatemes is not None:
for property, default in [("tense", {"nil"}), ("negation", {"neg0"})]:
match = grammatemes.findtext(ns + property);
if match is not None and match not in default:
properties[property] = match;
graph.add_node(id = i, label = lemma, anchors = anchors,
properties = properties.keys(),
values = properties.values(),
top = node[0] == top.get("id"));
#
# similarly, record all edges, now using mapped identifiers
#
for source, target, label in edges:
graph.add_edge(mapping[source], mapping[target], label);
#
# in a second pass (so that all internal identifiers are mapped already),
# create edges reflecting coreference annotations.
#
for node in nodes:
coref = node[2].findtext(ns + "coref_gram.rf");
if coref is not None:
graph.add_edge(mapping[node[0]], mapping[coref], "coref_gram");
yield graph, None;
|