Add code and readme

c45d283 6 months ago

No virus

6.62 kB

	from operator import itemgetter;
	import os.path;
	import re;
	import xml.etree.ElementTree as ET;

	from graph import Graph;

	def walk(id, node, parent, nodes, edges, ns):
	i = node.get("id");
	o = node.findtext(ns + "ord");
	if i is None or o is None and parent is not None:
	raise Exception("treex.walk(): "
	"missing ‘id’ or ‘ord’ values while decoding tree #{}; exit."
	"".format(id));
	nodes.append((i, int(o) if o is not None else 0, node));

	if edges is not None:
	functor = node.findtext(ns + "functor");
	if parent is not None and functor is not None:
	edges.append((parent, i, functor));

	children = node.find(ns + "children");
	if children is not None:
	for child in children:
	if child.tag == ns + "LM":
	walk(id, child, i, nodes, edges, ns);
	if children.find(ns + "LM") is None:
	walk(id, children, i, nodes, edges, ns);

	def read(fp, text = None):
	ns = "{http://ufal.mff.cuni.cz/pdt/pml/}";

	#
	# _fix_me_
	# factor out the anchor()ing code into a reusable form. (oe; 4-apr-20)
	#
	n = None;
	i = 0;

	def skip():
	nonlocal i;
	while i < n and graph.input[i] in {" ", "\t"}:
	i += 1;

	def scan(candidates):
	for candidate in candidates:
	if graph.input.startswith(candidate, i):
	return len(candidate);

	def anchor(form):
	nonlocal i;
	skip();
	m = None;
	if graph.input.startswith(form, i):
	m = len(form);
	else:
	for old, new in {("‘", "`"), ("’", "'")}:
	form = form.replace(old, new);
	if graph.input.startswith(form, i):
	m = len(form);
	break;
	if not m:
	m = scan({"“", "\"", "``"}) or scan({"‘", "`"}) \
	or scan({"”", "\"", "''"}) or scan({"’", "'"}) \
	or scan({"—", "—", "---", "--"}) \
	or scan({"…", "...", ". . ."});
	if m:
	anchor = {"from": i, "to": i + m};
	i += m;
	skip();
	return anchor;
	else:
	raise Exception("{}: failed to anchor \|{}\| in \|{}\| ({})"
	"".format(graph.id, form, graph.input, i));

	tree = ET.parse(fp).getroot();
	bundles = tree.find(ns + "bundles");
	for item in bundles.findall(ns + "LM"):
	id = item.get("id");
	graph = Graph(id, flavor = 0, framework = "ptg");
	surface = list(); nodes = list(); edges = list();
	for zone in item.iter(ns + "zone"):
	if zone.get("language") == "en":
	sentence = zone.findtext(ns + "sentence");
	trees = zone.find(ns + "trees");
	if trees is not None:
	atree = trees.find(ns + "a_tree");
	ttree = trees.find(ns + "t_tree");
	root = atree.find(ns + "children");
	top = ttree.find(ns + "children");
	# print(id, sentence, atree, ttree, root, top);
	if root is None or top is None:
	raise Exception("treex.read(): "
	"missing ‘a_tree’ or ‘t_tree’ values while decoding tree #{}; exit."
	"".format(id));
	walk(id, root, None, surface, None, ns);
	walk(id, top, None, nodes, edges, ns);
	#
	# determine character-based anchors for all .surface. (analytical) tokens
	#
	anchoring = dict();
	if sentence is not None:
	graph.add_input(sentence);
	n = len(graph.input);
	i = 0;
	for node in sorted(surface, key = itemgetter(1)):
	anchoring[node[0]] = anchor(node[2].findtext(ns + "form"));

	#
	# now process tectogrammatical nodes in surface order (as indicated in the
	# annotations): map to consecutive numerical identifiers; retrieve anchors
	# from corresponding analytical nodes; and create actual (new) graph nodes.
	#
	mapping = {};
	to = 0;
	for node in sorted(nodes, key = itemgetter(1)):
	mapping[node[0]] = i = len(mapping);
	properties = dict();

	a = node[2].find(ns + "a");
	if a is not None:
	anchors = list();
	for lex in a:
	if len(lex) == 0:
	anchors.append(anchoring[lex.text]);
	else:
	for lm in lex.findall(ns + "LM"):
	anchors.append(anchoring[lm.text]);
	anchors = sorted(anchors, key = itemgetter("to"));
	to = anchors[-1]["to"];
	else:
	#
	# _fix_me_
	# discuss anchoring of generated nodes: currently, for uniformity, we
	# anchor them to an empty string immediately after the final character
	# of the preceding non-generated node. but this arguably introduces a
	# vacuous piece of information, unless one were to argue that it rather
	# is an encoding of the node status for generated nodes? (oe; 4-apr-20)
	#
	anchors = [{"from": to, "to": to}];

	#
	# the node label comes from the tectogrammatical lemma
	#
	lemma = node[2].findtext(ns + "t_lemma");

	frame = node[2].findtext(ns + "val_frame.rf");
	#
	# where present (mostly on verbs), extract the valency frame identifier
	# _fix_me_
	# for compatibility with earlier PSD releases, strip prefix that seems to
	# identify the valency dictionary. (oe; 4-apr-20)
	#
	if frame is not None:
	if "#" in frame:
	properties["frame"] = frame[frame.index("#") + 1:];
	else:
	properties["frame"] = frame;

	#
	# selectively expose grammatemes as node-local properties, but ignore
	# (vanilla but very high-frequent) default values
	#
	grammatemes = node[2].find(ns + "gram");
	if grammatemes is not None:
	for property, default in [("tense", {"nil"}), ("negation", {"neg0"})]:
	match = grammatemes.findtext(ns + property);
	if match is not None and match not in default:
	properties[property] = match;

	graph.add_node(id = i, label = lemma, anchors = anchors,
	properties = properties.keys(),
	values = properties.values(),
	top = node[0] == top.get("id"));

	#
	# similarly, record all edges, now using mapped identifiers
	#
	for source, target, label in edges:
	graph.add_edge(mapping[source], mapping[target], label);

	#
	# in a second pass (so that all internal identifiers are mapped already),
	# create edges reflecting coreference annotations.
	#
	for node in nodes:
	coref = node[2].findtext(ns + "coref_gram.rf");
	if coref is not None:
	graph.add_edge(mapping[node[0]], mapping[coref], "coref_gram");

	yield graph, None;