File size: 6,620 Bytes

c45d283

from operator import itemgetter;
import os.path;
import re;
import xml.etree.ElementTree as ET;

from graph import Graph;

def walk(id, node, parent, nodes, edges, ns):
  i = node.get("id");
  o = node.findtext(ns + "ord");
  if i is None or o is None and parent is not None:
    raise Exception("treex.walk(): "
                    "missing ‘id’ or ‘ord’ values while decoding tree #{}; exit."
                    "".format(id));
  nodes.append((i, int(o) if o is not None else 0, node));

  if edges is not None:
    functor = node.findtext(ns + "functor");
    if parent is not None and functor is not None:
      edges.append((parent, i, functor));

  children = node.find(ns + "children");
  if children is not None:
    for child in children:
      if child.tag == ns + "LM":
        walk(id, child, i, nodes, edges, ns);
    if children.find(ns + "LM") is None:
      walk(id, children, i, nodes, edges, ns);

def read(fp, text = None):
  ns = "{http://ufal.mff.cuni.cz/pdt/pml/}";

  #
  # _fix_me_
  # factor out the anchor()ing code into a reusable form.        (oe; 4-apr-20)
  #
  n = None;
  i = 0;

  def skip():
    nonlocal i;
    while i < n and graph.input[i] in {" ", "\t"}:
      i += 1;

  def scan(candidates):
    for candidate in candidates:
      if graph.input.startswith(candidate, i):
        return len(candidate);

  def anchor(form):
    nonlocal i;
    skip();
    m = None;
    if graph.input.startswith(form, i):
      m = len(form);
    else:
      for old, new in {("‘", "`"), ("’", "'")}:
        form = form.replace(old, new);
        if graph.input.startswith(form, i):
          m = len(form);
          break;
      if not m:
        m = scan({"“", "\"", "``"}) or scan({"‘", "`"}) \
            or scan({"”", "\"", "''"}) or scan({"’", "'"}) \
            or scan({"—", "—", "---", "--"}) \
            or scan({"…", "...", ". . ."});
    if m:
      anchor = {"from": i, "to": i + m};
      i += m;
      skip();
      return anchor;
    else:
      raise Exception("{}: failed to anchor |{}| in |{}| ({})"
                      "".format(graph.id, form, graph.input, i));

  tree = ET.parse(fp).getroot();
  bundles = tree.find(ns + "bundles");
  for item in bundles.findall(ns + "LM"):
    id = item.get("id");
    graph = Graph(id, flavor = 0, framework = "ptg");
    surface = list(); nodes = list(); edges = list();
    for zone in item.iter(ns + "zone"):
      if zone.get("language") == "en":
        sentence = zone.findtext(ns + "sentence");
        trees = zone.find(ns + "trees");
        if trees is not None:
          atree = trees.find(ns + "a_tree");
          ttree = trees.find(ns + "t_tree");
          root = atree.find(ns + "children");
          top = ttree.find(ns + "children");
#          print(id, sentence, atree, ttree, root, top);
          if root is None or top is None:
            raise Exception("treex.read(): "
                            "missing ‘a_tree’ or ‘t_tree’ values while decoding tree #{}; exit."
                            "".format(id));
          walk(id, root, None, surface, None, ns);
          walk(id, top, None, nodes, edges, ns);
    #
    # determine character-based anchors for all .surface. (analytical) tokens
    #
    anchoring = dict();
    if sentence is not None:
      graph.add_input(sentence);
      n = len(graph.input);
      i = 0;
      for node in sorted(surface, key = itemgetter(1)):
        anchoring[node[0]] = anchor(node[2].findtext(ns + "form"));

    #
    # now process tectogrammatical nodes in surface order (as indicated in the
    # annotations): map to consecutive numerical identifiers; retrieve anchors
    # from corresponding analytical nodes; and create actual (new) graph nodes.
    #
    mapping = {};
    to = 0;
    for node in sorted(nodes, key = itemgetter(1)):
      mapping[node[0]] = i = len(mapping);
      properties = dict();

      a = node[2].find(ns + "a");
      if a is not None:
        anchors = list();
        for lex in a:
          if len(lex) == 0:
            anchors.append(anchoring[lex.text]);
          else:
            for lm in lex.findall(ns + "LM"):
              anchors.append(anchoring[lm.text]);
        anchors = sorted(anchors, key = itemgetter("to"));
        to = anchors[-1]["to"];
      else:
        #
        # _fix_me_
        # discuss anchoring of generated nodes: currently, for uniformity, we
        # anchor them to an empty string immediately after the final character
        # of the preceding non-generated node.  but this arguably introduces a
        # vacuous piece of information, unless one were to argue that it rather
        # is an encoding of the node status for generated nodes? (oe; 4-apr-20)
        #
        anchors = [{"from": to, "to": to}];

      #
      # the node label comes from the tectogrammatical lemma
      #
      lemma = node[2].findtext(ns + "t_lemma");

      frame = node[2].findtext(ns + "val_frame.rf");
      #
      # where present (mostly on verbs), extract the valency frame identifier
      # _fix_me_
      # for compatibility with earlier PSD releases, strip prefix that seems to
      # identify the valency dictionary.                         (oe; 4-apr-20) 
      #
      if frame is not None:
        if "#" in frame:
          properties["frame"] = frame[frame.index("#") + 1:];
        else:
          properties["frame"] = frame;

      #
      # selectively expose grammatemes as node-local properties, but ignore
      # (vanilla but very high-frequent) default values
      #
      grammatemes = node[2].find(ns + "gram");
      if grammatemes is not None:
        for property, default in [("tense", {"nil"}), ("negation", {"neg0"})]:
          match = grammatemes.findtext(ns + property);
          if match is not None and match not in default:
            properties[property] = match;

      graph.add_node(id = i, label = lemma, anchors = anchors,
                     properties = properties.keys(),
                     values = properties.values(),
                     top = node[0] == top.get("id"));

    #
    # similarly, record all edges, now using mapped identifiers
    #
    for source, target, label in edges:
      graph.add_edge(mapping[source], mapping[target], label);

    #
    # in a second pass (so that all internal identifiers are mapped already),
    # create edges reflecting coreference annotations.
    #
    for node in nodes:
      coref = node[2].findtext(ns + "coref_gram.rf");
      if coref is not None:
        graph.add_edge(mapping[node[0]], mapping[coref], "coref_gram");

    yield graph, None;