|
|
|
|
|
|
|
|
|
import argparse; |
|
import json; |
|
import multiprocessing as mp; |
|
import re; |
|
import sys; |
|
import time; |
|
from pathlib import Path; |
|
from zipfile import ZipFile; |
|
|
|
import codec.amr; |
|
import codec.conllu; |
|
import codec.eds; |
|
import codec.mrp; |
|
import codec.norec; |
|
import codec.pmb; |
|
import codec.sdp; |
|
import codec.treex; |
|
import codec.ucca; |
|
import inspector; |
|
import score.edm; |
|
import score.mces; |
|
import score.sdp; |
|
import score.smatch; |
|
import score.ucca; |
|
import validate.core; |
|
from analyzer import analyze; |
|
|
|
__author__ = "oe" |
|
|
|
ENCODING = "utf-8"; |
|
NORMALIZATIONS = {"anchors", "case", "edges", "attributes"}; |
|
VALIDATIONS = {"input", "anchors", "edges", |
|
"amr", "eds", "sdp", "ucca"} |
|
|
|
def read_graphs(stream, format = None, |
|
full = False, normalize = False, reify = False, node_centric = False, |
|
frameworks = None, prefix = None, text = None, filter = None, |
|
trace = 0, strict = 0, quiet = False, robust = False, |
|
alignment = None, anchors = None, pretty = False, |
|
id = None, n = None, i = None): |
|
|
|
name = getattr(stream, "name", ""); |
|
if name.endswith(".zip"): |
|
with ZipFile(name) as zip: |
|
stream = None; |
|
for entry in zip.namelist(): |
|
if entry.endswith(".mrp"): |
|
if stream is not None: |
|
print("read_graphs(): multiple MRP entries in ‘{}’; exit." |
|
"".format(name), file = sys.stderr); |
|
sys.exit(1); |
|
stream = zip.open(entry); |
|
if stream is None: |
|
print("read_graphs(): missing MRP entry in ‘{}’; exit." |
|
"".format(name), file = sys.stderr); |
|
sys.exit(1); |
|
|
|
generator = None; |
|
if format in {"amr", "camr"}: |
|
generator \ |
|
= codec.amr.read(stream, full = full, reify = reify, |
|
text = text, camr = format == "camr", |
|
alignment = alignment, quiet = quiet, trace = trace); |
|
elif format in {"ccd", "dm", "pas", "psd"}: |
|
generator = codec.sdp.read(stream, framework = format, text = text); |
|
elif format == "eds": |
|
generator = codec.eds.read(stream, reify = reify, text = text); |
|
elif format == "mrp": |
|
generator = codec.mrp.read(stream, text = text, robust = robust); |
|
elif format == "norec": |
|
generator = codec.norec.read(stream, text = text, node_centric = node_centric); |
|
elif format == "pmb": |
|
generator = codec.pmb.read(stream, full = full, |
|
reify = reify, text = text, |
|
trace = trace, strict = strict); |
|
elif format == "treex": |
|
generator = codec.treex.read(stream) |
|
elif format == "ucca": |
|
generator = codec.ucca.read(stream, text = text, prefix = prefix); |
|
elif format == "conllu" or format == "ud": |
|
generator = codec.conllu.read(stream, framework = format, text = text, |
|
anchors = anchors, trace = trace); |
|
elif format == "eud": |
|
generator = codec.conllu.read(stream, framework = format, text = text, |
|
anchors = anchors, trace = trace, |
|
enhanced_graph = True); |
|
else: |
|
print("read_graphs(): invalid input codec {}; exit." |
|
"".format(format), file = sys.stderr); |
|
sys.exit(1); |
|
|
|
if generator is None: |
|
return None, None; |
|
|
|
|
|
|
|
|
|
graphs = []; |
|
overlays = []; |
|
j = 0; |
|
while n is None or n < 1 or j < n: |
|
try: |
|
graph, overlay = next(generator); |
|
if frameworks is not None and graph.framework not in frameworks: continue; |
|
if filter is not None and graph.id not in filter: continue; |
|
if id is not None: |
|
if graph.id == id: |
|
graphs.append(graph); overlays.append(overlay); |
|
elif i is not None and i >= 0: |
|
if j == i: |
|
graphs.append(graph); overlays.append(overlay); |
|
break; |
|
else: |
|
graphs.append(graph); overlays.append(overlay); |
|
j += 1; |
|
except StopIteration: |
|
break; |
|
except Exception as error: |
|
print(error, file = sys.stderr); |
|
pass; |
|
|
|
if pretty: |
|
for graph in graphs: graph.prettify(trace); |
|
if normalize: |
|
for graph in graphs: graph.normalize(normalize, trace); |
|
|
|
return graphs, overlays; |
|
|
|
def main(args=None): |
|
parser = argparse.ArgumentParser(description = "MRP Graph Toolkit"); |
|
parser.add_argument("--inspect", action = "store_true"); |
|
parser.add_argument("--analyze", action = "store_true"); |
|
parser.add_argument("--normalize", action = "append", default = []); |
|
parser.add_argument("--full", action = "store_true"); |
|
parser.add_argument("--reify", action = "store_true"); |
|
parser.add_argument("--node_centric", action = "store_true"); |
|
parser.add_argument("--unique", action = "store_true"); |
|
parser.add_argument("--ids", action = "store_true"); |
|
parser.add_argument("--strings", action = "store_true"); |
|
parser.add_argument("--framework", action = "append", default = []); |
|
parser.add_argument("--gold", |
|
type = argparse.FileType("r", encoding = ENCODING)); |
|
parser.add_argument("--alignment", |
|
type = argparse.FileType("r", encoding = ENCODING)); |
|
parser.add_argument("--overlay", |
|
type = argparse.FileType("w", encoding = ENCODING)); |
|
parser.add_argument("--format"); |
|
parser.add_argument("--score"); |
|
parser.add_argument("--validate", action = "append", default = []); |
|
parser.add_argument("--limit"); |
|
parser.add_argument("--read", required = True); |
|
parser.add_argument("--write"); |
|
parser.add_argument("--text"); |
|
parser.add_argument("--inverse", action = "store_true"); |
|
parser.add_argument("--anchors", |
|
type = argparse.FileType("r", encoding = ENCODING)); |
|
parser.add_argument("--prefix"); |
|
parser.add_argument("--source"); |
|
parser.add_argument("--targets"); |
|
parser.add_argument("--pretty", action = "store_true"); |
|
parser.add_argument("--inject"); |
|
parser.add_argument("--version", type = float, default = 1.1); |
|
parser.add_argument("--cores", type = int, default = 1); |
|
parser.add_argument("--i", type = int); |
|
parser.add_argument("--n", type = int); |
|
parser.add_argument("--id"); |
|
parser.add_argument("--filter"); |
|
parser.add_argument("--quiet", action = "store_true"); |
|
parser.add_argument("--robust", action = "store_true"); |
|
parser.add_argument("--trace", "-t", action = "count", default = 0); |
|
parser.add_argument("--strict", action = "count", default = 0); |
|
parser.add_argument("--errors", |
|
type = argparse.FileType("w", encoding = ENCODING)); |
|
parser.add_argument("input", nargs = "?", |
|
type = argparse.FileType("r", encoding = ENCODING), |
|
default = sys.stdin); |
|
parser.add_argument("output", nargs = "?", |
|
type = argparse.FileType("w", encoding = ENCODING), |
|
default = sys.stdout); |
|
if args is None: |
|
args = sys.argv |
|
arguments = parser.parse_args(args); |
|
|
|
text = None; |
|
if arguments.text is not None: |
|
path = Path(arguments.text); |
|
if path.is_file(): |
|
text = {}; |
|
with path.open() as stream: |
|
for line in stream: |
|
id, string = line.split("\t", maxsplit = 1); |
|
if string.endswith("\n"): string = string[:len(string) - 1]; |
|
if arguments.inverse: text[string] = id; |
|
else: text[id] = string; |
|
elif path.is_dir(): |
|
text = path; |
|
elif arguments.inverse: |
|
print("main.py(): option ‘--inverse’ requires ‘--text’; exit.", |
|
file = sys.stderr); |
|
sys.exit(1); |
|
|
|
if arguments.read not in {"mrp", |
|
"ccd", "dm", "pas", "psd", "treex", |
|
"eds", "ucca", |
|
"amr", "camr", "pmb", |
|
"conllu", "ud", "eud", |
|
"norec"}: |
|
print("main.py(): invalid input format: {}; exit." |
|
"".format(arguments.read), file = sys.stderr); |
|
sys.exit(1); |
|
|
|
filter = None; |
|
if arguments.filter is not None: |
|
try: |
|
path = Path(arguments.filter); |
|
filter = set(); |
|
with path.open() as stream: |
|
for line in stream: |
|
filter.add(line.split("\t", maxsplit = 1)[0]); |
|
except: |
|
print("main.py(): invalid ‘--filter’: {}; exit." |
|
"".format(arguments.write), file = sys.stderr); |
|
sys.exit(1); |
|
if filter is not None and len(filter) == 0: filter = None; |
|
|
|
if arguments.write is not None and \ |
|
arguments.write not in \ |
|
{"dot", "tikz", "displacy", "evaluation", "id", "json", "mrp", |
|
"source", "targets", "txt", "ucca", "norec"}: |
|
print("main.py(): invalid output format: {}; exit." |
|
"".format(arguments.write), file = sys.stderr); |
|
sys.exit(1); |
|
|
|
|
|
|
|
|
|
if arguments.score == "mces": arguments.score = "mrp"; |
|
if arguments.score is not None and \ |
|
arguments.score not in {"mrp", "sdp", "edm", "ucca", "smatch"}: |
|
print("main.py(): invalid evaluation metric: {}; exit." |
|
"".format(arguments.score), file = sys.stderr); |
|
sys.exit(1); |
|
|
|
if arguments.format and \ |
|
arguments.format not in {"mrp", |
|
"ccd", "dm", "pas", "psd", |
|
"eds", "ucca", |
|
"amr", "camr", "pmb", |
|
"conllu", "ud", "eud"}: |
|
print("main.py(): invalid gold format: {}; exit." |
|
"".format(arguments.read), file = sys.stderr); |
|
sys.exit(1); |
|
|
|
if len(arguments.normalize) == 1 and arguments.normalize[0] == "all": |
|
normalize = NORMALIZATIONS; |
|
else: |
|
normalize = set(); |
|
for action in arguments.normalize: |
|
if action in NORMALIZATIONS: |
|
normalize.add(action); |
|
else: |
|
print("main.py(): invalid type of normalization: {}; exit." |
|
"".format(action), file = sys.stderr); |
|
sys.exit(1); |
|
if arguments.score is not None and len(normalize) == 0: |
|
normalize = NORMALIZATIONS; |
|
|
|
if arguments.targets == "gather" and not arguments.unique: |
|
print("main.py(): option ‘--targets gather’ requires ‘--unique’; exit.", |
|
file = sys.stderr); |
|
sys.exit(1); |
|
|
|
if arguments.alignment is not None and arguments.overlay is None: |
|
print("main.py(): option ‘--alignment’ requires ‘--overlay’; exit.", |
|
file = sys.stderr); |
|
sys.exit(1); |
|
|
|
if len(arguments.framework) == 0: arguments.framework = None; |
|
|
|
if arguments.cores == 0: arguments.cores = mp.cpu_count(); |
|
|
|
graphs, overlays \ |
|
= read_graphs(arguments.input, format = arguments.read, |
|
full = arguments.full, normalize = normalize, |
|
reify = arguments.reify, frameworks = arguments.framework, |
|
text = text, filter = filter, alignment = arguments.alignment, |
|
anchors = arguments.anchors, pretty = arguments.pretty, |
|
trace = arguments.trace, strict = arguments.strict, node_centric = arguments.node_centric, |
|
quiet = arguments.quiet, robust = arguments.robust, |
|
id = arguments.id, n = arguments.n, i = arguments.i); |
|
if graphs is None: |
|
print("main.py(): unable to read input graphs: {}; exit." |
|
"".format(arguments.input.name), file = sys.stderr); |
|
sys.exit(1); |
|
|
|
if arguments.unique: |
|
targets = dict(); |
|
if arguments.targets == "gather": |
|
for graph in graphs: |
|
if graph.id in targets: targets[graph.id].add(graph.framework); |
|
else: targets[graph.id] = {graph.framework}; |
|
arguments.targets = None; |
|
unique = list(); |
|
ids = set(); |
|
for graph in graphs: |
|
id = graph.id; |
|
if id in targets: graph.targets(list(targets[id])); |
|
if id not in ids: |
|
ids.add(id); |
|
unique.append(graph); |
|
graphs = unique; |
|
|
|
|
|
|
|
|
|
if arguments.source: |
|
for graph in graphs: graph.source(arguments.source); |
|
if arguments.inject: |
|
for graph in graphs: graph.inject(arguments.inject); |
|
|
|
if arguments.validate == ["all"]: |
|
actions = VALIDATIONS; |
|
else: |
|
actions = set(); |
|
for action in arguments.validate: |
|
if action in VALIDATIONS: |
|
actions.add(action); |
|
else: |
|
print("main.py(): invalid type of validation: {}; exit." |
|
"".format(action), file = sys.stderr); |
|
sys.exit(1); |
|
|
|
if arguments.quiet: arguments.trace = 0; |
|
|
|
if actions: |
|
for graph in graphs: |
|
validate.core.test(graph, actions, stream = sys.stderr); |
|
|
|
if arguments.analyze: |
|
analyze(graphs); |
|
|
|
gold = None; |
|
if arguments.gold and arguments.score or arguments.inspect: |
|
if arguments.format is None: arguments.format = arguments.read; |
|
gold, _ = read_graphs(arguments.gold, format = arguments.format, |
|
full = arguments.full, normalize = normalize, |
|
reify = arguments.reify, node_centric = arguments.node_centric, |
|
frameworks = arguments.framework, |
|
text = text, filter = filter, |
|
trace = arguments.trace, quiet = arguments.quiet, |
|
robust = arguments.robust, |
|
id = arguments.id, n = arguments.n, i = arguments.i); |
|
if gold is None: |
|
print("main.py(): unable to read gold graphs: {}; exit." |
|
"".format(arguments.gold.name), file = sys.stderr); |
|
sys.exit(1); |
|
|
|
if arguments.inspect: |
|
result = inspector.summarize(graphs, gold); |
|
if arguments.write == "json" or True: |
|
json.dump(result, arguments.output, indent = None); |
|
print(file = arguments.output); |
|
sys.exit(0); |
|
|
|
if arguments.score: |
|
limits = {"rrhc": None, "mces": None}; |
|
for metric in arguments.score.split(","): |
|
if arguments.limit is not None: |
|
try: |
|
match = re.search(r"([0-9]+):([0-9]+)", arguments.limit) |
|
if match: |
|
limits["rrhc"] = int(match.group(1)); |
|
limits["mces"] = int(match.group(2)); |
|
else: |
|
if metric == "smatch": |
|
limits["rrhc"] = int(arguments.limit); |
|
else: |
|
limits["mces"] = int(arguments.limit); |
|
except: |
|
print("main.py(): invalid ‘--limit’ {}; exit." |
|
"".format(arguments.limit), |
|
file = sys.stderr); |
|
sys.exit(1); |
|
errors = dict() if arguments.errors else None; |
|
result = None; |
|
launch = time.time(), time.process_time(); |
|
if metric == "edm": |
|
result = score.edm.evaluate(gold, graphs, |
|
format = arguments.write, |
|
trace = arguments.trace); |
|
elif metric == "mrp": |
|
result = score.mces.evaluate(gold, graphs, |
|
format = arguments.write, |
|
limits = limits, |
|
cores = arguments.cores, |
|
trace = arguments.trace, |
|
errors = errors, |
|
quiet = arguments.quiet); |
|
elif metric == "sdp": |
|
result = score.sdp.evaluate(gold, graphs, |
|
format = arguments.write, |
|
trace = arguments.trace); |
|
elif metric == "smatch": |
|
result = score.smatch.evaluate(gold, graphs, |
|
format = arguments.write, |
|
limit = limits["rrhc"], |
|
values = {"tops", "labels", |
|
"properties", "anchors", |
|
"edges", "attributes"}, |
|
trace = arguments.trace); |
|
elif metric == "ucca": |
|
result = score.ucca.evaluate(gold, graphs, |
|
format = arguments.write, |
|
trace = arguments.trace); |
|
|
|
if result is not None: |
|
result["time"] = time.time() - launch[0]; |
|
result["cpu"] = time.process_time() - launch[1]; |
|
if arguments.write == "json" or True: |
|
|
|
|
|
|
|
|
|
print("{", file = arguments.output, end = ""); |
|
start = True; |
|
for key in result: |
|
if start: start = False; |
|
else: print(",\n ", file = arguments.output, end = ""); |
|
print("\"{}\": ".format(key), file = arguments.output, end = ""); |
|
json.dump(result[key], arguments.output, indent = None); |
|
print("}", file = arguments.output); |
|
|
|
if errors is not None: |
|
if arguments.write == "dot": |
|
for graph in gold: |
|
graph.dot(arguments.errors, |
|
ids = arguments.ids, strings = arguments.strings, |
|
errors = errors[graph.framework][graph.id]); |
|
elif arguments.write == "json" or True: |
|
json.dump(errors, arguments.errors, indent = None); |
|
sys.exit(0); |
|
|
|
for graph in graphs: |
|
if arguments.write in {"mrp", "evaluation"}: |
|
if arguments.write == "evaluation": |
|
graph.flavor = graph.framework = graph.nodes = graph.edges = None; |
|
if arguments.targets is not None: |
|
graph.targets(arguments.targets.split(",")); |
|
json.dump(graph.encode(arguments.version), arguments.output, |
|
indent = None, ensure_ascii = False); |
|
print(file = arguments.output); |
|
elif arguments.write == "dot": |
|
graph.dot(arguments.output, |
|
ids = arguments.ids, strings = arguments.strings); |
|
print(file = arguments.output); |
|
elif arguments.write == "tikz": |
|
graph.tikz(arguments.output); |
|
elif arguments.write == "displacy": |
|
graph.displacy(arguments.output); |
|
elif arguments.write == "id": |
|
print("{}".format(graph.id), file = arguments.output); |
|
elif arguments.write == "source": |
|
print("{}\t{}".format(graph.id, graph.source()), file = arguments.output); |
|
elif arguments.write == "targets": |
|
for target in graph.targets() or (""): |
|
print("{}\t{}".format(graph.id, target), file = arguments.output); |
|
elif arguments.write == "txt": |
|
print("{}\t{}".format(graph.id, graph.input), file = arguments.output); |
|
elif arguments.write == "ucca": |
|
|
|
|
|
codec.ucca.write(graph, graph.input, file = arguments.output) |
|
|
|
if arguments.write == "norec": |
|
norec_graphs = [codec.norec.write(graph, graph.input, node_centric = arguments.node_centric) for graph in graphs] |
|
json.dump(norec_graphs, arguments.output, indent=None) |
|
|
|
if arguments.overlay: |
|
for graph in overlays: |
|
if graph: |
|
json.dump(graph.encode(arguments.version), arguments.overlay, |
|
indent = None, ensure_ascii = False); |
|
print(file = arguments.overlay); |
|
|
|
if __name__ == "__main__": |
|
main(); |
|
|