|
import re; |
|
import sys; |
|
|
|
from graph import Graph; |
|
|
|
TEXT = re.compile(r"^# text = (.+)$"); |
|
ID = re.compile(r"^# sent_id = (.+)$"); |
|
RANGE = re.compile(r"^([0-9]+)-([0-9]+)$"); |
|
ANCHOR = re.compile(r".*TokenRange=([0-9]+):([0-9]+)"); |
|
|
|
def read_tuples(stream): |
|
id, input = None, None; |
|
tuples = []; |
|
for line in stream: |
|
line = line.rstrip(); |
|
if line.startswith("#"): |
|
match = TEXT.match(line) |
|
if match: |
|
input = match.group(1); |
|
continue; |
|
match = ID.match(line); |
|
if match: |
|
id = match.group(1); |
|
continue; |
|
elif len(line) == 0: |
|
|
|
|
|
if input is None: |
|
input = reconstruct_input_from_tuples(tuples) |
|
if tuples: |
|
yield id, input, tuples; |
|
id, input = None, None; |
|
tuples = [] |
|
else: |
|
tuples.append(line.split("\t")); |
|
|
|
def reconstruct_input_from_tuples(tuples): |
|
""" Reconstruct input sentence from the CoNLL-U representation. |
|
each tuple in tuples correspond to a line in a block. """ |
|
if not tuples: return '' |
|
|
|
surface_indicator = get_is_surface_token_indicator(tuples) |
|
surface_tuples = [tuple |
|
for is_surface, tuple in zip(surface_indicator, tuples) |
|
if is_surface] |
|
sent_str = '' |
|
for t in surface_tuples: |
|
tok = t[1] |
|
sent_str += tok |
|
if "SpaceAfter=No" not in t[-1] and t is not tuples[-1]: |
|
|
|
sent_str += ' ' |
|
|
|
return sent_str |
|
|
|
def get_ids2range_tuple(tuples): |
|
""" |
|
Return Dict[int: tuple]. |
|
for each node-id k that is part of a multi-word token (denoted by range-id "i-j"), let t be the tuple |
|
of the token i-j (the multiword token). the dict will be {k:t} over all these ks. |
|
""" |
|
ranges2multiword = dict() |
|
for tuple in tuples: |
|
match = RANGE.match(tuple[0]) |
|
if match is not None: |
|
for t in range(int(match.group(1)), int(match.group(2)) + 1): |
|
ranges2multiword[t] = tuple |
|
return ranges2multiword |
|
|
|
def get_is_surface_token_indicator(tuples): |
|
""" |
|
Return a list of boolean in same length as `tuples`, |
|
where output[i] indicate whether tuple[i] correspond to a surface token. |
|
surface tokens are those tokens that are required for detokenization of input sentence. |
|
see https://universaldependencies.org/format.html#words-tokens-and-empty-nodes |
|
|
|
the conditions to be a surface token - |
|
1. be not an empty node (in the form "i.j") |
|
2. be not a (syntactic) word that is contained in a multi-word token. that is, the word's id |
|
isn't included in any range-id (in the form "i-j"). |
|
""" |
|
ids2range_tuple = get_ids2range_tuple(tuples) |
|
ids = [t[0] for t in tuples] |
|
surface_indicator = ["." not in tid |
|
and ("-" in tid or int(tid) not in ids2range_tuple) |
|
for tid in ids] |
|
return surface_indicator |
|
|
|
def read_anchors(stream): |
|
if stream is None: |
|
while True: yield None, None; |
|
else: |
|
id = None; |
|
tokens = list(); |
|
for line in stream: |
|
line = line.rstrip("\n"); |
|
if len(line) == 0: |
|
yield id, tokens; |
|
id = None; |
|
tokens.clear(); |
|
elif line.startswith("#"): |
|
id = line[1:]; |
|
else: |
|
fields = line.split("\t"); |
|
if len(fields) == 3: |
|
tokens.append((int(fields[0]), int(fields[1]))); |
|
if len(tokens) > 0: |
|
yield id, tokens; |
|
|
|
def construct_graph_nodes(id, input, tuples, framework, text, anchors): |
|
i = 0; |
|
def compute(form): |
|
nonlocal i; |
|
m = None; |
|
j = input.find(form, i); |
|
if j >= i: |
|
i, m = j, len(form); |
|
else: |
|
base = form; |
|
k, l = len(input), 0; |
|
for old, new in {("β", "`"), ("β", "'"), ("β", "'"), ("`", "'"), |
|
("β", "\""), ("β", "\""), |
|
("β", "--"), ("β", "---"), ("β", "---"), |
|
("β¦", "..."), ("β¦", ". . .")}: |
|
form = base.replace(old, new); |
|
j = input.find(form, i); |
|
if j >= i and j < k: k, l = j, len(form); |
|
if k < len(input): i, m = k, l; |
|
if m: |
|
match = {"from": i, "to": i + m}; |
|
i += m; |
|
return match; |
|
else: |
|
raise Exception("[{}] failed to anchor |{}| in |{}|{}| ({})" |
|
"".format(graph.id, form, input[:i], input[i:], i)); |
|
|
|
graph = Graph(id, flavor = 0, framework = framework); |
|
if input is not None: graph.add_input(input); |
|
elif text is not None: graph.add_input(text); |
|
input = graph.input; |
|
|
|
anchors_generator = read_anchors(anchors); |
|
_, anchors_tokens = next(anchors_generator); |
|
id, ids = 0, dict(); |
|
ids2range_tuple = get_ids2range_tuple(tuples) |
|
for tuple, is_surface_token in zip(tuples, get_is_surface_token_indicator(tuples)): |
|
id += 1; |
|
ids[tuple[0]] = id; |
|
form, lemma, upos, xpos, features, head, misc = \ |
|
tuple[1], tuple[2], tuple[3], tuple[4], tuple[5], tuple[6], tuple[9]; |
|
properties = {"lemma": lemma, "upos": upos, "xpos": xpos}; |
|
if features != "_": |
|
for feature in features.split("|"): |
|
name, value = feature.split("=", 1); |
|
properties[name] = value; |
|
|
|
if not is_surface_token: |
|
anchors = [] |
|
elif anchors_tokens is not None: |
|
start, end = anchors_tokens.pop(0); |
|
anchors = [{"from": start, "to": end}]; |
|
else: |
|
tid = tuple[0] |
|
if tid.isnumeric() and int(tid) in ids2range_tuple: |
|
range_tuple_misc = ids2range_tuple[int(tid)][9]; |
|
if range_tuple_misc != "_": |
|
misc = range_tuple_misc |
|
match = ANCHOR.match(misc); |
|
if match: |
|
anchors = [{"from": int(match.group(1)), "to": int(match.group(2))}]; |
|
else: |
|
anchors = [compute(form)]; |
|
graph.add_node(id, label = form, |
|
properties = list(properties.keys()), |
|
values = list(properties.values()), |
|
top = True if head == "0" else False, |
|
anchors = anchors); |
|
return graph, ids; |
|
|
|
def construct_graph_edges(tuples, graph, ids): |
|
""" Given a graph with nodes (and id-mapping) pre-constructed, |
|
read edges from tuples and add them to graph. |
|
Modifies `graph` argument. """ |
|
for tuple in tuples: |
|
id, head, type = tuple[0], tuple[6], tuple[7] |
|
if head in ids: |
|
graph.add_edge(ids[head], ids[id], type) |
|
|
|
def construct_enhanced_graph_edges(tuples, graph, ids): |
|
""" Given a graph with nodes (and id-mapping) pre-constructed, |
|
read edges from tuples and add them to graph. |
|
This function is for reading Enhance UD graphs, which is distinguished from reading |
|
basic UD only in source of edges information -- DEPS column instead of HEAD, DEPREL columns. |
|
See https://universaldependencies.org/format.html#syntactic-annotation for EUD format specifications |
|
which we follow here. |
|
Modifies `graph` argument. """ |
|
for tuple in tuples: |
|
id, deps = tuple[0], tuple[8] |
|
if deps == "_": |
|
continue |
|
for rel in deps.split("|"): |
|
head, dep_type = rel.split(":", 1) |
|
if head in ids: |
|
graph.add_edge(ids[head], ids[id], dep_type) |
|
|
|
|
|
def construct_graph(id, input, tuples, framework = None, text = None, anchors = None, enhanced_graph=False): |
|
graph, ids = construct_graph_nodes(id, input, tuples, framework, text, anchors) |
|
if not enhanced_graph: |
|
|
|
construct_graph_edges(tuples, graph, ids) |
|
else: |
|
|
|
construct_enhanced_graph_edges(tuples, graph, ids) |
|
return graph |
|
|
|
def read(stream, framework = None, text = None, anchors = None, trace = 0, enhanced_graph=False): |
|
tuples_generator = read_tuples(stream) |
|
for id, input, tuples in tuples_generator: |
|
if trace: |
|
print("conllu.read(): processing graph #{} ...".format(id), |
|
file = sys.stderr); |
|
graph = construct_graph(id, input, tuples, framework, text, anchors, enhanced_graph) |
|
yield graph, None; |
|
|