|
|
|
import json |
|
import os |
|
import os.path |
|
import sys |
|
|
|
|
|
def gen_train_facts(data_file_name, truth_dir): |
|
fact_file_name = data_file_name[data_file_name.find("train_") :] |
|
fact_file_name = os.path.join(truth_dir, fact_file_name.replace(".json", ".fact")) |
|
|
|
if os.path.exists(fact_file_name): |
|
fact_in_train = set([]) |
|
triples = json.load(open(fact_file_name)) |
|
for x in triples: |
|
fact_in_train.add(tuple(x)) |
|
return fact_in_train |
|
|
|
fact_in_train = set([]) |
|
ori_data = json.load(open(data_file_name)) |
|
for data in ori_data: |
|
vertexSet = data["vertexSet"] |
|
for label in data["labels"]: |
|
rel = label["r"] |
|
for n1 in vertexSet[label["h"]]: |
|
for n2 in vertexSet[label["t"]]: |
|
fact_in_train.add((n1["name"], n2["name"], rel)) |
|
|
|
json.dump(list(fact_in_train), open(fact_file_name, "w")) |
|
|
|
return fact_in_train |
|
|
|
|
|
input_dir = sys.argv[1] |
|
output_dir = sys.argv[2] |
|
|
|
submit_dir = os.path.join(input_dir, "res") |
|
truth_dir = os.path.join(input_dir, "ref") |
|
|
|
if not os.path.isdir(submit_dir): |
|
print("%s doesn't exist" % submit_dir) |
|
|
|
if os.path.isdir(submit_dir) and os.path.isdir(truth_dir): |
|
if not os.path.exists(output_dir): |
|
os.makedirs(output_dir) |
|
|
|
fact_in_train_annotated = gen_train_facts("../data/train_annotated.json", truth_dir) |
|
fact_in_train_distant = gen_train_facts("../data/train_distant.json", truth_dir) |
|
|
|
output_filename = os.path.join(output_dir, "scores.txt") |
|
output_file = open(output_filename, "w") |
|
|
|
truth_file = os.path.join(truth_dir, "dev_test.json") |
|
truth = json.load(open(truth_file)) |
|
|
|
std = {} |
|
tot_evidences = 0 |
|
titleset = set([]) |
|
|
|
title2vectexSet = {} |
|
|
|
for x in truth: |
|
title = x["title"] |
|
titleset.add(title) |
|
|
|
vertexSet = x["vertexSet"] |
|
title2vectexSet[title] = vertexSet |
|
|
|
for label in x["labels"]: |
|
r = label["r"] |
|
|
|
h_idx = label["h"] |
|
t_idx = label["t"] |
|
std[(title, r, h_idx, t_idx)] = set(label["evidence"]) |
|
tot_evidences += len(label["evidence"]) |
|
|
|
tot_relations = len(std) |
|
|
|
submission_answer_file = os.path.join(submit_dir, "result.json") |
|
tmp = json.load(open(submission_answer_file)) |
|
tmp.sort(key=lambda x: (x["title"], x["h_idx"], x["t_idx"], x["r"])) |
|
submission_answer = [tmp[0]] |
|
for i in range(1, len(tmp)): |
|
x = tmp[i] |
|
y = tmp[i - 1] |
|
if (x["title"], x["h_idx"], x["t_idx"], x["r"]) != (y["title"], y["h_idx"], y["t_idx"], y["r"]): |
|
submission_answer.append(tmp[i]) |
|
|
|
correct_re = 0 |
|
correct_evidence = 0 |
|
pred_evi = 0 |
|
|
|
correct_in_train_annotated = 0 |
|
correct_in_train_distant = 0 |
|
titleset2 = set([]) |
|
for x in submission_answer: |
|
title = x["title"] |
|
h_idx = x["h_idx"] |
|
t_idx = x["t_idx"] |
|
r = x["r"] |
|
titleset2.add(title) |
|
if title not in title2vectexSet: |
|
continue |
|
vertexSet = title2vectexSet[title] |
|
|
|
if "evidence" in x: |
|
evi = set(x["evidence"]) |
|
else: |
|
evi = set([]) |
|
pred_evi += len(evi) |
|
|
|
if (title, r, h_idx, t_idx) in std: |
|
correct_re += 1 |
|
stdevi = std[(title, r, h_idx, t_idx)] |
|
correct_evidence += len(stdevi & evi) |
|
in_train_annotated = in_train_distant = False |
|
for n1 in vertexSet[h_idx]: |
|
for n2 in vertexSet[t_idx]: |
|
if (n1["name"], n2["name"], r) in fact_in_train_annotated: |
|
in_train_annotated = True |
|
if (n1["name"], n2["name"], r) in fact_in_train_distant: |
|
in_train_distant = True |
|
|
|
if in_train_annotated: |
|
correct_in_train_annotated += 1 |
|
if in_train_distant: |
|
correct_in_train_distant += 1 |
|
|
|
re_p = 1.0 * correct_re / len(submission_answer) |
|
re_r = 1.0 * correct_re / tot_relations |
|
if re_p + re_r == 0: |
|
re_f1 = 0 |
|
else: |
|
re_f1 = 2.0 * re_p * re_r / (re_p + re_r) |
|
|
|
evi_p = 1.0 * correct_evidence / pred_evi if pred_evi > 0 else 0 |
|
evi_r = 1.0 * correct_evidence / tot_evidences |
|
if evi_p + evi_r == 0: |
|
evi_f1 = 0 |
|
else: |
|
evi_f1 = 2.0 * evi_p * evi_r / (evi_p + evi_r) |
|
|
|
re_p_ignore_train_annotated = ( |
|
1.0 * (correct_re - correct_in_train_annotated) / (len(submission_answer) - correct_in_train_annotated) |
|
) |
|
re_p_ignore_train = ( |
|
1.0 * (correct_re - correct_in_train_distant) / (len(submission_answer) - correct_in_train_distant) |
|
) |
|
|
|
if re_p_ignore_train_annotated + re_r == 0: |
|
re_f1_ignore_train_annotated = 0 |
|
else: |
|
re_f1_ignore_train_annotated = 2.0 * re_p_ignore_train_annotated * re_r / (re_p_ignore_train_annotated + re_r) |
|
|
|
if re_p_ignore_train + re_r == 0: |
|
re_f1_ignore_train = 0 |
|
else: |
|
re_f1_ignore_train = 2.0 * re_p_ignore_train * re_r / (re_p_ignore_train + re_r) |
|
|
|
print("RE_F1:", re_f1) |
|
print("Evi_F1:", evi_f1) |
|
print("RE_ignore_annotated_F1:", re_f1_ignore_train_annotated) |
|
print("RE_ignore_distant_F1:", re_f1_ignore_train) |
|
|
|
output_file.write("RE_F1: %f\n" % re_f1) |
|
output_file.write("Evi_F1: %f\n" % evi_f1) |
|
|
|
output_file.write("RE_ignore_annotated_F1: %f\n" % re_f1_ignore_train_annotated) |
|
output_file.write("RE_ignore_distant_F1: %f\n" % re_f1_ignore_train) |
|
|
|
output_file.close() |
|
|