File size: 4,540 Bytes
85ab89d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import pickle
import os
import argparse
import tqdm
import json

SCORES_PATH = "/home/ubuntu/proteinchat/eval/results/scores"
AVGS_PATH = "/home/ubuntu/proteinchat/eval/results/avgs"

def parse_args():
    parser = argparse.ArgumentParser(description="scorer")
    parser.add_argument("--model", type=str, required=True, help="specify the model to load the model.")
    args = parser.parse_args()
    return args

args = parse_args()

prot_scores = open(os.path.join(SCORES_PATH, f"{args.model}_score_output.pickle"), "rb")
prot_scores = pickle.load(prot_scores)

# sum average each BERT score first 
for prot in tqdm.tqdm(prot_scores):
    p_sum = 0
    r_sum = 0
    f1_sum = 0
    l = len(prot_scores[prot]["bert_score"]["precision"])  
    for i in range(0, l):
        p_sum += prot_scores[prot]["bert_score"]["precision"][i]
        r_sum += prot_scores[prot]["bert_score"]["recall"][i]
        f1_sum += prot_scores[prot]["bert_score"]["f1"][i]

    prot_scores[prot]["bert_score"]["precision"] = p_sum / l
    prot_scores[prot]["bert_score"]["recall"] = r_sum / l
    prot_scores[prot]["bert_score"]["f1"] = f1_sum / l

results = {}
results["gpt_score"] = {}
results["pubmedbert_score"] = {}
results["rouge"] = {}
results["bert_score"] = {}
results["bleu"] = {}
results["meteor"] = {} 
results["mauve"] = {}

gpt_p_sum = 0
gpt_r_sum = 0
gpt_f1_sum = 0

medbert_p_sum = 0
medbert_r_sum = 0
medbert_f1_sum = 0

rouge_1_sum = 0 
rouge_2_sum = 0 
rouge_L_sum = 0 
rouge_Ls_sum = 0 

bert_p_sum = 0
bert_r_sum = 0
bert_f1_sum = 0

bleu_sum = 0
bleu_p_1_sum = 0
bleu_p_2_sum = 0
bleu_p_3_sum = 0
bleu_p_4_sum = 0
bleu_bp_sum = 0
bleu_lr_sum = 0
bleu_tl_sum = 0
bleu_rl_sum = 0

meteor_sum = 0

for prot in tqdm.tqdm(prot_scores):
    gpt_p_sum += prot_scores[prot]["gpt_score"]["precision"]
    gpt_r_sum += prot_scores[prot]["gpt_score"]["recall"]
    gpt_f1_sum += prot_scores[prot]["gpt_score"]["f1_score"]

    
    medbert_p_sum += prot_scores[prot]["pubmedbert_score"]["precision"]
    medbert_r_sum += prot_scores[prot]["pubmedbert_score"]["recall"]
    medbert_f1_sum += prot_scores[prot]["pubmedbert_score"]["f1_score"]

    rouge_1_sum += prot_scores[prot]["rouge"]["rouge1"]
    rouge_2_sum += prot_scores[prot]["rouge"]["rouge2"] 
    rouge_L_sum += prot_scores[prot]["rouge"]["rougeL"]
    rouge_Ls_sum += prot_scores[prot]["rouge"]["rougeLsum"]

    bert_p_sum += prot_scores[prot]["bert_score"]["precision"]
    bert_r_sum += prot_scores[prot]["bert_score"]["recall"]
    bert_f1_sum += prot_scores[prot]["bert_score"]["f1"]

    bleu_sum = prot_scores[prot]["bleu"]["bleu"]
    bleu_p_1_sum = prot_scores[prot]["bleu"]["precisions"][0]
    bleu_p_2_sum = prot_scores[prot]["bleu"]["precisions"][1]
    bleu_p_3_sum = prot_scores[prot]["bleu"]["precisions"][2]
    bleu_p_4_sum = prot_scores[prot]["bleu"]["precisions"][3]
    bleu_bp_sum = prot_scores[prot]["bleu"]["brevity_penalty"]
    bleu_lr_sum = prot_scores[prot]["bleu"]["length_ratio"]
    bleu_tl_sum = prot_scores[prot]["bleu"]["translation_length"]
    bleu_rl_sum = prot_scores[prot]["bleu"]["reference_length"]


    meteor_sum = prot_scores[prot]["meteor"]["meteor"]

l = len(prot_scores)
results["gpt_score"]["precision"] = gpt_p_sum / l
results["gpt_score"]["recall"] = gpt_r_sum / l
results["gpt_score"]["f1_score"] = gpt_f1_sum / l

results["pubmedbert_score"]["precision"] = medbert_p_sum / l
results["pubmedbert_score"]["recall"] = medbert_r_sum / l
results["pubmedbert_score"]["f1_score"] = medbert_f1_sum / l

results["rouge"]["rouge1"] = rouge_1_sum / l
results["rouge"]["rouge2"] = rouge_2_sum / l
results["rouge"]["rougeL"] = rouge_L_sum / l
results["rouge"]["rougeLsum"] = rouge_Ls_sum / l

results["bert_score"]["precision"] = bert_p_sum / l
results["bert_score"]["recall"] = bert_r_sum / l
results["bert_score"]["f1_score"] = bert_f1_sum / l

results["bleu"]["bleu"] = bleu_sum / l
results["bleu"]["precisions"] = []
results["bleu"]["precisions"].append(bleu_p_1_sum / l)
results["bleu"]["precisions"].append(bleu_p_2_sum / l)
results["bleu"]["precisions"].append(bleu_p_3_sum / l)
results["bleu"]["precisions"].append(bleu_p_4_sum / l)
results["bleu"]["brevity_penalty"] = bleu_bp_sum / l 
results["bleu"]["length_ratio"] = bleu_lr_sum / l
results["bleu"]["translation_length"] = bleu_tl_sum / l
results["bleu"]["reference_length"] = bleu_rl_sum / l

results["meteor"] = meteor_sum / l
# results["mauve"] = 

print(results)

with open(os.path.join(AVGS_PATH , f"{args.model}_avg_scores.json"), 'w') as fp:
    json.dump(results, fp, indent=4)