File size: 1,943 Bytes
49079cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ca523c
 
49079cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import re





def answer_cleansing_zero_shot(dataset, pred, must_choice=False):
    pred = pred.strip()
    if dataset in ("commonsense-mc"):
        pred = re.findall(r'A|B|C|D|E', pred)
    elif dataset in ("arithmetic"):
        if must_choice:
            pred = re.findall(r'A|B|C|D', pred)
        else:
            pred = pred.replace(",", "")
            pred = [s for s in re.findall(r'-?\d+\.?\d*', pred)]
    elif dataset in ("commonsense-verify", "symbolic-coin"):
        pred = pred.lower()
        pred = re.sub("\"|\'|\n|\.|\s|\:|\,", " ", pred)
        pred = pred.split(" ")
        pred = [i for i in pred if i in ("yes", "no")]
    elif dataset == "symbolic-letter":
        pred = re.sub("\"|\'|\n|\.|\s", "", pred)
        pred = [pred]
    elif dataset == "UNDEFINED":
        pred = pred
    else:
        raise ValueError("dataset is not properly defined ...")

    # If there is no candidate in list, null is set.
    if len(pred) == 0:
        pred = ""
    else:
        # choose the first element in list ...
        pred = pred[0]

    # (For arithmetic tasks) if a word ends with period, it will be omitted ...
    if pred != "":
        if pred[-1] == ".":
            pred = pred[:-1]

    return pred

def type_cleasing(type):
    type = re.findall(r'arithmetic|commonsense-mc|commonsense-verify|symbolic-coin|symbolic-letter', type)
    if len(type) == 0:
        type = "UNDEFINED"
    else:
        type = type[0]
    return type


def entity_cleansing(ent):
    ent = re.sub("\n|\s*-\s*|\.", ",", ent)
    ent = ent.split(",")
    ent = [e.strip() for e in ent if e != ""]
    return ent

def knowledge_cleansing(knowledge):
    #print("Knowledge Before: " + knowledge)
    knowledge = knowledge.strip()
    if knowledge.startswith("No, "):
        knowledge = re.sub("No, ", "", knowledge)
    knowledge = re.sub("\s"," ", knowledge)
    #print("Knowledge After: " + knowledge)
    return knowledge