Spaces:
Running
Running
File size: 5,801 Bytes
cc8997b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import glob
import json
import os
import os
import torch
from tqdm import tqdm
import random
def open_content(path):
paths = glob.glob(os.path.join(path, "*.json"))
train, dev, test, labels = None, None, None, None
for p in paths:
if "train" in p:
with open(p, "r") as f:
train = json.load(f)
elif "dev" in p:
with open(p, "r") as f:
dev = json.load(f)
elif "test" in p:
with open(p, "r") as f:
test = json.load(f)
elif "labels" in p:
with open(p, "r") as f:
labels = json.load(f)
return train, dev, test, labels
def process(data):
words = data['sentence'].split()
entities = [] # List of entities (start, end, type)
for entity in data['entities']:
start_char, end_char = entity['pos']
# Initialize variables to keep track of word positions
start_word = None
end_word = None
# Iterate through words and find the word positions
char_count = 0
for i, word in enumerate(words):
word_length = len(word)
if char_count == start_char:
start_word = i
if char_count + word_length == end_char:
end_word = i
break
char_count += word_length + 1 # Add 1 for the space
# Append the word positions to the list
entities.append((start_word, end_word, entity['type']))
# Create a list of word positions for each entity
sample = {
"tokenized_text": words,
"ner": entities
}
return sample
# create dataset
def create_dataset(path):
train, dev, test, labels = open_content(path)
train_dataset = []
dev_dataset = []
test_dataset = []
for data in train:
train_dataset.append(process(data))
for data in dev:
dev_dataset.append(process(data))
for data in test:
test_dataset.append(process(data))
return train_dataset, dev_dataset, test_dataset, labels
@torch.no_grad()
def get_for_one_path(path, model):
# load the dataset
_, _, test_dataset, entity_types = create_dataset(path)
data_name = path.split("/")[-1] # get the name of the dataset
# check if the dataset is flat_ner
flat_ner = True
if any([i in data_name for i in ["ACE", "GENIA", "Corpus"]]):
flat_ner = False
# evaluate the model
results, f1 = model.evaluate(test_dataset, flat_ner=flat_ner, threshold=0.5, batch_size=12,
entity_types=entity_types)
return data_name, results, f1
def get_for_all_path(model, steps, log_dir, data_paths):
all_paths = glob.glob(f"{data_paths}/*")
all_paths = sorted(all_paths)
# move the model to the device
device = next(model.parameters()).device
model.to(device)
# set the model to eval mode
model.eval()
# log the results
save_path = os.path.join(log_dir, "results.txt")
with open(save_path, "a") as f:
f.write("##############################################\n")
# write step
f.write("step: " + str(steps) + "\n")
zero_shot_benc = ["mit-movie", "mit-restaurant", "CrossNER_AI", "CrossNER_literature", "CrossNER_music",
"CrossNER_politics", "CrossNER_science"]
zero_shot_benc_results = {}
all_results = {} # without crossNER
for p in tqdm(all_paths):
if "sample_" not in p:
data_name, results, f1 = get_for_one_path(p, model)
# write to file
with open(save_path, "a") as f:
f.write(data_name + "\n")
f.write(str(results) + "\n")
if data_name in zero_shot_benc:
zero_shot_benc_results[data_name] = f1
else:
all_results[data_name] = f1
avg_all = sum(all_results.values()) / len(all_results)
avg_zs = sum(zero_shot_benc_results.values()) / len(zero_shot_benc_results)
save_path_table = os.path.join(log_dir, "tables.txt")
# results for all datasets except crossNER
table_bench_all = ""
for k, v in all_results.items():
table_bench_all += f"{k:20}: {v:.1%}\n"
# (20 size aswell for average i.e. :20)
table_bench_all += f"{'Average':20}: {avg_all:.1%}"
# results for zero-shot benchmark
table_bench_zeroshot = ""
for k, v in zero_shot_benc_results.items():
table_bench_zeroshot += f"{k:20}: {v:.1%}\n"
table_bench_zeroshot += f"{'Average':20}: {avg_zs:.1%}"
# write to file
with open(save_path_table, "a") as f:
f.write("##############################################\n")
f.write("step: " + str(steps) + "\n")
f.write("Table for all datasets except crossNER\n")
f.write(table_bench_all + "\n\n")
f.write("Table for zero-shot benchmark\n")
f.write(table_bench_zeroshot + "\n")
f.write("##############################################\n\n")
def sample_train_data(data_paths, sample_size=10000):
all_paths = glob.glob(f"{data_paths}/*")
all_paths = sorted(all_paths)
# to exclude the zero-shot benchmark datasets
zero_shot_benc = ["CrossNER_AI", "CrossNER_literature", "CrossNER_music",
"CrossNER_politics", "CrossNER_science", "ACE 2004"]
new_train = []
# take 10k samples from each dataset
for p in tqdm(all_paths):
if any([i in p for i in zero_shot_benc]):
continue
train, dev, test, labels = create_dataset(p)
# add label key to the train data
for i in range(len(train)):
train[i]["label"] = labels
random.shuffle(train)
train = train[:sample_size]
new_train.extend(train)
return new_train
|