File size: 4,073 Bytes
bcc84fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from google.cloud import storage
import pandas as pd
import json
import re
import sys

# Create a storage client
client = storage.Client()

# Get the bucket
bucket_name = "nb-t5x-us-central2"
bucket = client.bucket(bucket_name)


checkpoints=["exp115_mt5_small","exp116_north_t5_base_NCC","exp117_north_t5_base_NCC_lm","exp118_north_t5_base_scand3M","exp119_mt5_base","exp120_sab_base_2","exp121_sab_base_3","exp122_sab_base_4"]


start=["1500000","1600000","2000000","3000000","4000000"]

iterations=["1","2","3","4","5"]
file_names=[]

for i in iterations:
    for c in checkpoints:
        for s in start:
            if "scand" in c:
                name = f'finetuned/ul2test/eval_nynorsk_{c}_v{i}/inference_eval/translate_full_scand-metrics.jsonl'
            elif ("mt5" in c) or ("north" in c):
                name = f'finetuned/ul2test/eval_nynorsk_{c}_v{i}/inference_eval/translate_full_mt5-metrics.jsonl'
            else:
                name = f'finetuned/ul2test/eval_nynorsk_{c}_v{i}/inference_eval/translate_full-metrics.jsonl'
            file_names.append(name)


#list to store json files data
file_contents = []

downloaded = 0
not_downloaded = 0

#print(file_names)
#print(bucket)
#sys.exit(-1)

#iterate over the files
for file_name in file_names:
    # Get the file
    blob = bucket.get_blob(file_name)
    print(f'gs://{bucket_name}/{file_name}')

    if not blob:
        #print(f"Unable to download {file_name}")
        not_downloaded+=1
        continue
    else:
        downloaded+=1

    content = blob.download_as_string().decode("utf-8")
    
    #print(file_name)
    #print(content)

    # Split the content by newline
    lines = content.split("\n")
    
    #iterate over the lines
    for n,line in enumerate(lines):
        if not line:
            continue
        #print(line)
        #print(file_name)
        data = json.loads(line)
        data['base_file_name'] = file_name
        pretraining_steps = 0 #re.search(r"(voc_|voc-full_|voc-full-scratch_|voc-scratch_)(.*?)(?=/)", file_name).group(2)
        data['pretraining_steps'] = int(pretraining_steps)
        data['finetuning_steps'] = int(str(data['step'])[-4:])
        data['vocab'] = re.search(r"_(\w+?)-metric", file_name).group(1)
        data['experiment'] = re.search(r"_exp(\w+?)_", file_name).group(1) 
        data['version'] = re.search(r"_v(\w+?)/", file_name).group(1) 
        data['experiment_name'] = re.search(r"exp\d+_(.*?)_v", file_name).group(1) 
        file_contents.append(data)

print(f"\nTotally {downloaded} files downloaded, {not_downloaded} files not downloaded")

df = pd.json_normalize(file_contents)
df = df.drop_duplicates(subset=['step','experiment','version']).reset_index()
only_5000 = df[df["finetuning_steps"] == 5000]
grouped_at_5000 = only_5000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro"]].groupby(["experiment","experiment_name","pretraining_steps"])
average_at_5000 = grouped_at_5000.mean().reset_index()
average_at_5000 = average_at_5000.assign(num_experiments=grouped_at_5000.size().values)
only_3000 = df[df["finetuning_steps"] == 3000]
grouped = only_3000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro"]].groupby(["experiment","experiment_name","pretraining_steps"])
average_at_3000 = grouped.mean().reset_index()
average_at_3000 = average_at_3000.assign(rows_count=grouped.size().values)

#print(average_at_3000.to_string(index=False))
print(average_at_5000.to_string(index=False))

print("\nNot complete:")
uncomplete = average_at_5000[average_at_5000['num_experiments'] != 5]
print(uncomplete)

df.to_json("stats_various/all.jsonl", orient="records", lines=True)
df.to_csv("stats_various/all.csv", index=False)

only_5000.to_json("stats_various/only_5000.jsonl", orient="records", lines=True)
only_5000.to_csv("stats_various/only_5000.csv", index=False)

average_at_5000.to_json("stats_various/average_at_5000.jsonl", orient="records", lines=True)
average_at_5000.to_csv("stats_various/average_at_5000.csv", index=False)


print(f"Files exported to stats")