|
from google.cloud import storage |
|
import pandas as pd |
|
import json |
|
import re |
|
import sys |
|
|
|
|
|
client = storage.Client() |
|
|
|
|
|
bucket_name = "nb-t5x-us-central2" |
|
bucket = client.bucket(bucket_name) |
|
|
|
|
|
checkpoints=["exp115_mt5_small","exp116_north_t5_base_NCC","exp117_north_t5_base_NCC_lm","exp118_north_t5_base_scand3M","exp119_mt5_base","exp120_sab_base_2","exp121_sab_base_3","exp122_sab_base_4"] |
|
|
|
|
|
start=["1500000","1600000","2000000","3000000","4000000"] |
|
|
|
iterations=["1","2","3","4","5"] |
|
file_names=[] |
|
|
|
for i in iterations: |
|
for c in checkpoints: |
|
for s in start: |
|
if "scand" in c: |
|
name = f'finetuned/ul2test/eval_nynorsk_{c}_v{i}/inference_eval/translate_full_scand-metrics.jsonl' |
|
elif ("mt5" in c) or ("north" in c): |
|
name = f'finetuned/ul2test/eval_nynorsk_{c}_v{i}/inference_eval/translate_full_mt5-metrics.jsonl' |
|
else: |
|
name = f'finetuned/ul2test/eval_nynorsk_{c}_v{i}/inference_eval/translate_full-metrics.jsonl' |
|
file_names.append(name) |
|
|
|
|
|
|
|
file_contents = [] |
|
|
|
downloaded = 0 |
|
not_downloaded = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
for file_name in file_names: |
|
|
|
blob = bucket.get_blob(file_name) |
|
print(f'gs://{bucket_name}/{file_name}') |
|
|
|
if not blob: |
|
|
|
not_downloaded+=1 |
|
continue |
|
else: |
|
downloaded+=1 |
|
|
|
content = blob.download_as_string().decode("utf-8") |
|
|
|
|
|
|
|
|
|
|
|
lines = content.split("\n") |
|
|
|
|
|
for n,line in enumerate(lines): |
|
if not line: |
|
continue |
|
|
|
|
|
data = json.loads(line) |
|
data['base_file_name'] = file_name |
|
pretraining_steps = 0 |
|
data['pretraining_steps'] = int(pretraining_steps) |
|
data['finetuning_steps'] = int(str(data['step'])[-4:]) |
|
data['vocab'] = re.search(r"_(\w+?)-metric", file_name).group(1) |
|
data['experiment'] = re.search(r"_exp(\w+?)_", file_name).group(1) |
|
data['version'] = re.search(r"_v(\w+?)/", file_name).group(1) |
|
data['experiment_name'] = re.search(r"exp\d+_(.*?)_v", file_name).group(1) |
|
file_contents.append(data) |
|
|
|
print(f"\nTotally {downloaded} files downloaded, {not_downloaded} files not downloaded") |
|
|
|
df = pd.json_normalize(file_contents) |
|
df = df.drop_duplicates(subset=['step','experiment','version']).reset_index() |
|
only_5000 = df[df["finetuning_steps"] == 5000] |
|
grouped_at_5000 = only_5000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro"]].groupby(["experiment","experiment_name","pretraining_steps"]) |
|
average_at_5000 = grouped_at_5000.mean().reset_index() |
|
average_at_5000 = average_at_5000.assign(num_experiments=grouped_at_5000.size().values) |
|
only_3000 = df[df["finetuning_steps"] == 3000] |
|
grouped = only_3000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro"]].groupby(["experiment","experiment_name","pretraining_steps"]) |
|
average_at_3000 = grouped.mean().reset_index() |
|
average_at_3000 = average_at_3000.assign(rows_count=grouped.size().values) |
|
|
|
|
|
print(average_at_5000.to_string(index=False)) |
|
|
|
print("\nNot complete:") |
|
uncomplete = average_at_5000[average_at_5000['num_experiments'] != 5] |
|
print(uncomplete) |
|
|
|
df.to_json("stats_various/all.jsonl", orient="records", lines=True) |
|
df.to_csv("stats_various/all.csv", index=False) |
|
|
|
only_5000.to_json("stats_various/only_5000.jsonl", orient="records", lines=True) |
|
only_5000.to_csv("stats_various/only_5000.csv", index=False) |
|
|
|
average_at_5000.to_json("stats_various/average_at_5000.jsonl", orient="records", lines=True) |
|
average_at_5000.to_csv("stats_various/average_at_5000.csv", index=False) |
|
|
|
|
|
print(f"Files exported to stats") |
|
|
|
|
|
|
|
|