diff --git a/app.py b/app.py
index ec7c43ad54c24e86f78a8aab4b55bdfdf6bf8dda..396d3d4728d90d35e874dacb09f682beb4ea8d6f 100644
--- a/app.py
+++ b/app.py
@@ -1,9 +1,10 @@
-import os
 import json
-import glob
 from collections import defaultdict
+from pathlib import Path
+
 import pandas as pd
 import gradio as gr
+
 from content import *
 from css import *
 import glob
@@ -16,74 +17,74 @@ BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
 
 METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
 
-LANGS = 'ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh'.split(',')
+LANGS = "ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh".split(",")
 
 LANG_NAME = {
-    'ar': 'Arabic',
-    'bn': 'Bengali',
-    'ca': 'Catalan',
-    'da': 'Danish',
-    'de': 'German',
-    'es': 'Spanish',
-    'eu': 'Basque',
-    'fr': 'French',
-    'gu': 'Gujarati',
-    'hi': 'Hindi',
-    'hr': 'Croatian',
-    'hu': 'Hungarian',
-    'hy': 'Armenian',
-    'id': 'Indonesian',
-    'it': 'Italian',
-    'kn': 'Kannada',
-    'ml': 'Malayalam',
-    'mr': 'Marathi',
-    'ne': 'Nepali',
-    'nl': 'Dutch',
-    'pt': 'Portuguese',
-    'ro': 'Romanian',
-    'ru': 'Russian',
-    'sk': 'Slovak',
-    'sr': 'Serbian',
-    'sv': 'Swedish',
-    'ta': 'Tamil',
-    'te': 'Telugu',
-    'uk': 'Ukrainian',
-    'vi': 'Vietnamese',
-    'zh': 'Chinese'
+    "ar": "Arabic",
+    "bn": "Bengali",
+    "ca": "Catalan",
+    "da": "Danish",
+    "de": "German",
+    "es": "Spanish",
+    "eu": "Basque",
+    "fr": "French",
+    "gu": "Gujarati",
+    "hi": "Hindi",
+    "hr": "Croatian",
+    "hu": "Hungarian",
+    "hy": "Armenian",
+    "id": "Indonesian",
+    "it": "Italian",
+    "kn": "Kannada",
+    "ml": "Malayalam",
+    "mr": "Marathi",
+    "ne": "Nepali",
+    "nl": "Dutch",
+    "pt": "Portuguese",
+    "ro": "Romanian",
+    "ru": "Russian",
+    "sk": "Slovak",
+    "sr": "Serbian",
+    "sv": "Swedish",
+    "ta": "Tamil",
+    "te": "Telugu",
+    "uk": "Ukrainian",
+    "vi": "Vietnamese",
+    "zh": "Chinese",
 }
 
 
 def collect_results():
     performance_dict = defaultdict(dict)
     pretrained_models = set()
-    for file in glob.glob('evals/*/*.json'):
-        with open(file, 'r') as f:
-            data = json.load(f)
-        if 'results' not in data:
+    for pfin in Path("evals").rglob("*.json"):
+        data = json.loads(pfin.read_text(encoding="utf-8"))
+        if "results" not in data:
             continue
-        if 'config' not in data:
+        if "config" not in data:
             continue
-        results = data['results']
-        config = data['config']
-        if 'model_args' not in config:
+        results = data["results"]
+        config = data["config"]
+        if "model_args" not in config:
             continue
 
-        model_args = config['model_args'].split(',')
-        pretrained = [x for x in model_args if x.startswith('pretrained=')]
+        model_args = config["model_args"].split(",")
+        pretrained = [x for x in model_args if x.startswith("pretrained=")]
         if len(pretrained) != 1:
             continue
-        pretrained = pretrained[0].split('=')[1]
-        pretrained = pretrained.split('/')[-1]
+        pretrained = pretrained[0].split("=")[1]
+        pretrained = pretrained.split("/")[-1]
         pretrained_models.add(pretrained)
 
         for lang_task, perfs in results.items():
-            task, lang = lang_task.split('_')
+            task, lang = lang_task.split("_")
             assert task in BENCHMARKS
 
             if lang and task:
                 metric = METRICS[BENCHMARKS.index(task)]
                 p = round(perfs[metric] * 100, 1)
                 performance_dict[(pretrained, lang)][task] = p
+
     return performance_dict, pretrained_models
 
 
@@ -96,15 +97,13 @@ def get_leaderboard_df(performance_dict, pretrained_models):
         mmlu_perf = perfs.get(MMLU, 0.0)
         truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
 
-        if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
-            continue
         avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
-        notes = ' '.join([pretrained, lang_name])
-        row = [pretrained, lang_name, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
+        notes = " ".join([pretrained, lang_name])
+        row = [pretrained, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
         df.append(row)
 
     df = pd.DataFrame.from_records(df, columns=COLS)
-    df = df.sort_values(by=[LANG_COL, AVERAGE_COL], ascending=False)
+    df = df.sort_values(by=[AVERAGE_COL], ascending=False)
     df = df[COLS]
 
     return df
@@ -115,10 +114,7 @@ def search_table(df, query):
     return filtered_df
 
 
-
 MODEL_COL = "Model"
-LANG_COL = "Language"
-CODE_COL = "Code"
 AVERAGE_COL = "Average"
 ARC_COL = "ARC (25-shot)"
 HELLASWAG_COL = "HellaSwag (10-shot)️"
@@ -126,8 +122,8 @@ MMLU_COL = "MMLU (5-shot)"
 TRUTHFULQA_COL = "TruthfulQA (0-shot)"
 NOTES_COL = "Notes"  # For search only
 
-COLS = [MODEL_COL, LANG_COL, CODE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL]
-TYPES = ["str", "str", "str", "number", "number", "number", "number", "number", "str"]
+COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL]
+TYPES = ["str", "number", "number", "number", "number", "number", "str"]
 
 args = collect_results()
 original_df = get_leaderboard_df(*args)
@@ -139,9 +135,7 @@ with demo:
     gr.Markdown(HOW_TO, elem_classes="markdown-text")
 
     with gr.Box():
-        search_bar = gr.Textbox(
-            placeholder="Search models and languages...", show_label=False, elem_id="search-bar"
-        )
+        search_bar = gr.Textbox(placeholder="Search models and languages...", show_label=False, elem_id="search-bar")
 
         leaderboard_table = gr.components.Dataframe(
             value=original_df,
diff --git a/css.py b/css.py
index a476733d83bfe934665f06fb222097392e2db88c..d0af6b59ea2531da0624c61b07d250c76b787a70 100644
--- a/css.py
+++ b/css.py
@@ -1,4 +1,4 @@
-CUSTOM_CSS= """
+CUSTOM_CSS = """
 /* Hides the final column */
 table td:last-child,
 table th:last-child {
@@ -10,4 +10,4 @@ table th:last-child {
 #     overflow: auto;
 #     white-space: nowrap;
 # }
-"""
\ No newline at end of file
+"""
diff --git a/evals/arc/arc_ar-bloom-7b1.json b/evals/arc/arc_ar-bloom-7b1.json
deleted file mode 100644
index 66c115459f73a74be6bd4b1b3933509010a82342..0000000000000000000000000000000000000000
--- a/evals/arc/arc_ar-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ar": {
-      "acc": 0.2634730538922156,
-      "acc_stderr": 0.012889646336321774,
-      "acc_norm": 0.31394354148845166,
-      "acc_norm_stderr": 0.013579515768185788
-    }
-  },
-  "versions": {
-    "arc_ar": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_ar-llama-7B.json b/evals/arc/arc_ar-llama-7B.json
deleted file mode 100644
index 31293a19637055f69dbf3fb11cadfd2fde391402..0000000000000000000000000000000000000000
--- a/evals/arc/arc_ar-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ar": {
-      "acc": 0.19760479041916168,
-      "acc_stderr": 0.011651221980953499,
-      "acc_norm": 0.24636441402908468,
-      "acc_norm_stderr": 0.012608059960468694
-    }
-  },
-  "versions": {
-    "arc_ar": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_bn-bloom-7b1.json b/evals/arc/arc_bn-bloom-7b1.json
deleted file mode 100644
index b7b877a4a649f59197b24de7b3ec917785979683..0000000000000000000000000000000000000000
--- a/evals/arc/arc_bn-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_bn": {
-      "acc": 0.22412318220701455,
-      "acc_stderr": 0.012201644195165715,
-      "acc_norm": 0.2617621899059025,
-      "acc_norm_stderr": 0.012862641889254466
-    }
-  },
-  "versions": {
-    "arc_bn": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_bn-llama-7B.json b/evals/arc/arc_bn-llama-7B.json
deleted file mode 100644
index 1dafcad0f0dbcae9d42395e2697e1ddc5c1ba0c2..0000000000000000000000000000000000000000
--- a/evals/arc/arc_bn-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_bn": {
-      "acc": 0.1899059024807528,
-      "acc_stderr": 0.011476660752315397,
-      "acc_norm": 0.2583404619332763,
-      "acc_norm_stderr": 0.012807875214816267
-    }
-  },
-  "versions": {
-    "arc_bn": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_ca-bloom-7b1.json b/evals/arc/arc_ca-bloom-7b1.json
deleted file mode 100644
index f0a15e06750a49e5570198c619957cce3e35cf0c..0000000000000000000000000000000000000000
--- a/evals/arc/arc_ca-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ca": {
-      "acc": 0.31989708404802747,
-      "acc_stderr": 0.01366562491926326,
-      "acc_norm": 0.34734133790737565,
-      "acc_norm_stderr": 0.013949489903701517
-    }
-  },
-  "versions": {
-    "arc_ca": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_ca-llama-7B.json b/evals/arc/arc_ca-llama-7B.json
deleted file mode 100644
index f0e3b53912555842b913d4cc78b61de1b70a2380..0000000000000000000000000000000000000000
--- a/evals/arc/arc_ca-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ca": {
-      "acc": 0.3276157804459691,
-      "acc_stderr": 0.01375080741597368,
-      "acc_norm": 0.3507718696397942,
-      "acc_norm_stderr": 0.013981316936172217
-    }
-  },
-  "versions": {
-    "arc_ca": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_da-bloom-7b1.json b/evals/arc/arc_da-bloom-7b1.json
deleted file mode 100644
index 1f4e588f7cac0716c4285f186e6d2aa122ee795d..0000000000000000000000000000000000000000
--- a/evals/arc/arc_da-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_da": {
-      "acc": 0.20137103684661525,
-      "acc_stderr": 0.011744154502532795,
-      "acc_norm": 0.24592973436161097,
-      "acc_norm_stderr": 0.012611366681285752
-    }
-  },
-  "versions": {
-    "arc_da": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_da-llama-7B.json b/evals/arc/arc_da-llama-7B.json
deleted file mode 100644
index 814a2fb017691ccd12afbf034c490e10a646843e..0000000000000000000000000000000000000000
--- a/evals/arc/arc_da-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_da": {
-      "acc": 0.286203941730934,
-      "acc_stderr": 0.013236574332463879,
-      "acc_norm": 0.3273350471293916,
-      "acc_norm_stderr": 0.013741887176251822
-    }
-  },
-  "versions": {
-    "arc_da": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_de-bloom-7b1.json b/evals/arc/arc_de-bloom-7b1.json
deleted file mode 100644
index 205cbe1e5a60177701994fa2eca97338da50bd02..0000000000000000000000000000000000000000
--- a/evals/arc/arc_de-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_de": {
-      "acc": 0.22241231822070145,
-      "acc_stderr": 0.012168377742629776,
-      "acc_norm": 0.262617621899059,
-      "acc_norm_stderr": 0.01287617552045283
-    }
-  },
-  "versions": {
-    "arc_de": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_de-llama-7B.json b/evals/arc/arc_de-llama-7B.json
deleted file mode 100644
index f13cfc00bfd0ac6e8b6e48a5c0bc3b99c3140b69..0000000000000000000000000000000000000000
--- a/evals/arc/arc_de-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_de": {
-      "acc": 0.2951240376390077,
-      "acc_stderr": 0.013345572865502645,
-      "acc_norm": 0.35072711719418304,
-      "acc_norm_stderr": 0.013962940383743043
-    }
-  },
-  "versions": {
-    "arc_de": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_es-bloom-7b1.json b/evals/arc/arc_es-bloom-7b1.json
deleted file mode 100644
index 39a5c5211ff20ef49014baa232a8ea2a9d8884be..0000000000000000000000000000000000000000
--- a/evals/arc/arc_es-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_es": {
-      "acc": 0.3316239316239316,
-      "acc_stderr": 0.013769752111910177,
-      "acc_norm": 0.3811965811965812,
-      "acc_norm_stderr": 0.01420507709573084
-    }
-  },
-  "versions": {
-    "arc_es": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_es-llama-7B.json b/evals/arc/arc_es-llama-7B.json
deleted file mode 100644
index 11544ff8942a30c3fb128aa473ea30d88443b0e6..0000000000000000000000000000000000000000
--- a/evals/arc/arc_es-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_es": {
-      "acc": 0.3606837606837607,
-      "acc_stderr": 0.014044746572948867,
-      "acc_norm": 0.3683760683760684,
-      "acc_norm_stderr": 0.014108074259155369
-    }
-  },
-  "versions": {
-    "arc_es": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_eu-bloom-7b1.json b/evals/arc/arc_eu-bloom-7b1.json
deleted file mode 100644
index 156fd60ab449125d255226262654e5337e4cb697..0000000000000000000000000000000000000000
--- a/evals/arc/arc_eu-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_eu": {
-      "acc": 0.22056239015817222,
-      "acc_stderr": 0.01229634886589257,
-      "acc_norm": 0.2521968365553603,
-      "acc_norm_stderr": 0.012879032347922939
-    }
-  },
-  "versions": {
-    "arc_eu": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_eu-llama-7B.json b/evals/arc/arc_eu-llama-7B.json
deleted file mode 100644
index 10a039f055cb172c7978f840a54bec6cc724948c..0000000000000000000000000000000000000000
--- a/evals/arc/arc_eu-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_eu": {
-      "acc": 0.20738137082601055,
-      "acc_stderr": 0.012023662461166562,
-      "acc_norm": 0.2451669595782074,
-      "acc_norm_stderr": 0.012757811738008544
-    }
-  },
-  "versions": {
-    "arc_eu": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_fr-bloom-7b1.json b/evals/arc/arc_fr-bloom-7b1.json
deleted file mode 100644
index 78cbf1e3cfc337f169be33735f919ab397b8d085..0000000000000000000000000000000000000000
--- a/evals/arc/arc_fr-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_fr": {
-      "acc": 0.32677502138579984,
-      "acc_stderr": 0.01372407602199982,
-      "acc_norm": 0.3669803250641574,
-      "acc_norm_stderr": 0.014102904772197396
-    }
-  },
-  "versions": {
-    "arc_fr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_fr-llama-7B.json b/evals/arc/arc_fr-llama-7B.json
deleted file mode 100644
index c79866a45e043e6b6e5e139f5ac63dfb8b522f27..0000000000000000000000000000000000000000
--- a/evals/arc/arc_fr-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_fr": {
-      "acc": 0.3473053892215569,
-      "acc_stderr": 0.013931226499492353,
-      "acc_norm": 0.3729683490162532,
-      "acc_norm_stderr": 0.014150093168782438
-    }
-  },
-  "versions": {
-    "arc_fr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_gu-bloom-7b1.json b/evals/arc/arc_gu-bloom-7b1.json
deleted file mode 100644
index c78878020cb8341b5adb388627ffa309dde3ad3a..0000000000000000000000000000000000000000
--- a/evals/arc/arc_gu-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_gu": {
-      "acc": 0.2206896551724138,
-      "acc_stderr": 0.012181604374453973,
-      "acc_norm": 0.2336206896551724,
-      "acc_norm_stderr": 0.012428989430945793
-    }
-  },
-  "versions": {
-    "arc_gu": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_gu-llama-7B.json b/evals/arc/arc_gu-llama-7B.json
deleted file mode 100644
index afadd880b353d2482c13ab85d24811ac5ea5fd57..0000000000000000000000000000000000000000
--- a/evals/arc/arc_gu-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_gu": {
-      "acc": 0.2120689655172414,
-      "acc_stderr": 0.012007177871292825,
-      "acc_norm": 0.23189655172413792,
-      "acc_norm_stderr": 0.012396962423413033
-    }
-  },
-  "versions": {
-    "arc_gu": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_hi-bloom-7b1.json b/evals/arc/arc_hi-bloom-7b1.json
deleted file mode 100644
index 70136df6c1f9731ab888c323fa0128c0beb43524..0000000000000000000000000000000000000000
--- a/evals/arc/arc_hi-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hi": {
-      "acc": 0.2363013698630137,
-      "acc_stderr": 0.012435369590403731,
-      "acc_norm": 0.2919520547945205,
-      "acc_norm_stderr": 0.013309191484613488
-    }
-  },
-  "versions": {
-    "arc_hi": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_hi-llama-7B.json b/evals/arc/arc_hi-llama-7B.json
deleted file mode 100644
index ddcd58ade570221ad656710d0944a241789b1d8b..0000000000000000000000000000000000000000
--- a/evals/arc/arc_hi-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hi": {
-      "acc": 0.21232876712328766,
-      "acc_stderr": 0.011971304657273123,
-      "acc_norm": 0.25,
-      "acc_norm_stderr": 0.012675503164084846
-    }
-  },
-  "versions": {
-    "arc_hi": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_hr-bloom-7b1.json b/evals/arc/arc_hr-bloom-7b1.json
deleted file mode 100644
index 80efc06ef94471b0b04935089a967e72d9e2095e..0000000000000000000000000000000000000000
--- a/evals/arc/arc_hr-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hr": {
-      "acc": 0.19332763045337895,
-      "acc_stderr": 0.011555111310342437,
-      "acc_norm": 0.2369546621043627,
-      "acc_norm_stderr": 0.012441890624187792
-    }
-  },
-  "versions": {
-    "arc_hr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_hr-llama-7B.json b/evals/arc/arc_hr-llama-7B.json
deleted file mode 100644
index 9c50fa3252a0133486190ed9d5cbc497e1a17fe9..0000000000000000000000000000000000000000
--- a/evals/arc/arc_hr-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hr": {
-      "acc": 0.2754491017964072,
-      "acc_stderr": 0.01307174925264165,
-      "acc_norm": 0.330196749358426,
-      "acc_norm_stderr": 0.013760638974726852
-    }
-  },
-  "versions": {
-    "arc_hr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_hu-bloom-7b1.json b/evals/arc/arc_hu-bloom-7b1.json
deleted file mode 100644
index 3c7e8773a07af63cf8522b314bbd0611c37c7b98..0000000000000000000000000000000000000000
--- a/evals/arc/arc_hu-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hu": {
-      "acc": 0.1969178082191781,
-      "acc_stderr": 0.011640913614197496,
-      "acc_norm": 0.2585616438356164,
-      "acc_norm_stderr": 0.0128169339627777
-    }
-  },
-  "versions": {
-    "arc_hu": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_hu-llama-7B.json b/evals/arc/arc_hu-llama-7B.json
deleted file mode 100644
index ac3191180768a88cd6c937d51bf005adb11c7ccf..0000000000000000000000000000000000000000
--- a/evals/arc/arc_hu-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hu": {
-      "acc": 0.2517123287671233,
-      "acc_stderr": 0.012704310825494622,
-      "acc_norm": 0.2979452054794521,
-      "acc_norm_stderr": 0.013388079339102703
-    }
-  },
-  "versions": {
-    "arc_hu": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_hy-bloom-7b1.json b/evals/arc/arc_hy-bloom-7b1.json
deleted file mode 100644
index d138545e18f6bb49f13d11bd9cd3b515db23815b..0000000000000000000000000000000000000000
--- a/evals/arc/arc_hy-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hy": {
-      "acc": 0.21181818181818182,
-      "acc_stderr": 0.01232525683396216,
-      "acc_norm": 0.26181818181818184,
-      "acc_norm_stderr": 0.013261197012809796
-    }
-  },
-  "versions": {
-    "arc_hy": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_hy-llama-7B.json b/evals/arc/arc_hy-llama-7B.json
deleted file mode 100644
index 35e46c981f8bc3bf9374fdf6ad4b483f4c65762b..0000000000000000000000000000000000000000
--- a/evals/arc/arc_hy-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hy": {
-      "acc": 0.19454545454545455,
-      "acc_stderr": 0.011940766785664334,
-      "acc_norm": 0.2718181818181818,
-      "acc_norm_stderr": 0.013420241182110736
-    }
-  },
-  "versions": {
-    "arc_hy": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_id-bloom-7b1.json b/evals/arc/arc_id-bloom-7b1.json
deleted file mode 100644
index a2cc8cf230eda88935959ff54b9ded1986940b84..0000000000000000000000000000000000000000
--- a/evals/arc/arc_id-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_id": {
-      "acc": 0.3128205128205128,
-      "acc_stderr": 0.013560492090917607,
-      "acc_norm": 0.3598290598290598,
-      "acc_norm_stderr": 0.014037469945597791
-    }
-  },
-  "versions": {
-    "arc_id": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_id-llama-7B.json b/evals/arc/arc_id-llama-7B.json
deleted file mode 100644
index 59fcc7ff10a29c0f82833ce5df7a260a8d4bbd42..0000000000000000000000000000000000000000
--- a/evals/arc/arc_id-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_id": {
-      "acc": 0.19316239316239317,
-      "acc_stderr": 0.011546413314069014,
-      "acc_norm": 0.26666666666666666,
-      "acc_norm_stderr": 0.012933850109759573
-    }
-  },
-  "versions": {
-    "arc_id": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_it-bloom-7b1.json b/evals/arc/arc_it-bloom-7b1.json
deleted file mode 100644
index 7eda117416da15b68b1713aa6ef9ff77e69fd826..0000000000000000000000000000000000000000
--- a/evals/arc/arc_it-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_it": {
-      "acc": 0.24037639007698888,
-      "acc_stderr": 0.01250327289928353,
-      "acc_norm": 0.28999144568006846,
-      "acc_norm_stderr": 0.01327709194338097
-    }
-  },
-  "versions": {
-    "arc_it": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_it-llama-7B.json b/evals/arc/arc_it-llama-7B.json
deleted file mode 100644
index 76b8875276c1b0078d3d087c16397df3b3ea9200..0000000000000000000000000000000000000000
--- a/evals/arc/arc_it-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_it": {
-      "acc": 0.31736526946107785,
-      "acc_stderr": 0.013619227292898307,
-      "acc_norm": 0.3575705731394354,
-      "acc_norm_stderr": 0.014024008839912006
-    }
-  },
-  "versions": {
-    "arc_it": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_kn-bloom-7b1.json b/evals/arc/arc_kn-bloom-7b1.json
deleted file mode 100644
index e92b7d0d555bc117110f34dbbc68d327f5092f5f..0000000000000000000000000000000000000000
--- a/evals/arc/arc_kn-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_kn": {
-      "acc": 0.2221254355400697,
-      "acc_stderr": 0.012273607270054452,
-      "acc_norm": 0.24738675958188153,
-      "acc_norm_stderr": 0.012740675198098838
-    }
-  },
-  "versions": {
-    "arc_kn": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_kn-llama-7B.json b/evals/arc/arc_kn-llama-7B.json
deleted file mode 100644
index 39ae5661b6403f677d4427689194c417f1f2f8b5..0000000000000000000000000000000000000000
--- a/evals/arc/arc_kn-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_kn": {
-      "acc": 0.20470383275261325,
-      "acc_stderr": 0.011913674295957856,
-      "acc_norm": 0.24738675958188153,
-      "acc_norm_stderr": 0.012740675198098834
-    }
-  },
-  "versions": {
-    "arc_kn": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_ml-bloom-7b1.json b/evals/arc/arc_ml-bloom-7b1.json
deleted file mode 100644
index f7c83104b2f7701b8a7af344179886c58a0e89a0..0000000000000000000000000000000000000000
--- a/evals/arc/arc_ml-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ml": {
-      "acc": 0.2075306479859895,
-      "acc_stderr": 0.01200575665793095,
-      "acc_norm": 0.2635726795096322,
-      "acc_norm_stderr": 0.013042844591075362
-    }
-  },
-  "versions": {
-    "arc_ml": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_ml-llama-7B.json b/evals/arc/arc_ml-llama-7B.json
deleted file mode 100644
index fc465c13860754471e99430d5e6c5e1df5046b2e..0000000000000000000000000000000000000000
--- a/evals/arc/arc_ml-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ml": {
-      "acc": 0.21628721541155868,
-      "acc_stderr": 0.012188522634632977,
-      "acc_norm": 0.27845884413309985,
-      "acc_norm_stderr": 0.013269918016014967
-    }
-  },
-  "versions": {
-    "arc_ml": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_mr-bloom-7b1.json b/evals/arc/arc_mr-bloom-7b1.json
deleted file mode 100644
index cb854d6690652622f9f24d8c241c70b1cab749f9..0000000000000000000000000000000000000000
--- a/evals/arc/arc_mr-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_mr": {
-      "acc": 0.23376623376623376,
-      "acc_stderr": 0.012458582396003653,
-      "acc_norm": 0.2727272727272727,
-      "acc_norm_stderr": 0.013110221561502926
-    }
-  },
-  "versions": {
-    "arc_mr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_mr-llama-7B.json b/evals/arc/arc_mr-llama-7B.json
deleted file mode 100644
index 0755f8ce24bf655025ef6eb6414570573beb9858..0000000000000000000000000000000000000000
--- a/evals/arc/arc_mr-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_mr": {
-      "acc": 0.2051948051948052,
-      "acc_stderr": 0.011888050053276677,
-      "acc_norm": 0.2545454545454545,
-      "acc_norm_stderr": 0.012823020964319998
-    }
-  },
-  "versions": {
-    "arc_mr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_ne-bloom-7b1.json b/evals/arc/arc_ne-bloom-7b1.json
deleted file mode 100644
index 8642b825a874e720a4bb8c0f92ff6fc304357c9f..0000000000000000000000000000000000000000
--- a/evals/arc/arc_ne-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ne": {
-      "acc": 0.21300256629597947,
-      "acc_stderr": 0.01198002307808546,
-      "acc_norm": 0.223267750213858,
-      "acc_norm_stderr": 0.012185048029719049
-    }
-  },
-  "versions": {
-    "arc_ne": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_ne-llama-7B.json b/evals/arc/arc_ne-llama-7B.json
deleted file mode 100644
index e20341882d82d53d339ccb9e726250d842765069..0000000000000000000000000000000000000000
--- a/evals/arc/arc_ne-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ne": {
-      "acc": 0.2172797262617622,
-      "acc_stderr": 0.012066782166932105,
-      "acc_norm": 0.24294268605645852,
-      "acc_norm_stderr": 0.012548588352773893
-    }
-  },
-  "versions": {
-    "arc_ne": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_nl_Llama-2-7b-chat-hf.json b/evals/arc/arc_nl_Llama-2-7b-chat-hf.json
new file mode 100644
index 0000000000000000000000000000000000000000..d95e22e17c312755971c6aec7f376d25ab3f159e
--- /dev/null
+++ b/evals/arc/arc_nl_Llama-2-7b-chat-hf.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_nl": {
+      "acc": 0.3609923011120616,
+      "acc_stderr": 0.014053373664144792,
+      "acc_norm": 0.3618477331052181,
+      "acc_norm_stderr": 0.014060593893704966
+    }
+  },
+  "versions": {
+    "arc_nl": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 8,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_nl_Llama-2-7b-hf.json b/evals/arc/arc_nl_Llama-2-7b-hf.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a203f2de2ff7ea6dca18093dfde7757ccf55eca
--- /dev/null
+++ b/evals/arc/arc_nl_Llama-2-7b-hf.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_nl": {
+      "acc": 0.33704020530367834,
+      "acc_stderr": 0.013831300903580639,
+      "acc_norm": 0.3567151411462789,
+      "acc_norm_stderr": 0.014016546277185005
+    }
+  },
+  "versions": {
+    "arc_nl": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 8,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_nl_Mistral-7B-v0.1.json b/evals/arc/arc_nl_Mistral-7B-v0.1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e69cc570fa8ab39caac1d704af5c19c2a53baf3d
--- /dev/null
+++ b/evals/arc/arc_nl_Mistral-7B-v0.1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_nl": {
+      "acc": 0.42087254063301965,
+      "acc_stderr": 0.014445778557368833,
+      "acc_norm": 0.4294268605645851,
+      "acc_norm_stderr": 0.014483677397351059
+    }
+  },
+  "versions": {
+    "arc_nl": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 8,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_nl_zephyr-7b-beta.json b/evals/arc/arc_nl_zephyr-7b-beta.json
new file mode 100644
index 0000000000000000000000000000000000000000..af6a67755466fb649a9285fcd45d0ccdf6fa1116
--- /dev/null
+++ b/evals/arc/arc_nl_zephyr-7b-beta.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_nl": {
+      "acc": 0.43798118049615054,
+      "acc_stderr": 0.01451716231691793,
+      "acc_norm": 0.4328485885372113,
+      "acc_norm_stderr": 0.01449759923259859
+    }
+  },
+  "versions": {
+    "arc_nl": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 8,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_pt-bloom-7b1.json b/evals/arc/arc_pt-bloom-7b1.json
deleted file mode 100644
index 880d8570463408853523eec06407b3c8ed9e5b11..0000000000000000000000000000000000000000
--- a/evals/arc/arc_pt-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_pt": {
-      "acc": 0.3401709401709402,
-      "acc_stderr": 0.013856612397310694,
-      "acc_norm": 0.4,
-      "acc_norm_stderr": 0.014328422047021531
-    }
-  },
-  "versions": {
-    "arc_pt": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_pt-llama-7B.json b/evals/arc/arc_pt-llama-7B.json
deleted file mode 100644
index 0a856face8fef0cab72d3cda7305f6949d011ce3..0000000000000000000000000000000000000000
--- a/evals/arc/arc_pt-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_pt": {
-      "acc": 0.3367521367521368,
-      "acc_stderr": 0.01382247630777062,
-      "acc_norm": 0.37777777777777777,
-      "acc_norm_stderr": 0.014180244103534094
-    }
-  },
-  "versions": {
-    "arc_pt": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_ro-bloom-7b1.json b/evals/arc/arc_ro-bloom-7b1.json
deleted file mode 100644
index 083766c1f50d79393939908a8f8837dcc7cb697d..0000000000000000000000000000000000000000
--- a/evals/arc/arc_ro-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ro": {
-      "acc": 0.2099400171379606,
-      "acc_stderr": 0.011926921791273557,
-      "acc_norm": 0.26906598114824337,
-      "acc_norm_stderr": 0.012987310039914976
-    }
-  },
-  "versions": {
-    "arc_ro": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_ro-llama-7B.json b/evals/arc/arc_ro-llama-7B.json
deleted file mode 100644
index eab2e4a70b967696417355b0d11bd69cabf3ddc5..0000000000000000000000000000000000000000
--- a/evals/arc/arc_ro-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ro": {
-      "acc": 0.30077120822622105,
-      "acc_stderr": 0.013430077114209907,
-      "acc_norm": 0.32390745501285345,
-      "acc_norm_stderr": 0.013704533924425027
-    }
-  },
-  "versions": {
-    "arc_ro": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_ru-bloom-7b1.json b/evals/arc/arc_ru-bloom-7b1.json
deleted file mode 100644
index 1ff9ed6089fca2642658a8e6f9f74471739e87e6..0000000000000000000000000000000000000000
--- a/evals/arc/arc_ru-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ru": {
-      "acc": 0.21043627031650983,
-      "acc_stderr": 0.01192703439080346,
-      "acc_norm": 0.2754491017964072,
-      "acc_norm_stderr": 0.01307174925264165
-    }
-  },
-  "versions": {
-    "arc_ru": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_ru-llama-7B.json b/evals/arc/arc_ru-llama-7B.json
deleted file mode 100644
index f62854eef188594fdc60a93341410fac7a49fa14..0000000000000000000000000000000000000000
--- a/evals/arc/arc_ru-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ru": {
-      "acc": 0.2934131736526946,
-      "acc_stderr": 0.013322973103306575,
-      "acc_norm": 0.32078699743370404,
-      "acc_norm_stderr": 0.013658089444975752
-    }
-  },
-  "versions": {
-    "arc_ru": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_sk-bloom-7b1.json b/evals/arc/arc_sk-bloom-7b1.json
deleted file mode 100644
index 4404e57e2290a69cce8029b89f0939593bbe7d8e..0000000000000000000000000000000000000000
--- a/evals/arc/arc_sk-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sk": {
-      "acc": 0.20359281437125748,
-      "acc_stderr": 0.011782227020010716,
-      "acc_norm": 0.24893071000855432,
-      "acc_norm_stderr": 0.012651960282598879
-    }
-  },
-  "versions": {
-    "arc_sk": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_sk-llama-7B.json b/evals/arc/arc_sk-llama-7B.json
deleted file mode 100644
index b018df9a5453495bb3ff51f8908c88c064d888a4..0000000000000000000000000000000000000000
--- a/evals/arc/arc_sk-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sk": {
-      "acc": 0.23609923011120615,
-      "acc_stderr": 0.012426371635795894,
-      "acc_norm": 0.28999144568006846,
-      "acc_norm_stderr": 0.013277091943380979
-    }
-  },
-  "versions": {
-    "arc_sk": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_sr-bloom-7b1.json b/evals/arc/arc_sr-bloom-7b1.json
deleted file mode 100644
index ca68a7fae3c2920f66e9f6948396528ea7efe421..0000000000000000000000000000000000000000
--- a/evals/arc/arc_sr-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sr": {
-      "acc": 0.2172797262617622,
-      "acc_stderr": 0.012066782166932079,
-      "acc_norm": 0.25149700598802394,
-      "acc_norm_stderr": 0.01269526466186626
-    }
-  },
-  "versions": {
-    "arc_sr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_sr-llama-7B.json b/evals/arc/arc_sr-llama-7B.json
deleted file mode 100644
index dbe0e415ecd651a7afbe25423df0f79ddbf30b59..0000000000000000000000000000000000000000
--- a/evals/arc/arc_sr-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sr": {
-      "acc": 0.25748502994011974,
-      "acc_stderr": 0.012794024494042348,
-      "acc_norm": 0.30795551753635586,
-      "acc_norm_stderr": 0.013507954174822524
-    }
-  },
-  "versions": {
-    "arc_sr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_sv-bloom-7b1.json b/evals/arc/arc_sv-bloom-7b1.json
deleted file mode 100644
index e602b4d12926dbb93b567be032a836cb50b2ff51..0000000000000000000000000000000000000000
--- a/evals/arc/arc_sv-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sv": {
-      "acc": 0.20515021459227467,
-      "acc_stderr": 0.011835920197074948,
-      "acc_norm": 0.2515021459227468,
-      "acc_norm_stderr": 0.012717145410329311
-    }
-  },
-  "versions": {
-    "arc_sv": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_sv-llama-7B.json b/evals/arc/arc_sv-llama-7B.json
deleted file mode 100644
index 3cacd9bbf330b2d6be85b2903f5d124c0045cc94..0000000000000000000000000000000000000000
--- a/evals/arc/arc_sv-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sv": {
-      "acc": 0.303862660944206,
-      "acc_stderr": 0.013480613043590443,
-      "acc_norm": 0.34935622317596565,
-      "acc_norm_stderr": 0.013974278424227307
-    }
-  },
-  "versions": {
-    "arc_sv": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_ta-bloom-7b1.json b/evals/arc/arc_ta-bloom-7b1.json
deleted file mode 100644
index 68a6f4875888d86505752626ba4a52fd12cc3c84..0000000000000000000000000000000000000000
--- a/evals/arc/arc_ta-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ta": {
-      "acc": 0.22942206654991243,
-      "acc_stderr": 0.01244752638770244,
-      "acc_norm": 0.24168126094570927,
-      "acc_norm_stderr": 0.012673733216040754
-    }
-  },
-  "versions": {
-    "arc_ta": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_ta-llama-7B.json b/evals/arc/arc_ta-llama-7B.json
deleted file mode 100644
index d7c697739212d1bec5e84f1a4e6f0017d500ecc7..0000000000000000000000000000000000000000
--- a/evals/arc/arc_ta-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ta": {
-      "acc": 0.2075306479859895,
-      "acc_stderr": 0.012005756657930957,
-      "acc_norm": 0.27495621716287216,
-      "acc_norm_stderr": 0.013218161880960047
-    }
-  },
-  "versions": {
-    "arc_ta": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_te-bloom-7b1.json b/evals/arc/arc_te-bloom-7b1.json
deleted file mode 100644
index 1be31afe5307f0b3c626e305437b1932d4457b68..0000000000000000000000000000000000000000
--- a/evals/arc/arc_te-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_te": {
-      "acc": 0.20175438596491227,
-      "acc_stderr": 0.01189098690363561,
-      "acc_norm": 0.24298245614035088,
-      "acc_norm_stderr": 0.01270803987901337
-    }
-  },
-  "versions": {
-    "arc_te": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_te-llama-7B.json b/evals/arc/arc_te-llama-7B.json
deleted file mode 100644
index f84a1b907c92965f5829cbd68e89759d2d1ef9d7..0000000000000000000000000000000000000000
--- a/evals/arc/arc_te-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_te": {
-      "acc": 0.2026315789473684,
-      "acc_stderr": 0.011910259341316062,
-      "acc_norm": 0.2517543859649123,
-      "acc_norm_stderr": 0.012860230436368953
-    }
-  },
-  "versions": {
-    "arc_te": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_uk-bloom-7b1.json b/evals/arc/arc_uk-bloom-7b1.json
deleted file mode 100644
index 05233ff08727d5cac7dd74429dbc024eb5fd5f4f..0000000000000000000000000000000000000000
--- a/evals/arc/arc_uk-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_uk": {
-      "acc": 0.1958939264328486,
-      "acc_stderr": 0.011613035012800898,
-      "acc_norm": 0.2275449101796407,
-      "acc_norm_stderr": 0.012267293637033645
-    }
-  },
-  "versions": {
-    "arc_uk": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_uk-llama-7B.json b/evals/arc/arc_uk-llama-7B.json
deleted file mode 100644
index 717afd73b3550c42e809f9bdb7fac834e805b5ee..0000000000000000000000000000000000000000
--- a/evals/arc/arc_uk-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_uk": {
-      "acc": 0.28999144568006846,
-      "acc_stderr": 0.013277091943380968,
-      "acc_norm": 0.32934131736526945,
-      "acc_norm_stderr": 0.013751575689336035
-    }
-  },
-  "versions": {
-    "arc_uk": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_vi-bloom-7b1.json b/evals/arc/arc_vi-bloom-7b1.json
deleted file mode 100644
index 4bc8e4783cc71214d4ba57feef30a0bfee5774c2..0000000000000000000000000000000000000000
--- a/evals/arc/arc_vi-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_vi": {
-      "acc": 0.28974358974358977,
-      "acc_stderr": 0.013268054405378885,
-      "acc_norm": 0.3367521367521368,
-      "acc_norm_stderr": 0.01382247630777062
-    }
-  },
-  "versions": {
-    "arc_vi": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_vi-llama-7B.json b/evals/arc/arc_vi-llama-7B.json
deleted file mode 100644
index 7c14775b05df6587593cb1cbb921ee6ac86a8370..0000000000000000000000000000000000000000
--- a/evals/arc/arc_vi-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_vi": {
-      "acc": 0.20256410256410257,
-      "acc_stderr": 0.011754979539893694,
-      "acc_norm": 0.23675213675213674,
-      "acc_norm_stderr": 0.01243290160581911
-    }
-  },
-  "versions": {
-    "arc_vi": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_zh-bloom-7b1.json b/evals/arc/arc_zh-bloom-7b1.json
deleted file mode 100644
index c4deb085367a11032bec8e265cc4cb91fe75a0f5..0000000000000000000000000000000000000000
--- a/evals/arc/arc_zh-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_zh": {
-      "acc": 0.3076923076923077,
-      "acc_stderr": 0.013498970320941413,
-      "acc_norm": 0.37264957264957266,
-      "acc_norm_stderr": 0.014141587247061969
-    }
-  },
-  "versions": {
-    "arc_zh": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc/arc_zh-llama-7B.json b/evals/arc/arc_zh-llama-7B.json
deleted file mode 100644
index 9cca2a2335f34f3b9eb36c125304f260fc3f8cd9..0000000000000000000000000000000000000000
--- a/evals/arc/arc_zh-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_zh": {
-      "acc": 0.2564102564102564,
-      "acc_stderr": 0.012771065618749024,
-      "acc_norm": 0.2982905982905983,
-      "acc_norm_stderr": 0.013381080232166387
-    }
-  },
-  "versions": {
-    "arc_zh": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ar_bloom-7b1.json b/evals/hellaswag/hellaswag_ar_bloom-7b1.json
deleted file mode 100644
index 69248e00b845c50b1eb8379e9d0ec05aaffc075d..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_ar_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_ar": {
-      "acc": 0.3561464690496949,
-      "acc_stderr": 0.004999249661771764,
-      "acc_norm": 0.43341325196163905,
-      "acc_norm_stderr": 0.005173461992734505
-    }
-  },
-  "versions": {
-    "hellaswag_ar": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ar_llama-7B.json b/evals/hellaswag/hellaswag_ar_llama-7B.json
deleted file mode 100644
index 53797549241b15b072b9f0ce5f8b12ea57bce437..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_ar_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_ar": {
-      "acc": 0.28040540540540543,
-      "acc_stderr": 0.004689581635445738,
-      "acc_norm": 0.3085222319093287,
-      "acc_norm_stderr": 0.004822023322058258
-    }
-  },
-  "versions": {
-    "hellaswag_ar": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_bn_bloom-7b1.json b/evals/hellaswag/hellaswag_bn_bloom-7b1.json
deleted file mode 100644
index 7e6f1a343c04d236c977fa61b55e3bd8c74fa3f1..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_bn_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_bn": {
-      "acc": 0.28381302748322873,
-      "acc_stderr": 0.004689968075947356,
-      "acc_norm": 0.3277429127894395,
-      "acc_norm_stderr": 0.004882866652334284
-    }
-  },
-  "versions": {
-    "hellaswag_bn": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_bn_llama-7B.json b/evals/hellaswag/hellaswag_bn_llama-7B.json
deleted file mode 100644
index cb1676e09ecdce592c17a4ff25f63c87e2a2a971..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_bn_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_bn": {
-      "acc": 0.26011685782298205,
-      "acc_stderr": 0.00456358696087763,
-      "acc_norm": 0.28251460722787275,
-      "acc_norm_stderr": 0.004683467388784859
-    }
-  },
-  "versions": {
-    "hellaswag_bn": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ca_bloom-7b1.json b/evals/hellaswag/hellaswag_ca_bloom-7b1.json
deleted file mode 100644
index fa322ff2eccfdf62925b1b79ced281791b64de0e..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_ca_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_ca": {
-      "acc": 0.40186712983065564,
-      "acc_stderr": 0.005108421054557395,
-      "acc_norm": 0.5120495006513244,
-      "acc_norm_stderr": 0.005208233728494265
-    }
-  },
-  "versions": {
-    "hellaswag_ca": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ca_llama-7B.json b/evals/hellaswag/hellaswag_ca_llama-7B.json
deleted file mode 100644
index 4e0b22ebaf8ac031767a3f3ab1e4789d623a3c02..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_ca_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_ca": {
-      "acc": 0.38460703430308296,
-      "acc_stderr": 0.0050691072999641,
-      "acc_norm": 0.49565783760312637,
-      "acc_norm_stderr": 0.005209550302588167
-    }
-  },
-  "versions": {
-    "hellaswag_ca": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_da_bloom-7b1.json b/evals/hellaswag/hellaswag_da_bloom-7b1.json
deleted file mode 100644
index 248065e86f7721ea28ce5b176e014af8e2c365bf..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_da_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_da": {
-      "acc": 0.2806018269747448,
-      "acc_stderr": 0.00465795256586935,
-      "acc_norm": 0.31176786673831275,
-      "acc_norm_stderr": 0.004802289060894963
-    }
-  },
-  "versions": {
-    "hellaswag_da": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_da_llama-7B.json b/evals/hellaswag/hellaswag_da_llama-7B.json
deleted file mode 100644
index 158172ac8091f5c183cde64b120c8c32ef6b2da7..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_da_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_da": {
-      "acc": 0.3730252552391188,
-      "acc_stderr": 0.005013710932255912,
-      "acc_norm": 0.46695325094035467,
-      "acc_norm_stderr": 0.005172309453152385
-    }
-  },
-  "versions": {
-    "hellaswag_da": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_de_bloom-7b1.json b/evals/hellaswag/hellaswag_de_bloom-7b1.json
deleted file mode 100644
index 1a42078cb7cf48cd71502713357d1faa121702cc..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_de_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_de": {
-      "acc": 0.2982493595217763,
-      "acc_stderr": 0.004726948912322779,
-      "acc_norm": 0.32418872758326217,
-      "acc_norm_stderr": 0.004836279708509382
-    }
-  },
-  "versions": {
-    "hellaswag_de": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_de_llama-7B.json b/evals/hellaswag/hellaswag_de_llama-7B.json
deleted file mode 100644
index a027e43f548e49b4fd7dd60cc606b68dc314cb9d..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_de_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_de": {
-      "acc": 0.39427900523001386,
-      "acc_stderr": 0.005049108443939032,
-      "acc_norm": 0.49855907780979825,
-      "acc_norm_stderr": 0.005165885308732062
-    }
-  },
-  "versions": {
-    "hellaswag_de": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_es_bloom-7b1.json b/evals/hellaswag/hellaswag_es_bloom-7b1.json
deleted file mode 100644
index 7fd9710255ac60d17ca496eac2cdcfe416fd02be..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_es_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_es": {
-      "acc": 0.4372733091529763,
-      "acc_stderr": 0.0051237264293392815,
-      "acc_norm": 0.566567100490719,
-      "acc_norm_stderr": 0.005118554174253425
-    }
-  },
-  "versions": {
-    "hellaswag_es": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_es_llama-7B.json b/evals/hellaswag/hellaswag_es_llama-7B.json
deleted file mode 100644
index 571b2651d1c438f6d95ef828887f685893f506ff..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_es_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_es": {
-      "acc": 0.4311466666666667,
-      "acc_stderr": 0.005115053675969629,
-      "acc_norm": 0.5640533333333333,
-      "acc_norm_stderr": 0.0051217018246512425
-    }
-  },
-  "versions": {
-    "hellaswag_es": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_eu_bloom-7b1.json b/evals/hellaswag/hellaswag_eu_bloom-7b1.json
deleted file mode 100644
index aaa2bac442dd619e5e485a1c9bb7770c1aaad3e8..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_eu_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_eu": {
-      "acc": 0.27380695314187,
-      "acc_stderr": 0.004633608505053738,
-      "acc_norm": 0.31235154394299286,
-      "acc_norm_stderr": 0.00481588516396214
-    }
-  },
-  "versions": {
-    "hellaswag_eu": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_eu_llama-7B.json b/evals/hellaswag/hellaswag_eu_llama-7B.json
deleted file mode 100644
index f969135230558d6c41262c2194dbbe0e29c848f6..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_eu_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_eu": {
-      "acc": 0.25847549125458863,
-      "acc_stderr": 0.004549288692503547,
-      "acc_norm": 0.28719499028287626,
-      "acc_norm_stderr": 0.004701591142825526
-    }
-  },
-  "versions": {
-    "hellaswag_eu": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_fr_bloom-7b1.json b/evals/hellaswag/hellaswag_fr_bloom-7b1.json
deleted file mode 100644
index 737e5f885ea8810e330462182af605bac6f7338e..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_fr_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_fr": {
-      "acc": 0.4255729278218034,
-      "acc_stderr": 0.005116827391881862,
-      "acc_norm": 0.5656457485542943,
-      "acc_norm_stderr": 0.005129684120180618
-    }
-  },
-  "versions": {
-    "hellaswag_fr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_fr_llama-7B.json b/evals/hellaswag/hellaswag_fr_llama-7B.json
deleted file mode 100644
index 3f0fd2446e8e689f67cfb568e162f7b4dba1a617..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_fr_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_fr": {
-      "acc": 0.4255729278218034,
-      "acc_stderr": 0.00511682739188186,
-      "acc_norm": 0.5566502463054187,
-      "acc_norm_stderr": 0.005141155729141772
-    }
-  },
-  "versions": {
-    "hellaswag_fr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_gu_bloom-7b1.json b/evals/hellaswag/hellaswag_gu_bloom-7b1.json
deleted file mode 100644
index 0ef2b298131daf31fa9c77d37366818ba539e0bb..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_gu_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_gu": {
-      "acc": 0.2683176189935249,
-      "acc_stderr": 0.004722752779022285,
-      "acc_norm": 0.30625922980802,
-      "acc_norm_stderr": 0.0049130651137809294
-    }
-  },
-  "versions": {
-    "hellaswag_gu": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_gu_llama-7B.json b/evals/hellaswag/hellaswag_gu_llama-7B.json
deleted file mode 100644
index a610259f2ef19c9db88847c04b399dfcbcc4a463..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_gu_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_gu": {
-      "acc": 0.2560490741792571,
-      "acc_stderr": 0.004652036002377334,
-      "acc_norm": 0.28899238895830964,
-      "acc_norm_stderr": 0.004831585233585411
-    }
-  },
-  "versions": {
-    "hellaswag_gu": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hi_bloom-7b1.json b/evals/hellaswag/hellaswag_hi_bloom-7b1.json
deleted file mode 100644
index 63eeb2a2481895efb7ecade2660f0911184073b6..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_hi_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_hi": {
-      "acc": 0.31202209005947323,
-      "acc_stderr": 0.004774960194792877,
-      "acc_norm": 0.36363636363636365,
-      "acc_norm_stderr": 0.004957653483174718
-    }
-  },
-  "versions": {
-    "hellaswag_hi": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hi_llama-7B.json b/evals/hellaswag/hellaswag_hi_llama-7B.json
deleted file mode 100644
index 35969545033bac79e8237b539ab99ce740103734..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_hi_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_hi": {
-      "acc": 0.2729396771452846,
-      "acc_stderr": 0.0045910116736375154,
-      "acc_norm": 0.2917374681393373,
-      "acc_norm_stderr": 0.004684713934059222
-    }
-  },
-  "versions": {
-    "hellaswag_hi": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hr_bloom-7b1.json b/evals/hellaswag/hellaswag_hr_bloom-7b1.json
deleted file mode 100644
index 2571f200efda69d65fed248bfa1462accaa0e80f..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_hr_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_hr": {
-      "acc": 0.27478095640240685,
-      "acc_stderr": 0.004586771132918674,
-      "acc_norm": 0.3000105563179563,
-      "acc_norm_stderr": 0.004708614858618206
-    }
-  },
-  "versions": {
-    "hellaswag_hr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hr_llama-7B.json b/evals/hellaswag/hellaswag_hr_llama-7B.json
deleted file mode 100644
index 0c8aa308a99a4d9917300d2b6bca88d4fbd44a07..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_hr_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_hr": {
-      "acc": 0.3393856222949435,
-      "acc_stderr": 0.004865190903217322,
-      "acc_norm": 0.41148527393645096,
-      "acc_norm_stderr": 0.005056324888258699
-    }
-  },
-  "versions": {
-    "hellaswag_hr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hu_bloom-7b1.json b/evals/hellaswag/hellaswag_hu_bloom-7b1.json
deleted file mode 100644
index cfb0859d6479ddd7e6caa9ab28436da9061fafe0..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_hu_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_hu": {
-      "acc": 0.2749780893952673,
-      "acc_stderr": 0.004673697346652944,
-      "acc_norm": 0.30127081507449605,
-      "acc_norm_stderr": 0.004802517407348953
-    }
-  },
-  "versions": {
-    "hellaswag_hu": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hu_llama-7B.json b/evals/hellaswag/hellaswag_hu_llama-7B.json
deleted file mode 100644
index 7f1300419e0e8727d8da4787e4074336d82c6d64..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_hu_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_hu": {
-      "acc": 0.31879929886064856,
-      "acc_stderr": 0.004877892181685683,
-      "acc_norm": 0.3785056967572305,
-      "acc_norm_stderr": 0.005076808255387223
-    }
-  },
-  "versions": {
-    "hellaswag_hu": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hy_bloom-7b1.json b/evals/hellaswag/hellaswag_hy_bloom-7b1.json
deleted file mode 100644
index b7aadfc69e7d37e7a69be9da8de7f6f479daa078..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_hy_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_hy": {
-      "acc": 0.2517377201112141,
-      "acc_stderr": 0.00467165233929534,
-      "acc_norm": 0.2761816496756256,
-      "acc_norm_stderr": 0.004812620824973181
-    }
-  },
-  "versions": {
-    "hellaswag_hy": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hy_llama-7B.json b/evals/hellaswag/hellaswag_hy_llama-7B.json
deleted file mode 100644
index 85198baf9a0a8e2dcb229b74cd9c22b5421c95b3..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_hy_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_hy": {
-      "acc": 0.2545180722891566,
-      "acc_stderr": 0.004688644596808388,
-      "acc_norm": 0.2849860982391103,
-      "acc_norm_stderr": 0.004858906279128767
-    }
-  },
-  "versions": {
-    "hellaswag_hy": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_id_bloom-7b1.json b/evals/hellaswag/hellaswag_id_bloom-7b1.json
deleted file mode 100644
index b4bcc31e157c6a9c8fc29d08fd6088001c2a4e2b..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_id_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_id": {
-      "acc": 0.3894849785407725,
-      "acc_stderr": 0.005051366474018924,
-      "acc_norm": 0.49484978540772534,
-      "acc_norm_stderr": 0.005179195541251435
-    }
-  },
-  "versions": {
-    "hellaswag_id": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_id_llama-7B.json b/evals/hellaswag/hellaswag_id_llama-7B.json
deleted file mode 100644
index d408a6b8209abf2afa7b33e28f960ce7cf71596b..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_id_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_id": {
-      "acc": 0.3017167381974249,
-      "acc_stderr": 0.004754784760510309,
-      "acc_norm": 0.34431330472103006,
-      "acc_norm_stderr": 0.004921986658657097
-    }
-  },
-  "versions": {
-    "hellaswag_id": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_it_bloom-7b1.json b/evals/hellaswag/hellaswag_it_bloom-7b1.json
deleted file mode 100644
index f071bbb39cf2e6048f33a2ac1444d8d24657c9ab..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_it_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_it": {
-      "acc": 0.33380465520991953,
-      "acc_stderr": 0.004918337887582365,
-      "acc_norm": 0.40765716771807703,
-      "acc_norm_stderr": 0.005125137013353996
-    }
-  },
-  "versions": {
-    "hellaswag_it": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_it_llama-7B.json b/evals/hellaswag/hellaswag_it_llama-7B.json
deleted file mode 100644
index 2698d8e1b02654e67b142631369916d337041789..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_it_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_it": {
-      "acc": 0.3975851191123681,
-      "acc_stderr": 0.0051045551272873,
-      "acc_norm": 0.5201783966061133,
-      "acc_norm_stderr": 0.005210879697577827
-    }
-  },
-  "versions": {
-    "hellaswag_it": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_kn_bloom-7b1.json b/evals/hellaswag/hellaswag_kn_bloom-7b1.json
deleted file mode 100644
index ec110ed487575de37a4630739da2ee9264bd8d08..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_kn_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_kn": {
-      "acc": 0.26337169939065674,
-      "acc_stderr": 0.004679154494054024,
-      "acc_norm": 0.30275332881967953,
-      "acc_norm_stderr": 0.004880859653925846
-    }
-  },
-  "versions": {
-    "hellaswag_kn": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_kn_llama-7B.json b/evals/hellaswag/hellaswag_kn_llama-7B.json
deleted file mode 100644
index 219c76670fe5ee2040cfa43d6e6360e4684a6fe4..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_kn_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_kn": {
-      "acc": 0.25603701196118256,
-      "acc_stderr": 0.004636450973386679,
-      "acc_norm": 0.2887610020311442,
-      "acc_norm_stderr": 0.0048143280788988845
-    }
-  },
-  "versions": {
-    "hellaswag_kn": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ml_bloom-7b1.json b/evals/hellaswag/hellaswag_ml_bloom-7b1.json
deleted file mode 100644
index a4de930d07f3cb8e48668e5b5f1b53560c0ff7f1..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_ml_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_ml": {
-      "acc": 0.25444979290272024,
-      "acc_stderr": 0.004608558887983242,
-      "acc_norm": 0.2878092466136796,
-      "acc_norm_stderr": 0.004790448543019756
-    }
-  },
-  "versions": {
-    "hellaswag_ml": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ml_llama-7B.json b/evals/hellaswag/hellaswag_ml_llama-7B.json
deleted file mode 100644
index d0fff179c59dc9c44b1a6de207bcba30d72726a7..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_ml_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_ml": {
-      "acc": 0.2510914586365163,
-      "acc_stderr": 0.004588344357712618,
-      "acc_norm": 0.2890406358446211,
-      "acc_norm_stderr": 0.004796533523475371
-    }
-  },
-  "versions": {
-    "hellaswag_ml": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_mr_bloom-7b1.json b/evals/hellaswag/hellaswag_mr_bloom-7b1.json
deleted file mode 100644
index 5768dcee263277655dc8087f17858a884c937b53..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_mr_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_mr": {
-      "acc": 0.2701799762905486,
-      "acc_stderr": 0.004610067484763786,
-      "acc_norm": 0.3100549628192693,
-      "acc_norm_stderr": 0.004801748474056546
-    }
-  },
-  "versions": {
-    "hellaswag_mr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_mr_llama-7B.json b/evals/hellaswag/hellaswag_mr_llama-7B.json
deleted file mode 100644
index 6c3e2cc455a43fee3f289e2eab0831003b552a30..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_mr_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_mr": {
-      "acc": 0.2592951826705464,
-      "acc_stderr": 0.004549803334314971,
-      "acc_norm": 0.2879620648776808,
-      "acc_norm_stderr": 0.004701019162604622
-    }
-  },
-  "versions": {
-    "hellaswag_mr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ne_bloom-7b1.json b/evals/hellaswag/hellaswag_ne_bloom-7b1.json
deleted file mode 100644
index 3b95e1d5f31b1e69f29c233339889469700c84bd..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_ne_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_ne": {
-      "acc": 0.27441511053874224,
-      "acc_stderr": 0.004622852940386713,
-      "acc_norm": 0.30897188237819273,
-      "acc_norm_stderr": 0.004787064632332303
-    }
-  },
-  "versions": {
-    "hellaswag_ne": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ne_llama-7B.json b/evals/hellaswag/hellaswag_ne_llama-7B.json
deleted file mode 100644
index 8c4989d19a23d592896ca0b4e6fded1f62cc01f3..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_ne_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_ne": {
-      "acc": 0.264112470487229,
-      "acc_stderr": 0.004567327225923831,
-      "acc_norm": 0.28171281390856406,
-      "acc_norm_stderr": 0.00466030469849661
-    }
-  },
-  "versions": {
-    "hellaswag_ne": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_nl_Llama-2-7b-chat-hf.json b/evals/hellaswag/hellaswag_nl_Llama-2-7b-chat-hf.json
new file mode 100644
index 0000000000000000000000000000000000000000..6839e4e1493b60821794c1a23f7ba02b789cfc95
--- /dev/null
+++ b/evals/hellaswag/hellaswag_nl_Llama-2-7b-chat-hf.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_nl": {
+      "acc": 0.38467350242849435,
+      "acc_stderr": 0.005054749888300686,
+      "acc_norm": 0.4823529411764706,
+      "acc_norm_stderr": 0.005191586180318448
+    }
+  },
+  "versions": {
+    "hellaswag_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 64,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_nl_Llama-2-7b-hf.json b/evals/hellaswag/hellaswag_nl_Llama-2-7b-hf.json
new file mode 100644
index 0000000000000000000000000000000000000000..745826da641f1d99a1211c46e7cdb0d94765fe6f
--- /dev/null
+++ b/evals/hellaswag/hellaswag_nl_Llama-2-7b-hf.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_nl": {
+      "acc": 0.3878035617916892,
+      "acc_stderr": 0.005062348307428708,
+      "acc_norm": 0.5000539665407447,
+      "acc_norm_stderr": 0.005194822688012659
+    }
+  },
+  "versions": {
+    "hellaswag_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 64,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json b/evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json
new file mode 100644
index 0000000000000000000000000000000000000000..36155d9b2cec2a6c48e7e134b99f453d80e9b75f
--- /dev/null
+++ b/evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_nl": {
+      "acc": 0.43486238532110094,
+      "acc_stderr": 0.005150551758279897,
+      "acc_norm": 0.5676200755531571,
+      "acc_norm_stderr": 0.005147097096977192
+    }
+  },
+  "versions": {
+    "hellaswag_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 64,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_nl_zephyr-7b-beta.json b/evals/hellaswag/hellaswag_nl_zephyr-7b-beta.json
new file mode 100644
index 0000000000000000000000000000000000000000..5fa9f92c70efa809176a3aff63d8a88be3e78172
--- /dev/null
+++ b/evals/hellaswag/hellaswag_nl_zephyr-7b-beta.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_nl": {
+      "acc": 0.4478143550998381,
+      "acc_stderr": 0.005166450687025188,
+      "acc_norm": 0.575067458175931,
+      "acc_norm_stderr": 0.005135942094754352
+    }
+  },
+  "versions": {
+    "hellaswag_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 64,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_pt_bloom-7b1.json b/evals/hellaswag/hellaswag_pt_bloom-7b1.json
deleted file mode 100644
index 5050ad2ec66e4750cc93be5c7e0c4c942051e7a9..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_pt_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_pt": {
-      "acc": 0.4227977028930545,
-      "acc_stderr": 0.005142526543466809,
-      "acc_norm": 0.5511973128182902,
-      "acc_norm_stderr": 0.005177587858629525
-    }
-  },
-  "versions": {
-    "hellaswag_pt": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_pt_llama-7B.json b/evals/hellaswag/hellaswag_pt_llama-7B.json
deleted file mode 100644
index 7ec9536f323c0aa592fcadb1d9e1333cd323941d..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_pt_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_pt": {
-      "acc": 0.4037273810813739,
-      "acc_stderr": 0.005107551363682552,
-      "acc_norm": 0.532343699209015,
-      "acc_norm_stderr": 0.005194044440586472
-    }
-  },
-  "versions": {
-    "hellaswag_pt": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ro_bloom-7b1.json b/evals/hellaswag/hellaswag_ro_bloom-7b1.json
deleted file mode 100644
index dafe7356bdb6ae258020ac1efcc6169d4f31dd20..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_ro_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_ro": {
-      "acc": 0.2795024337479719,
-      "acc_stderr": 0.00466744369483023,
-      "acc_norm": 0.3182260681449432,
-      "acc_norm_stderr": 0.004844601996973363
-    }
-  },
-  "versions": {
-    "hellaswag_ro": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ro_llama-7B.json b/evals/hellaswag/hellaswag_ro_llama-7B.json
deleted file mode 100644
index 03cce6eee60bd007c3835cca157c9f654b0774a7..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_ro_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_ro": {
-      "acc": 0.36041103299080585,
-      "acc_stderr": 0.004993666697380137,
-      "acc_norm": 0.4491076257436452,
-      "acc_norm_stderr": 0.005173430588992903
-    }
-  },
-  "versions": {
-    "hellaswag_ro": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ru_bloom-7b1.json b/evals/hellaswag/hellaswag_ru_bloom-7b1.json
deleted file mode 100644
index a1114c4bc91539820ff9a813a92206eb0b0aaf89..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_ru_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_ru": {
-      "acc": 0.2975625539257981,
-      "acc_stderr": 0.004748207348707273,
-      "acc_norm": 0.32538826574633306,
-      "acc_norm_stderr": 0.004865915900810558
-    }
-  },
-  "versions": {
-    "hellaswag_ru": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ru_llama-7B.json b/evals/hellaswag/hellaswag_ru_llama-7B.json
deleted file mode 100644
index 9da4ad4e94c2effcad5429b563495f832b369727..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_ru_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_ru": {
-      "acc": 0.370685936151855,
-      "acc_stderr": 0.005016184279255606,
-      "acc_norm": 0.4568593615185505,
-      "acc_norm_stderr": 0.005173496063169706
-    }
-  },
-  "versions": {
-    "hellaswag_ru": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_sk_bloom-7b1.json b/evals/hellaswag/hellaswag_sk_bloom-7b1.json
deleted file mode 100644
index a452682d669ca439c37ef65351b2482280cb6a25..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_sk_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_sk": {
-      "acc": 0.27053241960991037,
-      "acc_stderr": 0.004561596675422169,
-      "acc_norm": 0.2981549815498155,
-      "acc_norm_stderr": 0.004697273773957717
-    }
-  },
-  "versions": {
-    "hellaswag_sk": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_sk_llama-7B.json b/evals/hellaswag/hellaswag_sk_llama-7B.json
deleted file mode 100644
index 7720fc7912fd16392b3c8ddc4e66fd5530405fce..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_sk_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_sk": {
-      "acc": 0.30173958882445967,
-      "acc_stderr": 0.004713343422332119,
-      "acc_norm": 0.35888244596731683,
-      "acc_norm_stderr": 0.004925486913523139
-    }
-  },
-  "versions": {
-    "hellaswag_sk": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_sr_bloom-7b1.json b/evals/hellaswag/hellaswag_sr_bloom-7b1.json
deleted file mode 100644
index 2d4dc8c27d8e4e36fa4c1a0c3d9a5431716e620d..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_sr_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_sr": {
-      "acc": 0.27748968144777225,
-      "acc_stderr": 0.004606546970716383,
-      "acc_norm": 0.29855011112287017,
-      "acc_norm_stderr": 0.004708005935082949
-    }
-  },
-  "versions": {
-    "hellaswag_sr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_sr_llama-7B.json b/evals/hellaswag/hellaswag_sr_llama-7B.json
deleted file mode 100644
index 05dc0fdc8921cb49fe2182f475f6d81e20eb5990..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_sr_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_sr": {
-      "acc": 0.3437400783151656,
-      "acc_stderr": 0.004886333271945336,
-      "acc_norm": 0.41147211345115886,
-      "acc_norm_stderr": 0.005062718548853834
-    }
-  },
-  "versions": {
-    "hellaswag_sr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_sv_bloom-7b1.json b/evals/hellaswag/hellaswag_sv_bloom-7b1.json
deleted file mode 100644
index 4ebba6534a5e9e09a423a4422dbf2aae81a1bc02..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_sv_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_sv": {
-      "acc": 0.27647445735584303,
-      "acc_stderr": 0.0046830976447929905,
-      "acc_norm": 0.3101293575970182,
-      "acc_norm_stderr": 0.0048432182915872585
-    }
-  },
-  "versions": {
-    "hellaswag_sv": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_sv_llama-7B.json b/evals/hellaswag/hellaswag_sv_llama-7B.json
deleted file mode 100644
index ee471bcb53bf2f1459136089da2f6e7ae0cdafd1..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_sv_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_sv": {
-      "acc": 0.3857706643279982,
-      "acc_stderr": 0.005096929762325147,
-      "acc_norm": 0.5051523788642841,
-      "acc_norm_stderr": 0.005235108858635741
-    }
-  },
-  "versions": {
-    "hellaswag_sv": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ta_bloom-7b1.json b/evals/hellaswag/hellaswag_ta_bloom-7b1.json
deleted file mode 100644
index 584724a6119d2433aab6e11c1971faf29ca9ce8f..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_ta_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_ta": {
-      "acc": 0.2588850588375134,
-      "acc_stderr": 0.004775805657688067,
-      "acc_norm": 0.29406870319743256,
-      "acc_norm_stderr": 0.0049677071891109335
-    }
-  },
-  "versions": {
-    "hellaswag_ta": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ta_llama-7B.json b/evals/hellaswag/hellaswag_ta_llama-7B.json
deleted file mode 100644
index 2d69d8dc8c743704b031d2ef3894db3a70bb4c9a..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_ta_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_ta": {
-      "acc": 0.25329846665874245,
-      "acc_stderr": 0.004741766564082548,
-      "acc_norm": 0.28313324616664687,
-      "acc_norm_stderr": 0.004912075369610396
-    }
-  },
-  "versions": {
-    "hellaswag_ta": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_te_bloom-7b1.json b/evals/hellaswag/hellaswag_te_bloom-7b1.json
deleted file mode 100644
index 5052ea04c62ee79955014621885295121b68fc76..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_te_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_te": {
-      "acc": 0.26123337918386064,
-      "acc_stderr": 0.00470365034659896,
-      "acc_norm": 0.2922971114167813,
-      "acc_norm_stderr": 0.004869729181749992
-    }
-  },
-  "versions": {
-    "hellaswag_te": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_te_llama-7B.json b/evals/hellaswag/hellaswag_te_llama-7B.json
deleted file mode 100644
index 7bce32700aa0c1c9e176ddae4994c8d3a2b22f3b..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_te_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_te": {
-      "acc": 0.25767996331957815,
-      "acc_stderr": 0.0046827716491321504,
-      "acc_norm": 0.28931682714351215,
-      "acc_norm_stderr": 0.004855030101325898
-    }
-  },
-  "versions": {
-    "hellaswag_te": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_uk_bloom-7b1.json b/evals/hellaswag/hellaswag_uk_bloom-7b1.json
deleted file mode 100644
index cd933afdab71857ec060d16116e0f044d23e7a50..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_uk_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_uk": {
-      "acc": 0.2781379530237007,
-      "acc_stderr": 0.004619644722138738,
-      "acc_norm": 0.30035072802635776,
-      "acc_norm_stderr": 0.004726132393644123
-    }
-  },
-  "versions": {
-    "hellaswag_uk": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_uk_llama-7B.json b/evals/hellaswag/hellaswag_uk_llama-7B.json
deleted file mode 100644
index 545af16e16507026332c8c8c7836ef6d20ccae00..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_uk_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_uk": {
-      "acc": 0.3544720628850648,
-      "acc_stderr": 0.0049304266046324334,
-      "acc_norm": 0.4412577012959422,
-      "acc_norm_stderr": 0.005117854029524533
-    }
-  },
-  "versions": {
-    "hellaswag_uk": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_vi_bloom-7b1.json b/evals/hellaswag/hellaswag_vi_bloom-7b1.json
deleted file mode 100644
index 686132db373135123f1b9720642bfd294f99f328..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_vi_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_vi": {
-      "acc": 0.3836498581095831,
-      "acc_stderr": 0.0050805394682356675,
-      "acc_norm": 0.4827548570181183,
-      "acc_norm_stderr": 0.005220836527919318
-    }
-  },
-  "versions": {
-    "hellaswag_vi": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_vi_llama-7B.json b/evals/hellaswag/hellaswag_vi_llama-7B.json
deleted file mode 100644
index 816307d9258b275603ae30ffb36851a8b3475dd9..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_vi_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_vi": {
-      "acc": 0.27865094957432873,
-      "acc_stderr": 0.004684158200782215,
-      "acc_norm": 0.31608819035145164,
-      "acc_norm_stderr": 0.0048577229826674215
-    }
-  },
-  "versions": {
-    "hellaswag_vi": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_zh_bloom-7b1.json b/evals/hellaswag/hellaswag_zh_bloom-7b1.json
deleted file mode 100644
index 30ac380919e1d6d2c44c46e941ae3dc9929982e1..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_zh_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_zh": {
-      "acc": 0.38851715950787824,
-      "acc_stderr": 0.005063776486157121,
-      "acc_norm": 0.5115475933520397,
-      "acc_norm_stderr": 0.005193156826942953
-    }
-  },
-  "versions": {
-    "hellaswag_zh": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_zh_llama-7B.json b/evals/hellaswag/hellaswag_zh_llama-7B.json
deleted file mode 100644
index b0d393a5879535e46dfb92d3361d469fc71f97b7..0000000000000000000000000000000000000000
--- a/evals/hellaswag/hellaswag_zh_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_zh": {
-      "acc": 0.32358653431160983,
-      "acc_stderr": 0.004859949552176753,
-      "acc_norm": 0.3945835131635736,
-      "acc_norm_stderr": 0.0050772319918162435
-    }
-  },
-  "versions": {
-    "hellaswag_zh": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ar-bloom-7b1.json b/evals/mmlu/mmlu_ar-bloom-7b1.json
deleted file mode 100644
index b6e593af4922000fb94fdaab7a48477f593319ba..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_ar-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_ar": {
-      "acc": 0.26531559405940597,
-      "acc_stderr": 0.0038831388933726414,
-      "acc_norm": 0.2754486386138614,
-      "acc_norm_stderr": 0.003929217133330591
-    }
-  },
-  "versions": {
-    "mmlu_ar": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ar-llama-7B.json b/evals/mmlu/mmlu_ar-llama-7B.json
deleted file mode 100644
index f601d0a0a213c652ffd5519a7454ba2a537af3fc..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_ar-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_ar": {
-      "acc": 0.2589727722772277,
-      "acc_stderr": 0.0038529667515366556,
-      "acc_norm": 0.2797803217821782,
-      "acc_norm_stderr": 0.003948136869379606
-    }
-  },
-  "versions": {
-    "mmlu_ar": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_bn-bloom-7b1.json b/evals/mmlu/mmlu_bn-bloom-7b1.json
deleted file mode 100644
index 89c8ade0841c9df16a86355a7b703e726726acfa..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_bn-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_bn": {
-      "acc": 0.2671137646192852,
-      "acc_stderr": 0.004001512896559074,
-      "acc_norm": 0.28150813772797906,
-      "acc_norm_stderr": 0.004067374934957544
-    }
-  },
-  "versions": {
-    "mmlu_bn": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_bn-llama-7B.json b/evals/mmlu/mmlu_bn-llama-7B.json
deleted file mode 100644
index da3322aaf303ad70cf3667aba1a4d73764af5fdc..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_bn-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_bn": {
-      "acc": 0.2501022327635561,
-      "acc_stderr": 0.0039166757490002955,
-      "acc_norm": 0.28461601374008344,
-      "acc_norm_stderr": 0.0040809105667388166
-    }
-  },
-  "versions": {
-    "mmlu_bn": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ca-bloom-7b1.json b/evals/mmlu/mmlu_ca-bloom-7b1.json
deleted file mode 100644
index b760f91f32565b551455d9bf715837b34540ec24..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_ca-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_ca": {
-      "acc": 0.2785041045910611,
-      "acc_stderr": 0.003908294722890792,
-      "acc_norm": 0.28785345089692915,
-      "acc_norm_stderr": 0.003947525835346328
-    }
-  },
-  "versions": {
-    "mmlu_ca": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ca-llama-7B.json b/evals/mmlu/mmlu_ca-llama-7B.json
deleted file mode 100644
index 5183b4df5346ae0e0aa74c3166323602507c4598..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_ca-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_ca": {
-      "acc": 0.3038917604134995,
-      "acc_stderr": 0.004010074337091965,
-      "acc_norm": 0.3022955305564001,
-      "acc_norm_stderr": 0.004004111747979521
-    }
-  },
-  "versions": {
-    "mmlu_ca": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_da-bloom-7b1.json b/evals/mmlu/mmlu_da-bloom-7b1.json
deleted file mode 100644
index 5b81f4f5ab7529c0d7efd0c3b2c040d9e4643cc2..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_da-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_da": {
-      "acc": 0.2557170982886567,
-      "acc_stderr": 0.0037964676375075402,
-      "acc_norm": 0.2705588368923217,
-      "acc_norm_stderr": 0.003865954982495375
-    }
-  },
-  "versions": {
-    "mmlu_da": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_da-llama-7B.json b/evals/mmlu/mmlu_da-llama-7B.json
deleted file mode 100644
index f4957b8b53a4880a0eac49ccabcab4a8c6a584c2..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_da-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_da": {
-      "acc": 0.2997122520066636,
-      "acc_stderr": 0.003986771176689293,
-      "acc_norm": 0.2995608056943813,
-      "acc_norm_stderr": 0.003986194743561357
-    }
-  },
-  "versions": {
-    "mmlu_da": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_de-bloom-7b1.json b/evals/mmlu/mmlu_de-bloom-7b1.json
deleted file mode 100644
index 40c8412a571fbf0d4f63f6290e66bfbbab5fa943..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_de-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_de": {
-      "acc": 0.2670085985819882,
-      "acc_stderr": 0.0038422837632401587,
-      "acc_norm": 0.2812641424045859,
-      "acc_norm_stderr": 0.003904983582450586
-    }
-  },
-  "versions": {
-    "mmlu_de": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_de-llama-7B.json b/evals/mmlu/mmlu_de-llama-7B.json
deleted file mode 100644
index 48403f057f5a6bffdb9e4cb2644c286f80b5ccf0..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_de-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_de": {
-      "acc": 0.3045708251621662,
-      "acc_stderr": 0.003997127255569371,
-      "acc_norm": 0.2988384371700106,
-      "acc_norm_stderr": 0.003975618018830569
-    }
-  },
-  "versions": {
-    "mmlu_de": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_es-bloom-7b1.json b/evals/mmlu/mmlu_es-bloom-7b1.json
deleted file mode 100644
index 1ca552b581fe950c76b7e801b8922438a03f50b6..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_es-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_es": {
-      "acc": 0.2846857657117144,
-      "acc_stderr": 0.00390811532232558,
-      "acc_norm": 0.28926053697315135,
-      "acc_norm_stderr": 0.003926773662056655
-    }
-  },
-  "versions": {
-    "mmlu_es": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_es-llama-7B.json b/evals/mmlu/mmlu_es-llama-7B.json
deleted file mode 100644
index 6c5c8136a88729662690739c773310e7e60685c7..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_es-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_es": {
-      "acc": 0.30808459577021147,
-      "acc_stderr": 0.00399850416060033,
-      "acc_norm": 0.30268486575671216,
-      "acc_norm_stderr": 0.0039787436578546075
-    }
-  },
-  "versions": {
-    "mmlu_es": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_eu-bloom-7b1.json b/evals/mmlu/mmlu_eu-bloom-7b1.json
deleted file mode 100644
index bd26e106ebaee3484061fd6d78bd4e9d52579fcd..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_eu-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_eu": {
-      "acc": 0.2576611914684972,
-      "acc_stderr": 0.003953719493412054,
-      "acc_norm": 0.2735147503473073,
-      "acc_norm_stderr": 0.0040298051028790725
-    }
-  },
-  "versions": {
-    "mmlu_eu": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_eu-llama-7B.json b/evals/mmlu/mmlu_eu-llama-7B.json
deleted file mode 100644
index cbf5d4151c1d0c86b7232d6cbc1cc4623fafce36..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_eu-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_eu": {
-      "acc": 0.2668954809185258,
-      "acc_stderr": 0.003998838127920185,
-      "acc_norm": 0.27923510664378526,
-      "acc_norm_stderr": 0.00405566512057356
-    }
-  },
-  "versions": {
-    "mmlu_eu": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_fr-bloom-7b1.json b/evals/mmlu/mmlu_fr-bloom-7b1.json
deleted file mode 100644
index 518cf70d5d420bdf6c38c7dc1d83ad8289360cb0..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_fr-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_fr": {
-      "acc": 0.2887479948055916,
-      "acc_stderr": 0.0039609687595635185,
-      "acc_norm": 0.29860209304102053,
-      "acc_norm_stderr": 0.003999989334139082
-    }
-  },
-  "versions": {
-    "mmlu_fr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_fr-llama-7B.json b/evals/mmlu/mmlu_fr-llama-7B.json
deleted file mode 100644
index e22bb03037c1bf7eebd47d64ccb10e43eca00210..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_fr-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_fr": {
-      "acc": 0.318997784737606,
-      "acc_stderr": 0.004073786574740586,
-      "acc_norm": 0.3054006569398824,
-      "acc_norm_stderr": 0.00402561598834305
-    }
-  },
-  "versions": {
-    "mmlu_fr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_gu-bloom-7b1.json b/evals/mmlu/mmlu_gu-bloom-7b1.json
deleted file mode 100644
index 08db474bfffcd53c11f37cca5a5523de19ab27b2..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_gu-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_gu": {
-      "acc": 0.24933390631714655,
-      "acc_stderr": 0.004010971174274014,
-      "acc_norm": 0.26566394499355395,
-      "acc_norm_stderr": 0.004094955673385403
-    }
-  },
-  "versions": {
-    "mmlu_gu": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_gu-llama-7B.json b/evals/mmlu/mmlu_gu-llama-7B.json
deleted file mode 100644
index 2236b1f5ac01a2de4772fb6fde41222398119985..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_gu-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_gu": {
-      "acc": 0.24391920928233776,
-      "acc_stderr": 0.003981461991912142,
-      "acc_norm": 0.27382896433175763,
-      "acc_norm_stderr": 0.0041342298983896774
-    }
-  },
-  "versions": {
-    "mmlu_gu": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hi-bloom-7b1.json b/evals/mmlu/mmlu_hi-bloom-7b1.json
deleted file mode 100644
index 8402e114c7f1914a4c05f4a1f91ecb4aad9df2d8..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_hi-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_hi": {
-      "acc": 0.2666237838707084,
-      "acc_stderr": 0.00396526756671177,
-      "acc_norm": 0.2751467395674198,
-      "acc_norm_stderr": 0.004004671316183439
-    }
-  },
-  "versions": {
-    "mmlu_hi": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hi-llama-7B.json b/evals/mmlu/mmlu_hi-llama-7B.json
deleted file mode 100644
index b9c9d981a7d61e96d94d6c128b4ccfc3f3b0f0e6..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_hi-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_hi": {
-      "acc": 0.2549650237195465,
-      "acc_stderr": 0.003908303467263245,
-      "acc_norm": 0.27860416499155743,
-      "acc_norm_stderr": 0.0040201315154066415
-    }
-  },
-  "versions": {
-    "mmlu_hi": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hr-bloom-7b1.json b/evals/mmlu/mmlu_hr-bloom-7b1.json
deleted file mode 100644
index 11c2e3822a0ada199f63dd7adb04e6c604d3151e..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_hr-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_hr": {
-      "acc": 0.25448737450562825,
-      "acc_stderr": 0.0037988075329188904,
-      "acc_norm": 0.26954669911773654,
-      "acc_norm_stderr": 0.0038699014491549413
-    }
-  },
-  "versions": {
-    "mmlu_hr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hr-llama-7B.json b/evals/mmlu/mmlu_hr-llama-7B.json
deleted file mode 100644
index b2f5ca1c97a96e3d94fd3ae5c2603632e633b975..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_hr-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_hr": {
-      "acc": 0.294721630666261,
-      "acc_stderr": 0.003976243355939721,
-      "acc_norm": 0.2931244295710374,
-      "acc_norm_stderr": 0.003969942004520753
-    }
-  },
-  "versions": {
-    "mmlu_hr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hu-bloom-7b1.json b/evals/mmlu/mmlu_hu-bloom-7b1.json
deleted file mode 100644
index b5cd6a42f13e7a2790a24766a0455177825ac001..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_hu-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_hu": {
-      "acc": 0.25,
-      "acc_stderr": 0.0037944175097970817,
-      "acc_norm": 0.269041769041769,
-      "acc_norm_stderr": 0.0038859804834747223
-    }
-  },
-  "versions": {
-    "mmlu_hu": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hu-llama-7B.json b/evals/mmlu/mmlu_hu-llama-7B.json
deleted file mode 100644
index b74a19de5e6654aef46cf40427dc362a330fa08e..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_hu-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_hu": {
-      "acc": 0.27794840294840295,
-      "acc_stderr": 0.0039256419656824035,
-      "acc_norm": 0.29000307125307123,
-      "acc_norm_stderr": 0.0039762530331634354
-    }
-  },
-  "versions": {
-    "mmlu_hu": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hy-bloom-7b1.json b/evals/mmlu/mmlu_hy-bloom-7b1.json
deleted file mode 100644
index 5b33b978463855a30343b21fc48c4d5eeefe9ed4..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_hy-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_hy": {
-      "acc": 0.24754384354053807,
-      "acc_stderr": 0.004135735206626923,
-      "acc_norm": 0.2570930125791938,
-      "acc_norm_stderr": 0.004187920399106458
-    }
-  },
-  "versions": {
-    "mmlu_hy": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hy-llama-7B.json b/evals/mmlu/mmlu_hy-llama-7B.json
deleted file mode 100644
index c10ca85321ddad4c7be01b48cec4e49a1e214777..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_hy-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_hy": {
-      "acc": 0.24800293820585806,
-      "acc_stderr": 0.004138305469907604,
-      "acc_norm": 0.2746304287944174,
-      "acc_norm_stderr": 0.004277007917763834
-    }
-  },
-  "versions": {
-    "mmlu_hy": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_id-bloom-7b1.json b/evals/mmlu/mmlu_id-bloom-7b1.json
deleted file mode 100644
index eab2b6f207224be214da56e0b7642b6e08ab6522..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_id-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_id": {
-      "acc": 0.26631554843141747,
-      "acc_stderr": 0.0038620444798720234,
-      "acc_norm": 0.28058926799480954,
-      "acc_norm_stderr": 0.003925439934317792
-    }
-  },
-  "versions": {
-    "mmlu_id": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_id-llama-7B.json b/evals/mmlu/mmlu_id-llama-7B.json
deleted file mode 100644
index b6135824ebca4f3650da00511c33cc4a21bfb152..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_id-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_id": {
-      "acc": 0.2795969773299748,
-      "acc_stderr": 0.003921194198043396,
-      "acc_norm": 0.2895962140294634,
-      "acc_norm_stderr": 0.003962902849695825
-    }
-  },
-  "versions": {
-    "mmlu_id": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_it-bloom-7b1.json b/evals/mmlu/mmlu_it-bloom-7b1.json
deleted file mode 100644
index f1fd4d72695bef88e7d84fea1cef3fe7a204b1d4..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_it-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_it": {
-      "acc": 0.26161516960036263,
-      "acc_stderr": 0.0038202735800333108,
-      "acc_norm": 0.2760444209413009,
-      "acc_norm_stderr": 0.0038856803174993136
-    }
-  },
-  "versions": {
-    "mmlu_it": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_it-llama-7B.json b/evals/mmlu/mmlu_it-llama-7B.json
deleted file mode 100644
index 4911cc10b24667a5ceebaa64adfc01511364c093..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_it-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_it": {
-      "acc": 0.29848152904736724,
-      "acc_stderr": 0.003977405833855968,
-      "acc_norm": 0.29901034977713986,
-      "acc_norm_stderr": 0.003979426926074157
-    }
-  },
-  "versions": {
-    "mmlu_it": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_kn-bloom-7b1.json b/evals/mmlu/mmlu_kn-bloom-7b1.json
deleted file mode 100644
index cdc6e7a6340ce902630293fdf1c6020b92559efd..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_kn-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_kn": {
-      "acc": 0.24622316459051152,
-      "acc_stderr": 0.0040494962676919264,
-      "acc_norm": 0.26716141001855287,
-      "acc_norm_stderr": 0.004159165326445932
-    }
-  },
-  "versions": {
-    "mmlu_kn": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_kn-llama-7B.json b/evals/mmlu/mmlu_kn-llama-7B.json
deleted file mode 100644
index 606fb0050e37b38e833800c8c6787674d6157cca..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_kn-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_kn": {
-      "acc": 0.23933209647495363,
-      "acc_stderr": 0.004010635314254899,
-      "acc_norm": 0.27096033218482196,
-      "acc_norm_stderr": 0.004177761014860752
-    }
-  },
-  "versions": {
-    "mmlu_kn": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ml-bloom-7b1.json b/evals/mmlu/mmlu_ml-bloom-7b1.json
deleted file mode 100644
index 0dfd9c349dd00e3ccd1fece3fcf4c414525835bb..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_ml-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_ml": {
-      "acc": 0.24646354733405876,
-      "acc_stderr": 0.0041039285720239,
-      "acc_norm": 0.26414581066376497,
-      "acc_norm_stderr": 0.0041984507173371734
-    }
-  },
-  "versions": {
-    "mmlu_ml": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ml-llama-7B.json b/evals/mmlu/mmlu_ml-llama-7B.json
deleted file mode 100644
index 1dc1ffa8a7a5300db7121f13be137d91ddd33088..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_ml-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_ml": {
-      "acc": 0.24492201668480232,
-      "acc_stderr": 0.0040952567017621564,
-      "acc_norm": 0.27529923830250275,
-      "acc_norm_stderr": 0.004253566006101179
-    }
-  },
-  "versions": {
-    "mmlu_ml": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_mr-bloom-7b1.json b/evals/mmlu/mmlu_mr-bloom-7b1.json
deleted file mode 100644
index de6dc10fd113d66213dca64afc3849f020f6285e..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_mr-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_mr": {
-      "acc": 0.2495736213757817,
-      "acc_stderr": 0.003900219801135433,
-      "acc_norm": 0.26289287744660117,
-      "acc_norm_stderr": 0.003967257688070526
-    }
-  },
-  "versions": {
-    "mmlu_mr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_mr-llama-7B.json b/evals/mmlu/mmlu_mr-llama-7B.json
deleted file mode 100644
index a68274469ffcdac51ed2534e328a082e752259d5..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_mr-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_mr": {
-      "acc": 0.24941119142369853,
-      "acc_stderr": 0.0038993723464080766,
-      "acc_norm": 0.2784861528465849,
-      "acc_norm_stderr": 0.004039799718714403
-    }
-  },
-  "versions": {
-    "mmlu_mr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ne-bloom-7b1.json b/evals/mmlu/mmlu_ne-bloom-7b1.json
deleted file mode 100644
index 63db04e7a0d9e7387ac032f7c649cd67f1996ea4..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_ne-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_ne": {
-      "acc": 0.2568858909499719,
-      "acc_stderr": 0.003915419717331052,
-      "acc_norm": 0.2658797077009556,
-      "acc_norm_stderr": 0.0039591928340292366
-    }
-  },
-  "versions": {
-    "mmlu_ne": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ne-llama-7B.json b/evals/mmlu/mmlu_ne-llama-7B.json
deleted file mode 100644
index 5f6048f4b5b7f57e7bc90c0226fb4fb987b1f1b5..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_ne-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_ne": {
-      "acc": 0.245483016140689,
-      "acc_stderr": 0.0038567872193795804,
-      "acc_norm": 0.2774431863807918,
-      "acc_norm_stderr": 0.004012393111736023
-    }
-  },
-  "versions": {
-    "mmlu_ne": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_pt-bloom-7b1.json b/evals/mmlu/mmlu_pt-bloom-7b1.json
deleted file mode 100644
index 3887b3366a9810116b594c74c02905628ee78fcf..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_pt-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_pt": {
-      "acc": 0.2809216451516061,
-      "acc_stderr": 0.0038938542873620118,
-      "acc_norm": 0.287676373461423,
-      "acc_norm_stderr": 0.0039218389764563225
-    }
-  },
-  "versions": {
-    "mmlu_pt": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_pt-llama-7B.json b/evals/mmlu/mmlu_pt-llama-7B.json
deleted file mode 100644
index d5ff15ab450754ca303e55e1503611a1b7fd3d44..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_pt-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_pt": {
-      "acc": 0.3016361453017112,
-      "acc_stderr": 0.003976322071656026,
-      "acc_norm": 0.3007355148604023,
-      "acc_norm_stderr": 0.003972940683152965
-    }
-  },
-  "versions": {
-    "mmlu_pt": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ro-bloom-7b1.json b/evals/mmlu/mmlu_ro-bloom-7b1.json
deleted file mode 100644
index b9ced8c74d8ae4d628e7fe9168ff402ce98cd279..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_ro-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_ro": {
-      "acc": 0.2555891238670695,
-      "acc_stderr": 0.003790966515146354,
-      "acc_norm": 0.2737160120845921,
-      "acc_norm_stderr": 0.0038750360364507622
-    }
-  },
-  "versions": {
-    "mmlu_ro": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ro-llama-7B.json b/evals/mmlu/mmlu_ro-llama-7B.json
deleted file mode 100644
index 7474e610db1236709be35a3a648960d8b40a838e..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_ro-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_ro": {
-      "acc": 0.29342900302114805,
-      "acc_stderr": 0.003957326026204448,
-      "acc_norm": 0.2965256797583082,
-      "acc_norm_stderr": 0.003969425800928827
-    }
-  },
-  "versions": {
-    "mmlu_ro": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ru-bloom-7b1.json b/evals/mmlu/mmlu_ru-bloom-7b1.json
deleted file mode 100644
index 597b21a215ebd9c9d442c41b7c7577008553e896..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_ru-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_ru": {
-      "acc": 0.2525563158299377,
-      "acc_stderr": 0.0038097500220131194,
-      "acc_norm": 0.2695471669101253,
-      "acc_norm_stderr": 0.0038908241231695112
-    }
-  },
-  "versions": {
-    "mmlu_ru": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ru-llama-7B.json b/evals/mmlu/mmlu_ru-llama-7B.json
deleted file mode 100644
index 1cc8eed486b867ef15f762b1387fd29a6cf4416b..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_ru-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_ru": {
-      "acc": 0.29445683093718766,
-      "acc_stderr": 0.0039966925205054795,
-      "acc_norm": 0.3016068270931037,
-      "acc_norm_stderr": 0.004024377402999243
-    }
-  },
-  "versions": {
-    "mmlu_ru": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_sk-bloom-7b1.json b/evals/mmlu/mmlu_sk-bloom-7b1.json
deleted file mode 100644
index c5c41d03419b8a4038c58ab0e4166ce0e96c28d9..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_sk-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_sk": {
-      "acc": 0.24927269943347113,
-      "acc_stderr": 0.003785212350164864,
-      "acc_norm": 0.26672791303016385,
-      "acc_norm_stderr": 0.003869711564658995
-    }
-  },
-  "versions": {
-    "mmlu_sk": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_sk-llama-7B.json b/evals/mmlu/mmlu_sk-llama-7B.json
deleted file mode 100644
index 309a344b59b192e0dbc8e50b499a16b67538c1ef..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_sk-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_sk": {
-      "acc": 0.28127392436074106,
-      "acc_stderr": 0.003934216199449274,
-      "acc_norm": 0.2944418925126321,
-      "acc_norm_stderr": 0.003988209639409228
-    }
-  },
-  "versions": {
-    "mmlu_sk": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_sr-bloom-7b1.json b/evals/mmlu/mmlu_sr-bloom-7b1.json
deleted file mode 100644
index 88c6699b6f71aadafabd08193c19c50d25887e85..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_sr-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_sr": {
-      "acc": 0.25650952706293173,
-      "acc_stderr": 0.0038050782551146203,
-      "acc_norm": 0.27245122599256055,
-      "acc_norm_stderr": 0.003879266167871199
-    }
-  },
-  "versions": {
-    "mmlu_sr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_sr-llama-7B.json b/evals/mmlu/mmlu_sr-llama-7B.json
deleted file mode 100644
index fbe389b6b884d3a9692413dc84031d5ea2363b31..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_sr-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_sr": {
-      "acc": 0.2902907462233356,
-      "acc_stderr": 0.003954858675409034,
-      "acc_norm": 0.2920367418203902,
-      "acc_norm_stderr": 0.003961851981605455
-    }
-  },
-  "versions": {
-    "mmlu_sr": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_sv-bloom-7b1.json b/evals/mmlu/mmlu_sv-bloom-7b1.json
deleted file mode 100644
index 90ee3cd4e9733639263cdcf04b82e171f8485253..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_sv-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_sv": {
-      "acc": 0.26122788446998335,
-      "acc_stderr": 0.003820033520031446,
-      "acc_norm": 0.27491305005292604,
-      "acc_norm_stderr": 0.0038823517609477554
-    }
-  },
-  "versions": {
-    "mmlu_sv": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_sv-llama-7B.json b/evals/mmlu/mmlu_sv-llama-7B.json
deleted file mode 100644
index d962d7acbb38d8ae28b5d3c396c6389a2ae6bf49..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_sv-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_sv": {
-      "acc": 0.30024194767881446,
-      "acc_stderr": 0.003985765983480769,
-      "acc_norm": 0.29321034326326934,
-      "acc_norm_stderr": 0.003958556933478504
-    }
-  },
-  "versions": {
-    "mmlu_sv": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ta-bloom-7b1.json b/evals/mmlu/mmlu_ta-bloom-7b1.json
deleted file mode 100644
index 227c87597c1eb663c59c29f3eb1d52a08a3d189d..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_ta-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_ta": {
-      "acc": 0.2531252694197776,
-      "acc_stderr": 0.00403738422854994,
-      "acc_norm": 0.2664884903871023,
-      "acc_norm_stderr": 0.004105359016847502
-    }
-  },
-  "versions": {
-    "mmlu_ta": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ta-llama-7B.json b/evals/mmlu/mmlu_ta-llama-7B.json
deleted file mode 100644
index c47ddc1d3941b02c8ef307b03e1af7c3f33d41f8..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_ta-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_ta": {
-      "acc": 0.24743512371756185,
-      "acc_stderr": 0.004006923901271705,
-      "acc_norm": 0.27752392447624796,
-      "acc_norm_stderr": 0.004157865121797154
-    }
-  },
-  "versions": {
-    "mmlu_ta": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_te-bloom-7b1.json b/evals/mmlu/mmlu_te-bloom-7b1.json
deleted file mode 100644
index 6dda2185b223b03895db5556e33db9db1733d107..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_te-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_te": {
-      "acc": 0.2502857142857143,
-      "acc_stderr": 0.004061713740284853,
-      "acc_norm": 0.2618901098901099,
-      "acc_norm_stderr": 0.00412252643604891
-    }
-  },
-  "versions": {
-    "mmlu_te": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_te-llama-7B.json b/evals/mmlu/mmlu_te-llama-7B.json
deleted file mode 100644
index d495ac0b0d562ef0467a6d5a79b03bb80ccfc6a4..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_te-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_te": {
-      "acc": 0.24562637362637363,
-      "acc_stderr": 0.00403621353648515,
-      "acc_norm": 0.26874725274725275,
-      "acc_norm_stderr": 0.004156704581054155
-    }
-  },
-  "versions": {
-    "mmlu_te": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_uk-bloom-7b1.json b/evals/mmlu/mmlu_uk-bloom-7b1.json
deleted file mode 100644
index 7ad6aa7c934875a8ffa40228178610c089842e74..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_uk-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_uk": {
-      "acc": 0.24719188163296923,
-      "acc_stderr": 0.0037969053429642604,
-      "acc_norm": 0.2663258191959098,
-      "acc_norm_stderr": 0.003890709230487387
-    }
-  },
-  "versions": {
-    "mmlu_uk": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_uk-llama-7B.json b/evals/mmlu/mmlu_uk-llama-7B.json
deleted file mode 100644
index 2ac08620ea865817dc03d2021d1c2a89e95bd091..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_uk-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_uk": {
-      "acc": 0.2894104888062592,
-      "acc_stderr": 0.003991508434906801,
-      "acc_norm": 0.2939809435277713,
-      "acc_norm_stderr": 0.004009944142684111
-    }
-  },
-  "versions": {
-    "mmlu_uk": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_vi-bloom-7b1.json b/evals/mmlu/mmlu_vi-bloom-7b1.json
deleted file mode 100644
index 3b29824403bc095477d8a6a0acdb87f1e76c4dfb..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_vi-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_vi": {
-      "acc": 0.26726381871076405,
-      "acc_stderr": 0.003872181345366132,
-      "acc_norm": 0.281427040269484,
-      "acc_norm_stderr": 0.003934867675165376
-    }
-  },
-  "versions": {
-    "mmlu_vi": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_vi-llama-7B.json b/evals/mmlu/mmlu_vi-llama-7B.json
deleted file mode 100644
index 194b2dd47470bee66f0c97bb28f1a825707dccea..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_vi-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_vi": {
-      "acc": 0.26052671872607563,
-      "acc_stderr": 0.0038406007591986315,
-      "acc_norm": 0.28579084366865715,
-      "acc_norm_stderr": 0.003953198731610307
-    }
-  },
-  "versions": {
-    "mmlu_vi": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_zh-bloom-7b1.json b/evals/mmlu/mmlu_zh-bloom-7b1.json
deleted file mode 100644
index e98a766b006fc2ceed3e7d766f77be6fdaf5abe6..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_zh-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_zh": {
-      "acc": 0.27884542347132546,
-      "acc_stderr": 0.003908427008060506,
-      "acc_norm": 0.29137865552601594,
-      "acc_norm_stderr": 0.003960427300065885
-    }
-  },
-  "versions": {
-    "mmlu_zh": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_zh-llama-7B.json b/evals/mmlu/mmlu_zh-llama-7B.json
deleted file mode 100644
index 963997e00a6c8204be6df0d19adfe241fd53d094..0000000000000000000000000000000000000000
--- a/evals/mmlu/mmlu_zh-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "mmlu_zh": {
-      "acc": 0.2769464489175845,
-      "acc_stderr": 0.003900220811105949,
-      "acc_norm": 0.2883402962400304,
-      "acc_norm_stderr": 0.003948161607934338
-    }
-  },
-  "versions": {
-    "mmlu_zh": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ar-bloom-7b1.json b/evals/truthfulqa/truthfulqa_ar-bloom-7b1.json
deleted file mode 100644
index 4ecb61811afa7d48353c2bef8d82befffceceb07..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_ar-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ar": {
-      "mc1": 0.26002587322121606,
-      "mc1_stderr": 0.015787301353849415,
-      "mc2": 0.4256353881905651,
-      "mc2_stderr": 0.015737567507798107
-    }
-  },
-  "versions": {
-    "truthfulqa_ar": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ar-gpt2.json b/evals/truthfulqa/truthfulqa_ar-gpt2.json
deleted file mode 100644
index f83b2bef80b7c2c4a74c05764b7e0d0996d4b489..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_ar-gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ar": {
-      "mc1": 0.23932729624838292,
-      "mc1_stderr": 0.015356292760819215,
-      "mc2": 0.44027391572034885,
-      "mc2_stderr": 0.01696958534622728
-    }
-  },
-  "versions": {
-    "truthfulqa_ar": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ar-llama-7B.json b/evals/truthfulqa/truthfulqa_ar-llama-7B.json
deleted file mode 100644
index 8eaf03b60bf7c8428a848aa8ce0dceeb1b8649da..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_ar-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ar": {
-      "mc1": 0.278137128072445,
-      "mc1_stderr": 0.016126799456170973,
-      "mc2": 0.4510826498021589,
-      "mc2_stderr": 0.01621099626555797
-    }
-  },
-  "versions": {
-    "truthfulqa_ar": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_bn-bloom-7b1.json b/evals/truthfulqa/truthfulqa_bn-bloom-7b1.json
deleted file mode 100644
index 3f0f5acb8958dae16338d6f3538d1c45fd1d5be8..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_bn-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_bn": {
-      "mc1": 0.26248399487836105,
-      "mc1_stderr": 0.015753963575796108,
-      "mc2": 0.48383834952509674,
-      "mc2_stderr": 0.01620495508989729
-    }
-  },
-  "versions": {
-    "truthfulqa_bn": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_bn-llama-7B.json b/evals/truthfulqa/truthfulqa_bn-llama-7B.json
deleted file mode 100644
index 3c9c3b9489ea6ca298a17d5e7f442b2a42217543..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_bn-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_bn": {
-      "mc1": 0.2765685019206146,
-      "mc1_stderr": 0.016015952210618845,
-      "mc2": 0.5123820777474262,
-      "mc2_stderr": 0.01680032112327857
-    }
-  },
-  "versions": {
-    "truthfulqa_bn": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ca-bloom-7b1.json b/evals/truthfulqa/truthfulqa_ca-bloom-7b1.json
deleted file mode 100644
index ef3e258e39add637921d92a92ce41f916a905cce..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_ca-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ca": {
-      "mc1": 0.24324324324324326,
-      "mc1_stderr": 0.015401665455019378,
-      "mc2": 0.4007618819736215,
-      "mc2_stderr": 0.015273518926419462
-    }
-  },
-  "versions": {
-    "truthfulqa_ca": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ca-llama-7B.json b/evals/truthfulqa/truthfulqa_ca-llama-7B.json
deleted file mode 100644
index 279d4a6dd8300c3fdf93c1251995060f831d8f3d..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_ca-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ca": {
-      "mc1": 0.23423423423423423,
-      "mc1_stderr": 0.015203455154765249,
-      "mc2": 0.3889981216363435,
-      "mc2_stderr": 0.015057090749567676
-    }
-  },
-  "versions": {
-    "truthfulqa_ca": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_da-bloom-7b1.json b/evals/truthfulqa/truthfulqa_da-bloom-7b1.json
deleted file mode 100644
index 74bcde7ba97432b4b569a73b77198ee611a380d0..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_da-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_da": {
-      "mc1": 0.26248399487836105,
-      "mc1_stderr": 0.01575396357579612,
-      "mc2": 0.4375025988127945,
-      "mc2_stderr": 0.01662443223981383
-    }
-  },
-  "versions": {
-    "truthfulqa_da": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_da-llama-7B.json b/evals/truthfulqa/truthfulqa_da-llama-7B.json
deleted file mode 100644
index 08c1d956bd1de9206944f2438d9f56022794d2d5..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_da-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_da": {
-      "mc1": 0.2573623559539053,
-      "mc1_stderr": 0.01565358047400349,
-      "mc2": 0.4161317873775415,
-      "mc2_stderr": 0.015138516880476807
-    }
-  },
-  "versions": {
-    "truthfulqa_da": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_de-bloom-7b1.json b/evals/truthfulqa/truthfulqa_de-bloom-7b1.json
deleted file mode 100644
index 068e8c49c1d499f40d02aeb1b4037569845e3f39..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_de-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_de": {
-      "mc1": 0.24746192893401014,
-      "mc1_stderr": 0.015382646812261825,
-      "mc2": 0.43516734073709074,
-      "mc2_stderr": 0.015914493454090475
-    }
-  },
-  "versions": {
-    "truthfulqa_de": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_de-llama-7B.json b/evals/truthfulqa/truthfulqa_de-llama-7B.json
deleted file mode 100644
index 870d9cc5a8bc73c2ca376de43d027b704b474970..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_de-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_de": {
-      "mc1": 0.233502538071066,
-      "mc1_stderr": 0.015080432502225448,
-      "mc2": 0.38322430555832593,
-      "mc2_stderr": 0.014662714095687
-    }
-  },
-  "versions": {
-    "truthfulqa_de": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_es-bloom-7b1.json b/evals/truthfulqa/truthfulqa_es-bloom-7b1.json
deleted file mode 100644
index ff2caf3355fd7554ac124714fa094f7631c4b942..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_es-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_es": {
-      "mc1": 0.24714828897338403,
-      "mc1_stderr": 0.015366339219335662,
-      "mc2": 0.4037104105160595,
-      "mc2_stderr": 0.014621192787404666
-    }
-  },
-  "versions": {
-    "truthfulqa_es": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_es-llama-7B.json b/evals/truthfulqa/truthfulqa_es-llama-7B.json
deleted file mode 100644
index 57d59d5a6d7fcd5e98b4558ed333d506ab551069..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_es-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_es": {
-      "mc1": 0.22686945500633712,
-      "mc1_stderr": 0.014919398735157142,
-      "mc2": 0.3704736235055417,
-      "mc2_stderr": 0.014441434139778718
-    }
-  },
-  "versions": {
-    "truthfulqa_es": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_eu-bloom-7b1.json b/evals/truthfulqa/truthfulqa_eu-bloom-7b1.json
deleted file mode 100644
index 0af0c1ab614e35a49f6251d7b28e594279fd4640..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_eu-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_eu": {
-      "mc1": 0.26098191214470284,
-      "mc1_stderr": 0.015795849655411115,
-      "mc2": 0.4458532690626118,
-      "mc2_stderr": 0.016282676760451684
-    }
-  },
-  "versions": {
-    "truthfulqa_eu": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_eu-llama-7B.json b/evals/truthfulqa/truthfulqa_eu-llama-7B.json
deleted file mode 100644
index 173bbf1cdee4e48adcce1026ba92eea153711152..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_eu-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_eu": {
-      "mc1": 0.22739018087855298,
-      "mc1_stderr": 0.015075655972442521,
-      "mc2": 0.4067861653338961,
-      "mc2_stderr": 0.016617765169363637
-    }
-  },
-  "versions": {
-    "truthfulqa_eu": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_fr-bloom-7b1.json b/evals/truthfulqa/truthfulqa_fr-bloom-7b1.json
deleted file mode 100644
index 59d411be1a435aa79d393d5234b98b20153fa489..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_fr-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_fr": {
-      "mc1": 0.2604828462515883,
-      "mc1_stderr": 0.015654976408037494,
-      "mc2": 0.40875422704780084,
-      "mc2_stderr": 0.014771598297171899
-    }
-  },
-  "versions": {
-    "truthfulqa_fr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_fr-llama-7B.json b/evals/truthfulqa/truthfulqa_fr-llama-7B.json
deleted file mode 100644
index f2cf1301239dab8cdd09c7e41a803f442a37aaff..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_fr-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_fr": {
-      "mc1": 0.2388818297331639,
-      "mc1_stderr": 0.015209198584184304,
-      "mc2": 0.3992160965584639,
-      "mc2_stderr": 0.014275541507345014
-    }
-  },
-  "versions": {
-    "truthfulqa_fr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_gu-bloom-7b1.json b/evals/truthfulqa/truthfulqa_gu-bloom-7b1.json
deleted file mode 100644
index 2e428d6ce6e3db9502a089fe9c54da6bd4d4e2fa..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_gu-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_gu": {
-      "mc1": 0.2585499316005472,
-      "mc1_stderr": 0.016205100857272815,
-      "mc2": 0.4553767987804663,
-      "mc2_stderr": 0.01727282663518889
-    }
-  },
-  "versions": {
-    "truthfulqa_gu": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_gu-llama-7B.json b/evals/truthfulqa/truthfulqa_gu-llama-7B.json
deleted file mode 100644
index a439f0578967f86f0d5cd4f63d5c8655fa596680..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_gu-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_gu": {
-      "mc1": 0.2612859097127223,
-      "mc1_stderr": 0.016260532228493024,
-      "mc2": 0.42794967344995166,
-      "mc2_stderr": 0.017270715140237876
-    }
-  },
-  "versions": {
-    "truthfulqa_gu": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hi-bloom-7b1.json b/evals/truthfulqa/truthfulqa_hi-bloom-7b1.json
deleted file mode 100644
index 8576765f053944525c9eb8954a99cd9ce76a4d1c..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_hi-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hi": {
-      "mc1": 0.2613195342820181,
-      "mc1_stderr": 0.01581268409688839,
-      "mc2": 0.44399239540333224,
-      "mc2_stderr": 0.015881067623592954
-    }
-  },
-  "versions": {
-    "truthfulqa_hi": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hi-llama-7B.json b/evals/truthfulqa/truthfulqa_hi-llama-7B.json
deleted file mode 100644
index e21366d36ceaf8601da21d648ee943852d911560..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_hi-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hi": {
-      "mc1": 0.2794307891332471,
-      "mc1_stderr": 0.016149769533382482,
-      "mc2": 0.47236250377441935,
-      "mc2_stderr": 0.016709755014514986
-    }
-  },
-  "versions": {
-    "truthfulqa_hi": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hr-bloom-7b1.json b/evals/truthfulqa/truthfulqa_hr-bloom-7b1.json
deleted file mode 100644
index 672cbb9e39a1a7e019ee45709b90eec7588d5235..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_hr-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hr": {
-      "mc1": 0.2808842652795839,
-      "mc1_stderr": 0.016217447153754203,
-      "mc2": 0.4793142433106635,
-      "mc2_stderr": 0.01663884163172186
-    }
-  },
-  "versions": {
-    "truthfulqa_hr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hr-llama-7B.json b/evals/truthfulqa/truthfulqa_hr-llama-7B.json
deleted file mode 100644
index 3d1d11b77357870c8e0a53dcbafb4e8980c01f9f..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_hr-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hr": {
-      "mc1": 0.24187256176853056,
-      "mc1_stderr": 0.015451967985505181,
-      "mc2": 0.41709863857620866,
-      "mc2_stderr": 0.01546097371205123
-    }
-  },
-  "versions": {
-    "truthfulqa_hr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hu-bloom-7b1.json b/evals/truthfulqa/truthfulqa_hu-bloom-7b1.json
deleted file mode 100644
index 54432301293d130afd643eb21b0db15d9f209b67..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_hu-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hu": {
-      "mc1": 0.26718547341115434,
-      "mc1_stderr": 0.015946232556288537,
-      "mc2": 0.49994152241197887,
-      "mc2_stderr": 0.01703257765685213
-    }
-  },
-  "versions": {
-    "truthfulqa_hu": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hu-llama-7B.json b/evals/truthfulqa/truthfulqa_hu-llama-7B.json
deleted file mode 100644
index ccaefb69215b32c9208f055af2f3a1cf9c8760bc..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_hu-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hu": {
-      "mc1": 0.24643320363164722,
-      "mc1_stderr": 0.015529773657188122,
-      "mc2": 0.4311628343540659,
-      "mc2_stderr": 0.01555491548978951
-    }
-  },
-  "versions": {
-    "truthfulqa_hu": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hy-bloom-7b1.json b/evals/truthfulqa/truthfulqa_hy-bloom-7b1.json
deleted file mode 100644
index debcc1a8876d402702e3c9c496eb89bc3ad0f709..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_hy-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hy": {
-      "mc1": 0.2585895117540687,
-      "mc1_stderr": 0.018636539619637415,
-      "mc2": 0.44943643103428205,
-      "mc2_stderr": 0.02033094239607556
-    }
-  },
-  "versions": {
-    "truthfulqa_hy": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hy-llama-7B.json b/evals/truthfulqa/truthfulqa_hy-llama-7B.json
deleted file mode 100644
index 433e953ddf49c551d21da840cc57c95f665a192a..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_hy-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hy": {
-      "mc1": 0.2585895117540687,
-      "mc1_stderr": 0.018636539619637415,
-      "mc2": 0.4550713950263578,
-      "mc2_stderr": 0.020036965332656535
-    }
-  },
-  "versions": {
-    "truthfulqa_hy": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_id-bloom-7b1.json b/evals/truthfulqa/truthfulqa_id-bloom-7b1.json
deleted file mode 100644
index d6ab9911631d5cf4f7387d705739f249f1da7de2..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_id-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_id": {
-      "mc1": 0.2532133676092545,
-      "mc1_stderr": 0.01560023256901984,
-      "mc2": 0.4031249320049949,
-      "mc2_stderr": 0.015031705347347539
-    }
-  },
-  "versions": {
-    "truthfulqa_id": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_id-llama-7B.json b/evals/truthfulqa/truthfulqa_id-llama-7B.json
deleted file mode 100644
index 0967fc5439ed4e2c5217256c546b2f76aa443e6b..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_id-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_id": {
-      "mc1": 0.2570694087403599,
-      "mc1_stderr": 0.015677933234808462,
-      "mc2": 0.3981714076698207,
-      "mc2_stderr": 0.015520404506158571
-    }
-  },
-  "versions": {
-    "truthfulqa_id": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_it-bloom-7b1.json b/evals/truthfulqa/truthfulqa_it-bloom-7b1.json
deleted file mode 100644
index 9599a6d59070c187811a37aa2dcaec596f4e300c..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_it-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_it": {
-      "mc1": 0.2707535121328225,
-      "mc1_stderr": 0.015889888362560486,
-      "mc2": 0.4374801864181257,
-      "mc2_stderr": 0.015955762711633903
-    }
-  },
-  "versions": {
-    "truthfulqa_it": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_it-llama-7B.json b/evals/truthfulqa/truthfulqa_it-llama-7B.json
deleted file mode 100644
index 221af91b2b82bf70d904265c27c0279db93872af..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_it-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_it": {
-      "mc1": 0.24521072796934865,
-      "mc1_stderr": 0.015384352284543929,
-      "mc2": 0.39642666716879443,
-      "mc2_stderr": 0.01483705265700183
-    }
-  },
-  "versions": {
-    "truthfulqa_it": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_kn-bloom-7b1.json b/evals/truthfulqa/truthfulqa_kn-bloom-7b1.json
deleted file mode 100644
index b116af421e76c9c9f0d685f0a1156de33d48fa41..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_kn-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_kn": {
-      "mc1": 0.28466076696165193,
-      "mc1_stderr": 0.017343050775840425,
-      "mc2": 0.49109028617714945,
-      "mc2_stderr": 0.017608862092749467
-    }
-  },
-  "versions": {
-    "truthfulqa_kn": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_kn-llama-7B.json b/evals/truthfulqa/truthfulqa_kn-llama-7B.json
deleted file mode 100644
index f05f0339406ac5574d7a1dc62bddacb292f097eb..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_kn-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_kn": {
-      "mc1": 0.275811209439528,
-      "mc1_stderr": 0.017176612615872052,
-      "mc2": 0.4635130117214921,
-      "mc2_stderr": 0.01825683954680752
-    }
-  },
-  "versions": {
-    "truthfulqa_kn": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ml-bloom-7b1.json b/evals/truthfulqa/truthfulqa_ml-bloom-7b1.json
deleted file mode 100644
index d2ada8ce66115bbf7e7e2ac501b996bc7b9ab3a1..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_ml-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ml": {
-      "mc1": 0.260806916426513,
-      "mc1_stderr": 0.01667907195342198,
-      "mc2": 0.47996911862138697,
-      "mc2_stderr": 0.017778690252427683
-    }
-  },
-  "versions": {
-    "truthfulqa_ml": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ml-llama-7B.json b/evals/truthfulqa/truthfulqa_ml-llama-7B.json
deleted file mode 100644
index 4dd3caeb8a76c583e812d275589a2c18156d6935..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_ml-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ml": {
-      "mc1": 0.2824207492795389,
-      "mc1_stderr": 0.01710080754090615,
-      "mc2": 0.5024391989231584,
-      "mc2_stderr": 0.017936047828800445
-    }
-  },
-  "versions": {
-    "truthfulqa_ml": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_mr-bloom-7b1.json b/evals/truthfulqa/truthfulqa_mr-bloom-7b1.json
deleted file mode 100644
index 181033bdf126dc47bfc09557ea24531f4fead727..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_mr-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_mr": {
-      "mc1": 0.2761780104712042,
-      "mc1_stderr": 0.016186321628712155,
-      "mc2": 0.4765064151203332,
-      "mc2_stderr": 0.016772466571288412
-    }
-  },
-  "versions": {
-    "truthfulqa_mr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_mr-llama-7B.json b/evals/truthfulqa/truthfulqa_mr-llama-7B.json
deleted file mode 100644
index a1fcd59738ae0b14a296aba32a13e2bda55370e3..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_mr-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_mr": {
-      "mc1": 0.2905759162303665,
-      "mc1_stderr": 0.016436922328865435,
-      "mc2": 0.49306373435254724,
-      "mc2_stderr": 0.016980148211258952
-    }
-  },
-  "versions": {
-    "truthfulqa_mr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ne-bloom-7b1.json b/evals/truthfulqa/truthfulqa_ne-bloom-7b1.json
deleted file mode 100644
index 89defff7cdf83326b83aee4c35f6b7ab666393c0..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_ne-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ne": {
-      "mc1": 0.28811369509043927,
-      "mc1_stderr": 0.0162891162717815,
-      "mc2": 0.46164155205805624,
-      "mc2_stderr": 0.016689007834004295
-    }
-  },
-  "versions": {
-    "truthfulqa_ne": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ne-llama-7B.json b/evals/truthfulqa/truthfulqa_ne-llama-7B.json
deleted file mode 100644
index b18b50165478e2f5e3938b2978e51ae65ffb09b0..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_ne-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ne": {
-      "mc1": 0.29198966408268734,
-      "mc1_stderr": 0.016353615824015625,
-      "mc2": 0.4636310825029969,
-      "mc2_stderr": 0.016928691048242774
-    }
-  },
-  "versions": {
-    "truthfulqa_ne": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_nl_Llama-2-13b-hf.json b/evals/truthfulqa/truthfulqa_nl_Llama-2-13b-hf.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc50ce18862c928b9b51cbda3ac9cc0c13b71b40
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_nl_Llama-2-13b-hf.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_nl": {
+      "mc1": 0.2764331210191083,
+      "mc1_stderr": 0.01597262688062874,
+      "mc2": 0.4103755310313891,
+      "mc2_stderr": 0.014811313488625848
+    }
+  },
+  "versions": {
+    "truthfulqa_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=meta-llama/Llama-2-13b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 8,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_nl_Llama-2-7b-chat-hf.json b/evals/truthfulqa/truthfulqa_nl_Llama-2-7b-chat-hf.json
new file mode 100644
index 0000000000000000000000000000000000000000..ece12d09b26076267e338648ee7a6d36c649199c
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_nl_Llama-2-7b-chat-hf.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_nl": {
+      "mc1": 0.2917197452229299,
+      "mc1_stderr": 0.016234071293195287,
+      "mc2": 0.4462996697687161,
+      "mc2_stderr": 0.016161710042968205
+    }
+  },
+  "versions": {
+    "truthfulqa_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 64,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_nl_Llama-2-7b-hf.json b/evals/truthfulqa/truthfulqa_nl_Llama-2-7b-hf.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab4837ac43ff277111882cfd0e0a9fc9b5e05518
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_nl_Llama-2-7b-hf.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_nl": {
+      "mc1": 0.28152866242038216,
+      "mc1_stderr": 0.016062309899461683,
+      "mc2": 0.41626070733921117,
+      "mc2_stderr": 0.014914193769419527
+    }
+  },
+  "versions": {
+    "truthfulqa_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 64,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_nl_Mistral-7B-v0.1.json b/evals/truthfulqa/truthfulqa_nl_Mistral-7B-v0.1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f63906c6941530a86f973e4da2bce0c2ca6724e4
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_nl_Mistral-7B-v0.1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_nl": {
+      "mc1": 0.3070063694267516,
+      "mc1_stderr": 0.01647328769082192,
+      "mc2": 0.45280570817630444,
+      "mc2_stderr": 0.015014728029135574
+    }
+  },
+  "versions": {
+    "truthfulqa_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 64,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_nl_falcon-40b-ft-alpaca-dolly-dutch.json b/evals/truthfulqa/truthfulqa_nl_falcon-40b-ft-alpaca-dolly-dutch.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae35054ea6bbd7a2539e1f37f6066ebf947d5354
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_nl_falcon-40b-ft-alpaca-dolly-dutch.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_nl": {
+      "mc1": 0.310828025477707,
+      "mc1_stderr": 0.016529733724696277,
+      "mc2": 0.4460845208916539,
+      "mc2_stderr": 0.01476856418537487
+    }
+  },
+  "versions": {
+    "truthfulqa_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=BramVanroy/falcon-40b-ft-alpaca-dolly-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 8,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_nl_falcon-40b.json b/evals/truthfulqa/truthfulqa_nl_falcon-40b.json
new file mode 100644
index 0000000000000000000000000000000000000000..48f1f48a671435726595dc9daa36cccf2f0a1daf
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_nl_falcon-40b.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_nl": {
+      "mc1": 0.2764331210191083,
+      "mc1_stderr": 0.01597262688062875,
+      "mc2": 0.4091336161450544,
+      "mc2_stderr": 0.014605140809282338
+    }
+  },
+  "versions": {
+    "truthfulqa_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=tiiuae/falcon-40b,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 8,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_nl_llama2-13b-ft-mc4_nl_cleaned_tiny.json b/evals/truthfulqa/truthfulqa_nl_llama2-13b-ft-mc4_nl_cleaned_tiny.json
new file mode 100644
index 0000000000000000000000000000000000000000..649cd0e84504b0783d539643aa070c34a5218f96
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_nl_llama2-13b-ft-mc4_nl_cleaned_tiny.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_nl": {
+      "mc1": 0.2751592356687898,
+      "mc1_stderr": 0.0159498029022655,
+      "mc2": 0.41816127879466414,
+      "mc2_stderr": 0.01474120131034505
+    }
+  },
+  "versions": {
+    "truthfulqa_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=BramVanroy/llama2-13b-ft-mc4_nl_cleaned_tiny,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 8,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_nl_zephyr-7b-beta.json b/evals/truthfulqa/truthfulqa_nl_zephyr-7b-beta.json
new file mode 100644
index 0000000000000000000000000000000000000000..66cbc553a965b6d420caf50420f7c0a71edff7e9
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_nl_zephyr-7b-beta.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_nl": {
+      "mc1": 0.3719745222929936,
+      "mc1_stderr": 0.0172618443903749,
+      "mc2": 0.5294532108691418,
+      "mc2_stderr": 0.016221848481192833
+    }
+  },
+  "versions": {
+    "truthfulqa_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
+    "batch_size": 64,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_pt-bloom-7b1.json b/evals/truthfulqa/truthfulqa_pt-bloom-7b1.json
deleted file mode 100644
index d9c6cefe30e562acfb981870f9e593f27f720a3d..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_pt-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_pt": {
-      "mc1": 0.23857868020304568,
-      "mc1_stderr": 0.015192910034567013,
-      "mc2": 0.38894722340741417,
-      "mc2_stderr": 0.014531269277587645
-    }
-  },
-  "versions": {
-    "truthfulqa_pt": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_pt-llama-7B.json b/evals/truthfulqa/truthfulqa_pt-llama-7B.json
deleted file mode 100644
index 1ae678becb49d878dc30174f2c390f2c1b5a1f49..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_pt-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_pt": {
-      "mc1": 0.22842639593908629,
-      "mc1_stderr": 0.014964922033138022,
-      "mc2": 0.3823261607330551,
-      "mc2_stderr": 0.014633193983144183
-    }
-  },
-  "versions": {
-    "truthfulqa_pt": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ro-bloom-7b1.json b/evals/truthfulqa/truthfulqa_ro-bloom-7b1.json
deleted file mode 100644
index e9d6490be6beab45fd85e68d9df1e301bf2dff28..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_ro-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ro": {
-      "mc1": 0.26187419768934533,
-      "mc1_stderr": 0.015762378425124946,
-      "mc2": 0.4605371384706094,
-      "mc2_stderr": 0.016307442681458683
-    }
-  },
-  "versions": {
-    "truthfulqa_ro": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ro-llama-7B.json b/evals/truthfulqa/truthfulqa_ro-llama-7B.json
deleted file mode 100644
index 26abd62509f8f15981ca8051421f879ea16ddc2f..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_ro-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ro": {
-      "mc1": 0.22849807445442877,
-      "mc1_stderr": 0.015052893222788351,
-      "mc2": 0.37047262828252514,
-      "mc2_stderr": 0.015022205435273333
-    }
-  },
-  "versions": {
-    "truthfulqa_ro": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ru-bloom-7b1.json b/evals/truthfulqa/truthfulqa_ru-bloom-7b1.json
deleted file mode 100644
index 3347a51ef0c14c0658f692111f9112b52f876a5c..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_ru-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ru": {
-      "mc1": 0.30710659898477155,
-      "mc1_stderr": 0.016443354533552747,
-      "mc2": 0.49874761323987404,
-      "mc2_stderr": 0.016167778359600482
-    }
-  },
-  "versions": {
-    "truthfulqa_ru": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ru-llama-7B.json b/evals/truthfulqa/truthfulqa_ru-llama-7B.json
deleted file mode 100644
index 54b06b11d61f59c9f47d987a96a9290c09921a27..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_ru-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ru": {
-      "mc1": 0.24619289340101522,
-      "mc1_stderr": 0.015356084872692898,
-      "mc2": 0.40938277991151933,
-      "mc2_stderr": 0.015252017769860154
-    }
-  },
-  "versions": {
-    "truthfulqa_ru": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_sk-bloom-7b1.json b/evals/truthfulqa/truthfulqa_sk-bloom-7b1.json
deleted file mode 100644
index 1132cb125d8848afa4abc9ecef17405375f5ccc0..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_sk-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_sk": {
-      "mc1": 0.2390745501285347,
-      "mc1_stderr": 0.015301260856408254,
-      "mc2": 0.43782616190313467,
-      "mc2_stderr": 0.01657761354751216
-    }
-  },
-  "versions": {
-    "truthfulqa_sk": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_sk-llama-7B.json b/evals/truthfulqa/truthfulqa_sk-llama-7B.json
deleted file mode 100644
index 71e866145020816a8524a8bd50cddf94af5042ea..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_sk-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_sk": {
-      "mc1": 0.2275064267352185,
-      "mc1_stderr": 0.015039512631474048,
-      "mc2": 0.40729144857566124,
-      "mc2_stderr": 0.015845697731465
-    }
-  },
-  "versions": {
-    "truthfulqa_sk": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_sr-bloom-7b1.json b/evals/truthfulqa/truthfulqa_sr-bloom-7b1.json
deleted file mode 100644
index 75efa51eca0c0d99414987b87632f9c19f581a21..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_sr-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_sr": {
-      "mc1": 0.2878980891719745,
-      "mc1_stderr": 0.016170834614246097,
-      "mc2": 0.4604993074094113,
-      "mc2_stderr": 0.01649631560714403
-    }
-  },
-  "versions": {
-    "truthfulqa_sr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_sr-llama-7B.json b/evals/truthfulqa/truthfulqa_sr-llama-7B.json
deleted file mode 100644
index a65b681172e15a187d19448a27058ec125e2b1f1..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_sr-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_sr": {
-      "mc1": 0.26878980891719745,
-      "mc1_stderr": 0.01583322873155152,
-      "mc2": 0.422701657829082,
-      "mc2_stderr": 0.015374851085961157
-    }
-  },
-  "versions": {
-    "truthfulqa_sr": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_sv-bloom-7b1.json b/evals/truthfulqa/truthfulqa_sv-bloom-7b1.json
deleted file mode 100644
index 85698716bf120fd641d6dfcb551bdb145d17bc87..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_sv-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_sv": {
-      "mc1": 0.2622739018087855,
-      "mc1_stderr": 0.015821052272364522,
-      "mc2": 0.44572489319670916,
-      "mc2_stderr": 0.016517364176123605
-    }
-  },
-  "versions": {
-    "truthfulqa_sv": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_sv-llama-7B.json b/evals/truthfulqa/truthfulqa_sv-llama-7B.json
deleted file mode 100644
index f2f88649e17469e2a7fdc44f296619fe407feac6..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_sv-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_sv": {
-      "mc1": 0.2596899224806202,
-      "mc1_stderr": 0.01577046983489191,
-      "mc2": 0.4052891370296314,
-      "mc2_stderr": 0.01500679891573553
-    }
-  },
-  "versions": {
-    "truthfulqa_sv": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ta-bloom-7b1.json b/evals/truthfulqa/truthfulqa_ta-bloom-7b1.json
deleted file mode 100644
index 956d773e26ebf10fc23669bb18d5b9df924be462..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_ta-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ta": {
-      "mc1": 0.2651413189771198,
-      "mc1_stderr": 0.016204613164182584,
-      "mc2": 0.48348066773619114,
-      "mc2_stderr": 0.016887213348384833
-    }
-  },
-  "versions": {
-    "truthfulqa_ta": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ta-llama-7B.json b/evals/truthfulqa/truthfulqa_ta-llama-7B.json
deleted file mode 100644
index 3edaa546d22cbb705a02af8433a7b3ecb4f29213..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_ta-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ta": {
-      "mc1": 0.28263795423956933,
-      "mc1_stderr": 0.016530366611189357,
-      "mc2": 0.5032626048969708,
-      "mc2_stderr": 0.01719880976895468
-    }
-  },
-  "versions": {
-    "truthfulqa_ta": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_te-bloom-7b1.json b/evals/truthfulqa/truthfulqa_te-bloom-7b1.json
deleted file mode 100644
index d139c759617d41dd724dc54443b08c7eba5c2a83..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_te-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_te": {
-      "mc1": 0.2652482269503546,
-      "mc1_stderr": 0.016638349265004355,
-      "mc2": 0.4612285746093752,
-      "mc2_stderr": 0.017504699336599025
-    }
-  },
-  "versions": {
-    "truthfulqa_te": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_te-llama-7B.json b/evals/truthfulqa/truthfulqa_te-llama-7B.json
deleted file mode 100644
index b7371487cfa5f3b205258d0c63aa1d722e304a75..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_te-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_te": {
-      "mc1": 0.2851063829787234,
-      "mc1_stderr": 0.01701523103469595,
-      "mc2": 0.4821795923320059,
-      "mc2_stderr": 0.01784811574301116
-    }
-  },
-  "versions": {
-    "truthfulqa_te": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_uk-bloom-7b1.json b/evals/truthfulqa/truthfulqa_uk-bloom-7b1.json
deleted file mode 100644
index da866d1706ae757888bf53041acc427c30e98a06..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_uk-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_uk": {
-      "mc1": 0.3090909090909091,
-      "mc1_stderr": 0.01666442755255745,
-      "mc2": 0.5143873310692731,
-      "mc2_stderr": 0.016755211041268873
-    }
-  },
-  "versions": {
-    "truthfulqa_uk": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_uk-llama-7B.json b/evals/truthfulqa/truthfulqa_uk-llama-7B.json
deleted file mode 100644
index 3a420b35b0478fcc320798e8287f213c061df4fe..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_uk-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_uk": {
-      "mc1": 0.23636363636363636,
-      "mc1_stderr": 0.015320412612327241,
-      "mc2": 0.4141829984231552,
-      "mc2_stderr": 0.01560702677887637
-    }
-  },
-  "versions": {
-    "truthfulqa_uk": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_vi-bloom-7b1.json b/evals/truthfulqa/truthfulqa_vi-bloom-7b1.json
deleted file mode 100644
index f21113c3d005bd269763438b047147bb50ac5125..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_vi-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_vi": {
-      "mc1": 0.2968152866242038,
-      "mc1_stderr": 0.016316229722585934,
-      "mc2": 0.44721474578334436,
-      "mc2_stderr": 0.015073430494043749
-    }
-  },
-  "versions": {
-    "truthfulqa_vi": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_vi-llama-7B.json b/evals/truthfulqa/truthfulqa_vi-llama-7B.json
deleted file mode 100644
index bc5992da0821ee82c8ce26e99fb73e6e2f872651..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_vi-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_vi": {
-      "mc1": 0.2445859872611465,
-      "mc1_stderr": 0.015351480770855935,
-      "mc2": 0.42975481561967727,
-      "mc2_stderr": 0.01625176801732652
-    }
-  },
-  "versions": {
-    "truthfulqa_vi": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_zh-bloom-7b1.json b/evals/truthfulqa/truthfulqa_zh-bloom-7b1.json
deleted file mode 100644
index 7496dee8d8893c925eac3f5a5de1723f69d1ad77..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_zh-bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_zh": {
-      "mc1": 0.22842639593908629,
-      "mc1_stderr": 0.014964922033138017,
-      "mc2": 0.38822244050439564,
-      "mc2_stderr": 0.014953544130092178
-    }
-  },
-  "versions": {
-    "truthfulqa_zh": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_zh-llama-7B.json b/evals/truthfulqa/truthfulqa_zh-llama-7B.json
deleted file mode 100644
index eeab4eff270462460733b050ac068062679cc507..0000000000000000000000000000000000000000
--- a/evals/truthfulqa/truthfulqa_zh-llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_zh": {
-      "mc1": 0.26649746192893403,
-      "mc1_stderr": 0.015760136800242356,
-      "mc2": 0.43598966702035913,
-      "mc2_stderr": 0.015850355717645676
-    }
-  },
-  "versions": {
-    "truthfulqa_zh": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file