Yeyito commited on
Commit
b56563d
β€’
1 Parent(s): c13858c

Meant to do last commit on run.py not app.py

Browse files
Files changed (1) hide show
  1. app.py +307 -231
app.py CHANGED
@@ -1,235 +1,311 @@
1
- import logging
2
- logging.basicConfig(level='ERROR')
3
- import numpy as np
4
- from pathlib import Path
5
- import openai
6
- import torch
7
- import zlib
8
- import statistics
9
- from torch.utils.data import DataLoader
10
- from transformers import AutoTokenizer, AutoModelForCausalLM
11
- from tqdm import tqdm
12
- import math
13
- import numpy as np
14
- from datasets import load_dataset
15
- from options import Options
16
- from ipdb import set_trace as bp
17
- from eval import *
18
- from utils import evaluate_model
19
- from analyze import analyze_data
20
- import argparse
21
  import os
22
  import sys
23
- import gc
24
- import pickle
25
-
26
- models = {}
27
-
28
- def save_data(filename, data):
29
- with open(filename, 'wb') as filehandle:
30
- # store the data as binary data stream
31
- pickle.dump(data, filehandle)
32
-
33
- def load_data(filename):
34
- with open(filename, 'rb') as filehandle:
35
- # read the data as binary data stream
36
- loaded_data = pickle.load(filehandle)
37
-
38
- return loaded_data
39
-
40
- def unload_model(model,tokenizer):
41
- print("[X] Cannot unload model! Functionality not implemented!")
42
-
43
- def load_model(name1):
44
- if name1 not in models:
45
- model1 = AutoModelForCausalLM.from_pretrained(name1, return_dict=True, device_map='auto')
46
- model1.eval()
47
- tokenizer1 = AutoTokenizer.from_pretrained(name1)
48
-
49
- tokenizer1.pad_token = tokenizer1.eos_token
50
- models[name1] = model1
51
- models[name1 + "_tokenizer"] = tokenizer1
52
- return models[name1], models[name1 + "_tokenizer"]
53
-
54
- def calculatePerplexity(sentence, model, tokenizer, gpu):
55
- """
56
- exp(loss)
57
- """
58
- input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
59
- input_ids = input_ids.to(gpu)
60
- with torch.no_grad():
61
- outputs = model(input_ids, labels=input_ids)
62
- loss, logits = outputs[:2]
63
-
64
- '''
65
- extract logits:
66
- '''
67
- # Apply softmax to the logits to get probabilities
68
- probabilities = torch.nn.functional.log_softmax(logits, dim=-1)
69
- # probabilities = torch.nn.functional.softmax(logits, dim=-1)
70
- all_prob = []
71
- input_ids_processed = input_ids[0][1:]
72
-
73
- for i, token_id in enumerate(input_ids_processed):
74
- probability = probabilities[0, i, token_id].item()
75
- all_prob.append(probability)
76
- return torch.exp(loss).item(), all_prob, loss.item()
77
-
78
- def sample_generation(sentence, model, tokenizer, args,data_name):
79
- half_sentence_index = math.ceil(len(sentence.split())*args['prefix_length'])
80
-
81
- if half_sentence_index > 0:
82
- prefix = " ".join(sentence.split()[:half_sentence_index])
83
- else:
84
- prefix = '<|startoftext|> '
85
-
86
- input_ids = torch.tensor(tokenizer.encode(prefix)).unsqueeze(0)
87
- input_ids = input_ids.to(model.device)
88
-
89
- output = None
90
- if data_name != "cais/mmlu" or data_name != "gsm8k":
91
- output = model.generate(input_ids, max_new_tokens=len(sentence.split())-half_sentence_index, min_new_tokens=1, num_return_sequences=args['num_z'], pad_token_id=tokenizer.eos_token_id, **args['generate_args'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  else:
93
- output = model.generate(input_ids, max_new_tokens=(len(sentence.split())-half_sentence_index)/2, min_new_tokens=1, num_return_sequences=args['num_z'], pad_token_id=tokenizer.eos_token_id, **args['generate_args'])
94
- # print(output)
95
- complete_generated_text = tokenizer.batch_decode(output, skip_special_tokens=True)
96
-
97
- return complete_generated_text
98
-
99
-
100
- def RMIA_1(text,target_loss,ref_loss,model1,tokenizer1,ratio_gen,neighbors_dl):
101
- target_losses_z = evaluate_model(model1,tokenizer1,neighbors_dl)
102
- result = torch.count_nonzero(target_losses_z < target_loss).item() / len(target_losses_z)
103
- return result
104
-
105
- def get_neighbors(text,ref_loss,model2,tokenizer2,ratio_gen,data_name):
106
- cur_args = {'prefix_length': ratio_gen, 'num_z': 100, 'generate_args': {'do_sample': True}}
107
- neighbors = sample_generation(text, model2, tokenizer2, cur_args,data_name)
108
- neighbors_dl = DataLoader(neighbors, batch_size=32, shuffle=False)
109
- return neighbors_dl
110
-
111
- def evaluate_data(test_data, col_name, target_model, ref_model, ratio_gen, data_name):
112
- global model1,model2,tokenizer1,tokenizer2
113
- print(f"all data size: {len(test_data)}")
114
- random.seed(0)
115
- random.shuffle(test_data)
116
- test_data = test_data[:100]
117
-
118
- inference2_pass = None
119
- neighbors_dls = None
120
- ref_model_clean = ref_model.replace("/","-")
121
- data_name_clean = data_name.replace("/","-")
122
- os.makedirs(os.path.join(f"saves/{ref_model_clean}",f"{data_name_clean}"),exist_ok=True)
123
- try:
124
- inference2_pass = load_data(f'saves/{ref_model_clean}/{data_name_clean}/inference2_pass.txt')
125
- neighbors_dls = load_data(f'saves/{ref_model_clean}/{data_name_clean}/neighbors_dls.txt')
126
- except:
127
- ### MODEL 2 likelihoods
128
- model2, tokenizer2 = load_model(ref_model)
129
- inference2_pass = [] #0: p_ref, #1: all_prob_ref, #2: p_ref_likelihood
130
- for ex in tqdm(test_data):
131
- text = ex[col_name]
132
- new_ex = inference_model2(model2, tokenizer2, text)
133
- inference2_pass.append(new_ex)
134
- # Invariant. Doesn't take in model1 so I'm good
135
-
136
- ### Neighbors:
137
- neighbors_dls = []
138
- counter = 0
139
- for ex in tqdm(test_data):
140
- text = ex[col_name]
141
- new_ex = get_neighbors(text,inference2_pass[counter][2],model2,tokenizer2,ratio_gen,data_name)
142
- counter = counter + 1
143
- neighbors_dls.append(new_ex)
144
- unload_model(model2,tokenizer2)
145
- # Because it uses temp it is not invariant, however taking a snapshot in time should be just fine.
146
- save_data(f'saves/{ref_model_clean}/{data_name_clean}/inference2_pass.txt',inference2_pass)
147
- save_data(f'saves/{ref_model_clean}/{data_name_clean}/neighbors_dls.txt',neighbors_dls)
148
- print("Saved ref data, exiting.")
149
-
150
- ### MODEL 1 likelihoods
151
- model1, tokenizer1 = load_model(target_model)
152
- inference1_pass = [] #0: p1, #1: all_prob, #2: p1_likelihood, #3: p_lower, #4: p_lower_likelihood
153
- for ex in tqdm(test_data):
154
- text = ex[col_name]
155
- new_ex = inference_model1(model1,tokenizer1,text)
156
- inference1_pass.append(new_ex)
157
-
158
- ### RIMA results
159
- model1, tokenizer1 = load_model(target_model)
160
- counter = 0
161
- results = []
162
- for ex in tqdm(test_data):
163
- text = ex[col_name]
164
- new_ex = RMIA_1(text,inference1_pass[counter][2],inference2_pass[counter][2],model1,tokenizer1,ratio_gen,neighbors_dls[counter])
165
- counter = counter + 1
166
- results.append(new_ex)
167
- unload_model(model1,tokenizer1)
168
-
169
- ### Inference ex
170
- all_output = []
171
- counter = 0
172
- for ex in tqdm(test_data):
173
- text = ex[col_name]
174
- pred = {}
175
- pred["minkprob_w/_ref"] = results[counter]
176
- pred["ppl"] = inference1_pass[counter][0]
177
- pred["ppl/Ref_ppl (calibrate PPL to the reference model)"] = inference1_pass[counter][2]-inference2_pass[counter][2]
178
- pred["ppl/lowercase_ppl"] = -(np.log(inference1_pass[counter][3]) / np.log(inference1_pass[counter][0])).item()
179
- zlib_entropy = len(zlib.compress(bytes(text, 'utf-8')))
180
- pred["ppl/zlib"] = np.log(inference1_pass[counter][0])/zlib_entropy
181
- ex["pred"] = pred
182
- counter = counter + 1
183
- all_output.append(ex)
184
- return all_output
185
-
186
- def inference_model1 (model1, tokenizer1, text):
187
- p1, all_prob, p1_likelihood = calculatePerplexity(text, model1, tokenizer1, gpu=model1.device)
188
- p_lower, _, p_lower_likelihood = calculatePerplexity(text.lower(), model1, tokenizer1, gpu=model1.device)
189
- return [p1, all_prob, p1_likelihood, p_lower, p_lower_likelihood]
190
-
191
- def inference_model2 (model2, tokenizer2, text):
192
- p_ref, all_prob_ref, p_ref_likelihood = calculatePerplexity(text, model2, tokenizer2, gpu=model2.device)
193
- return [p_ref,all_prob_ref,p_ref_likelihood]
194
-
195
- def main(target_model,ref_model,output_dir,data,length,key_name,ratio_gen):
196
- output_dir = f"{output_dir}/{target_model}_{ref_model}/{key_name}"
197
- Path(output_dir).mkdir(parents=True, exist_ok=True)
198
- # load model and data
199
- data_name = data
200
- if "jsonl" in data:
201
- data = load_jsonl(f"{data}")
202
- elif data == "truthful_qa":
203
- # bp()
204
- dataset = load_dataset(data, "multiple_choice", split="validation")
205
- data = convert_huggingface_data_to_list_dic(dataset)
206
- data = process_truthful_qa(data)
207
- elif data == "cais/mmlu":
208
- dataset = load_dataset(data, "all", split="test")
209
- data = convert_huggingface_data_to_list_dic(dataset)
210
- data = process_mmlu(data)
211
- elif data == "ai2_arc":
212
- dataset = load_dataset(data, "ARC-Challenge", split="test")
213
- data = convert_huggingface_data_to_list_dic(dataset)
214
- data = process_arc(data)
215
- elif data == "gsm8k":
216
- dataset = load_dataset(data, "main", split="test")
217
- data = convert_huggingface_data_to_list_dic(dataset)
218
- data = process_gsm8k(data)
219
- elif data == "Rowan/hellaswag":
220
- dataset = load_dataset(data, "default", split="validation")
221
- # We use validation since labels for the test set are not available?
222
- data = convert_huggingface_data_to_list_dic(dataset)
223
- data = process_hellaswag(data)
224
- elif data == "winogrande":
225
- dataset = load_dataset(data,"winogrande_debiased", split="validation")
226
- data = convert_huggingface_data_to_list_dic(dataset)
227
- data = process_winogrande(data)
228
-
229
- #model1, model2, tokenizer1, tokenizer2 = load_model(target_model, ref_model)
230
-
231
- all_output = evaluate_data(data,key_name, target_model, ref_model,ratio_gen,data_name)
232
- dump_jsonl(all_output, f"{output_dir}/all_output.jsonl")
233
- return analyze_data(all_output)
234
- # fig_fpr_tpr(all_output, output_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
 
 
1
+ import gradio as gr
2
+ import subprocess
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import os
4
  import sys
5
+ import time
6
+ import pandas as pd
7
+ from threading import Thread
8
+
9
+ # Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path
10
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination"))
11
+ src_dir = os.path.join(project_root, "src")
12
+ sys.path.insert(0, src_dir)
13
+
14
+ import run as evaluator # Import the run module
15
+ from src.css_html import custom_css
16
+ from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT, SUBMISSION_TEXT_2
17
+ from src.envs import API, H4_TOKEN, REPO_ID
18
+ from huggingface_hub import HfApi
19
+ from src.utils import (
20
+ AutoEvalColumn,
21
+ fields,
22
+ is_model_on_hub,
23
+ make_clickable_names,
24
+ styled_error,
25
+ styled_message,
26
+ )
27
+
28
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
29
+ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
30
+ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
31
+ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
32
+
33
+ # CONFIGURATION:
34
+ ref_model = "huggyllama/llama-7b"
35
+ test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
36
+ modelQueue = []
37
+
38
+ def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way.
39
+ API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
40
+
41
+
42
+ def save_to_txt(model, results, model_type):
43
+ file_path = "data/code_eval_board.csv"
44
+
45
+ with open(file_path, "a") as f:
46
+ f.write(f"\n{model_type},{model}," + str(results["arc"]) + "," + str(results["hellaswag"]) + "," + str(results["mmlu"]) + "," + str(results["truthfulQA"]) + "," + str(results["winogrande"]) + "," + str(results["gsm8k"]))
47
+ f.close()
48
+
49
+ restart_space()
50
+
51
+ def run_test(model,ref_model,data):
52
+ print(f"|| TESTING {data} ||")
53
+ return evaluator.main(
54
+ target_model=f"{model}",
55
+ ref_model=f"{ref_model}",
56
+ output_dir="out",
57
+ data=f"{data}",
58
+ length=64,
59
+ key_name="input",
60
+ ratio_gen=0.4
61
+ ) # Call the main function in detect-pretrain-code-contamination/src/run.py
62
+
63
+ def evaluate(model,model_type):
64
+ global ref_model
65
+ print(f"|| EVALUATING {model} ||")
66
+ results = {
67
+ "arc": run_test(model, ref_model, test_datasets[2]),
68
+ "hellaswag": run_test(model, ref_model, test_datasets[4]),
69
+ "mmlu": run_test(model, ref_model, test_datasets[1]),
70
+ "truthfulQA": run_test(model, ref_model, test_datasets[0]),
71
+ "winogrande": run_test(model, ref_model, test_datasets[5]),
72
+ "gsm8k": run_test(model, ref_model, test_datasets[3]),
73
+ "ref_model": ref_model,
74
+ }
75
+
76
+ # Save to .txt file in /Evaluations/{model}
77
+ save_to_txt(model, results, model_type)
78
+ return "\n".join([f"{k}:{results[k]}" for k in results])
79
+
80
+ def worker_thread():
81
+ global modelQueue, server
82
+ while True:
83
+ for submission in modelQueue:
84
+ evaluate(submission[0],submission[1].split(" ")[0])
85
+ modelQueue.pop(modelQueue.index(submission))
86
+ time.sleep(1)
87
+ time.sleep(1)
88
+
89
+ def queue(model,model_type):
90
+ global modelQueue
91
+ modelQueue.append([model,model_type])
92
+ print(f"QUEUE:\n{modelQueue}")
93
+
94
+
95
+ ### bigcode/bigcode-models-leaderboard
96
+ def add_new_eval(
97
+ model: str,
98
+ revision: str,
99
+ precision: str,
100
+ model_type: str,
101
+ ):
102
+ precision = precision
103
+
104
+ if model_type is None or model_type == "" or model_type == []:
105
+ return styled_error("Please select a model type.")
106
+ print(model_type)
107
+ # check the model actually exists before adding the eval
108
+ if revision == "":
109
+ revision = "main"
110
+
111
+ model_on_hub, error = is_model_on_hub(model, revision)
112
+ if not model_on_hub:
113
+ return styled_error(f'Model "{model}" {error}')
114
+
115
+ print("Adding new eval")
116
+ queue(model,model_type)
117
+ return styled_message("Your request has been submitted to the evaluation queue!\n")
118
+
119
+ def select_columns(df, columns):
120
+ always_here_cols = [
121
+ AutoEvalColumn.model_type_symbol.name,
122
+ AutoEvalColumn.model.name,
123
+ ]
124
+ # We use COLS to maintain sorting
125
+ filtered_df = df[
126
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns]
127
+ ]
128
+ return filtered_df
129
+
130
+
131
+ def filter_items(df, leaderboard_table, query):
132
+ if query == "All":
133
+ return df[leaderboard_table.columns]
134
  else:
135
+ query = query[0] # take only the emoji character
136
+ filtered_df = df[(df["T"] == query)]
137
+ return filtered_df[leaderboard_table.columns]
138
+
139
+ def search_table(df, leaderboard_table, query):
140
+ filtered_df = df[(df["Models"].str.contains(query, case=False))]
141
+ return filtered_df[leaderboard_table.columns]
142
+
143
+ demo = gr.Blocks(css=custom_css)
144
+ with demo:
145
+ with gr.Row():
146
+ gr.Markdown(
147
+ """<div style="text-align: center;"><h1> πŸ“„ LLM Contamination Detector </h1></div>\
148
+ <br>\
149
+ <p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">πŸ€— Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">πŸ€— Big Code Models Leaderboard οΏ½οΏ½οΏ½</a>, we use an implementation of <a href="https://huggingface.co/papers/2310.16789">Detecting Pretraining Data from Large Language Models</a> paper found in <a href="https://github.com/swj0419/detect-pretrain-code-contamination/tree/master">this github repo</a>, to provide contamination scores for LLMs on the datasets used by Open LLM Leaderboard.\
150
+ This space should NOT be used to flag or accuse models of cheating / being contamined, instead, it should form part of a holistic assesment by the parties involved.</p>""",
151
+ elem_classes="markdown-text",
152
+ )
153
+
154
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
155
+ with gr.Column():
156
+ with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
157
+ with gr.TabItem("πŸ” Evaluations", id=0):
158
+ with gr.Column():
159
+ with gr.Accordion("➑️ See filters", open=False):
160
+ shown_columns = gr.CheckboxGroup(
161
+ choices=[
162
+ c
163
+ for c in COLS
164
+ if c
165
+ not in [
166
+ AutoEvalColumn.dummy.name,
167
+ AutoEvalColumn.model.name,
168
+ AutoEvalColumn.model_type_symbol.name,
169
+ ]
170
+ ],
171
+ value=[
172
+ c
173
+ for c in COLS_LITE
174
+ if c
175
+ not in [
176
+ AutoEvalColumn.dummy.name,
177
+ AutoEvalColumn.model.name,
178
+ AutoEvalColumn.model_type_symbol.name,
179
+ ]
180
+ ],
181
+ label="",
182
+ elem_id="column-select",
183
+ interactive=True,
184
+ )
185
+ # with gr.Column(min_width=780):
186
+ with gr.Row():
187
+ search_bar = gr.Textbox(
188
+ placeholder="πŸ” Search for a model and press ENTER...",
189
+ show_label=False,
190
+ elem_id="search-bar",
191
+ )
192
+ filter_columns = gr.Radio(
193
+ label="⏚ Filter model types",
194
+ choices=["All", "🟒 Base", "πŸ”Ά Finetuned"],
195
+ value="All",
196
+ elem_id="filter-columns",
197
+ )
198
+
199
+ df = pd.read_csv("data/code_eval_board.csv")
200
+ leaderboard_df = gr.components.Dataframe(
201
+ value=df[
202
+ [
203
+ AutoEvalColumn.model_type_symbol.name,
204
+ AutoEvalColumn.model.name,
205
+ ]
206
+ + shown_columns.value
207
+ ],
208
+ headers=[
209
+ AutoEvalColumn.model_type_symbol.name,
210
+ AutoEvalColumn.model.name,
211
+ ]
212
+ + shown_columns.value,
213
+ datatype=TYPES,
214
+ elem_id="leaderboard-table",
215
+ interactive=False,
216
+ )
217
+
218
+ hidden_leaderboard_df = gr.components.Dataframe(
219
+ value=df,
220
+ headers=COLS,
221
+ datatype=["str" for _ in range(len(COLS))],
222
+ visible=False,
223
+ )
224
+
225
+ search_bar.submit(
226
+ search_table,
227
+ [hidden_leaderboard_df, leaderboard_df, search_bar],
228
+ leaderboard_df,
229
+ )
230
+
231
+ filter_columns.change(
232
+ filter_items,
233
+ [hidden_leaderboard_df, leaderboard_df, filter_columns],
234
+ leaderboard_df,
235
+ )
236
+
237
+ shown_columns.change(
238
+ select_columns,
239
+ [hidden_leaderboard_df, shown_columns],
240
+ leaderboard_df,
241
+ )
242
+
243
+ gr.Markdown(
244
+ """
245
+ **Notes:**
246
+ - The Huggingface team is working on their own implementation of this paper as a space, I'll be leaving this space up until that's available.
247
+ - Some scores may not be entirely accurate according to the paper cited as I still work out the kinks and innacuracies of this implementation.
248
+ - For any issues, questions, or comments either open a discussion in this space's community tab or message me directly to my discord: yeyito777.
249
+ - Make sure to check the pinned discussion in this space's community tab for implementation details I'm not 100% about.
250
+ """,
251
+ elem_classes="markdown-text",
252
+ )
253
+
254
+ with gr.TabItem("πŸ“ About", id=2):
255
+ gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
256
+ with gr.TabItem("πŸ› οΈ Submit models", id=3):
257
+ gr.Markdown(SUBMISSION_TEXT)
258
+ gr.Markdown(
259
+ "## πŸ“€ Submit a model here:", elem_classes="markdown-text"
260
+ )
261
+ with gr.Column():
262
+ with gr.Row():
263
+ model_name = gr.Textbox(label="Model name")
264
+ revision_name = gr.Textbox(
265
+ label="revision", placeholder="main"
266
+ )
267
+ with gr.Row():
268
+ precision = gr.Dropdown(
269
+ choices=[
270
+ "float16",
271
+ "bfloat16",
272
+ "8bit",
273
+ "4bit",
274
+ ],
275
+ label="Precision",
276
+ multiselect=False,
277
+ value="float16",
278
+ interactive=True,
279
+ )
280
+ model_type = gr.Dropdown(
281
+ choices=["🟒 base", "πŸ”Ά instruction-tuned"],
282
+ label="Model type",
283
+ multiselect=False,
284
+ value=None,
285
+ interactive=True,
286
+ )
287
+ submit_button = gr.Button("Submit Eval")
288
+ submission_result = gr.Markdown()
289
+ submit_button.click(
290
+ add_new_eval,
291
+ inputs=[model_name, revision_name, precision, model_type],
292
+ outputs=[submission_result],
293
+ )
294
+ gr.Markdown(SUBMISSION_TEXT_2)
295
+
296
+ thread = Thread(target=worker_thread)
297
+ thread.start()
298
+ demo.launch(share=True)
299
+
300
+ # Some worries:
301
+ # 1. Am I testing things correctly in eval.py, following the template format?
302
+
303
+ # 2. Am I choosing the correct splits in run.py? The higherarchy I use is: test > val > train
304
+ # (As in: if test exists, I go with that, then validation, then default)
305
+
306
+ # 3. I decided to go with winogrande_debiased instead of winogrande_l arbitrarily.
307
+ # (Not sure which one open llm leaderboard uses, or what is the standard)
308
+
309
+ # 4. I'm unsure why in eval.py we append the output at the end of the input.
310
 
311
+ # 5. Currently I'm using huggyllama/llama-7b as ref_model, should I switch to llama2-7B? Maybe Mistral-7B?