Spaces:
Build error
Build error
try 5-shot for open source models
Browse files- data/best_metrics.csv +2 -1
- data/best_results.csv +0 -0
- data/openai_metrics.csv +3 -0
- llama-factory/saves/qwen2_7b/lora/sft_4bit/checkpoint-176/merges.txt +0 -0
- llama-factory/saves/qwen2_7b/lora/sft_4bit/checkpoint-264/merges.txt +0 -0
- llama-factory/saves/qwen2_7b/lora/sft_4bit/checkpoint-88/merges.txt +0 -0
- llama-factory/saves/qwen2_7b/lora/sft_4bit/merges.txt +0 -0
- llm_toolkit/eval_shots.py +1 -1
- llm_toolkit/logical_reasoning_utils.py +11 -6
- notebooks/00_Data Analysis.ipynb +0 -0
- notebooks/04b_OpenAI-Models_analysis.ipynb +0 -0
- scripts/eval-mgtv-shots_4bit.sh +1 -1
data/best_metrics.csv
CHANGED
@@ -7,6 +7,7 @@ index,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
|
7 |
6,Llama3.1-70B-Chinese-Chat,Llama3.1-70B-Chinese-Chat,0.7963333333333333,0.8248972880055918,0.7963333333333333,0.8076868978089201,1.0
|
8 |
7,Qwen2-72B-Instruct,Qwen2-72B-Instruct,0.784,0.8354349234761956,0.784,0.804194683154365,1.0
|
9 |
8,Ensemble Model,Ensemble Model,0.8193333333333334,0.8407464756633664,0.8193333333333334,0.828054127213081,1.0
|
10 |
-
9,gpt-4o-mini (
|
11 |
10,o1-mini (10-shot),o1-mini (10-shot),0.725,0.7892485648334764,0.725,0.7485623974683336,1.0
|
12 |
11,gpt-4o (10-shot),gpt-4o (10-shot),0.7916666666666666,0.8227707658360168,0.7916666666666666,0.803614688453356,0.9996666666666667
|
|
|
|
7 |
6,Llama3.1-70B-Chinese-Chat,Llama3.1-70B-Chinese-Chat,0.7963333333333333,0.8248972880055918,0.7963333333333333,0.8076868978089201,1.0
|
8 |
7,Qwen2-72B-Instruct,Qwen2-72B-Instruct,0.784,0.8354349234761956,0.784,0.804194683154365,1.0
|
9 |
8,Ensemble Model,Ensemble Model,0.8193333333333334,0.8407464756633664,0.8193333333333334,0.828054127213081,1.0
|
10 |
+
9,gpt-4o-mini (0-shot),gpt-4o-mini (0-shot),0.7176666666666667,0.785706730193659,0.7176666666666667,0.7296061848734905,1.0
|
11 |
10,o1-mini (10-shot),o1-mini (10-shot),0.725,0.7892485648334764,0.725,0.7485623974683336,1.0
|
12 |
11,gpt-4o (10-shot),gpt-4o (10-shot),0.7916666666666666,0.8227707658360168,0.7916666666666666,0.803614688453356,0.9996666666666667
|
13 |
+
12,o1-preview (10-shot),o1-preview (10-shot),0.749,0.7964482186234537,0.749,0.7677316493549238,1.0
|
data/best_results.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
data/openai_metrics.csv
CHANGED
@@ -13,5 +13,8 @@ shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
|
13 |
30,gpt-4o,gpt-4o/shots-30,0.7886666666666666,0.8260847852316618,0.7886666666666666,0.8030949295928699,0.999
|
14 |
40,gpt-4o,gpt-4o/shots-40,0.784,0.8233509309291644,0.784,0.7993336791122846,0.9973333333333333
|
15 |
50,gpt-4o,gpt-4o/shots-50,0.787,0.8234800466218334,0.787,0.8013530974301947,0.9993333333333333
|
|
|
16 |
10,o1-mini,o1-mini/shots-10,0.725,0.7892485648334764,0.725,0.7485623974683336,0.9943333333333333
|
|
|
|
|
17 |
10,gpt-4o-mini_batch,gpt-4o-mini_batch/shots-10,0.6576666666666666,0.7689201800674901,0.6576666666666666,0.6748319385295091,0.996
|
|
|
13 |
30,gpt-4o,gpt-4o/shots-30,0.7886666666666666,0.8260847852316618,0.7886666666666666,0.8030949295928699,0.999
|
14 |
40,gpt-4o,gpt-4o/shots-40,0.784,0.8233509309291644,0.784,0.7993336791122846,0.9973333333333333
|
15 |
50,gpt-4o,gpt-4o/shots-50,0.787,0.8234800466218334,0.787,0.8013530974301947,0.9993333333333333
|
16 |
+
0,o1-mini,o1-mini/shots-00,0.7083333333333334,0.7848098266888749,0.7083333333333334,0.7377068425566796,0.999
|
17 |
10,o1-mini,o1-mini/shots-10,0.725,0.7892485648334764,0.725,0.7485623974683336,0.9943333333333333
|
18 |
+
0,o1-preview,o1-preview/shots-00,0.721,0.7849371317342158,0.721,0.7451207069815194,0.998
|
19 |
+
10,o1-preview,o1-preview/shots-10,0.749,0.7964482186234537,0.749,0.7677316493549238,0.9873333333333333
|
20 |
10,gpt-4o-mini_batch,gpt-4o-mini_batch/shots-10,0.6576666666666666,0.7689201800674901,0.6576666666666666,0.6748319385295091,0.996
|
llama-factory/saves/qwen2_7b/lora/sft_4bit/checkpoint-176/merges.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
llama-factory/saves/qwen2_7b/lora/sft_4bit/checkpoint-264/merges.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
llama-factory/saves/qwen2_7b/lora/sft_4bit/checkpoint-88/merges.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
llama-factory/saves/qwen2_7b/lora/sft_4bit/merges.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
llm_toolkit/eval_shots.py
CHANGED
@@ -99,7 +99,7 @@ def evaluate_model_with_num_shots(
|
|
99 |
model_name,
|
100 |
data_path,
|
101 |
start_num_shots=0,
|
102 |
-
range_num_shots=[0,
|
103 |
batch_size=1,
|
104 |
max_new_tokens=2048,
|
105 |
device="cuda",
|
|
|
99 |
model_name,
|
100 |
data_path,
|
101 |
start_num_shots=0,
|
102 |
+
range_num_shots=[0, 5],
|
103 |
batch_size=1,
|
104 |
max_new_tokens=2048,
|
105 |
device="cuda",
|
llm_toolkit/logical_reasoning_utils.py
CHANGED
@@ -429,11 +429,15 @@ def get_metrics_df(df, variant="epoch"):
|
|
429 |
perf_df = pd.DataFrame(
|
430 |
columns=[variant, "model", "run", "accuracy", "precision", "recall", "f1"]
|
431 |
)
|
432 |
-
columns =
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
|
|
|
|
|
|
|
|
437 |
print("columns:", columns)
|
438 |
for i, col in enumerate(columns):
|
439 |
metrics = calc_metrics(df["label"], df[col], debug=False)
|
@@ -445,7 +449,8 @@ def get_metrics_df(df, variant="epoch"):
|
|
445 |
if variant == "shots":
|
446 |
parts = col.split("/shots-")
|
447 |
new_model_metrics["shots"] = int(parts[1])
|
448 |
-
|
|
|
449 |
|
450 |
new_model_metrics.update(metrics)
|
451 |
|
|
|
429 |
perf_df = pd.DataFrame(
|
430 |
columns=[variant, "model", "run", "accuracy", "precision", "recall", "f1"]
|
431 |
)
|
432 |
+
columns = (
|
433 |
+
df.columns[5:]
|
434 |
+
if variant == "index"
|
435 |
+
else [
|
436 |
+
col
|
437 |
+
for col in df.columns[5:]
|
438 |
+
if variant in col or variant == "epoch" and "_torch." in col
|
439 |
+
]
|
440 |
+
)
|
441 |
print("columns:", columns)
|
442 |
for i, col in enumerate(columns):
|
443 |
metrics = calc_metrics(df["label"], df[col], debug=False)
|
|
|
449 |
if variant == "shots":
|
450 |
parts = col.split("/shots-")
|
451 |
new_model_metrics["shots"] = int(parts[1])
|
452 |
+
if variant in new_model_metrics["model"]:
|
453 |
+
new_model_metrics["model"] = parts[0]
|
454 |
|
455 |
new_model_metrics.update(metrics)
|
456 |
|
notebooks/00_Data Analysis.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/04b_OpenAI-Models_analysis.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
scripts/eval-mgtv-shots_4bit.sh
CHANGED
@@ -13,7 +13,7 @@ cat /etc/os-release
|
|
13 |
lscpu
|
14 |
grep MemTotal /proc/meminfo
|
15 |
|
16 |
-
export START_NUM_SHOTS=
|
17 |
|
18 |
#$BASEDIR/scripts/eval-mgtv-internlm-20b.sh
|
19 |
|
|
|
13 |
lscpu
|
14 |
grep MemTotal /proc/meminfo
|
15 |
|
16 |
+
export START_NUM_SHOTS=5
|
17 |
|
18 |
#$BASEDIR/scripts/eval-mgtv-internlm-20b.sh
|
19 |
|