Spaces:

inflaton-ai
/

logical-reasoning

Build error

dh-mc commited on Sep 18, 2024

Commit

d2150e8

1 Parent(s): 545719f

try 5-shot for open source models

Files changed (12) hide show

data/best_metrics.csv CHANGED Viewed

@@ -7,6 +7,7 @@ index,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
 6,Llama3.1-70B-Chinese-Chat,Llama3.1-70B-Chinese-Chat,0.7963333333333333,0.8248972880055918,0.7963333333333333,0.8076868978089201,1.0
 7,Qwen2-72B-Instruct,Qwen2-72B-Instruct,0.784,0.8354349234761956,0.784,0.804194683154365,1.0
 8,Ensemble Model,Ensemble Model,0.8193333333333334,0.8407464756633664,0.8193333333333334,0.828054127213081,1.0
-9,gpt-4o-mini (10-shot),gpt-4o-mini (10-shot),0.6793333333333333,0.7728086050218999,0.6793333333333333,0.6916749681933937,0.9996666666666667
 10,o1-mini (10-shot),o1-mini (10-shot),0.725,0.7892485648334764,0.725,0.7485623974683336,1.0
 11,gpt-4o (10-shot),gpt-4o (10-shot),0.7916666666666666,0.8227707658360168,0.7916666666666666,0.803614688453356,0.9996666666666667

 6,Llama3.1-70B-Chinese-Chat,Llama3.1-70B-Chinese-Chat,0.7963333333333333,0.8248972880055918,0.7963333333333333,0.8076868978089201,1.0
 7,Qwen2-72B-Instruct,Qwen2-72B-Instruct,0.784,0.8354349234761956,0.784,0.804194683154365,1.0
 8,Ensemble Model,Ensemble Model,0.8193333333333334,0.8407464756633664,0.8193333333333334,0.828054127213081,1.0
+9,gpt-4o-mini (0-shot),gpt-4o-mini (0-shot),0.7176666666666667,0.785706730193659,0.7176666666666667,0.7296061848734905,1.0
 10,o1-mini (10-shot),o1-mini (10-shot),0.725,0.7892485648334764,0.725,0.7485623974683336,1.0
 11,gpt-4o (10-shot),gpt-4o (10-shot),0.7916666666666666,0.8227707658360168,0.7916666666666666,0.803614688453356,0.9996666666666667
+12,o1-preview (10-shot),o1-preview (10-shot),0.749,0.7964482186234537,0.749,0.7677316493549238,1.0

data/best_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/openai_metrics.csv CHANGED Viewed

@@ -13,5 +13,8 @@ shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
 30,gpt-4o,gpt-4o/shots-30,0.7886666666666666,0.8260847852316618,0.7886666666666666,0.8030949295928699,0.999
 40,gpt-4o,gpt-4o/shots-40,0.784,0.8233509309291644,0.784,0.7993336791122846,0.9973333333333333
 50,gpt-4o,gpt-4o/shots-50,0.787,0.8234800466218334,0.787,0.8013530974301947,0.9993333333333333
 10,o1-mini,o1-mini/shots-10,0.725,0.7892485648334764,0.725,0.7485623974683336,0.9943333333333333
 10,gpt-4o-mini_batch,gpt-4o-mini_batch/shots-10,0.6576666666666666,0.7689201800674901,0.6576666666666666,0.6748319385295091,0.996

 30,gpt-4o,gpt-4o/shots-30,0.7886666666666666,0.8260847852316618,0.7886666666666666,0.8030949295928699,0.999
 40,gpt-4o,gpt-4o/shots-40,0.784,0.8233509309291644,0.784,0.7993336791122846,0.9973333333333333
 50,gpt-4o,gpt-4o/shots-50,0.787,0.8234800466218334,0.787,0.8013530974301947,0.9993333333333333
+0,o1-mini,o1-mini/shots-00,0.7083333333333334,0.7848098266888749,0.7083333333333334,0.7377068425566796,0.999
 10,o1-mini,o1-mini/shots-10,0.725,0.7892485648334764,0.725,0.7485623974683336,0.9943333333333333
+0,o1-preview,o1-preview/shots-00,0.721,0.7849371317342158,0.721,0.7451207069815194,0.998
+10,o1-preview,o1-preview/shots-10,0.749,0.7964482186234537,0.749,0.7677316493549238,0.9873333333333333
 10,gpt-4o-mini_batch,gpt-4o-mini_batch/shots-10,0.6576666666666666,0.7689201800674901,0.6576666666666666,0.6748319385295091,0.996

llama-factory/saves/qwen2_7b/lora/sft_4bit/checkpoint-176/merges.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff

llama-factory/saves/qwen2_7b/lora/sft_4bit/checkpoint-264/merges.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff

llama-factory/saves/qwen2_7b/lora/sft_4bit/checkpoint-88/merges.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff

llama-factory/saves/qwen2_7b/lora/sft_4bit/merges.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff

llm_toolkit/eval_shots.py CHANGED Viewed

@@ -99,7 +99,7 @@ def evaluate_model_with_num_shots(
     model_name,
     data_path,
     start_num_shots=0,
-    range_num_shots=[0, 10],
     batch_size=1,
     max_new_tokens=2048,
     device="cuda",

     model_name,
     data_path,
     start_num_shots=0,
+    range_num_shots=[0, 5],
     batch_size=1,
     max_new_tokens=2048,
     device="cuda",

llm_toolkit/logical_reasoning_utils.py CHANGED Viewed

@@ -429,11 +429,15 @@ def get_metrics_df(df, variant="epoch"):
     perf_df = pd.DataFrame(
         columns=[variant, "model", "run", "accuracy", "precision", "recall", "f1"]
     )
-    columns = [
-        col
-        for col in df.columns[5:]
-        if variant in col or variant == "epoch" and "_torch." in col
-    ]
     print("columns:", columns)
     for i, col in enumerate(columns):
         metrics = calc_metrics(df["label"], df[col], debug=False)
@@ -445,7 +449,8 @@ def get_metrics_df(df, variant="epoch"):
         if variant == "shots":
             parts = col.split("/shots-")
             new_model_metrics["shots"] = int(parts[1])
-            # new_model_metrics["model"] = parts[0]
         new_model_metrics.update(metrics)

     perf_df = pd.DataFrame(
         columns=[variant, "model", "run", "accuracy", "precision", "recall", "f1"]
     )
+    columns = (
+        df.columns[5:]
+        if variant == "index"
+        else [
+            col
+            for col in df.columns[5:]
+            if variant in col or variant == "epoch" and "_torch." in col
+        ]
+    )
     print("columns:", columns)
     for i, col in enumerate(columns):
         metrics = calc_metrics(df["label"], df[col], debug=False)
         if variant == "shots":
             parts = col.split("/shots-")
             new_model_metrics["shots"] = int(parts[1])
+            if variant in new_model_metrics["model"]:
+                new_model_metrics["model"] = parts[0]
         new_model_metrics.update(metrics)

notebooks/00_Data Analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/04b_OpenAI-Models_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

scripts/eval-mgtv-shots_4bit.sh CHANGED Viewed

@@ -13,7 +13,7 @@ cat /etc/os-release
 lscpu
 grep MemTotal /proc/meminfo
-export START_NUM_SHOTS=10
 #$BASEDIR/scripts/eval-mgtv-internlm-20b.sh

 lscpu
 grep MemTotal /proc/meminfo
+export START_NUM_SHOTS=5
 #$BASEDIR/scripts/eval-mgtv-internlm-20b.sh