Spaces:
Build error
Build error
openai zero-shot results
Browse files- .env.example +2 -2
- data/openai_results.csv +0 -0
- llm_toolkit/eval_openai.py +91 -0
- llm_toolkit/logical_reasoning_utils.py +54 -3
- notebooks/04_Few-shot_Prompting_OpenAI.ipynb +0 -0
.env.example
CHANGED
@@ -5,7 +5,7 @@ HF_TOKEN=
|
|
5 |
WANDB_API_KEY=
|
6 |
|
7 |
LOAD_IN_4BIT=false
|
8 |
-
NUM_TRAIN_EPOCHS=
|
9 |
|
10 |
LOGICAL_REASONING_DATA_PATH=datasets/mgtv
|
11 |
-
LOGICAL_REASONING_RESULTS_PATH=results/mgtv-
|
|
|
5 |
WANDB_API_KEY=
|
6 |
|
7 |
LOAD_IN_4BIT=false
|
8 |
+
NUM_TRAIN_EPOCHS=2
|
9 |
|
10 |
LOGICAL_REASONING_DATA_PATH=datasets/mgtv
|
11 |
+
LOGICAL_REASONING_RESULTS_PATH=results/mgtv-results.csv
|
data/openai_results.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
llm_toolkit/eval_openai.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
from dotenv import find_dotenv, load_dotenv
|
4 |
+
|
5 |
+
found_dotenv = find_dotenv(".env")
|
6 |
+
|
7 |
+
if len(found_dotenv) == 0:
|
8 |
+
found_dotenv = find_dotenv(".env.example")
|
9 |
+
print(f"loading env vars from: {found_dotenv}")
|
10 |
+
load_dotenv(found_dotenv, override=False)
|
11 |
+
|
12 |
+
path = os.path.dirname(found_dotenv)
|
13 |
+
print(f"Adding {path} to sys.path")
|
14 |
+
sys.path.append(path)
|
15 |
+
|
16 |
+
from llm_toolkit.llm_utils import *
|
17 |
+
from llm_toolkit.logical_reasoning_utils import *
|
18 |
+
|
19 |
+
model_name = os.getenv("MODEL_NAME")
|
20 |
+
data_path = os.getenv("LOGICAL_REASONING_DATA_PATH")
|
21 |
+
results_path = os.getenv("LOGICAL_REASONING_RESULTS_PATH")
|
22 |
+
max_new_tokens = int(os.getenv("MAX_NEW_TOKENS", 2048))
|
23 |
+
|
24 |
+
print(
|
25 |
+
model_name,
|
26 |
+
data_path,
|
27 |
+
results_path,
|
28 |
+
max_new_tokens,
|
29 |
+
)
|
30 |
+
|
31 |
+
|
32 |
+
def on_num_shots_step_completed(model_name, dataset, predictions, results_path):
|
33 |
+
save_results(
|
34 |
+
model_name,
|
35 |
+
results_path,
|
36 |
+
dataset,
|
37 |
+
predictions,
|
38 |
+
)
|
39 |
+
|
40 |
+
metrics = calc_metrics(dataset["label"], predictions, debug=True)
|
41 |
+
print(f"{model_name} metrics: {metrics}")
|
42 |
+
|
43 |
+
|
44 |
+
def evaluate_model_with_num_shots(
|
45 |
+
model_name,
|
46 |
+
datasets,
|
47 |
+
results_path=None,
|
48 |
+
range_num_shots=[0],
|
49 |
+
max_new_tokens=2048,
|
50 |
+
result_column_name=None,
|
51 |
+
):
|
52 |
+
print(f"Evaluating model: {model_name}")
|
53 |
+
|
54 |
+
eval_dataset = datasets["test"].to_pandas()
|
55 |
+
print_row_details(eval_dataset)
|
56 |
+
|
57 |
+
for num_shots in range_num_shots:
|
58 |
+
print(f"*** Evaluating with num_shots: {num_shots}")
|
59 |
+
|
60 |
+
predictions = eval_openai(eval_dataset, model=model_name, max_new_tokens=max_new_tokens)
|
61 |
+
model_name_with_shorts = (
|
62 |
+
result_column_name
|
63 |
+
if result_column_name
|
64 |
+
else f"{model_name}/shots-{num_shots:02d}"
|
65 |
+
)
|
66 |
+
|
67 |
+
try:
|
68 |
+
on_num_shots_step_completed(
|
69 |
+
model_name_with_shorts, eval_dataset, predictions, results_path
|
70 |
+
)
|
71 |
+
except Exception as e:
|
72 |
+
print(e)
|
73 |
+
|
74 |
+
|
75 |
+
if __name__ == "__main__":
|
76 |
+
datasets = load_logical_reasoning_dataset(
|
77 |
+
data_path,
|
78 |
+
)
|
79 |
+
|
80 |
+
if len(sys.argv) > 1:
|
81 |
+
num = int(sys.argv[1])
|
82 |
+
if num > 0:
|
83 |
+
print(f"--- evaluating {num} entries")
|
84 |
+
datasets["test"] = datasets["test"].select(range(num))
|
85 |
+
|
86 |
+
evaluate_model_with_num_shots(
|
87 |
+
model_name,
|
88 |
+
datasets,
|
89 |
+
results_path=results_path,
|
90 |
+
max_new_tokens=max_new_tokens,
|
91 |
+
)
|
llm_toolkit/logical_reasoning_utils.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import os
|
2 |
import re
|
|
|
|
|
3 |
import pandas as pd
|
4 |
import seaborn as sns
|
5 |
import matplotlib.pyplot as plt
|
@@ -8,6 +10,7 @@ from matplotlib.ticker import MultipleLocator
|
|
8 |
from datasets import load_dataset
|
9 |
import numpy as np
|
10 |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
|
|
11 |
|
12 |
print(f"loading {__file__}")
|
13 |
|
@@ -86,6 +89,7 @@ Please strictly follow these rules when answering the participant's questions.
|
|
86 |
**Participant's question:** {}
|
87 |
"""
|
88 |
|
|
|
89 |
|
90 |
def get_prompt_template(using_p1=True, chinese_prompt=True):
|
91 |
if using_p1:
|
@@ -177,8 +181,12 @@ def save_results(model_name, results_path, dataset, predictions, debug=False):
|
|
177 |
|
178 |
# Create all directories in the path (if they don't exist)
|
179 |
os.makedirs(dir_path, exist_ok=True)
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
182 |
else:
|
183 |
df = pd.read_csv(results_path, on_bad_lines="warn")
|
184 |
|
@@ -215,7 +223,7 @@ def load_logical_reasoning_dataset(
|
|
215 |
messages = [
|
216 |
{
|
217 |
"role": "system",
|
218 |
-
"content":
|
219 |
},
|
220 |
None,
|
221 |
]
|
@@ -419,3 +427,46 @@ def plot_metrics(perf_df, model_name):
|
|
419 |
plt.legend(loc="center left", bbox_to_anchor=(1.0, 0.5))
|
420 |
|
421 |
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import re
|
3 |
+
from langchain_openai import ChatOpenAI
|
4 |
+
from langchain_core.prompts import ChatPromptTemplate
|
5 |
import pandas as pd
|
6 |
import seaborn as sns
|
7 |
import matplotlib.pyplot as plt
|
|
|
10 |
from datasets import load_dataset
|
11 |
import numpy as np
|
12 |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
13 |
+
from tqdm import tqdm
|
14 |
|
15 |
print(f"loading {__file__}")
|
16 |
|
|
|
89 |
**Participant's question:** {}
|
90 |
"""
|
91 |
|
92 |
+
system_prompt = "You are an expert in logical reasoning."
|
93 |
|
94 |
def get_prompt_template(using_p1=True, chinese_prompt=True):
|
95 |
if using_p1:
|
|
|
181 |
|
182 |
# Create all directories in the path (if they don't exist)
|
183 |
os.makedirs(dir_path, exist_ok=True)
|
184 |
+
|
185 |
+
if isinstance(dataset, pd.DataFrame):
|
186 |
+
df = dataset
|
187 |
+
else:
|
188 |
+
df = dataset.to_pandas()
|
189 |
+
df.drop(columns=["answer", "prompt", "train_text"], inplace=True, errors="ignore")
|
190 |
else:
|
191 |
df = pd.read_csv(results_path, on_bad_lines="warn")
|
192 |
|
|
|
223 |
messages = [
|
224 |
{
|
225 |
"role": "system",
|
226 |
+
"content": system_prompt,
|
227 |
},
|
228 |
None,
|
229 |
]
|
|
|
427 |
plt.legend(loc="center left", bbox_to_anchor=(1.0, 0.5))
|
428 |
|
429 |
plt.show()
|
430 |
+
|
431 |
+
|
432 |
+
def reasoning_with_openai(
|
433 |
+
row, user_prompt, max_tokens=None, model="gpt-4o-mini", base_url=None
|
434 |
+
):
|
435 |
+
llm = ChatOpenAI(
|
436 |
+
model=model,
|
437 |
+
temperature=0,
|
438 |
+
max_tokens=max_tokens,
|
439 |
+
timeout=None,
|
440 |
+
max_retries=2,
|
441 |
+
base_url=base_url,
|
442 |
+
)
|
443 |
+
|
444 |
+
prompt = ChatPromptTemplate.from_messages(
|
445 |
+
[
|
446 |
+
(
|
447 |
+
"system",
|
448 |
+
system_prompt,
|
449 |
+
),
|
450 |
+
(
|
451 |
+
"human",
|
452 |
+
user_prompt.format(row["puzzle"], row["truth"], row["text"]),
|
453 |
+
),
|
454 |
+
]
|
455 |
+
)
|
456 |
+
|
457 |
+
chain = prompt | llm
|
458 |
+
response = chain.invoke(input={})
|
459 |
+
|
460 |
+
return response.content
|
461 |
+
|
462 |
+
|
463 |
+
def eval_openai(eval_dataset, model="gpt-4o-mini", max_new_tokens=300):
|
464 |
+
user_prompt = get_prompt_template(using_p1=False, chinese_prompt=True)
|
465 |
+
total = len(eval_dataset)
|
466 |
+
predictions = []
|
467 |
+
|
468 |
+
for i in tqdm(range(total)):
|
469 |
+
output = reasoning_with_openai(eval_dataset.iloc[i], user_prompt,model=model, max_tokens=max_new_tokens)
|
470 |
+
predictions.append(output)
|
471 |
+
|
472 |
+
return predictions
|
notebooks/04_Few-shot_Prompting_OpenAI.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|