inflaton commited on
Commit
8b9bb19
·
1 Parent(s): 090acf8

openai zero-shot results

Browse files
.env.example CHANGED
@@ -5,7 +5,7 @@ HF_TOKEN=
5
  WANDB_API_KEY=
6
 
7
  LOAD_IN_4BIT=false
8
- NUM_TRAIN_EPOCHS=3
9
 
10
  LOGICAL_REASONING_DATA_PATH=datasets/mgtv
11
- LOGICAL_REASONING_RESULTS_PATH=results/mgtv-results_l40.csv
 
5
  WANDB_API_KEY=
6
 
7
  LOAD_IN_4BIT=false
8
+ NUM_TRAIN_EPOCHS=2
9
 
10
  LOGICAL_REASONING_DATA_PATH=datasets/mgtv
11
+ LOGICAL_REASONING_RESULTS_PATH=results/mgtv-results.csv
data/openai_results.csv ADDED
The diff for this file is too large to render. See raw diff
 
llm_toolkit/eval_openai.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from dotenv import find_dotenv, load_dotenv
4
+
5
+ found_dotenv = find_dotenv(".env")
6
+
7
+ if len(found_dotenv) == 0:
8
+ found_dotenv = find_dotenv(".env.example")
9
+ print(f"loading env vars from: {found_dotenv}")
10
+ load_dotenv(found_dotenv, override=False)
11
+
12
+ path = os.path.dirname(found_dotenv)
13
+ print(f"Adding {path} to sys.path")
14
+ sys.path.append(path)
15
+
16
+ from llm_toolkit.llm_utils import *
17
+ from llm_toolkit.logical_reasoning_utils import *
18
+
19
+ model_name = os.getenv("MODEL_NAME")
20
+ data_path = os.getenv("LOGICAL_REASONING_DATA_PATH")
21
+ results_path = os.getenv("LOGICAL_REASONING_RESULTS_PATH")
22
+ max_new_tokens = int(os.getenv("MAX_NEW_TOKENS", 2048))
23
+
24
+ print(
25
+ model_name,
26
+ data_path,
27
+ results_path,
28
+ max_new_tokens,
29
+ )
30
+
31
+
32
+ def on_num_shots_step_completed(model_name, dataset, predictions, results_path):
33
+ save_results(
34
+ model_name,
35
+ results_path,
36
+ dataset,
37
+ predictions,
38
+ )
39
+
40
+ metrics = calc_metrics(dataset["label"], predictions, debug=True)
41
+ print(f"{model_name} metrics: {metrics}")
42
+
43
+
44
+ def evaluate_model_with_num_shots(
45
+ model_name,
46
+ datasets,
47
+ results_path=None,
48
+ range_num_shots=[0],
49
+ max_new_tokens=2048,
50
+ result_column_name=None,
51
+ ):
52
+ print(f"Evaluating model: {model_name}")
53
+
54
+ eval_dataset = datasets["test"].to_pandas()
55
+ print_row_details(eval_dataset)
56
+
57
+ for num_shots in range_num_shots:
58
+ print(f"*** Evaluating with num_shots: {num_shots}")
59
+
60
+ predictions = eval_openai(eval_dataset, model=model_name, max_new_tokens=max_new_tokens)
61
+ model_name_with_shorts = (
62
+ result_column_name
63
+ if result_column_name
64
+ else f"{model_name}/shots-{num_shots:02d}"
65
+ )
66
+
67
+ try:
68
+ on_num_shots_step_completed(
69
+ model_name_with_shorts, eval_dataset, predictions, results_path
70
+ )
71
+ except Exception as e:
72
+ print(e)
73
+
74
+
75
+ if __name__ == "__main__":
76
+ datasets = load_logical_reasoning_dataset(
77
+ data_path,
78
+ )
79
+
80
+ if len(sys.argv) > 1:
81
+ num = int(sys.argv[1])
82
+ if num > 0:
83
+ print(f"--- evaluating {num} entries")
84
+ datasets["test"] = datasets["test"].select(range(num))
85
+
86
+ evaluate_model_with_num_shots(
87
+ model_name,
88
+ datasets,
89
+ results_path=results_path,
90
+ max_new_tokens=max_new_tokens,
91
+ )
llm_toolkit/logical_reasoning_utils.py CHANGED
@@ -1,5 +1,7 @@
1
  import os
2
  import re
 
 
3
  import pandas as pd
4
  import seaborn as sns
5
  import matplotlib.pyplot as plt
@@ -8,6 +10,7 @@ from matplotlib.ticker import MultipleLocator
8
  from datasets import load_dataset
9
  import numpy as np
10
  from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 
11
 
12
  print(f"loading {__file__}")
13
 
@@ -86,6 +89,7 @@ Please strictly follow these rules when answering the participant's questions.
86
  **Participant's question:** {}
87
  """
88
 
 
89
 
90
  def get_prompt_template(using_p1=True, chinese_prompt=True):
91
  if using_p1:
@@ -177,8 +181,12 @@ def save_results(model_name, results_path, dataset, predictions, debug=False):
177
 
178
  # Create all directories in the path (if they don't exist)
179
  os.makedirs(dir_path, exist_ok=True)
180
- df = dataset.to_pandas()
181
- df.drop(columns=["answer", "prompt", "train_text"], inplace=True)
 
 
 
 
182
  else:
183
  df = pd.read_csv(results_path, on_bad_lines="warn")
184
 
@@ -215,7 +223,7 @@ def load_logical_reasoning_dataset(
215
  messages = [
216
  {
217
  "role": "system",
218
- "content": "You are an expert in logical reasoning.",
219
  },
220
  None,
221
  ]
@@ -419,3 +427,46 @@ def plot_metrics(perf_df, model_name):
419
  plt.legend(loc="center left", bbox_to_anchor=(1.0, 0.5))
420
 
421
  plt.show()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import re
3
+ from langchain_openai import ChatOpenAI
4
+ from langchain_core.prompts import ChatPromptTemplate
5
  import pandas as pd
6
  import seaborn as sns
7
  import matplotlib.pyplot as plt
 
10
  from datasets import load_dataset
11
  import numpy as np
12
  from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
13
+ from tqdm import tqdm
14
 
15
  print(f"loading {__file__}")
16
 
 
89
  **Participant's question:** {}
90
  """
91
 
92
+ system_prompt = "You are an expert in logical reasoning."
93
 
94
  def get_prompt_template(using_p1=True, chinese_prompt=True):
95
  if using_p1:
 
181
 
182
  # Create all directories in the path (if they don't exist)
183
  os.makedirs(dir_path, exist_ok=True)
184
+
185
+ if isinstance(dataset, pd.DataFrame):
186
+ df = dataset
187
+ else:
188
+ df = dataset.to_pandas()
189
+ df.drop(columns=["answer", "prompt", "train_text"], inplace=True, errors="ignore")
190
  else:
191
  df = pd.read_csv(results_path, on_bad_lines="warn")
192
 
 
223
  messages = [
224
  {
225
  "role": "system",
226
+ "content": system_prompt,
227
  },
228
  None,
229
  ]
 
427
  plt.legend(loc="center left", bbox_to_anchor=(1.0, 0.5))
428
 
429
  plt.show()
430
+
431
+
432
+ def reasoning_with_openai(
433
+ row, user_prompt, max_tokens=None, model="gpt-4o-mini", base_url=None
434
+ ):
435
+ llm = ChatOpenAI(
436
+ model=model,
437
+ temperature=0,
438
+ max_tokens=max_tokens,
439
+ timeout=None,
440
+ max_retries=2,
441
+ base_url=base_url,
442
+ )
443
+
444
+ prompt = ChatPromptTemplate.from_messages(
445
+ [
446
+ (
447
+ "system",
448
+ system_prompt,
449
+ ),
450
+ (
451
+ "human",
452
+ user_prompt.format(row["puzzle"], row["truth"], row["text"]),
453
+ ),
454
+ ]
455
+ )
456
+
457
+ chain = prompt | llm
458
+ response = chain.invoke(input={})
459
+
460
+ return response.content
461
+
462
+
463
+ def eval_openai(eval_dataset, model="gpt-4o-mini", max_new_tokens=300):
464
+ user_prompt = get_prompt_template(using_p1=False, chinese_prompt=True)
465
+ total = len(eval_dataset)
466
+ predictions = []
467
+
468
+ for i in tqdm(range(total)):
469
+ output = reasoning_with_openai(eval_dataset.iloc[i], user_prompt,model=model, max_tokens=max_new_tokens)
470
+ predictions.append(output)
471
+
472
+ return predictions
notebooks/04_Few-shot_Prompting_OpenAI.ipynb ADDED
The diff for this file is too large to render. See raw diff