Spaces:
Build error
Build error
openai batch
Browse files- .gitattributes +3 -0
- data/best_metrics.csv +10 -9
- data/best_results.csv +0 -0
- data/gpt-4o-mini-10-shots_batch_results.jsonl +0 -0
- data/internlm2_5-7b-chat_metrics.csv +11 -0
- data/openai_metrics.csv +16 -15
- datasets/mgtv/gpt-4o-mini.jsonl +3 -0
- datasets/mgtv/o1-mini.jsonl +3 -0
- llm_toolkit/llm_utils.py +43 -0
- llm_toolkit/logical_reasoning_utils.py +54 -27
- notebooks/00_Data Analysis.ipynb +0 -0
- notebooks/01a_internlm2_5-7b-chat_analysis.ipynb +0 -0
- notebooks/04b_OpenAI-Models_analysis.ipynb +0 -0
- notebooks/04c_OpenAI-o1.ipynb +1 -0
- notebooks/04d_OpenAI-batch.ipynb +1 -0
.gitattributes
CHANGED
@@ -105,3 +105,6 @@ results/test_b-results_r5.csv filter=lfs diff=lfs merge=lfs -text
|
|
105 |
results/test_b-results_r6.csv filter=lfs diff=lfs merge=lfs -text
|
106 |
mgtv_train_p1.json filter=lfs diff=lfs merge=lfs -text
|
107 |
mgtv_train_p2.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
108 |
filter=lfs diff=lfs merge=lfs -text
|
|
|
|
105 |
results/test_b-results_r6.csv filter=lfs diff=lfs merge=lfs -text
|
106 |
mgtv_train_p1.json filter=lfs diff=lfs merge=lfs -text
|
107 |
mgtv_train_p2.json filter=lfs diff=lfs merge=lfs -text
|
108 |
+
datasets/mgtv/o1-mini.jsonl filter=lfs diff=lfs merge=lfs -text
|
109 |
+
datasets/mgtv/Icon
|
110 |
filter=lfs diff=lfs merge=lfs -text
|
111 |
+
datasets/mgtv/gpt-4o-mini.jsonl filter=lfs diff=lfs merge=lfs -text
|
data/best_metrics.csv
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
index,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
-
1,internlm2_5-7b-chat
|
3 |
-
2,
|
4 |
-
3,
|
5 |
-
4,
|
6 |
-
5,Llama3.1-
|
7 |
-
6,
|
8 |
-
7,
|
9 |
-
8,
|
10 |
-
9,gpt-4o,gpt-4o,0.
|
|
|
|
1 |
index,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
+
1,internlm2_5-7b-chat,internlm2_5-7b-chat,0.7496666666666667,0.8041871978859686,0.7496666666666667,0.7660159670998776,1.0
|
3 |
+
2,internlm2_5-7b-chat-1m,internlm2_5-7b-chat-1m,0.803,0.8031411888150441,0.803,0.8028064320197301,1.0
|
4 |
+
3,Mistral-7B-v0.3-Chinese-Chat,Mistral-7B-v0.3-Chinese-Chat,0.75,0.7885868317699068,0.75,0.7648234347578796,1.0
|
5 |
+
4,Qwen2-7B-Instruct,Qwen2-7B-Instruct,0.759,0.8005303465799652,0.759,0.7748745026535183,1.0
|
6 |
+
5,Llama3.1-8B-Chinese-Chat,Llama3.1-8B-Chinese-Chat,0.78,0.810582723471486,0.78,0.7924651054056209,1.0
|
7 |
+
6,Llama3.1-70B-Chinese-Chat,Llama3.1-70B-Chinese-Chat,0.7963333333333333,0.8248972880055918,0.7963333333333333,0.8076868978089201,1.0
|
8 |
+
7,Qwen2-72B-Instruct,Qwen2-72B-Instruct,0.784,0.8354349234761956,0.784,0.804194683154365,1.0
|
9 |
+
8,Ensemble Model,Ensemble Model,0.8193333333333334,0.8407464756633664,0.8193333333333334,0.828054127213081,1.0
|
10 |
+
9,gpt-4o-mini (0-shot),gpt-4o-mini (0-shot),0.7176666666666667,0.785706730193659,0.7176666666666667,0.7296061848734905,1.0
|
11 |
+
10,gpt-4o (10-shot),gpt-4o (10-shot),0.7916666666666666,0.8227707658360168,0.7916666666666666,0.803614688453356,0.9996666666666667
|
data/best_results.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
data/gpt-4o-mini-10-shots_batch_results.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/internlm2_5-7b-chat_metrics.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
+
0.0,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-35_torch.bfloat16_lf,0.7193333333333334,0.7863486093365692,0.7193333333333334,0.7330498811142795,1.0
|
3 |
+
0.2,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-70_torch.bfloat16_lf,0.726,0.7900250828103491,0.726,0.7396583495246526,1.0
|
4 |
+
0.4,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-105_torch.bfloat16_lf,0.6736666666666666,0.8044565554629858,0.6736666666666666,0.7104123104529902,1.0
|
5 |
+
0.6,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-140_torch.bfloat16_lf,0.7496666666666667,0.8041871978859686,0.7496666666666667,0.7660159670998776,1.0
|
6 |
+
0.8,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-175_torch.bfloat16_lf,0.726,0.8094634420846424,0.726,0.751394838822856,1.0
|
7 |
+
1.0,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-210_torch.bfloat16_lf,0.7276666666666667,0.8039673699820601,0.7276666666666667,0.7488653386949028,1.0
|
8 |
+
1.2,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-245_torch.bfloat16_lf,0.747,0.8055537753403307,0.747,0.76527383722639,1.0
|
9 |
+
1.4,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-280_torch.bfloat16_lf,0.7166666666666667,0.8059535682746547,0.7166666666666667,0.7432427946178835,1.0
|
10 |
+
1.6,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-315_torch.bfloat16_lf,0.6983333333333334,0.8119110469658597,0.6983333333333334,0.7347246872892312,1.0
|
11 |
+
1.8,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-350_torch.bfloat16_lf,0.7076666666666667,0.8120132783051135,0.7076666666666667,0.7408145046817652,1.0
|
data/openai_metrics.csv
CHANGED
@@ -1,15 +1,16 @@
|
|
1 |
-
shots,model,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
-
0,gpt-4o-mini,0.7176666666666667,0.785706730193659,0.7176666666666667,0.7296061848734905,0.9916666666666667
|
3 |
-
5,gpt-4o-mini,0.7176666666666667,0.7767294185987051,0.7176666666666667,0.7181068311028772,0.9996666666666667
|
4 |
-
10,gpt-4o-mini,0.6793333333333333,0.7728086050218999,0.6793333333333333,0.6916749681933937,0.9983333333333333
|
5 |
-
20,gpt-4o-mini,0.6623333333333333,0.7686706009175459,0.6623333333333333,0.6798015109939115,0.998
|
6 |
-
30,gpt-4o-mini,0.6873333333333334,0.7684209723431035,0.6873333333333334,0.6913018667081989,0.999
|
7 |
-
40,gpt-4o-mini,0.6923333333333334,0.7639874967862498,0.6923333333333334,0.6924934068935911,0.9986666666666667
|
8 |
-
50,gpt-4o-mini,0.717,0.7692638634416518,0.717,0.7105227254860433,0.9993333333333333
|
9 |
-
0,gpt-4o,0.782,0.8204048322982596,0.782,0.7953019682198627,0.066
|
10 |
-
5,gpt-4o,0.7873333333333333,0.8230974205170392,0.7873333333333333,0.8000290527498529,0.998
|
11 |
-
10,gpt-4o,0.7916666666666666,0.8227707658360168,0.7916666666666666,0.803614688453356,0.9996666666666667
|
12 |
-
20,gpt-4o,0.7816666666666666,0.8204541793856629,0.7816666666666666,0.7967017169880498,0.9993333333333333
|
13 |
-
30,gpt-4o,0.7886666666666666,0.8260847852316618,0.7886666666666666,0.8030949295928699,0.999
|
14 |
-
40,gpt-4o,0.784,0.8233509309291644,0.784,0.7993336791122846,0.9973333333333333
|
15 |
-
50,gpt-4o,0.787,0.8234800466218334,0.787,0.8013530974301947,0.9993333333333333
|
|
|
|
1 |
+
shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
+
0,gpt-4o-mini,gpt-4o-mini/shots-00,0.7176666666666667,0.785706730193659,0.7176666666666667,0.7296061848734905,0.9916666666666667
|
3 |
+
5,gpt-4o-mini,gpt-4o-mini/shots-05,0.7176666666666667,0.7767294185987051,0.7176666666666667,0.7181068311028772,0.9996666666666667
|
4 |
+
10,gpt-4o-mini,gpt-4o-mini/shots-10,0.6793333333333333,0.7728086050218999,0.6793333333333333,0.6916749681933937,0.9983333333333333
|
5 |
+
20,gpt-4o-mini,gpt-4o-mini/shots-20,0.6623333333333333,0.7686706009175459,0.6623333333333333,0.6798015109939115,0.998
|
6 |
+
30,gpt-4o-mini,gpt-4o-mini/shots-30,0.6873333333333334,0.7684209723431035,0.6873333333333334,0.6913018667081989,0.999
|
7 |
+
40,gpt-4o-mini,gpt-4o-mini/shots-40,0.6923333333333334,0.7639874967862498,0.6923333333333334,0.6924934068935911,0.9986666666666667
|
8 |
+
50,gpt-4o-mini,gpt-4o-mini/shots-50,0.717,0.7692638634416518,0.717,0.7105227254860433,0.9993333333333333
|
9 |
+
0,gpt-4o,gpt-4o/shots-00,0.782,0.8204048322982596,0.782,0.7953019682198627,0.066
|
10 |
+
5,gpt-4o,gpt-4o/shots-05,0.7873333333333333,0.8230974205170392,0.7873333333333333,0.8000290527498529,0.998
|
11 |
+
10,gpt-4o,gpt-4o/shots-10,0.7916666666666666,0.8227707658360168,0.7916666666666666,0.803614688453356,0.9996666666666667
|
12 |
+
20,gpt-4o,gpt-4o/shots-20,0.7816666666666666,0.8204541793856629,0.7816666666666666,0.7967017169880498,0.9993333333333333
|
13 |
+
30,gpt-4o,gpt-4o/shots-30,0.7886666666666666,0.8260847852316618,0.7886666666666666,0.8030949295928699,0.999
|
14 |
+
40,gpt-4o,gpt-4o/shots-40,0.784,0.8233509309291644,0.784,0.7993336791122846,0.9973333333333333
|
15 |
+
50,gpt-4o,gpt-4o/shots-50,0.787,0.8234800466218334,0.787,0.8013530974301947,0.9993333333333333
|
16 |
+
10,gpt-4o-mini_batch,gpt-4o-mini_batch/shots-10,0.6576666666666666,0.7689201800674901,0.6576666666666666,0.6748319385295091,0.996
|
datasets/mgtv/gpt-4o-mini.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe16a3556e380afaac83c39454f07f58925a1854c3ff3e09ca29a38d3cb08862
|
3 |
+
size 57935228
|
datasets/mgtv/o1-mini.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3642807801ac5467fbb0812c81e6f3628e6013a5e5c2dc80f013543ccffb9c3
|
3 |
+
size 57923228
|
llm_toolkit/llm_utils.py
CHANGED
@@ -8,6 +8,8 @@ from transformers import (
|
|
8 |
TextStreamer,
|
9 |
)
|
10 |
from tqdm import tqdm
|
|
|
|
|
11 |
|
12 |
|
13 |
def get_template(model_name):
|
@@ -229,3 +231,44 @@ def print_row_details(df, indices=[0]):
|
|
229 |
for col in df.columns:
|
230 |
print("-" * 50)
|
231 |
print(f"{col}: {df[col].iloc[index]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
TextStreamer,
|
9 |
)
|
10 |
from tqdm import tqdm
|
11 |
+
from langchain_openai import ChatOpenAI
|
12 |
+
from langchain_core.prompts import ChatPromptTemplate
|
13 |
|
14 |
|
15 |
def get_template(model_name):
|
|
|
231 |
for col in df.columns:
|
232 |
print("-" * 50)
|
233 |
print(f"{col}: {df[col].iloc[index]}")
|
234 |
+
|
235 |
+
|
236 |
+
def invoke_openai_api(
|
237 |
+
user_prompt,
|
238 |
+
system_prompt=None,
|
239 |
+
model="gpt-4o-mini",
|
240 |
+
temperature=0,
|
241 |
+
max_tokens=None,
|
242 |
+
base_url=None,
|
243 |
+
):
|
244 |
+
llm = ChatOpenAI(
|
245 |
+
model=model,
|
246 |
+
temperature=temperature,
|
247 |
+
max_tokens=max_tokens,
|
248 |
+
timeout=None,
|
249 |
+
max_retries=2,
|
250 |
+
base_url=base_url,
|
251 |
+
)
|
252 |
+
|
253 |
+
messages = [
|
254 |
+
(
|
255 |
+
"human",
|
256 |
+
user_prompt,
|
257 |
+
),
|
258 |
+
]
|
259 |
+
|
260 |
+
if system_prompt:
|
261 |
+
messages.insert(
|
262 |
+
0,
|
263 |
+
(
|
264 |
+
"system",
|
265 |
+
system_prompt,
|
266 |
+
),
|
267 |
+
)
|
268 |
+
|
269 |
+
prompt = ChatPromptTemplate.from_messages(messages)
|
270 |
+
|
271 |
+
chain = prompt | llm
|
272 |
+
response = chain.invoke(input={})
|
273 |
+
|
274 |
+
return response.content
|
llm_toolkit/logical_reasoning_utils.py
CHANGED
@@ -1,7 +1,5 @@
|
|
1 |
import os
|
2 |
import re
|
3 |
-
from langchain_openai import ChatOpenAI
|
4 |
-
from langchain_core.prompts import ChatPromptTemplate
|
5 |
import pandas as pd
|
6 |
from tqdm import tqdm
|
7 |
import seaborn as sns
|
@@ -512,35 +510,23 @@ def plot_metrics(perf_df, model_name, variant="epoch", offset=0.01):
|
|
512 |
|
513 |
|
514 |
def reasoning_with_openai(
|
515 |
-
row,
|
|
|
|
|
|
|
|
|
|
|
|
|
516 |
):
|
517 |
-
|
518 |
-
|
519 |
-
temperature=0,
|
520 |
max_tokens=max_tokens,
|
521 |
-
|
522 |
-
max_retries=2,
|
523 |
base_url=base_url,
|
|
|
|
|
524 |
)
|
525 |
|
526 |
-
prompt = ChatPromptTemplate.from_messages(
|
527 |
-
[
|
528 |
-
(
|
529 |
-
"system",
|
530 |
-
system_prompt,
|
531 |
-
),
|
532 |
-
(
|
533 |
-
"human",
|
534 |
-
user_prompt.format(row["puzzle"], row["truth"], row["text"]),
|
535 |
-
),
|
536 |
-
]
|
537 |
-
)
|
538 |
-
|
539 |
-
chain = prompt | llm
|
540 |
-
response = chain.invoke(input={})
|
541 |
-
|
542 |
-
return response.content
|
543 |
-
|
544 |
|
545 |
def eval_openai(
|
546 |
eval_dataset,
|
@@ -557,10 +543,16 @@ def eval_openai(
|
|
557 |
print("user_prompt:", user_prompt)
|
558 |
total = len(eval_dataset)
|
559 |
predictions = []
|
|
|
560 |
|
561 |
for i in tqdm(range(total)):
|
562 |
output = reasoning_with_openai(
|
563 |
-
eval_dataset.iloc[i],
|
|
|
|
|
|
|
|
|
|
|
564 |
)
|
565 |
predictions.append(output)
|
566 |
|
@@ -605,3 +597,38 @@ def majority_vote(r1, r2, r3):
|
|
605 |
label = r1
|
606 |
|
607 |
return label
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import re
|
|
|
|
|
3 |
import pandas as pd
|
4 |
from tqdm import tqdm
|
5 |
import seaborn as sns
|
|
|
510 |
|
511 |
|
512 |
def reasoning_with_openai(
|
513 |
+
row,
|
514 |
+
user_prompt,
|
515 |
+
max_tokens=None,
|
516 |
+
model="gpt-4o-mini",
|
517 |
+
base_url=None,
|
518 |
+
temperature=0,
|
519 |
+
using_system_prompt=True,
|
520 |
):
|
521 |
+
return invoke_openai_api(
|
522 |
+
user_prompt.format(row["puzzle"], row["truth"], row["text"]),
|
|
|
523 |
max_tokens=max_tokens,
|
524 |
+
model=model,
|
|
|
525 |
base_url=base_url,
|
526 |
+
temperature=temperature,
|
527 |
+
using_system_prompt=using_system_prompt,
|
528 |
)
|
529 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
530 |
|
531 |
def eval_openai(
|
532 |
eval_dataset,
|
|
|
543 |
print("user_prompt:", user_prompt)
|
544 |
total = len(eval_dataset)
|
545 |
predictions = []
|
546 |
+
is_using_o1 = "o1" in model
|
547 |
|
548 |
for i in tqdm(range(total)):
|
549 |
output = reasoning_with_openai(
|
550 |
+
eval_dataset.iloc[i],
|
551 |
+
user_prompt,
|
552 |
+
model=model,
|
553 |
+
max_tokens=None if is_using_o1 else max_new_tokens,
|
554 |
+
temperature=1 if is_using_o1 else 0,
|
555 |
+
using_system_prompt=not is_using_o1,
|
556 |
)
|
557 |
predictions.append(output)
|
558 |
|
|
|
597 |
label = r1
|
598 |
|
599 |
return label
|
600 |
+
|
601 |
+
|
602 |
+
def load_openai_batch_data(data_path, num_shots=10, model="o1-mini", debug=True):
|
603 |
+
openai_data_path = f"{data_path}/{model}.jsonl"
|
604 |
+
if os.path.exists(openai_data_path):
|
605 |
+
print("loading existing data from:", openai_data_path)
|
606 |
+
data = pd.read_json(openai_data_path, orient="records", lines=True)
|
607 |
+
return data
|
608 |
+
|
609 |
+
datasets = load_logical_reasoning_dataset(data_path)
|
610 |
+
df_train = datasets["train"].to_pandas()
|
611 |
+
prompt = get_few_shot_prompt_template(num_shots, df_train, debug=debug)
|
612 |
+
|
613 |
+
messages = []
|
614 |
+
df_test = datasets["test"].to_pandas()
|
615 |
+
|
616 |
+
for i, row in df_test.iterrows():
|
617 |
+
content = prompt.format(row["puzzle"], row["truth"], row["text"])
|
618 |
+
messages.append(
|
619 |
+
{
|
620 |
+
"custom_id": f"request-{i + 1}",
|
621 |
+
"method": "POST",
|
622 |
+
"url": "/v1/chat/completions",
|
623 |
+
"body": {
|
624 |
+
"model": model,
|
625 |
+
"messages": [
|
626 |
+
{"role": "user", "content": content},
|
627 |
+
],
|
628 |
+
},
|
629 |
+
}
|
630 |
+
)
|
631 |
+
|
632 |
+
df_openai = pd.DataFrame(messages)
|
633 |
+
df_openai.to_json(openai_data_path, orient="records", lines=True)
|
634 |
+
return df_openai
|
notebooks/00_Data Analysis.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/01a_internlm2_5-7b-chat_analysis.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/04b_OpenAI-Models_analysis.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/04c_OpenAI-o1.ipynb
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"0ea8b46b-839b-445b-8043-ccdf4e920ace","showTitle":false,"title":""},"id":"YLH80COBzi_F"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","execution_count":2,"metadata":{"id":"63B5exAuzq4M"},"outputs":[],"source":["from pathlib import Path\n","\n","if \"workding_dir\" not in locals():\n"," try:\n"," from google.colab import drive\n"," drive.mount('/content/drive')\n"," workding_dir = \"/content/drive/MyDrive/logical-reasoning/\"\n"," except ModuleNotFoundError:\n"," workding_dir = str(Path.cwd().parent)"]},{"cell_type":"code","execution_count":3,"metadata":{"executionInfo":{"elapsed":368,"status":"ok","timestamp":1719461634865,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"zFulf0bg0H-9","outputId":"debdd535-c828-40b9-efc0-8a180e5830dd"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: /Users/inflaton/code/engd/projects/logical-reasoning\n"]}],"source":["import os\n","import sys\n","\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":4,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":589,"status":"ok","timestamp":1719462011879,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"DIUiweYYzi_I","outputId":"e16e9247-9077-4b0c-f8ea-17059f05a1c4"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /Users/inflaton/code/engd/projects/logical-reasoning/.env\n"]},{"data":{"text/plain":["True"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n"," found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["internlm/internlm2_5-7b-chat-1m datasets/mgtv data/openai_results.csv 2048\n"]}],"source":["import os\n","\n","model_name = os.getenv(\"MODEL_NAME\")\n","data_path = os.getenv(\"LOGICAL_REASONING_DATA_PATH\")\n","results_path = os.getenv(\"LOGICAL_REASONING_RESULTS_PATH\")\n","max_new_tokens = int(os.getenv(\"MAX_NEW_TOKENS\", 2048))\n","\n","print(model_name, data_path, results_path, max_new_tokens)"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading /Users/inflaton/code/engd/projects/logical-reasoning/llm_toolkit/logical_reasoning_utils.py\n"]}],"source":["from llm_toolkit.logical_reasoning_utils import *"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[],"source":["df = pd.read_csv(results_path)"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"data":{"image/png":"iVBORw0KGgoAAAANSUhEUgAAAqgAAAGeCAYAAABYc/NxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABUFklEQVR4nO3deVxU5f4H8M/AwCwgoCgSCiqCuJC5oOaS5ZYmFaJomQsK7ruWlqnklUpNzVsu16xccMkUSQWVLFfcLfNqKkiCbAnKOiwzw2y/P7zMr7lglndgDszn/Xqdl87znHPmezg6fOY5m8hgMBhARERERCQQNpYugIiIiIjojxhQiYiIiEhQGFCJiIiISFAYUImIiIhIUBhQiYiIiEhQGFCJiIiISFAYUImIiIhIUBhQiYiIiEhQqi2g7t69G8nJydW1eiIiIiKqo0RP8ySpPn364PTp0/jkk0/wzjvvVOrPyspCp06dcOnSJTRv3tzYvmzZMmzYsAFKpRIhISH4/PPP4ejoCABQKBSYOnUqDh48CCcnJ8yePRvvvvvuX65Jr9fj999/R7169SASif7uJhERERFRNTMYDCguLoaHhwdsbB4/Tip+mpWfPHkSS5cufWz/3LlzMXfuXJNwumnTJhw4cAAXL16Eq6sr5s6di4kTJ+Kbb74BAEyaNAlSqRRZWVkoLCxESEgIXFxcMHny5L9U0++//w5PT8+n2RwiIiIiqkEZGRlo2rTpY/ufKqD+mePHj+PWrVvYtWuXSfvatWuxe/dutGjRAsCjwNqyZUvcvXsXUqkUJ0+eRFpaGqRSKZydnREVFYVXXnnlLwfUevXqAXi0wU5OTubdKCIiIiL6nykUCnh6ehpz2+OYNaBqNBrMmjULmzZtgp2dnbH9wYMHKC0tRefOnY1tdnZ2eP3113HixAk4OztjwIABkEqlxv42bdrA0dERycnJ8PX1feJ7VxzWd3JyYkAlIiIiErAnnY5p1oukPvvsM9y9exfDhw+Hr68vjh07BgBIT0+Hj49Ppfn9/PyQkpLyxP6qqNVqKBQKk4mIiIiIaj+zBdTS0lKsWLECGzZswN27d7Fq1SqMGjUK9+/fh1KphFwur7SMTCaDUql8Yn9Vli9fDmdnZ+Mk9PNP+/TpAxsbG6xevbpSX5MmTSAWi43TJ598UuU6UlNTsXnzZuPr3NxcvPHGG3BxcUGzZs3w+eefV1v9RERERDXFbAH12LFj6Nu3L8LDw+Hg4IAhQ4YgNDQUO3bsgEwmg0qlqrRMXl4e5HL5E/ursnDhQhQVFRmnjIwMc21KtTh58iQiIiIqtRcWFkIikUCr1RqnBQsWVJpPr9djzJgxJiPFoaGhcHd3R1ZWFk6cOIFt27Zh37591bodRERERNXNbAH1t99+g7+/v0lb+/btkZaWBi8vL6SmplZaJjU1Fd7e3k/sr4pEIjGeb1qbzzu9efMm/Pz8njjfypUrcfHiReNrnU6Hl19+GWvWrIGDgwNatmyJsWPH4vTp09VZLhEREVG1M1tAbdq0KX799VeTths3bqBFixZwc3ODvb09kpKSjH1arRYHDx5Ev3790LNnTxw/fhxardbY/+uvv0KtVld5bmpdcuvWrScG1GvXrmH9+vWYMGGCsc3W1hazZ8+GWCyGVqvFpUuXsGnTJrz88svVXTIRERFRtTJbQH399ddx6dIlfP311ygrK8OhQ4ewe/duhIaGAnh0b9QJEyYgMzMTxcXFmDlzJvr3748WLVqgSZMm6NGjB+bOnYuSkhJkZGRg4sSJWLx4sbnKE6ybN29iz549aNCgATp06IDvv//epF+lUmH06NFYv3493N3dq1zH8OHD8fzzz8PDwwODBg2qibKJiIiIqo3ZAqqDgwO+//57REVFwd3dHStWrEBcXBwaNWoEAJgyZQoGDBiATp06oUmTJigvL8emTZuMy3/xxRfIz8+Hu7s7unbtiqFDhyI8PNxc5QlWYGAgzp8/j+zsbKxZswZhYWG4deuWsX/hwoXo2rUrgoODH7uOvXv34t///jckEgmmT59eE2UTERERVZunetSpECkUCjg7O6OoqEiw56MuXboUjo6OVT4etsLnn3+OtLQ0rFmzBidOnMCkSZPwyy+/oF69ek9cvrS0FM2bN0dSUhIaNGhQXZtBRERE9FT+al4z+5Ok6H/j7e2Nc+fOQaFQGE+JqBiF1mq1EIlEuHz5Mj777DOoVCrjk7mAR6PYzZs3x4MHDxhQiYiIqNYy64366e+JjIzEoUOHTNrOnj2LVq1awcnJCSkpKSgvL4dKpYJKpcLixYuxfPly7N27FykpKRg9erTJsgqFAunp6Y+98wERERFRbcCAakGdOnXCnDlzcO3aNZSVlWHbtm34+uuvMXXq1Ccu2717d+j1eqxYscJ4YdmoUaMwdepU2Nvb10D1RERERNWDh/gtKDAwEL///juGDRuGrKwsdOzYEQcPHoSHh8cTl7WxscH+/fsxc+ZMNG3aFBKJBBMnTsSSJUtqoHIiIiKi6sOLpP6m5u8drrZ1W8K9FYGWLoGIiIisxF/NazzET0RERESCwoBKRERERILCgEpEREREgsKASkRERESCwoBKRERERILCgEpEREREgsKASkRERESCwoBKRERERILCgEpEREREgsKASkRERESCwoBKRERERILCgEpEREREgsKASkRERESCwoBKRERERILCgEpEREREgsKASkRERESCwoBKRERERILCgEpEREREgsKASkRERESCwoBKRERERILCgEpEREREgvJUAbVPnz6wsbHB6tWrHzuPRqPBkiVLTNo2b94MLy8vODg4IDg4GDk5OSbzz5o1C/Xr14erqyvmz58PrVb7NOURERERUS32VAH15MmTiIiI+NN5IiIi8Msvvxhfx8fH4+OPP0ZcXBxyc3Ph7++P4OBgY/+SJUtw584dJCYmIjExEbdu3aoUcImIiIio7quWQ/znzp3Dp59+atK2du1arFmzBu3bt4dMJkNkZCT0ej1OnDiB8vJyfPnll9i2bRsaN26MRo0aISoqCl999RVKS0uro0QiIiIiEiizB9SSkhKEhobinXfeMbYZDAZcunQJgYGBJvOGhITg+PHjuHbtGtq0aQN3d3djn6urK7p164bz58+bu0QiIiIiEjCzB9TZs2cjMDAQAwYMMLbl5eXBxcUFUqnUZF4/Pz+kpKQgPT0dPj4+ldZV0V8VtVoNhUJhMhERERFR7WfWgHrw4EFcvHgRK1euNGlXKpWQy+WV5pfJZFAqlU/sr8ry5cvh7OxsnDw9Pc2zEURERERkUWYLqA8ePMD06dOxc+fOSiOlMpkMKpWq0jJ5eXmQy+VP7K/KwoULUVRUZJwyMjLMsyFEREREZFFic61o6tSpyMnJQffu3QEAer0eOp0O7u7uuH//PgoLC6HRaGBnZ2dcJjU1Fd7e3vDy8kJqamqldaampmLYsGFVvp9EIoFEIjFX+UREREQkEGYbQd2/fz80Gg1UKhVUKhWOHTuGV155BdnZ2RCJRAgICMCJEydMlomOjkb//v3RoUMH3Lp1CwUFBca+/Px8XLp0CT169DBXiURERERUC9TYk6TmzZuHOXPmIDExESqVCpGRkZDJZHjppZdgb2+P8ePHY+LEicjLy0Nubi7CwsIwY8aMxx7iJyIiIqK6qcYC6qBBgzBnzhwMGDAADRo0wNWrV7F//35j/7Jly/DMM8+gZcuWaNWqFXx9fbF06dKaKo+IiIiIBEJkMBgMli7CHBQKBZydnVFUVAQnJ6dqe5/m7x2utnVbwr0VgU+eiYiIiMgM/mpeq7ERVCIiIiKiv4IBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgE5akCap8+fWBjY4PVq1ebtJ89exbdunWDk5MTnn/+eVy8eNGkf9myZWjcuDGcnJwQFhaGkpISY59CocCoUaPg6OgIDw8PrFy58mlKIyIiIqJa7qkC6smTJxEREWHSlpycjBEjRuCjjz5CTk4OFixYgCFDhiAzMxMAsGnTJhw4cAAXL15EZmYmRCIRJk6caFx+0qRJsLOzQ1ZWFi5cuIDo6Gh88cUX/8OmEREREVFtJDbXitavX4958+ahf//+AIChQ4fi0qVL2Lt3L+bNm4e1a9di9+7daNGiBYBHgbVly5a4e/cupFIpTp48ibS0NEilUjg7OyMqKgqvvPIKJk+ebK4SiYiIiKgWMFtAbdGiBQIDA03aPD09kZGRgQcPHqC0tBSdO3c29tnZ2eH111/HiRMn4OzsjAEDBkAqlRr727RpA0dHRyQnJ8PX19dcZRIRERGRwJntIqk5c+ZUCpJxcXEICAhAeno6fHx8Ki3j5+eHlJSUJ/ZXRa1WQ6FQmExEREREVPtV21X827ZtQ3Z2NoKDg6FUKiGXyyvNI5PJoFQqn9hfleXLl8PZ2dk4eXp6mn0biIiIiKjmVUtAvXLlCt5991188803EIvFkMlkUKlUlebLy8uDXC5/Yn9VFi5ciKKiIuOUkZFh9u0gIiIioppntnNQK6SnpyM4OBhff/012rRpAwDw8vJCampqpXlTU1MREBAAJycnHD16tMp+b2/vKt9HIpFAIpGYt3giIiIisjizjqAWFxfj1Vdfxfz58/Hqq68a293c3GBvb4+kpCRjm1arxcGDB9GvXz/07NkTx48fh1arNfb/+uuvUKvVVZ6bSkRERER1l9kCqk6nw4gRI9C7d2/Mnj27Uv/cuXMxYcIEZGZmori4GDNnzkT//v3RokULNGnSBD169MDcuXNRUlKCjIwMTJw4EYsXLzZXeURERERUS5gtoM6cORPx8fHYtGkTxGKxcerXrx8AYMqUKRgwYAA6deqEJk2aoLy8HJs2bTIu/8UXXyA/Px/u7u7o2rUrhg4divDwcHOVR0RERES1hMhgMBgsXYQ5KBQKODs7o6ioCE5OTtX2Ps3fO1xt67aEeysCnzwTERERkRn81bxWbbeZIiIiIiJ6GgyoRERERCQoDKhEREREJCgMqEREREQkKAyoZPX69OkDGxsbrF692qT93Llz6NChA2QyGbp06YLLly+b9G/cuBHe3t5wcXHB8OHDkZOTY+xTqVSYO3cuPDw84OHhgXfeeQdqtbpGtoeIiKi2Y0Alq3fy5ElERESYtOXk5CA4OBgRERFQKBRYtGgRgoKCkJ2dDQCIjo5GVFQUjh07hpycHLzyyisYOXKkcflp06YhPz8fN27cwJUrV5CUlISFCxfW6HYRERHVVgyoRFXYsmUL3njjDQwdOhR2dnYYMmQIQkNDsXHjRgDAqVOnMHHiRPj4+EAikSAsLAzJyckoLCxEfn4+4uLi8MUXX8DV1RVNmjTBzp07ERUVZeGtIiIiqh0YUImqkJCQgKCgIJO2kJAQHD9+HAAQGBiIzz//HDdv3oRSqcSnn34KhUIBmUwGhUKB+fPnQyqVGpd1dnaGSCSCUqms0e0gIiKqjcSWLoBIiNLT0+Hj42PS5ufnh5SUFADAK6+8gtjYWPj7+wMApFIptm7dColEgubNm2P+/Pkmy164cAFubm6QyWQ1swFERES1GAMqURWUSiXkcrlJm0wmM46Abty4EZcvX8b169fRokUL7N271+QiqT8qLS3FhAkT8I9//KPa6yYiIqoLeIifqAoymQwqlcqkLS8vD3K5HDqdDkuXLsX+/fvx7LPPwtHREWFhYUhKSkJcXJzJMgaDAePGjUOPHj0QEhJSk5tARERUazGgElXBy8sLqampJm2pqanw9vZGbm4uHBwc0KxZM5P+bt264fvvvzdpW7x4MXJycrBhw4Zqr5mIiKiuYEAlqkKvXr0QHx9v0hYdHY3+/fvDzc0NJSUlKCoqMum/evUq3N3dja+3bduGvXv34rvvvoO9vX2N1E1ERFQX8BxUoiqEh4ejY8eO6N27NwYMGID4+Hjs2bMHV69ehUgkwpQpU/DWW29h/fr1cHNzw759+7B79278/PPPAB7dW/W9997DmTNn4OrqauGtISIiql04gkpUhcaNG2Pfvn1YuHAhHB0dsXTpUhw4cABubm4AgA8++AD+/v54/vnn8cwzzyAqKgrff/89vLy8kJSUhGHDhuHBgwdo27YtxGKxcTp9+rSFt4yIiEj4RAaDwWDpIsxBoVDA2dkZRUVFcHJyqrb3af7e4WpbtyXcWxFo6RKIiIjISvzVvMZD/FSn1KUvEPzyQERE1oqH+ImIiIhIUBhQiYiIiEhQGFCJiIiISFAYUImIiIhIUBhQiYiIiEhQGFCJiIiISFAYUImIiIhIUBhQiYiIiEhQGFCJiIiISFCeKqD26dMHNjY2WL16tUn7uXPn0KFDB8hkMnTp0gWXL1826d+8eTO8vLzg4OCA4OBg5OTkGPs0Gg1mzZqF+vXrw9XVFfPnz4dWq32a8oiIiIioFnuqgHry5ElERESYtOXk5CA4OBgRERFQKBRYtGgRgoKCkJ2dDQCIj4/Hxx9/jLi4OOTm5sLf3x/BwcHG5ZcsWYI7d+4gMTERiYmJuHXrFpYsWfI/bBoRERER1UZmO8S/ZcsWvPHGGxg6dCjs7OwwZMgQhIaGYuPGjQCAtWvXYs2aNWjfvj1kMhkiIyOh1+tx4sQJlJeX48svv8S2bdvQuHFjNGrUCFFRUfjqq69QWlpqrhKJiIiIqBYwW0BNSEhAUFCQSVtISAiOHz8Og8GAS5cuITAwsMr+a9euoU2bNnB3dzf2ubq6olu3bjh//ry5SiQiIiKiWsBsATU9PR0+Pj4mbX5+fkhJSUFeXh5cXFwglUqr7K9q2T/2V0WtVkOhUJhMRERERFT7mS2gKpVKyOVykzaZTAalUlll39/pr8ry5cvh7OxsnDw9Pc2zIURERERkUWYLqDKZDCqVyqQtLy8Pcrm8yr6/01+VhQsXoqioyDhlZGSYZ0OIiIiIyKLMFlC9vLyQmppq0paamgpvb2+4urqisLAQGo2myv6qlv1jf1UkEgmcnJxMJiIiIiKq/cwWUHv16oX4+HiTtujoaPTv3x8ikQgBAQE4ceJElf0dOnTArVu3UFBQYOzLz8/HpUuX0KNHD3OVSERERES1gNkCanh4OLZv346jR49Cq9UiLi4Oe/bswbRp0wAA8+bNw5w5c5CYmAiVSoXIyEjIZDK89NJLsLe3x/jx4zFx4kTk5eUhNzcXYWFhmDFjxmMP8RMRERFR3WS2gNq4cWPs27cPCxcuhKOjI5YuXYoDBw7Azc0NADBo0CDMmTMHAwYMQIMGDXD16lXs37/fuPyyZcvwzDPPoGXLlmjVqhV8fX2xdOlSc5VHRERERLWEyGAwGCxdhDkoFAo4OzujqKioWs9Hbf7e4WpbtyXcWxH45Jlqkbq0f+raviEiIvqrec1sI6hERERERObAgEpEREREgsKASkRERESCwoBKRERERILCgEpEREREgsKASkRERESCwoBKRERERILCgEpEREREgsKASkRERESCwoBKRERERILCgEpEREREgsKASkRERESCwoBKRERERILCgEpEREREgsKASkRERESCwoBKRERERILCgEpEREREgsKASkRERESCwoBKRERERILCgEpEREREgsKASkRERESCwoBKRERERILCgEpEREREgsKASkRERESCwoBKRERERILCgEpEREREgmLWgJqXl4fRo0ejQYMG8PLywpo1a4x9t2/fRq9evSCTydCuXTscOXLEZNkDBw6gVatWkMlk6Nu3L5KTk81ZGhERERHVEmYNqKGhofDx8UFGRgauXLmChIQEbN++HWq1GoMHD8bIkSNRVFSETZs2YcKECbhx4wYA4MaNG5g8eTK+/PJLFBUVYcSIERg0aBBUKpU5yyMiIiKiWsCsAfXMmTN4//334eDggMaNG2PmzJn47rvvEBMTA39/f0yfPh329vZ44YUXEBERgVWrVgEA1q9fjwULFuDFF1+Evb09pkyZgi5dumD37t3mLI+IiIiIagGzBtTAwEAsWLAACoUCGRkZ+Pjjj9G4cWMkJCQgKCjIZN6QkBAcP34cAJ7YT0RERETWw6wBdcOGDdi/fz+cnZ3h5eWF+/fv44MPPkB6ejp8fHxM5m3YsCFUKhXUajWysrLg7e1t0u/n54eUlJTHvpdarYZCoTCZiIiIiKj2M1tA1Wq1eO211/DWW28hLy8PaWlpGDRoEB48eAClUgm5XF5pGZlMBqVSCb1eDxsbmyr7Hmf58uVwdnY2Tp6enubaFCIiIiKyILMF1Li4OMhkMqxcudJ4FX9kZCTGjx8Pe3v7Ki94ys/Ph1wuh42NDQwGg0lfXl5elaG2wsKFC1FUVGScMjIyzLUpRERERGRBZguoSUlJeOGFF0zaHBwc4OLiAgBITU016cvOzkaDBg1gb2+PJk2aID093aQ/NTW10mH/P5JIJHBycjKZiIiIiKj2M1tA9fb2RmJiokmbSqXC7du3MXr0aMTHx5v0RUdHo3///gCAXr16/Wk/EREREVkPswXU1157DVevXsWGDRtQXFyMrKwshIaGonv37hg2bBguX76MqKgoaDQaXLhwAZ988gnmz58PAJg5cyY+/PBDXLp0CRqNBlu2bMH169cxcuRIc5VHRERERLWE2QKqVCpFXFwcDh06hMaNG6Nbt25wc3NDVFQUpFIpYmNjsXnzZtSrVw/h4eHYvHkz2rVrBwB49tlnsW7dOowZMwZOTk7YuXMnjhw5AolEYq7yiIiIiKiWEJtzZb6+vvj++++r7Gvbti3Onj372GWHDBmCIUOGmLMcIiIiIqqFzHofVCIiIiKi/xUDKhEREREJCgMqEREREQkKAyoRERERCQoDKhEREREJCgMqEREREQkKAyoRERERCQoDKhEREREJCgMqEREREQkKAyoRERERCQoDKhEREREJCgMqEREREQkKAyoRERERCQoDKhEREREJCgMqEREREQkKAyoRERERCQoDKhEREREJCgMqEREREQkKAyoRERERCQoDKhEREREJCgMqEREREQkKAyoRERERCQoDKhEREREJCgMqEREREQkKAyoRERERCUq1BtS0tDRs3769Ot+CiIiIiOqYag2os2fPRk5OjvH1uXPn0KFDB8hkMnTp0gWXL182mX/z5s3w8vKCg4MDgoODTZYlIiIiIutQbQH1yJEjuHv3LubNmwcAyMnJQXBwMCIiIqBQKLBo0SIEBQUhOzsbABAfH4+PP/4YcXFxyM3Nhb+/P4KDg6urPCIiIiISqGoJqGq1GnPmzMGmTZsgFosBAFu2bMEbb7yBoUOHws7ODkOGDEFoaCg2btwIAFi7di3WrFmD9u3bQyaTITIyEnq9HidOnKiOEomIiIhIoKoloH7yySfo3bs3evbsaWxLSEhAUFCQyXwhISE4fvw4DAYDLl26hMDAwCr7iYiIiMh6mD2gZmRkYPny5fj+++9Rv359vP3229Dr9UhPT4ePj4/JvH5+fkhJSUFeXh5cXFwglUqr7K+KWq2GQqEwmYiIiIio9jN7QI2MjMSAAQPw888/4+rVqzh79iw2bNgApVIJuVxuMq9MJoNSqayy74/9VVm+fDmcnZ2Nk6enp7k3hYiIiIgsQGzuFR48eBCJiYmoX78+AODrr7/GiBEjIJPJoFKpTObNy8uDXC6vsu+P/VVZuHCh8QIsAFAoFAypRERERHWAWUdQHz58CAcHB2M4BYB27dohMzMTXl5eSE1NNZk/NTUV3t7ecHV1RWFhITQaTZX9VZFIJHBycjKZiIiIiKj2M2tAdXV1RUFBAQoKCoxtt27dgpeXF3r16oX4+HiT+aOjo9G/f3+IRCIEBARUumK/op+IiIiIrIdZA6qNjQ3GjRuHcePG4cGDB0hNTcWECRMwe/ZshIeHY/v27Th69Ci0Wi3i4uKwZ88eTJs2DQAwb948zJkzB4mJiVCpVIiMjIRMJsNLL71kzhKJiIiISODMfg7qihUr8O6778Lf3x8ODg6YNWsWJk6cCADYt28fpk+fjuDgYPj7++PAgQNwc3MDAAwaNAhpaWkYMGAA8vLyMHDgQOzfv9/c5RERERGRwIkMBoPB0kWYg0KhgLOzM4qKiqr1fNTm7x2utnVbwr0VgU+eqRapS/unru0bIiKiv5rXqu1Rp0RERERET4MBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgEhQGViIiIiASFAZWIiIiIBIUBlYiIiIgEpVoD6qpVq5CbmwsAuH37Nnr16gWZTIZ27drhyJEjJvMeOHAArVq1gkwmQ9++fZGcnFydpRERERGRQFVbQL1+/TqWLFkCAFCr1Rg8eDBGjhyJoqIibNq0CRMmTMCNGzcAADdu3MDkyZPx5ZdfoqioCCNGjMCgQYOgUqmqqzwiIiIiEqhqCajl5eUYM2YMtFotACAmJgb+/v6YPn067O3t8cILLyAiIgKrVq0CAKxfvx4LFizAiy++CHt7e0yZMgVdunTB7t27q6M8IiIiIhKwagmoixYtwrPPPoumTZsCABISEhAUFGQyT0hICI4fP/6X+omIiIjIepg9oJ45cwbR0dHYsGGDsS09PR0+Pj4m8zVs2BAqlQpqtRpZWVnw9vY26ffz80NKSspj30etVkOhUJhMRERERFT7mTWgKhQKjBs3Dlu3boWzs7OxXalUQi6XV5pfJpNBqVRCr9fDxsamyr7HWb58OZydnY2Tp6en+TaEiIiIiCzGrAF11qxZGDZsGF566SWTdplMVuUFT/n5+ZDL5bCxsYHBYDDpy8vLqzLUVli4cCGKioqMU0ZGhlm2gYiIiIgsS2yuFR08eBC7du2Cra0t1q1bB+DRYfimTZvimWeeQWpqKnr37m2cPzs7Gw0aNIC9vT2aNGmC9PR0NGvWzNifmppa6bD/H0kkEkgkEnOVT0REREQCYbYR1KCgIGg0GqhUKuPUrFkzZGZm4qOPPkJ8fLzJ/NHR0ejfvz8AoFevXn/aT0RERETWo0aeJDV06FBcvnwZUVFR0Gg0uHDhAj755BPMnz8fADBz5kx8+OGHuHTpEjQaDbZs2YLr169j5MiRNVEeEREREQlIjQRUqVSK2NhYbN68GfXq1UN4eDg2b96Mdu3aAQCeffZZrFu3DmPGjIGTkxN27tyJI0eO8BA+ERERkRUy2zmoVbl3757x723btsXZs2cfO++QIUMwZMiQ6iyHiIiIiGqBGhlBJSIiIiL6qxhQiYiIiEhQGFCJiIiISFAYUImIiIhIUBhQiYiIiEhQGFCJiIiISFAYUImIiIhIUBhQiYiIiEhQGFCJiIiISFAYUImIiIhIUBhQiYiIiEhQGFCJiIiISFAYUImIiIhIUBhQiahWKCgoQGhoKBo1aoTmzZtjxYoV0Ov1lebTaDRYsmSJSdvmzZvh5eUFBwcHBAcHIycnp6bKJiKip8CASkS1QkhICJo0aYK7d+/ihx9+wIEDB7B+/fpK80VEROCXX34xvo6Pj8fHH3+MuLg45Obmwt/fH8HBwTVZOhER/U0MqEQkeL/88gsePHiAjz76CE5OTvD19cXWrVvx1Vdfmcx37tw5fPrppyZta9euxZo1a9C+fXvIZDJERkZCr9fjxIkTNbkJRET0NzCgEpHg6XQ6zJs3DyKRyNjm6elpcqi+pKQEoaGheOedd4xtBoMBly5dQmBgoMn6QkJCcPz48eovnIiIngoDKhEJXkBAAMaPH2/SFhcXh4CAAOPr2bNnIzAwEAMGDDC25eXlwcXFBVKp1GRZPz8/pKSkVG/RRET01MSWLoCI6O/Kzs7G22+/jZiYGADAwYMHcfHiRfz888+4ePGicT6lUgm5XF5peZlMBqVSWWP1EhHR38OASkS1ilqtxrBhwzBt2jR069YNDx48wPTp0xEbG1tppFQmk0GlUlVaR15eXpXBlYiIhIEBlYhqlfDwcHh6euL9998HAEydOhU5OTno3r07AECv10On08Hd3R33799HYWEhNBoN7OzsjOtITU2Ft7e3ReonIqIn4zmoRFRrLF26FCkpKdi2bZvxgqn9+/dDo9FApVJBpVLh2LFjeOWVV5CdnQ2RSISAgIBKV+xHR0ejf//+ltgEIiL6CxhQiahW2LVrF3bs2IEDBw5UOpT/Z+bNm4c5c+YgMTERKpUKkZGRkMlkeOmll6qvWCIi+p/wED8RCV5CQgLCwsKg0Wjg4eFh0nf37l00a9bsscsOGjQIaWlpGDBgAPLy8jBw4EDs37+/uksmIqL/gchgMBgsXYQ5KBQKODs7o6ioCE5OTtX2Ps3fO1xt67aEeysCnzxTLVKX9g/3jbDVtf1DRFQT/mpe4yF+IiIiIhIUswbUgoIChIaGolGjRmjevDlWrFgBvV4P4NEjCDt06ACZTIYuXbrg8uXLJstu3rwZXl5ecHBwQHBwsMkTYoiIiIjIepg1oIaEhKBJkya4e/cufvjhBxw4cADr169HTk4OgoODERERAYVCgUWLFiEoKAjZ2dkAgPj4eHz88ceIi4tDbm4u/P39ERwcbM7SiIiIiKiWMFtA/eWXX/DgwQN89NFHcHJygq+vL7Zu3YqvvvoKW7ZswRtvvIGhQ4fCzs4OQ4YMQWhoKDZu3AgAWLt2LdasWYP27dtDJpMhMjISer2+0q1hiIiIiKjuM1tA1el0mDdvnvHehADg6emJnJwcJCQkICgoyGT+kJAQHD9+HAaDAZcuXUJgYGCV/URERERkXcx2m6mAgAAEBASYtMXFxSEgIABpaWnw8fEx6fPz80NKSgry8vLg4uJS6b6Gfn5+2L1792PfT61WQ61WG18rFAozbAURERERWVq1XcWfnZ2Nt99+GxEREVAqlZWeey2TyaBUKqvs+2P/4yxfvhzOzs7GydPT0+zbQEREREQ1r1oCqlqtxrBhwzBt2jR069YNMpkMKpXKZJ68vDzI5fIq+/7Y/zgLFy5EUVGRccrIyDD7dhARERFRzauWJ0mFh4fD09MT77//PgDAy8sLqamp8PLyMs6TmpoKb29vuLq6orCwEBqNBnZ2dpX6H0cikUAikVRH+URERERkQWYfQV26dClSUlKwbds24wVTvXr1Qnx8vMl80dHR6N+/P0QiEQICAipdsV/RT0RERETWxawjqLt27cKOHTtw4cIFk4uewsPD0bFjR/Tu3RsDBgxAfHw89uzZg6tXrwIA5s2bhzlz5uC7775D8+bNsWrVKshkMrz00kvmLI+IiIiIagGzBdSEhASEhYVBo9HAw8PDpO/u3bvYt28fpk+fjuDgYPj7++PAgQNwc3MDAAwaNAhpaWkYMGAA8vLyMHDgQOzfv99cpRERERFRLWK2gPrCCy+Y3PbpvzVr1gzXrl17bP/kyZMxefJkc5VDRERERLVUtd1mioiIiIjoaTCgEhEREZGgMKASERERkaAwoBIRkVnl5eVh9OjRaNCgAby8vLBmzRpj3w8//IDOnTujXr166NGjB37++WcLVkpEQlUtN+onIiLrFRoaioCAAGRkZKCkpASTJ09Gw4YN0aFDB4SFhWHnzp3o1q0bjh49iiFDhuDq1ato1KiRpcsmIgFhQCUiIrM6c+YMYmJiYG9vDwcHB8ycORPr1q1DYmIi3n77bbz44osAgODgYJw6dQqHDh1CeHi4hasmIiHhIX4iIjKrwMBALFiwAAqFAhkZGfj444/RuHFj6HS6KucvKCio4QqJSOg4gkpERGa1YcMGPPfcc/jss88AAG3atMGOHTuQnp6OYcOGISAgAF26dMGRI0ewefNmnD592sIVE5HQcASViIjMRqvV4rXXXsNbb72FvLw8pKWlYdCgQXjw4AGef/55rFq1CuHh4fD09MTIkSPRq1cvdO3a1dJlE5HAMKASEZHZxMXFQSaTYeXKlcar+CMjIzF+/Hio1Wq89dZbSEpKQmxsLMRiMTZu3GjpkolIgBhQiYjIbJKSkvDCCy+YtDk4OMDFxQW//vorAECn02HKlClYunQpfH19LVEmEQkcAyoREZmNt7c3EhMTTdpUKhVu374Nd3d3AMDnn38OOzs7zJ071xIlElEtwIBKRERm89prr+Hq1avYsGEDiouLkZWVhdDQUHTv3h1NmjRBVlYWPvroI2zZsgW2traWLtfqrVq1Crm5ucbXPXv2hFgsNk7Tp0+3YHVkzRhQiYjIbKRSKeLi4nDo0CE0btwY3bp1g5ubG6KiogAAZWVlWLlyJfz9/S1cKV2/fh1LliwxaUtOToZKpYJWq4VWq8WGDRssVB1ZO95miojIyjV/77D5V9pxFtw6zgIAxAKI/ejMHzrdEVkd7/kf91YEVtu664ry8nKMGTMGWq3W2JaVlQVXV1eIxYwGZHn8V0hERGRlFi1ahGeffRZFRUXGtlu3bsHPz8+CVRH9Px7iJyIisiJnzpxBdHR0pcP3N2/exNmzZ9GwYUO0bt0aO3bssFCFRBxBJSIishoKhQLjxo3D1q1b4ezsbNIXEBCAM2fOwMfHB9evX8fIkSPh7u6OAQMGWKhasmYcQSUiIrISs2bNwrBhw/DSSy9V6uvVqxfatm0Le3t7BAQEYM2aNdi0aVPNF0kEBlQiIiKrcPDgQezatQvr1q2DVCqFVCpFWloamjZtitjY2Erze3t7IzMz0wKVEjGgEhERWYWgoCBoNBqoVCrj1KxZM2RmZiInJwdffPGFyfxnz55Fq1atLFQtVejTpw9sbGywevVqS5dSo3gOKhERkZULCAjAwIED0bp1a3Tr1g3Hjh3DkiVL8MMPP1i6NKt38uRJLF261NJl1DgGVCIiIivXoUMHbN68GdOmTcPdu3fRqlUrbNmyBR06dLB0aWSlGFCJiIgErFoepFDhzQ0IWH3pPy/EwGufwB2AAsDMc8DMc+Z9bz5Egf4qnoNKRERERILCgEpEREREgiKogKpQKDBq1Cg4OjrCw8MDK1eutHRJRERERFTDBHUO6qRJkyCVSpGVlYXCwkKEhITAxcUFkydPtnRpRERERFRDBBNQs7KycPLkSaSlpUEqlcLZ2RlRUVF45ZVXGFCJiIiIrIhgDvGfO3cOAwYMgFQqNba1adMGjo6OSE5OtmBlRERERFSTBDOCmp6eDh8fn0rtfn5+SElJga+vr0m7Wq2GWq02vi4qKgLw6DzW6qRXl1Xr+mtadf+8alpd2j/cN8JWl/YP942w1aX9U9f2TU1Rq9UQi8V14udXsQ0Gg+FP5xNMQFUqlZDL5ZXaZTIZlEplpfbly5fjH//4R6V2T0/PaqmvrnL+p6UroMfhvhE27h/h4r4RLu6b/82SJUssXYLZFBcXw9nZ+bH9ggmoMpkMZWWVvyXm5eVVGVwXLlyIefPmGV/r9Xrk5+fD1dUVIpGoWmutbgqFAp6ensjIyICTk5Oly6H/wv0jXNw3wsV9I2zcP8JV1/aNwWBAcXExPDw8/nQ+wQRULy8vHD16tFJ7amoqvL29K7VLJBJIJBKTNhcXl+oqzyKcnJzqxD/Guor7R7i4b4SL+0bYuH+Eqy7tmz8bOa0gmIukevbsiePHj0Or1Rrbfv31V6jV6irPTSUiIiKiukkwAbVJkybo0aMH5s6di5KSEmRkZGDixIlYvHixpUsjIiIiohokmIAKAF988QXy8/Ph7u6Orl27YujQoQgPD7d0WTVOIpHggw8+qHQKAwkD949wcd8IF/eNsHH/CJe17huR4UnX+RMRERER1SBBjaASERERETGgEhEREZGgMKASERERkaAwoBIRERGRoDCgEhEREZGgMKASERERkaAwoBL9TSUlJXBycjJ56tmRI0cwatQolJeXW7AyIiKqizIyMmBtdwVlQBWg0tJStG/f3iQAnTp1CosWLTJpI8uQyWQoKyuDTqdDZGQkAKBevXrYs2cPA2oNKCoqwuzZsx/b7+DggIcPHxpfl5aW1kRZ9BdotVqEhYWhuLgYAHDjxg3cvHnTwlVZtyNHjuDUqVM4c+ZMpSkhIQHJycmWLtEqFBQUYOjQoY/tb9u2LfLz842vs7KyaqIsi+KN+gVIq9VCKpVCqVRi9+7dCA0NxalTp9CvXz8UFRXB0dHR0iVahZKSEvj5+Rmf3mEwGODq6oqffvoJ9erVMz71LC8vD//+97/RqVMnKBQKODg4WLjyuk2hUKBdu3bIyMjAl19+ibKyMohEIshkMkycOBENGzZEeno65HI5cnJy0KlTJ5w/fx7NmjWzdOlWQa1W41//+hdkMhlsbW1RVlaG4OBg9OvXD1evXoWLiwsKCwvh6OiIt956C+fPn0dycjLs7OwsXbpVSU9Ph5eXF5555hl06dIFBoMBp0+fxosvvoiff/4Z/v7+sLe3R0JCAm7cuAEvLy9Ll1ynFRcXo127dkhPT8cHH3wAhUIBkUiEevXq4R//+Afc3NyQlpYGmUyGe/fuoV27drh58yaaN29u6dKrDUdQLai0tBQ9e/ZE37590bdvX/Tp0wdDhgyBWCyGVCqFwWDA3LlzAQAuLi4AAJFIZMGKrYtMJoNUKsWOHTsgEomwc+dOlJWVAXj06Dk7OzuIxWIAgL29vbGdqpdEIoFUKgUArFmzBr/++itu3LiBlStXAng0giqXywEA0dHRaNy4McNpDTIYDJg3bx527dqF7du3Y968eVCr1UhJSYFUKoVEIoFEIkFsbCz279+Pr776iuHUArp3746dO3dCLpfj0KFDiI2NRdOmTREbG4tu3bph27ZtiIuLw/jx442fe1R9/vi5tnfvXjRs2BCurq7YtWsXgEefaxW/X/bt24eAgIA6HU4BQGzpAqyZRCJBRkYGli9fjoULF2L58uX46KOPAABSqRT29vbGAFTxD7ciCFH1s7W1hYODA3r27Gn808bm0Xe6/w6m5eXlkEqlxnaqPmKx2OTn/+WXXwIA/Pz8jG2vv/46vvzyS+zfvx/vvPOOxWq1RhKJBLa2tjhz5gwAQC6Xm3yhs7GxwdatW7F48WLs3r0b/fv3t3DF1kkmk+HgwYNIS0vDsmXLAAC5ublYtmwZkpKS8Omnn8LJyQlyuRytW7e2cLV1n1gshq2tLQDAzs4OixYtAgDs2LHD2N+5c2fs3r0bMTExeP/99y1Wa03hb1MLEovFcHFxwahRo7By5UqMGjUKy5cvN/YBMI4saDQa2NnZcaTBwioCqlKpRFhYGAoKChAWFoaHDx/imWeesXB11sHW1hY5OTmYNm0asrOzMW3aNAAw+fLQt29fvPDCCygoKMCwYcMsWa7VEYlElY702NrawsbGBosXL4ZGo8G5c+dw+fLlOj8CJEQ//fQTYmNjIZFIsG/fPjRr1swYjEQiEWxtbY1/2tjYQK1WW7hi62BjY4PMzEwMHjwY6enpGDx4MAwGg/H/klgsxgcffIBXXnkF5eXlCAwMtHDF1Y8BVWAqfsmq1WosW7YMJSUlWLZsGbKysuDu7m7h6qjilG0bGxs0adLEeApGVlYWAgICLFmaVZHJZOjZsyeOHDmCnj17wmAw4OTJkwAe/ZKdM2cOiouLceTIEZ52YQH/fWmDXq+HwWDA3bt3odPpcPDgQfj6+mLRokU8bamGaTQaHDt2DGlpacjMzIRIJEJYWBj0ej22b9+O8ePH49y5c5gxYwY8PT0tXa5VcXV1xaJFizB+/HgsWrQIBoMBEydOhFarhUgkwpAhQ5CcnIzdu3cbs0JdVve3sJap+GDX6/VITk7G66+/juTkZFy7dg3PPfechaujiv1jb2+PyMhIREZGYtmyZSguLsb48eMtXJ31cHZ2xqhRo1C/fn2MGjUKo0ePBgDMmTMHCoUCAPD777/j+vXr+OWXXyxZqtXR6/WVQqdWq4VOp8M333wDe3t7nDp1Cnv37sW4ceMsU6QV6969Oy5cuIDIyEgMGjQIYrEYzz//PHr16gWNRoPu3bsjISEBfn5+eO211/j/pwbJ5XL07NkTjo6O6NmzJ3r16gUAGDNmjPEK/lu3biEtLc14Ck1dxhFUgakIQDKZzHjuCQAMHjzY+EuYaoZOp0NJSQmOHTtm/LPiNl86nQ7nz59HcXExDh48iBYtWmDgwIEWrth63Lt3D127dkVycjK6du1qbHdzc0NBQQGOHj2K06dPY+LEiYiOjkbHjh0tWK11UalU0Gq1sLW1NR6iLC4uhkajQXl5OTQaDVq3bo1Tp06hQ4cO2Lx5MyZNmmTpsq3Kw4cPkZiYiI0bN6J3795VzpOWloaoqCj07dsXx44dQ5cuXWq4SuuTnJwMNzc3FBYWws3NDQaDAQ0bNsTgwYNx+PBhbNq0yTi6feDAgcfuu7qCI6gWpNPpUF5ejjt37hj/rAhAWq0WN27cwO3bt7FixQqoVCoMHz7cwhVbF6VSCZVKhcmTJ8NgMGDy5MnGW0ipVCpkZ2cjNDQU+/btw7x58yxcrXVxd3fHjh074OXlhZ07dyIqKgoGgwHvv/8+PDw8sHjxYkilUgQFBeHcuXOWLteqSCQS4/1Nb9++jRs3bqBZs2ZYt24dysvLodVqUVpaCqlUig0bNuD999/nvWprmI2NDQ4dOmQMON26dUPz5s3h7e1tnHr06IF79+4hOjqa5wrXkJYtWyIjIwNt2rRBZmam8eb8Y8aMQePGjREbGwtXV1e8/PLLuHDhgqXLrXYcQbUglUqFO3fuoE2bNjAYDGjdujWaNm0K4NE5qL/88gsmT54MR0dHXLx40cLVWh9HR0f8/vvvldp1Oh3UajWGDh2KoKAgbN26FQMHDsSoUaPwySef8JzHaqbX6yEWi+Hn52e8oEMkEkGv1wN49OUuISEBP/zwAzw8PPDvf//bwhVbF1tbW7Rr1w7ffvstgoODjXe6mD59OgDg/v37OHXqFGbPno2jR49i/fr1vHdwDZNIJFCr1Th06BD69OkDGxsbvPnmm5g5cyaAR0fyunbtijVr1hhvcUjVS6fTQa/XQyKRQK/X4/Tp0zAYDMbPNY1Gg0OHDuHy5cto2LAhfv31VwtXXP0YUC3IwcEBarXa5Mr88vJy6PV6qNVqjB07FoGBgVi7di26du2KxYsXGy/KIctRKpXQ6XTQarUQi8WYMGEC+vTpg/DwcGi1WgbUalZxmBgA6tevj1dffRUajQaNGzcG8OiD3N7eHkFBQcjNzUVRUREKCgpQv359S5ZtNdLT0zFs2DBcu3YN0dHRKCsrw/Lly423ygMeBaDi4mK89NJL2Lt3rwWrtU56vR7l5eXYuHEjJk+ejOLiYohEIpP7nRYXFyMiIgIA8Pnnn1uqVKtRXl5uvGOCv78/Vq9eDY1Gg/bt2xv79Xo9unfvjry8PJSWluLhw4do1KiRJcuuVgyoFvbft42yt7dHWVmZ8aICV1dXfPjhhwgMDERYWBjGjh0LV1dXC1VLwKOR1eLiYpN7nrZs2RInT57kFck1oKysDIWFhQBQ5YUCxcXFKCsrg5OTEyQSCdatW8enr9Ugd3d3BAQEYM+ePWjZsiV2796NsrIyvP322ybz6XQ6XLp0yThCRDWntLQUEokE8fHx0Ol0OHHiBNavX4/169cjMDAQQ4YMwT//+U+UlJQgLy/P0uVaBZVKBWdnZwDAN998U6m/pKQEZWVlcHZ2hlQqRWxsbJ0OpwAfdWpxOp0OWVlZ8PT0NIYbvV6P27dvo23btiaBp6yszPiEHCKq2tGjR9G/f3/eM1ggvv/+e8TFxWHdunWWLoX+Q6VS4erVq+jRo4dJ+8mTJ7Fz506sW7eOv2ssSK/XV7qN1K1bt9C6dWuruL1UBQZUC8vIyEDz5s2RnZ1d6dvQhAkT8Oyzz2L27NkWqo6IiIhqSk5ODjw8PPDgwYPHHi1NTU3FoEGDkJSUVMPV1SzrieIC5eDgAIPBUOnb6pQpU7Bjxw7Uq1fPQpURCdvDhw/h5eWFgoKCx86TkZGBl19+uQarogpKpRIxMTEAAC8vL2RkZFi4Ivo7Hjx4gE6dOtX5ECQkmZmZcHR0hMFggKOjIxISEvDTTz/hypUrOHPmDO7evYvJkydDKpXi4cOHli632jGgWphEIoFIJDJe6QoAs2bNQnR0NH788UeEhYVZsDoiYSouLoZMJkNmZiZkMhkyMjKQm5uLhw8fIj09HQ8fPsTHH38MW1tb/PTTT5Yu1yppNBrMmDEDwKNz6xs2bIhz585h4MCBGDx4MAYPHowBAwbg1VdftXCl1ufatWtwcnIy3qWkqKgICxYsQG5urnEemUyGa9eu8bz6GrJmzRp88MEHkEqlEIlEkEgkCAwMxPDhw9G7d2+EhITg9OnT2LNnD5ycnCCTySxdcrVjQLWwivPkKi64iYmJwdatW3HkyBG88MILliyNSJC2b9+Od955x/hBLpVK0apVKzRu3Bju7u5o0aIFDh48iOXLl8PJycnk6nGqfjqdDu3atcPrr7+OgoIC9O3bF/fv34dEIoFMJkNBQQFat26Nvn374pdffsH7779v6ZKtjlwuR0lJifH2Xvb29lizZo3JVfwV/2/+OHhC1aesrAw2NjawtbWFra0tAKBp06ZITU1Fhw4dsGrVKsjlcri6ukIikZhcpFtXMaBaWMXTVnbt2gUACAoKwo8//mjydBwi+n937twx3gu14oKB5s2bQ6fToWvXrtiyZQscHBys6oNcSDQaDRYvXox58+bB1dUV7777LurXrw8bGxvIZDK4urqidevW6NixI+zt7StdqEPVr+LLXcUonEwmg8FgMPm/UvH3irBE1att27ZQqVQA/n/g6o9fDiqOtDo6OkIsFlvF51rd38JaYvbs2fj000/h7+8PANiwYYNJv1arRWFhIY4cOWKJ8ogEo3379jh8+DCA//8A/+O9Zys+yOvVq2c1H+RCIpVK0bdvXzRs2BByuRwDBw40nmPPw8XCUBGA/nini4oHXvzxNQCrumrcklq0aIFbt24hKioKer0eUVFRKCwsRFRUFB4+fIizZ8/C0dERCoUCUVFRVrFf+MktACKRCElJSVi8eDG+/PJLtGzZEp07dzaZp+I51kTWztvbG6mpqcZ7oJ45cwZlZWU4c+YMFAoFEhMT4eTkBJVKhYSEBKv4IBcSjUaDZ555BmKxGDqdDnZ2dtDr9dBoNFCpVFAqlcjLy4OjoyN0Oh0yMjLg6elp6bKtio2NDQwGA/r162dsMxgMCAkJqfRlj2qGm5sbUlNTERsbC51Oh9jYWBQVFSE2NhaFhYW4du0a7O3toVAoEBsbaxX7hreZsrCKJw/pdDoAwM6dOzF9+nTExMSYfHgQ0SPp6elo27Yt2rRpg2vXrqFDhw64desW2rZti8TERDRq1AhisRi///472rRpg6KiIty5c8fSZVsNnU6H7Oxs4ykXd+/eha+vL27evImrV69iypQpxlFtg8GAsrIypKenW7hq65KTk4NnnnkG77777mPnMRgMWLVqFTIyMuDh4VGD1Vmn0tJSdO7cGYmJiXB0dERJSQk6dOiAa9euoXv37pg6dSrq16+P999/Hzdu3EDLli1x9+5dS5ddrTiCamFarRbAow91W1tbjB49Gk5OTggODsYPP/yAbt26WbhCImFp0KABvLy8cOXKFTg6OuLKlSvo0KEDrly5YvJBvmjRIly5cgUtW7a0dMlWxdbWFk2aNIFSqcTmzZvh4OCACRMmGAPr6tWrMWjQIJw7dw59+/a1dLlWyWAwQCQSYfny5X863yeffAKOYdUMmUyGoqIiAP+fCyr+rKDT6VBaWgqDwWAVT2DjsS8Lq3im+B//Ib7++uuYMWMGhg8f/qf3eCSyRjKZzPio04ojDxV/VtDr9SgrK7OaD3IhMhgMGD9+PLKyslBaWoqGDRtCp9Nhzpw5kEgk+Pjjj7Fv3z5Ll2mV/jgwAgAXLlxAenp6pUkkEjGg1hAbGxsolUrj6TDAo3s9jx07Fnfv3sXmzZtRVlaGgoICqNVq4zx1GUdQLaziqj21Wm1y7k9kZCT279+P0NBQHDp0yFLlEQmOra0tlEolDAaD8UM6Pz8fERERyMzMRExMDN58803jB/l/j0JQ9dLr9ejbty8WLFhgbDt79iyioqKg0+mgUqkQFRWFzp07Izw8HG3btkW7du0sWLH1UavVxj/VajV69uxpEkYr/i4SiSp9+aPqUXGNSUUm0Gg0iIyMhJ2dHfr06QOVSoXWrVvD19cXxcXFxn1YlzGgWlh5eTnatm1b6duQra0tIiIicPTo0Sqfy0tkrTQaDfR6PVQqFQwGA3Q6HcaNGweRSISwsDCoVCrjRTolJSVW8UEuJCUlJejYsSNGjBgBW1tblJaWIiMjA9999x0MBgNKSkrw3XffAQCeeeYZfPHFF/j8888tXLV1KSkpMe6LBg0a4LfffjMZIAEe/T9r2bIlv+DVEJFIhIsXLxr3TVlZGSZNmlRpvsuXLyM1NRWlpaUWqLJm8SIpATMYDNBqtSa3AiGyduXl5Th9+jQ6duwINzc3FBYWwsnJqcp509LS0LZtW6v4MBeazMxMhIeHo6ioCJMnT8b48eNx//59dOnSBZmZmQAe7UveCL7mlZaW4tdff0WnTp0e+/slLy8PjRo1wvXr1423P6Tqp1arcfbsWfTu3fux+6a8vBz37t1Dq1atari6msWASkS1jk6nQ1JSEs6ePYvQ0NBKoz8VSktLce7cObz88ss1XCFV+OyzzxAeHg5HR0eoVCokJSXhueees3RZ9AQajQZ37tyBj4/PY/9/EVUnBlQiqnUyMjLQvHlzZGdno1GjRiZ9EyZMwLPPPovZs2dbqDoiIvpf8cRGIqp1HBwcYDAYjE8oqjBlyhTs2LED9erVs1BlVEGn0yElJQUAoFAo4OTkhOzsbJN5MjMzkZGRYYnyrF6DBg0qtYWEhBgfgEFkaQyoRFTrSCQS4yNNK8yaNQvR0dH48ccfERYWZsHqCHgUSn19faHVaiGXy1FSUmLyxeGrr76Cv78/li5darkirZiLiwsMBoPxIQlKpRJHjhyxitsXUe3Aq/iJqNapuHig4olEMTEx2Lp1K44fP46uXbtasjT6D0dHR4jFYuM+srGxgUQiQWFhIcLCwnDkyBH4+Phg48aNFq7U+hQXF6OsrAzNmjVDu3btcPToUezZswdNmjRBp06dEBQUBFtbWxgMBri4uGDr1q2WLpmsEEdQiajWqfjluWvXLgBAUFAQfvzxR4ZTgSgrK4OdnZ3JCLeNjQ3EYjEyMjKg1Wpx4cIFFBQU8AKcGpaTk4OOHTsCAOLj43H06FEUFBQgIiIC4eHhEIlEOHv2LIKCgnDp0iUEBQVZuGKyVrxIiohqHZ1OBzs7O9SvXx/NmjV77G1wtFotCgsLceTIkRqu0HoZDAa4u7ujWbNm+OWXX4xh6Oeff8aPP/6Ipk2bwtfXFwAglUpRUFAAmUxmyZKtSklJCXbs2IHly5dj3LhxmD9/PoYMGYIePXrg8OHD2LdvH/r06YP09HS0adMGt2/ftnTJZKU4gkpEtZJIJEJSUhK6du2KXbt24eLFi9BoNCZTeXm58QktVDM0Gg3ee+89TJo0CTY2Npg+fTqmTZsGAPjnP/9pDEIA4O7ujt9++82S5VodR0dHTJ06Fbt370ZycjJatGiBV199FZGRkTAYDBzRJsHgCCoR1TparRYSicT4GMadO3di+vTpiImJQb9+/SxcHQGP9pGDgwPUajUMBgPEYjHKysoQHR2NJUuWIDAwEOfOncP777+PkJAQS5drdXx9fREXF4eRI0fC19cXe/bsQefOnREbG4vu3btzBJUsjiOoRFTrVDx+sSKgjh49Gjt27EBwcDAuXbpkydLoP4qKiqDRaKBWq41Xhms0GowaNQq3bt2CWCzGtWvXcPHiRQtXap2kUimys7OxatUqKBQKLFy4EAB4xIEEgwGViGqdisDzx+eEv/7665gxYwaGDx+OgoICS5VG/yGTyZCWlgaJRAKlUgkPDw+UlZUBeBSO1q5di127dmHq1KkWrtT6HDt2DAAwdepUJCYmYt++fYiOjkaXLl3g4OAApVKJQ4cOoaSkBLGxsRaulqwVD/ETUa3z8OFDuLu7o6CgAE5OTsZ2nU6Htm3bws/PD4cOHbJghTR27Fjk5eXh8OHDSEhIgJ+fHxwcHPDPf/4Tw4cPr/PPEReqsrIyNG7cGCqVCg8fPoSLiwsA4PDhw5gyZQrOnz+PkSNHws7ODjqdDmVlZfjpp58sWzRZJY6gElGtU15ejrZt21a6qbitrS0iIiLg5OQEvV5voero4MGDiI+Px/r16wEAM2bMwLfffoubN28iISEB7du3R9u2bbFgwQJ8//33Fq7Wusjlcty5cwfDhw9Hp06dcOPGDQBAYGAgWrdujY0bN+Ls2bM4efIkzpw5w3BKFsMRVCKqUwwGA7RarfFm/lTztFotEhMT4e/vj+vXr+O1115DSkoKbG1tAQB5eXnYsmUL1q9fj7CwMHzwwQcWrtg6/etf/8KOHTtw/vx5AI/uizp27Fjcv3/fuK+ILIUBlYiIqtVvv/0GHx+fSu0ajQa2trawseHBPEvJz89HgwYNjK9//vlndO7c2YIVET3CgEpERGaj0+nwz3/+8y+NYNvY2KB9+/bo3bt3DVRGFUpKSnDy5EnUq1fvT78cVNxPeNCgQTVYHdEjDKhERGRWYrEYXbt2feJN37Ozs5GVlYWCggIeUq5B2dnZ8PDwgKur65/Ol5+fj1atWvFeqGQRDKhERGRWcrkcv/32Gzw8PP50vqSkJLRp0waJiYm8qr8GFRUVoX79+igpKYFcLq9yHpVKBblczosNyWLEli6AiIjqlj+OhqrVavzrX/+CSCSCSCRCxZjI9OnT0aBBA1y4cIHhtIaJxWKIRCIAjw73f/PNN8bRbpVKhbFjxxr3F5GlcASViIjMysHBAcnJyfDw8IBarYZMJsPo0aNha2sLg8GAHTt2QK1WQyzmGIklVIyOlpSUQKfTwcXFBZMnT4bBYMCWLVtQVFQEkUgEuVxufFobUU1jQCUiIrOyt7fHjRs34OfnB4PBAFtbW+PhZK1WC3t7ex46tqCKLw0lJSWQSqWQSCTGewrXq1cPxcXFUKvVDKhkUby3BxERmY1Wq0Xjxo2hVCoBwHiouOJwMQ8dC4uNjY3J/vjjaRhElsTjK0REZDZisRgZGRnGEdLy8nIYDAYsWLAAdnZ2xnatVstD/BbyxwCqUqmg0+nw1ltvwWAwQK1WQ6lU8q4KZHH8dCAiIrMpKSnBP/7xDxw/fhxXrlyBTqfDkiVLIJFIYGNjA71ej8WLF/MQvwXpdDrjqKnBYMD69euNV/P36dOHD04gQeA5qEREZDaTJ0/G6dOnMX/+fIwaNQpSqdTSJdF/KSoqQoMGDXDr1i3IZLIq51Gr1WjdujXu3bsHT0/PGq6QiAGViIjMqKCgAPXq1YNYLMbly5fxyiuvwMHB4U+XSU9Pr6HqCAB+//13NG3a9C+dbyoSiXihFFkED/ETEZHZ1K9f3/j35s2bIyoq6rHnmqrVahQUFNRUafQfbm5uyM3NhaOj45+ea1peXo78/PwarIzo/3EElYiIiIgEhWdCExEREZGgMKASERERkaAwoBIRERGRoDCgEhEREZGgMKASET3GqVOn8Oqrr/6tZS5evIgDBw489Xtu3LgRoaGhZq3pcZo3b47c3NynWjY0NBS2traYMWNGpb7bt2+jV69ekMlkaNeuHY4cOfK/lkpEVoYBlYjITEpLSzFmzBhotdqnXse+ffswfPhwM1ZVPbZv346vv/66UrtarcbgwYMxcuRIFBUVYdOmTZgwYQJu3LhhgSqJqLZiQCUiMpN58+YhNTX1qZfPycnB9evX8fLLL5uxqpoVExMDf39/TJ8+Hfb29njhhRcQERGBVatWWbo0IqpFGFCJqE6Ki4tD586dIZPJ0KZNG+zZswfvvfceVq9ejaVLl2L16tXYvHkzfH19IZVK0bFjR8TFxRmXf/fdd9GvXz8cOXIEYrEYP//885++3+HDh3Hq1CkMHTq0Ul9MTAyee+45SCQStGjRAqtWraryCT4xMTEIDAyEvb09AODQoUPw8vJCo0aNsG7dOpN5z549i4CAAMjlcvTp06dSMD59+jR69OgBqVQKDw8PvPfeeygvLwcA3Lx5E2KxGGlpaXB3d8fs2bMBAEuXLkX9+vXh5+eHU6dOPfmHXIWEhAQEBQWZtIWEhOD48eNPtT4isk4MqERU53z77beYOnUqli1bhtzcXMTExGDr1q0m54Zu2bIF8fHxOHz4MPLz8/Hhhx9i2rRp2LdvHwBg5cqVOH78OAYPHgytVovOnTs/9v1yc3MxZcoU7NixA3K53KRvx44dmDdvHlatWoWCggLExMQgJiYGc+fOrbSeffv2ISQkxPh61qxZOHbsGC5duoSioiJjqE1KSsL8+fMRFRWFnJwc9OvXD2+++aZxuRMnTuCNN97AnDlzkJubi5MnT+LmzZsYMWIEAKBdu3bQarVo1qwZsrOz8dlnnyExMREHDhxAamoqNm/ejHv37v3tnzvw6LGlPj4+Jm0NGzaESqWCWq1+qnUSkfVhQCWiOkWj0WDOnDn47rvvEBgYCAcHB7Rp0wbfffedSUCSSqXYt28fWrVqBblcjsDAQOzfvx/z5s2DXq//W+85adIkTJw4EV27djVpLy8vx/z583Hw4EG8/PLLkMvl6NixI44ePYp9+/YhKSnJOO/Dhw9x7do1DBw40NhmZ2cHJycneHt7Y/HixRCJRACAtLQ07N27F23btkW9evWwaNEiZGZm4uHDhwCAOXPm4Ouvv8aIESPg6OgIPz8/xMTEIDU1FceOHatyG2xtbWFnZwcHBwe8+OKLGDdu3N/6GVRQKpWVQjoAyGQyKJXKp1onEVkfBlQiqlNu3LgBd3d3BAQEmLRXhNAKISEhlZ5D3qVLFzg4OCA5Ofkvv9+2bdtw//59LFq0qMpamjRpgueee86k3cXFBa+++ipOnz5tbIuJicHgwYMhkUiMbatXr8YLL7yA7du3V6rT09PT+FokEqF58+bIzs5GXl4esrKyTLYVeBR233zzzcceuvf19cXAgQPRrVs3nD9//i9v/3+TyWRQqVSV2vPz86sMrkREVWFAJaI6RafTQSaTVdknFouNf6/qHNAncXd3h1gshlgsxvPPP4/09HS8/fbbuHr1KhwcHCCVSrFjxw689dZbmD9/PvR6vcl7/pGtra3JSG1VV+8HBQXh9OnTOHjwICZMmGBsd3Z2rrQ+Ozs7qNXqv/We/+3DDz/Exo0bMXXqVERFRaG0tNS4vWKxGG+88caf/nwAwMvLq9L5sNnZ2WjQoIHx3FoioidhQCWiOuXZZ59FSkpKpXMoy8vLER8fb3y9f//+SreDunLlCkpLS+Hr61vlurOzs6HVaqHVanHx4kV4eXkhLy8ParUaKpUKKpUKY8aMwe7du7Fq1Sq0b98eaWlp+O2330zWU1JSgsOHD6N3794AHh3ev3r1qsnh/QpNmzZFdHQ0jh49ivz8fACAjc3jP7obNWoEV1dXJCQkmLTrdDrs27cPL7744mOXBYDnn38e3377LVavXg0HBwfj9mq1Wnz77bd/uiwA9OrVy+TnDADR0dHo37//E5clIqrAgEpEdYpUKsXixYvx2muv4ezZs1CpVEhKSsKIESOgVquNo4tarRYjRozAnTt3UFZWhri4OAwdOhSrV682BkAbGxtoNJqnrkUikeDDDz/Ea6+9hjNnzkClUuH69et49dVXMXjwYLRt2xYA8N1332HQoEGQSqXGZXNycvDuu+8iPz8fFy5cgFarhZOT019637Vr12L06NGIi4tDWVkZfvvtN7z55pto1KgRBg0aZJzvj9t36tQpbNiwAWVlZThy5AiaNWv2VNs8dOhQXL58GVFRUdBoNLhw4QI++eQTzJ8//6nWR0TWqerjQEREtdiMGTMgl8sRHh6Oe/fuwcfHBwsWLICnpyecnZ1RWFiIsWPHwsPDA0OGDMHdu3fRunVrrFu3DkOGDDGup3Xr1rh58yY6dOiAa9euPVUtEyZMgJOTE2bMmIGkpCS4u7tj8uTJeO+994zzREdHY+rUqSbLNWrUCAaDAS1atECDBg2wbdu2xx66/28DBw7Eli1bsGjRIgwbNgwNGjTAW2+9hY8++sh4oRUA9OnTBz4+Pvjxxx/Rvn17fPrpp5g/fz6ee+457Ny586m2VyqVIjY2FpMmTcKkSZPg7e2NzZs3o127dk+1PiKyTiLD05yIRUQkUBqNxnge5X+f89i9e3ds2LABhw4dgqOjI9555x0LVfn/8vLy4OPjg/v375uMoBIRWTMe4ieiOsXOzg43b97EyJEjkZmZCQAoKCjA/Pnz4eDggE6dOlm4QlO5ubn47LPPGE6JiP6AAZWI6px//etfCAgIQL9+/dCoUSP4+/vDzs7O5ElRQuHn54exY8daugwiIkHhIX4iIiIiEhSOoBIRERGRoDCgEhEREZGgMKASERERkaAwoBIRERGRoDCgEhEREZGgMKASERERkaAwoBIRERGRoDCgEhEREZGg/B+ogpbmggvhqAAAAABJRU5ErkJggg==","text/plain":["<Figure size 800x400 with 1 Axes>"]},"metadata":{},"output_type":"display_data"}],"source":["best_run = \"gpt-4o/shots-10\"\n","plot_value_counts(df, best_run)"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["--------------------------------------------------\n","text: 哭的人和死者的关系重要吗 \n","--------------------------------------------------\n","label: 是\n","--------------------------------------------------\n","title: 甄庄哭声\n","--------------------------------------------------\n","puzzle: 在一个安静的夜晚,小村庄的湖边突然传来了阵阵哭泣声。第二天早晨,村长甄锐发现湖边的石头上放着一顶破旧的帽子,但没有人知道这顶帽子是从哪里来的,哭泣声又是为何。请还原故事真相。\n","--------------------------------------------------\n","truth: 原来,这顶破旧的帽子属于一个小男孩,他小时候与爷爷在湖边生活。爷爷教他钓鱼、游泳,还告诉他湖中的海龟是他们的朋友。后来,小男孩随父母去了城市生活,但每年夏天都会回到村子探望爷爷。然而,去年夏天,爷爷因病去世,小男孩伤心欲绝。今年夏天,他回到村子,来到湖边,想起和爷爷的美好回忆,忍不住哭泣。他将爷爷的帽子放在湖边的石头上,希望能让爷爷的在天之灵得到安慰。那晚的哭泣声正是小男孩在祭莫他亲爱的爷爷。\n","--------------------------------------------------\n","gpt-4o-mini/shots-00: 不重要\n","--------------------------------------------------\n","gpt-4o-mini/shots-05: 不重要\n","--------------------------------------------------\n","gpt-4o-mini/shots-10: 不重要\n","--------------------------------------------------\n","gpt-4o-mini/shots-20: 不重要\n","--------------------------------------------------\n","gpt-4o-mini/shots-30: 不重要\n","--------------------------------------------------\n","gpt-4o-mini/shots-40: 不重要\n","--------------------------------------------------\n","gpt-4o-mini/shots-50: 不重要\n","--------------------------------------------------\n","gpt-4o/shots-00: 是。\n","--------------------------------------------------\n","gpt-4o/shots-05: 不重要\n","--------------------------------------------------\n","gpt-4o/shots-10: 重要\n","--------------------------------------------------\n","gpt-4o/shots-20: 不重要\n","--------------------------------------------------\n","gpt-4o/shots-30: 不重要\n","--------------------------------------------------\n","gpt-4o/shots-40: 不重要\n","--------------------------------------------------\n","gpt-4o/shots-50: 重要\n"]}],"source":["from llm_toolkit.llm_utils import print_row_details\n","\n","invalid_rows = df[\"重要\" == df[best_run]]\n","print_row_details(invalid_rows)"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","num_shots: 10\n","labels: ['不是' '不重要' '是' '问法错误' '回答正确']\n","P2_few_shot: 你是一个情景猜谜游戏的主持人。游戏规则如下:\n","\n","1. 参与者会得到一个谜面,谜面会描述一��简单又难以理解的事件。\n","2. 主持人知道谜底,谜底是谜面的答案。\n","3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n","4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:\n"," - 若谜面和谜底能找到问题的答案,回答:是或者不是\n"," - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要\n"," - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误\n"," - 若参与者提问基本还原了谜底真相,回答:回答正确\n","5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。\n","\n","请严格按照这些规则回答参与者提出的问题。\n","\n","示例输入和输出: \n","谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n","谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n","参与者提出的问题: 偷的人信神吗\n","回答: 不是\n","\n","谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n","谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n","参与者提出的问题: 村庄里的人喜欢南瓜嘛\n","回答: 不重要\n","\n","谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n","谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n","参与者提出的问题: 是村里的人偷的么\n","回答: 是\n","\n","谜面: 在一个炎热的夏日,乡村的甄家大院的西瓜突然全部不翼而飞。据了解,甄家大院周围并没有其他人家,而且门窗都完好无损,没有任何被撬的痕迹。村民们议论纷纷,猜测这批西瓜究竟去了哪里。你知道西瓜去了哪里吗?\n","谜底: 原来,这批西瓜是被一只巨大的乌鸦偷走了。这只乌鸦为了给自己的孩子们准备食物,它趁着夜色,竟然将甄家大院的西瓜一颗颗地带回了巢穴。第二天,村民们发现了乌鸦的巢穴,里面堆满了西瓜,而这个意外的真相让所有人都忍俊不禁。甄家老爷也感慨地说:“真是世界大了,什么奇事都有!”\n","参与者提出的问题: 挖地道\n","回答: 问法错误\n","\n","谜面: 在一个炎热的夏日,乡村的甄家大院的西瓜突然全部不翼而飞。据了解,甄家大院周围并没有其他人家,而且门窗都完好无损,没有任何被撬的痕迹。村民们议论纷纷,猜测这批西瓜究竟去了哪里。你知道西瓜去了哪里吗?\n","谜底: 原来,这批西瓜是被一只巨大的乌鸦偷走了。这只乌鸦为了给自己的孩子们准备食物,它趁着夜色,竟然将甄家大院的西瓜一颗颗地带回了巢穴。第二天,村民们发现了乌鸦的巢穴,里面堆满了西瓜,而这个意外的真相让所有人都忍俊不禁。甄家老爷也感慨地说:“真是世界大了,什么奇事都有!”\n","参与者提出的问题: 鸟觅食时发现甄家大院有西瓜,飞入大院一颗一颗把西瓜带走\n","回答: 回答正确\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真���是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 有人身亡吗?\n","回答: 不是\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 有人跟甄大勇有仇吗\n","回答: 不重要\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 他仅仅是在修钟楼吗\n","回答: 是\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 是自然意外还是人为意外\n","回答: 问法错误\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 因为甄在钟楼里维修然后昏迷了导致钟楼停止报时\n","回答: 回答正确\n","\n","\n","谜面: {}\n","谜底: {}\n","参与者提出的问题: {}\n","回答: \n","\n"]}],"source":["datasets = load_logical_reasoning_dataset(data_path)\n","\n","prompt = get_few_shot_prompt_template(10, datasets[\"train\"].to_pandas(), debug=True)"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["你是一个情景猜谜游戏的主持人。游戏规则如下:\n","\n","1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。\n","2. 主持人知道谜底,谜底是谜面的答案。\n","3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n","4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:\n"," - 若谜面和谜底能找到问题的答案,回答:是或者不是\n"," - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要\n"," - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误\n"," - 若参与者提问基本还原了谜底真相,回答:回答正确\n","5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。\n","\n","请严格按照这些规则回答参与者提���的问题。\n","\n","示例输入和输出: \n","谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n","谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n","参与者提出的问题: 偷的人信神吗\n","回答: 不是\n","\n","谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n","谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n","参与者提出的问题: 村庄里的人喜欢南瓜嘛\n","回答: 不重要\n","\n","谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n","谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n","参与者提出的问题: 是村里的人偷的么\n","回答: 是\n","\n","谜面: 在一个炎热的夏日,乡村的甄家大院的西瓜突然全部不翼而飞。据了解,甄家大院周围并没有其他人家,而且门窗都完好无损,没有任何被撬的痕迹。村民们议论纷纷,猜测这批西瓜究竟去了哪里。你知道西瓜去了哪里吗?\n","谜底: 原来,这批西瓜是被一只巨大的乌鸦偷走了。这只乌鸦为了给自己的孩子们准备食物,它趁着夜色,竟然将甄家大院的西瓜一颗颗地带回了巢穴。第二天,村民们发现了乌鸦的巢穴,里面堆满了西瓜,而这个意外的真相让所有人都忍俊不禁。甄家老爷也感慨地说:“真是世界大了,什么奇事都有!”\n","参与者提出的问题: 挖地道\n","回答: 问法错误\n","\n","谜面: 在一个炎热的夏日,乡村的甄家大院的西瓜突然全部不翼而飞。据了解,甄家大院周围并没有其他人家,而且门窗都完好无损,没有任何被撬的痕迹。村民们议论纷纷,猜测这批西瓜究竟去了哪里。你知道西瓜去了哪里吗?\n","谜底: 原来,这批西瓜是被一只巨大的乌鸦偷走了。这只乌鸦为了给自己的孩子们准备食物,它趁着夜色,竟然将甄家大院的西瓜一颗颗地带回了巢穴。第二天,村民们发现了乌鸦的巢穴,里面堆满了西瓜,而这个意外的真相让所有人都忍俊不禁。甄家老爷也感慨地说:“真是世界大了,什么奇事都有!”\n","参与者提出的问题: 鸟觅食时发现甄家大院有西瓜,飞入大院一颗一颗把西瓜带走\n","回答: 回答正确\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 有人身亡吗?\n","回答: 不是\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是���迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 有人跟甄大勇有仇吗\n","回答: 不重要\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 他仅仅是在修钟楼吗\n","回答: 是\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 是自然意外还是人为意外\n","回答: 问法错误\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 因为甄在钟楼里维修然后昏迷了导致钟楼停止报时\n","回答: 回答正确\n","\n","\n","谜面: 在一个安静的夜晚,小村庄的湖边突然传来了阵阵哭泣声。第二天早晨,村长甄锐发现湖边的石头上放着一顶破旧的帽子,但没有人知道这顶帽子是从哪里来的,哭泣声又是为何。请还原故事真相。\n","谜底: 原来,这顶破旧的帽子属于一个小男孩,他小时候与爷爷在湖边生活。爷爷教他钓鱼、游泳,还告诉他湖中的海龟是他们的朋友。后来,小男孩随父母去了城市生活,但每年夏天都会回到村子探望爷爷。然而,去年夏天,爷爷因病去世,小男孩伤心欲绝。今年夏天,他回到村子,来到湖边,想起和爷爷的美好回忆,忍不住哭泣。他将爷爷的帽子放在湖边的石头上,希望能让爷爷的在天之灵得到安慰。那晚的哭泣声正是小男孩在祭莫他亲爱的爷爷。\n","参与者提出的问题: 哭的人和死者的关系重要吗 \n","回答: \n","\n"]}],"source":["row = invalid_rows.iloc[0]\n","print(prompt.format(row[\"puzzle\"], row[\"truth\"], row[\"text\"]))"]},{"cell_type":"code","execution_count":35,"metadata":{},"outputs":[{"data":{"text/plain":["'不重要'"]},"execution_count":35,"metadata":{},"output_type":"execute_result"}],"source":["%%time\n","\n","reasoning_with_openai(row, prompt, max_tokens=max_new_tokens, model=\"gpt-4o\")"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["CPU times: user 29 ms, sys: 14.9 ms, total: 43.9 ms\n","Wall time: 761 ms\n"]},{"data":{"text/plain":["'不重要'"]},"execution_count":12,"metadata":{},"output_type":"execute_result"}],"source":["%%time\n","\n","reasoning_with_openai(row, prompt, max_tokens=max_new_tokens, model=\"gpt-4o\", temperature=1)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["CPU times: user 19.2 ms, sys: 3.09 ms, total: 22.3 ms\n","Wall time: 632 ms\n"]},{"data":{"text/plain":["'不重要'"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["%%time\n","\n","reasoning_with_openai(row, prompt, max_tokens=max_new_tokens, model=\"gpt-4o-mini\")"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["CPU times: user 18.2 ms, sys: 2.49 ms, total: 20.7 ms\n","Wall time: 1.97 s\n"]},{"data":{"text/plain":["'不重要'"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["%%time\n","\n","reasoning_with_openai(\n"," row, prompt, max_tokens=max_new_tokens, model=\"gpt-4o-mini\", temperature=1\n",")"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["CPU times: user 21.3 ms, sys: 5 ms, total: 26.3 ms\n","Wall time: 10.7 s\n"]},{"data":{"text/plain":["'是'"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["%%time\n","\n","reasoning_with_openai(\n"," row,\n"," prompt,\n"," max_tokens=None,\n"," model=\"o1-preview\",\n"," temperature=1,\n"," using_system_prompt=False,\n",")"]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["CPU times: user 17.1 ms, sys: 2.2 ms, total: 19.3 ms\n","Wall time: 4.49 s\n"]},{"data":{"text/plain":["'是'"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["%%time\n","\n","reasoning_with_openai(\n"," row,\n"," prompt,\n"," max_tokens=None,\n"," model=\"o1-mini\",\n"," temperature=1,\n"," using_system_prompt=False,\n",")"]},{"cell_type":"markdown","metadata":{},"source":["## Run Batch Inference"]},{"cell_type":"code","execution_count":19,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading /Users/inflaton/code/engd/projects/logical-reasoning/llm_toolkit/logical_reasoning_utils.py\n","loading existing data from: datasets/mgtv/o1-mini.jsonl\n"]}],"source":["df_batch = load_openai_batch_data(data_path)"]},{"cell_type":"code","execution_count":20,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["--------------------------------------------------\n","custom_id: request-1\n","--------------------------------------------------\n","method: POST\n","--------------------------------------------------\n","url: /v1/chat/completions\n","--------------------------------------------------\n","body: {'model': 'o1-mini', 'messages': [{'role': 'user', 'content': '你是一个情景猜谜游戏的主持人。游戏规则如下:\\n\\n1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。\\n2. 主持人知道谜底,谜底是谜面的答案。\\n3. 参与者可以询问任何封闭式问题来找寻事件的真相。\\n4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:\\n - 若谜面和谜底能找到问题的答案,回答:是或者不是\\n - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要\\n - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误\\n - 若参与者提问基本还原了谜底真相,回答:回答正确\\n5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。\\n\\n请严格按照这些规则回答参与者提出的问题。\\n\\n示例输入和输出: \\n谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\\n谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\\n参与者提出的问题: 偷的人信神吗\\n回答: 不是\\n\\n谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\\n谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\\n参与者提出的问题: 村庄里的人喜欢南瓜嘛\\n回答: 不重要\\n\\n谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\\n谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结��。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\\n参与者提出的问题: 是村里的人偷的么\\n回答: 是\\n\\n谜面: 在一个炎热的夏日,乡村的甄家大院的西瓜突然全部不翼而飞。据了解,甄家大院周围并没有其他人家,而且门窗都完好无损,没有任何被撬的痕迹。村民们议论纷纷,猜测这批西瓜究竟去了哪里。你知道西瓜去了哪里吗?\\n谜底: 原来,这批西瓜是被一只巨大的乌鸦偷走了。这只乌鸦为了给自己的孩子们准备食物,它趁着夜色,竟然将甄家大院的西瓜一颗颗地带回了巢穴。第二天,村民们发现了乌鸦的巢穴,里面堆满了西瓜,而这个意外的真相让所有人都忍俊不禁。甄家老爷也感慨地说:“真是世界大了,什么奇事都有!”\\n参与者提出的问题: 挖地道\\n回答: 问法错误\\n\\n谜面: 在一个炎热的夏日,乡村的甄家大院的西瓜突然全部不翼而飞。据了解,甄家大院周围并没有其他人家,而且门窗都完好无损,没有任何被撬的痕迹。村民们议论纷纷,猜测这批西瓜究竟去了哪里。你知道西瓜去了哪里吗?\\n谜底: 原来,这批西瓜是被一只巨大的乌鸦偷走了。这只乌鸦为了给自己的孩子们准备食物,它趁着夜色,竟然将甄家大院的西瓜一颗颗地带回了巢穴。第二天,村民们发现了乌鸦的巢穴,里面堆满了西瓜,而这个意外的真相让所有人都忍俊不禁。甄家老爷也感慨地说:“真是世界大了,什么奇事都有!”\\n参与者提出的问题: 鸟觅食时发现甄家大院有西瓜,飞入大院一颗一颗把西瓜带走\\n回答: 回答正确\\n\\n谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\\n谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\\n参与者提出的问题: 有人身亡吗?\\n回答: 不是\\n\\n谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\\n谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\\n参与者提出的问题: 有人跟甄大勇有仇吗\\n回答: 不重要\\n\\n谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\\n谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\\n参与者提出的问题: 他仅仅是在修钟楼吗\\n回答: 是\\n\\n谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\\n谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\\n参与者提出的问题: 是自然意外还是人为意外\\n回答: 问法错误\\n\\n谜面: 在一个安���的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\\n谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\\n参与者提出的问题: 因为甄在钟楼里维修然后昏迷了导致钟楼停止报时\\n回答: 回答正确\\n\\n\\n谜面: 在远离城市喧嚣的海边小屋,一天清晨,邻居发现甄加索僵卧在沙滩上,已无生命迹象。现场没有发现任何打斗的迹象。请问甄加索的死因是什么?\\n谜底: 甄加索是一位热爱自然的画家,他每年都会来到这个海边小屋寻找灵感。在他生命的最后几天,他一直在创作一幅描绘海洋生物的画作。在画即将完成的前一天晚上,他骑着自行车外出,打算在海边观赏夜景。然而,他在沙滩上意外发现了一只搁浅的海豚,为了救助这只海豚,他耗费了极大的体力,最终成功将其送回海中。筋疲力尽的甄加索在沙滩上睡着了,由于他患有严重的心脏病,却未告知旁人,在寒冷的海风中,他的心脏停止了跳动。因此,警方在现场只发现了车轮痕迹和未完成的画作,而没有发现任何他杀的迹象。\\n参与者提出的问题: 甄加索是自杀吗\\n回答: \\n'}]}\n"]}],"source":["from llm_toolkit.llm_utils import print_row_details\n","\n","print_row_details(df_batch)"]},{"cell_type":"code","execution_count":21,"metadata":{},"outputs":[],"source":["from openai import OpenAI\n","\n","client = OpenAI()\n","\n","batch_input_file = client.files.create(\n"," file=open(\"datasets/mgtv/o1-mini.jsonl\", \"rb\"), purpose=\"batch\"\n",")"]},{"cell_type":"code","execution_count":22,"metadata":{},"outputs":[{"data":{"text/plain":["FileObject(id='file-kMFciebTZVHFQtiORp7tEqWN', bytes=57923228, created_at=1726205589, filename='o1-mini.jsonl', object='file', purpose='batch', status='processed', status_details=None)"]},"execution_count":22,"metadata":{},"output_type":"execute_result"}],"source":["batch_input_file"]},{"cell_type":"code","execution_count":23,"metadata":{},"outputs":[{"data":{"text/plain":["Batch(id='batch_QPokplhBDjc980UBURltmmbm', completion_window='24h', created_at=1726205603, endpoint='/v1/chat/completions', input_file_id='file-kMFciebTZVHFQtiORp7tEqWN', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1726292003, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'nightly eval job - o1-mini'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))"]},"execution_count":23,"metadata":{},"output_type":"execute_result"}],"source":["batch_input_file_id = batch_input_file.id\n","\n","client.batches.create(\n"," input_file_id=batch_input_file_id,\n"," endpoint=\"/v1/chat/completions\",\n"," completion_window=\"24h\",\n"," metadata={\n"," \"description\": \"nightly eval job - o1-mini\",\n"," }\n",")"]},{"cell_type":"code","execution_count":25,"metadata":{},"outputs":[{"data":{"text/plain":["Batch(id='batch_QPokplhBDjc980UBURltmmbm', completion_window='24h', created_at=1726205603, endpoint='/v1/chat/completions', input_file_id='file-kMFciebTZVHFQtiORp7tEqWN', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='model_not_found', line=1, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=2, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=3, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=4, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=5, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=6, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=7, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=8, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=9, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=10, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=11, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=12, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=13, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=14, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=15, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=16, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=17, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=18, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=19, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model'), BatchError(code='model_not_found', line=20, message=\"The provided model 'o1-mini' is not supported by the Batch API.\", param='body.model')], object='list'), expired_at=None, expires_at=1726292003, failed_at=1726205604, finalizing_at=None, in_progress_at=None, metadata={'description': 'nightly eval job - o1-mini'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))"]},"execution_count":25,"metadata":{},"output_type":"execute_result"}],"source":["from openai import OpenAI\n","\n","client = OpenAI()\n","\n","client.batches.retrieve(\"batch_QPokplhBDjc980UBURltmmbm\")"]},{"cell_type":"markdown","metadata":{},"source":["## Run Completion Endpoints"]},{"cell_type":"code","execution_count":27,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /Users/inflaton/code/engd/projects/logical-reasoning/.env\n","Adding /Users/inflaton/code/engd/projects/logical-reasoning to sys.path\n","internlm/internlm2_5-7b-chat-1m datasets/mgtv data/openai_results.csv 2048\n"]}],"source":["from llm_toolkit.eval_openai import evaluate_model_with_num_shots"]},{"cell_type":"code","execution_count":44,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n"]}],"source":["datasets = load_logical_reasoning_dataset(data_path)"]},{"cell_type":"code","execution_count":45,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Evaluating model: o1-mini\n","--------------------------------------------------\n","text: 甄加索是自杀吗\n","--------------------------------------------------\n","label: 不是\n","--------------------------------------------------\n","answer: nan\n","--------------------------------------------------\n","title: 海岸之谜\n","--------------------------------------------------\n","puzzle: 在远离城市喧嚣的海边小屋,一天清晨,邻居发现甄加索僵卧在沙滩上,已无生命迹象。现场没有发现任何打斗的迹象。请问甄加索的死因是什么?\n","--------------------------------------------------\n","truth: 甄加索是一位热爱自然的画家,他每年都会来到这个海边小屋寻找灵感。在他生命的最后几天,他一直在创作一幅描绘海洋生物的画作。在画即将完成的前一天晚上,他骑着自行车外出,打算在海边观赏夜景。然而,他在沙滩上意外发现了一只搁浅的海豚,为了救助这只海豚,他耗费了极大的体力,最终成功将其送回海中。筋疲力尽的甄加索在沙滩上睡着了,由于他患有严重的心脏病,却未告知旁人,在寒冷的海风中,他的心脏停止了跳动。因此,警方在现场只发现了车轮痕迹和未完成的画作,而没有发现任何他杀的迹象。\n","*** Evaluating with num_shots: 10\n","user_prompt: 你是一个情景猜谜游戏的主持人。游戏规则如下:\n","\n","1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。\n","2. 主持人知道谜底,谜底是谜面的答案。\n","3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n","4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:\n"," - 若谜面和谜底能找到问题的答案,回答:是或者不是\n"," - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要\n"," - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误\n"," - 若参与者提问基本还原了谜底真相,回答:回答正确\n","5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。\n","\n","请严格按照这些规则回答参与者提出的问题。\n","\n","示例输入和输出: \n","谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n","谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n","参与者提出的问题: 偷的人信神吗\n","回答: 不是\n","\n","谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n","谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n","参与者提出的问题: 村庄里的人喜欢南瓜嘛\n","回答: 不重要\n","\n","谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n","谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n","参与者提出的问题: 是村里的人偷的么\n","回答: 是\n","\n","谜面: 在一个炎热的夏日,乡村的甄家大院的西瓜突然全部不翼而飞。据了解,甄家大院周围并没有其他人家,而且门窗都完好无损,没有任何被撬的痕迹。村民们议论纷纷,猜测这批西瓜究竟去了哪里。你知道西瓜去了哪里吗?\n","谜底: 原来,这批西瓜是被一只巨大的乌鸦偷走了。这只乌鸦为了给自己的孩子们准备食物,它趁着夜色,竟然将甄家大院的西瓜一颗颗地带回了巢穴。第二天,村民们发现了乌鸦的巢穴,里面堆满了西瓜,而这个意外的真相让所有人都忍俊不禁。甄家老爷也感慨地说:“真是世界大了,什么奇事都有!”\n","参与者提出的问题: 挖地道\n","回答: 问法错误\n","\n","谜面: 在一个炎热的夏日,乡村的甄家大院的西瓜突然全部不翼而飞。据了解,甄家大院周围并没有其他人家,而且门窗都完好无损,没有任何被撬的痕迹。村民们议论纷纷,猜测这批西瓜究竟去了哪里。你知道西瓜去了哪里吗?\n","谜底: 原来,这批西瓜是被一只巨大的乌鸦偷走了。这只乌鸦为了给自己的孩子们准备食物,它趁着夜色,竟然将甄家大院的西瓜一颗颗地带回了巢穴。第二天,村民们发现了乌鸦的巢穴,里面堆满了西瓜,而这个意外的真相让所有人都忍俊不禁。甄家老爷也感慨地说:“真是世界大了,什么奇事都有!”\n","参与者提出的问题: 鸟觅食时发现甄家大院有西瓜,飞入大院一颗一颗把西瓜带走\n","回答: 回答正确\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面��人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 有人身亡吗?\n","回答: 不是\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 有人跟甄大勇有仇吗\n","回答: 不重要\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 他仅仅是在修钟楼吗\n","回答: 是\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 是自然意外还是人为意外\n","回答: 问法错误\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 因为甄在钟楼里维修然后昏迷了导致钟楼停止报时\n","回答: 回答正确\n","\n","\n","谜面: {}\n","谜底: {}\n","参与者提出的问题: {}\n","回答: \n","\n"]},{"name":"stderr","output_type":"stream","text":[" 45%|████▌ | 1353/3000 [3:26:20<5:21:36, 11.72s/it] "]}],"source":["%%time\n","\n","model_name =\"o1-mini\"\n","\n","evaluate_model_with_num_shots(\n"," model_name,\n"," datasets,\n"," results_path=results_path,\n"," range_num_shots=[10],\n"," max_new_tokens=max_new_tokens,\n",")"]}],"metadata":{"accelerator":"GPU","application/vnd.databricks.v1+notebook":{"dashboards":[],"environmentMetadata":null,"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"07_MAC_+_Qwen2-7B-Instructi_Unsloth_train","widgets":{}},"colab":{"gpuType":"T4","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":0}
|
notebooks/04d_OpenAI-batch.ipynb
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"0ea8b46b-839b-445b-8043-ccdf4e920ace","showTitle":false,"title":""},"id":"YLH80COBzi_F"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","execution_count":2,"metadata":{"id":"63B5exAuzq4M"},"outputs":[],"source":["from pathlib import Path\n","\n","if \"workding_dir\" not in locals():\n"," try:\n"," from google.colab import drive\n"," drive.mount('/content/drive')\n"," workding_dir = \"/content/drive/MyDrive/logical-reasoning/\"\n"," except ModuleNotFoundError:\n"," workding_dir = str(Path.cwd().parent)"]},{"cell_type":"code","execution_count":3,"metadata":{"executionInfo":{"elapsed":368,"status":"ok","timestamp":1719461634865,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"zFulf0bg0H-9","outputId":"debdd535-c828-40b9-efc0-8a180e5830dd"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: /Users/inflaton/code/engd/projects/logical-reasoning\n"]}],"source":["import os\n","import sys\n","\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":4,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":589,"status":"ok","timestamp":1719462011879,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"DIUiweYYzi_I","outputId":"e16e9247-9077-4b0c-f8ea-17059f05a1c4"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /Users/inflaton/code/engd/projects/logical-reasoning/.env\n"]},{"data":{"text/plain":["True"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n"," found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["internlm/internlm2_5-7b-chat-1m datasets/mgtv data/openai_results.csv 2048\n"]}],"source":["import os\n","\n","model_name = os.getenv(\"MODEL_NAME\")\n","data_path = os.getenv(\"LOGICAL_REASONING_DATA_PATH\")\n","results_path = os.getenv(\"LOGICAL_REASONING_RESULTS_PATH\")\n","max_new_tokens = int(os.getenv(\"MAX_NEW_TOKENS\", 2048))\n","\n","print(model_name, data_path, results_path, max_new_tokens)"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading /Users/inflaton/code/engd/projects/logical-reasoning/llm_toolkit/logical_reasoning_utils.py\n"]}],"source":["from llm_toolkit.logical_reasoning_utils import *"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","num_shots: 10\n","labels: ['不是' '不重要' '是' '问法错误' '回答正确']\n","P2_few_shot: 你是一个情景猜谜游戏的主持人。游戏规则如下:\n","\n","1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。\n","2. 主持人知道谜底,谜底是谜面的答案。\n","3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n","4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:\n"," - 若谜面和谜底能找到问题的答案,回答:是或者不是\n"," - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要\n"," - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误\n"," - 若参与者提问基本还原了谜底真相,回答:回答正确\n","5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。\n","\n","请严格按照这些规则回答参与者提出的问题。\n","\n","示例输入和输出: \n","谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n","谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他��约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n","参与者提出的问题: 偷的人信神吗\n","回答: 不是\n","\n","谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n","谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n","参与者提出的问题: 村庄里的人喜欢南瓜嘛\n","回答: 不重要\n","\n","谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n","谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n","参与者提出的问题: 是村里的人偷的么\n","回答: 是\n","\n","谜面: 在一个炎热的夏日,乡村的甄家大院的西瓜突然全部不翼而飞。据了解,甄家大院周围并没有其他人家,而且门窗都完好无损,没有任何被撬的痕迹。村民们议论纷纷,猜测这批西瓜究竟去了哪里。你知道西瓜去了哪里吗?\n","谜底: 原来,这批西瓜是被一只巨大的乌鸦偷走了。这只乌鸦为了给自己的孩子们准备食物,它趁着夜色,竟然将甄家大院的西瓜一颗颗地带回了巢穴。第二天,村民们发现了乌鸦的巢穴,里面堆满了西瓜,而这个意外的真相让所有人都忍俊不禁。甄家老爷也感慨地说:“真是世界大了,什么奇事都有!”\n","参与者提出的问题: 挖地道\n","回答: 问法错误\n","\n","谜面: 在一个炎热的夏日,乡村的甄家大院的西瓜突然全部不翼而飞。据了解,甄家大院周围并没有其他人家,而且门窗都完好无损,没有任何被撬的痕迹。村民们议论纷纷,猜测这批西瓜究竟去了哪里。你知道西瓜去了哪里吗?\n","谜底: 原来,这批西瓜是被一只巨大的乌鸦偷走了。这只乌鸦为了给自己的孩子们准备食物,它趁着夜色,竟然将甄家大院的西瓜一颗颗地带回了巢穴。第二天,村民们发现了乌鸦的巢穴,里面堆满了西瓜,而这个意外的真相让所有人都忍俊不禁。甄家老爷也感慨地说:“真是世界大了,什么奇事都有!”\n","参与者提出的问题: 鸟觅食时发现甄家大院有西瓜,飞入大院一颗一颗把西瓜带走\n","回答: 回答正确\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 有人身亡吗?\n","回答: 不是\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 有人跟甄大勇有仇吗\n","回答: 不重要\n","\n","谜面: 在一个��静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 他仅仅是在修钟楼吗\n","回答: 是\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 是自然意外还是人为意外\n","回答: 问法错误\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 因为甄在钟楼里维修然后昏迷了导致钟楼停止报时\n","回答: 回答正确\n","\n","\n","谜面: {}\n","谜底: {}\n","参与者提出的问题: {}\n","回答: \n","\n"]}],"source":["datasets = load_logical_reasoning_dataset(data_path)\n","\n","prompt = get_few_shot_prompt_template(10, datasets[\"train\"].to_pandas(), debug=True)"]},{"cell_type":"markdown","metadata":{},"source":["## Run Batch Inference"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","num_shots: 10\n","labels: ['不是' '不重要' '是' '问法错误' '回答正确']\n","P2_few_shot: 你是一个情景猜谜游戏的主持人。游戏规则如下:\n","\n","1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。\n","2. 主持人知道谜底,谜底是谜面的答案。\n","3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n","4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:\n"," - 若谜面和谜底能找到问题的答案,回答:是或者不是\n"," - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要\n"," - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误\n"," - 若参与者提问基本还原了谜底真相,回答:回答正确\n","5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。\n","\n","请严格按照这些规则回答参与者提出的问题。\n","\n","示例输入和输出: \n","谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n","谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n","参与者提出的问题: 偷的人信神吗\n","回答: 不是\n","\n","谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n","谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n","参与者提出的问题: 村庄里的人喜欢南瓜嘛\n","回答: 不重要\n","\n","谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n","谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n","参与者提出的问题: 是村里的人偷的么\n","回答: 是\n","\n","谜面: 在一个炎热的夏日,乡村的甄家大院的西瓜突然全部不翼而飞。据了解,甄家大院周围并没有其他人家,而且门窗都完好无损,没有任何被撬的痕迹。村民们议论纷纷,猜测这批西瓜究竟去了哪里。你知道西瓜去了哪里吗?\n","谜底: 原来,这批西瓜是被一只巨大的乌鸦偷走了。这只乌鸦为了给自己的孩子们准备食物,它趁着夜色,竟然将甄家大院的西瓜一颗颗地带回了巢穴。第二天,村民们发现了乌鸦的巢穴,里面堆满了西瓜,而这个意外的真相让所有人都忍俊不禁。甄家老爷也感慨地说:“真是世界大了,什么奇事都有!”\n","参与者提出的问题: 挖地道\n","回答: 问法错误\n","\n","谜面: 在一个炎热的夏日,乡村的甄家大院的西瓜突然全部不翼而飞。据了解,甄家大院周围并没有其他人家,而且门窗都完好无损,没有任何被撬的痕迹。村民们议论纷纷,猜测这批西瓜究竟去了哪里。你知道西瓜去了哪里吗?\n","谜底: 原来,这批西瓜是被一只巨大的乌鸦偷走了。这只乌鸦为了给自己的孩子们准备食物,它趁着夜色,竟然将甄家大院的西瓜一颗颗地带回了巢穴。第二天,村民们发现了乌鸦的巢穴,里面堆满了西瓜,而这个意外的真相让所有人都忍俊不禁。甄家老爷也感慨地说:“真是世界大了,什么奇事都有!”\n","参与者提出的问题: 鸟觅食时发现甄家大院有西瓜,飞入大院一颗一颗把西瓜带走\n","回答: 回答正确\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 有人身亡吗?\n","回答: 不是\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 有人跟甄大勇有仇吗\n","回答: 不重要\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 他仅仅是在修钟楼吗\n","回答: 是\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 是自然意外还是人为意外\n","回答: 问法错误\n","\n","谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\n","谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\n","参与者提出的问题: 因为甄在钟楼里维修然后昏迷了导致钟楼停止报时\n","回答: 回答正确\n","\n","\n","谜面: {}\n","谜底: {}\n","参与者提出的问题: {}\n","回答: \n","\n"]}],"source":["df_batch = load_openai_batch_data(data_path, model=\"gpt-4o-mini\")"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["--------------------------------------------------\n","custom_id: request-1\n","--------------------------------------------------\n","method: POST\n","--------------------------------------------------\n","url: /v1/chat/completions\n","--------------------------------------------------\n","body: {'model': 'gpt-4o-mini', 'messages': [{'role': 'user', 'content': '你是一个情景猜谜游戏的主持人。游戏规则如下:\\n\\n1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。\\n2. 主持人知道谜底,谜底是谜面的答案。\\n3. 参与者可以询问任何封闭式问题来找寻事件的真相。\\n4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:\\n - 若谜面和谜底能找到问题的答案,回答:是或者不是\\n - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要\\n - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误\\n - 若参与者提问基本还原了谜底真相,回答:回答正确\\n5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。\\n\\n请严格按照这些规则回答参与者提出的问题。\\n\\n示例输入和输出: \\n谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\\n谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\\n参与者提出的问题: 偷的人信神吗\\n回答: 不是\\n\\n谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\\n谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\\n参与者��出的问题: 村庄里的人喜欢南瓜嘛\\n回答: 不重要\\n\\n谜面: 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\\n谜底: 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\\n参与者提出的问题: 是村里的人偷的么\\n回答: 是\\n\\n谜面: 在一个炎热的夏日,乡村的甄家大院的西瓜突然全部不翼而飞。据了解,甄家大院周围并没有其他人家,而且门窗都完好无损,没有任何被撬的痕迹。村民们议论纷纷,猜测这批西瓜究竟去了哪里。你知道西瓜去了哪里吗?\\n谜底: 原来,这批西瓜是被一只巨大的乌鸦偷走了。这只乌鸦为了给自己的孩子们准备食物,它趁着夜色,竟然将甄家大院的西瓜一颗颗地带回了巢穴。第二天,村民们发现了乌鸦的巢穴,里面堆满了西瓜,而这个意外的真相让所有人都忍俊不禁。甄家老爷也感慨地说:“真是世界大了,什么奇事都有!”\\n参与者提出的问题: 挖地道\\n回答: 问法错误\\n\\n谜面: 在一个炎热的夏日,乡村的甄家大院的西瓜突然全部不翼而飞。据了解,甄家大院周围并没有其他人家,而且门窗都完好无损,没有任何被撬的痕迹。村民们议论纷纷,猜测这批西瓜究竟去了哪里。你知道西瓜去了哪里吗?\\n谜底: 原来,这批西瓜是被一只巨大的乌鸦偷走了。这只乌鸦为了给自己的孩子们准备食物,它趁着夜色,竟然将甄家大院的西瓜一颗颗地带回了巢穴。第二天,村民们发现了乌鸦的巢穴,里面堆满了西瓜,而这个意外的真相让所有人都忍俊不禁。甄家老爷也感慨地说:“真是世界大了,什么奇事都有!”\\n参与者提出的问题: 鸟觅食时发现甄家大院有西瓜,飞入大院一颗一颗把西瓜带走\\n回答: 回答正确\\n\\n谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\\n谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\\n参与者提出的问题: 有人身亡吗?\\n回答: 不是\\n\\n谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\\n谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\\n参与者提出的问题: 有人跟甄大勇有仇吗\\n回答: 不重要\\n\\n谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\\n谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\\n参与者提出的问题: 他仅仅是在修钟楼吗\\n回答: 是\\n\\n谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\\n谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎��钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\\n参与者提出的问题: 是自然意外还是人为意外\\n回答: 问法错误\\n\\n谜面: 在一个安静的夜晚,小镇上的钟楼突然停止了报时。第二天早晨,人们发现钟楼的管理员甄大勇失踪了,而钟楼的门紧闭,从外面看起来一切正常。小镇上的人们议论纷纷,不知道发生了什么事情。\\n谜底: 真相是,钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时,不慎从钟楼的顶部摔落,但并未死亡,只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置,导致钟声停止。他躺在钟楼底部,但由于门从内部反锁,外面的人无法进入。甄大勇在第二天中午苏醒后,自己打开了门,这才知道自己引发了小镇上的恐慌。\\n参与者提出的问题: 因为甄在钟楼里维修然后昏迷了导致钟楼停止报时\\n回答: 回答正确\\n\\n\\n谜面: 在远离城市喧嚣的海边小屋,一天清晨,邻居发现甄加索僵卧在沙滩上,已无生命迹象。现场没有发现任何打斗的迹象。请问甄加索的死因是什么?\\n谜底: 甄加索是一位热爱自然的画家,他每年都会来到这个海边小屋寻找灵感。在他生命的最后几天,他一直在创作一幅描绘海洋生物的画作。在画即将完成的前一天晚上,他骑着自行车外出,打算在海边观赏夜景。然而,他在沙滩上意外发现了一只搁浅的海豚,为了救助这只海豚,他耗费了极大的体力,最终成功将其送回海中。筋疲力尽的甄加索在沙滩上睡着了,由于他患有严重的心脏病,却未告知旁人,在寒冷的海风中,他的心脏停止了跳动。因此,警方在现场只发现了车轮痕迹和未完成的画作,而没有发现任何他杀的迹象。\\n参与者提出的问题: 甄加索是自杀吗\\n回答: \\n'}]}\n"]}],"source":["from llm_toolkit.llm_utils import print_row_details\n","\n","print_row_details(df_batch)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"data":{"text/plain":["FileObject(id='file-dFg7ljzKlmPTy6C0KTs6fYxz', bytes=57935228, created_at=1726210402, filename='gpt-4o-mini.jsonl', object='file', purpose='batch', status='processed', status_details=None)"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["from openai import OpenAI\n","\n","client = OpenAI()\n","\n","batch_input_file = client.files.create(\n"," file=open(\"datasets/mgtv/gpt-4o-mini.jsonl\", \"rb\"), purpose=\"batch\"\n",")\n","batch_input_file"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"data":{"text/plain":["Batch(id='batch_xVbfEAMd9Z3TOH2gkCxoe6GF', completion_window='24h', created_at=1726210433, endpoint='/v1/chat/completions', input_file_id='file-dFg7ljzKlmPTy6C0KTs6fYxz', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1726296833, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'nightly eval job - gpt-4o-mini'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["batch_input_file_id = batch_input_file.id\n","\n","client.batches.create(\n"," input_file_id=batch_input_file_id,\n"," endpoint=\"/v1/chat/completions\",\n"," completion_window=\"24h\",\n"," metadata={\n"," \"description\": \"nightly eval job - gpt-4o-mini\",\n"," },\n",")"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"data":{"text/plain":["Batch(id='batch_xVbfEAMd9Z3TOH2gkCxoe6GF', completion_window='24h', created_at=1726210433, endpoint='/v1/chat/completions', input_file_id='file-dFg7ljzKlmPTy6C0KTs6fYxz', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1726211968, error_file_id=None, errors=None, expired_at=None, expires_at=1726296833, failed_at=None, finalizing_at=1726211455, in_progress_at=1726210437, metadata={'description': 'nightly eval job - gpt-4o-mini'}, output_file_id='file-4DyXwheO1GIPHYII08guYDfp', request_counts=BatchRequestCounts(completed=3000, failed=0, total=3000))"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["from openai import OpenAI\n","\n","client = OpenAI()\n","\n","client.batches.retrieve(\"batch_xVbfEAMd9Z3TOH2gkCxoe6GF\")"]}],"metadata":{"accelerator":"GPU","application/vnd.databricks.v1+notebook":{"dashboards":[],"environmentMetadata":null,"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"07_MAC_+_Qwen2-7B-Instructi_Unsloth_train","widgets":{}},"colab":{"gpuType":"T4","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":0}
|