In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

try:
    from google.colab import drive

    drive.mount("/content/drive")
    workding_dir = "/content/drive/MyDrive/logical-reasoning/"
except ModuleNotFoundError:
    workding_dir = str(Path.cwd().parent)

In [3]:
import os
import sys
from pathlib import Path

os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

workding dir: /Users/inflaton/code/engd/projects/logical-reasoning


In [4]:
need_to_setup_env = False
need_to_setup_env

False

In [5]:
if need_to_setup_env:
    %pip install -r requirements.txt
    %cd /content/
    %rm -rf LLaMA-Factory
    !git clone https://github.com/hiyouga/LLaMA-Factory.git
    %cd LLaMA-Factory
    %ls
    %pip install -e .[torch,bitsandbytes]
    
    os.chdir(workding_dir)
    sys.path.append(workding_dir)
    print("workding dir:", workding_dir)

In [6]:
from dotenv import find_dotenv, load_dotenv

found_dotenv = find_dotenv(".env")

if len(found_dotenv) == 0:
    found_dotenv = find_dotenv(".env.example")
print(f"loading env vars from: {found_dotenv}")
load_dotenv(found_dotenv, override=True)

loading env vars from: /Users/inflaton/code/engd/projects/logical-reasoning/.env


True

In [7]:
import os

model_name = os.getenv("MODEL_NAME")
adapter_name_or_path = os.getenv("ADAPTER_NAME_OR_PATH")
load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
data_path = os.getenv("LOGICAL_REASONING_DATA_PATH")
results_path = os.getenv("LOGICAL_REASONING_RESULTS_PATH")
use_english_datasets = os.getenv("USE_ENGLISH_DATASETS") == "true"

print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)

THUDM/glm-4-9b-chat-1m None False datasets/mgtv results/mgtv-results_m3.csv


In [8]:
%%time
!python --version
!pip show flash-attn

Python 3.11.9
[0mCPU times: user 3.48 ms, sys: 6.05 ms, total: 9.54 ms
Wall time: 634 ms


In [9]:
from llm_toolkit.llm_utils import *
from llm_toolkit.logical_reasoning_utils import *

device = check_gpu()

loading /Users/inflaton/code/engd/projects/logical-reasoning/llm_toolkit/logical_reasoning_utils.py
MPS is available


In [10]:
%%time

model, tokenizer = load_model(model_name, adapter_name_or_path=adapter_name_or_path, using_llama_factory=False)

loading model: THUDM/glm-4-9b-chat-1m with adapter: None


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

CPU times: user 1.07 s, sys: 1.28 s, total: 2.35 s
Wall time: 7.08 s


In [11]:
datasets = load_logical_reasoning_dataset(
    data_path,
    tokenizer=tokenizer,
    chinese_prompt=not use_english_datasets,
    using_p1=False,
)

loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],
        num_rows: 3000
    })
})


In [12]:
print_row_details(datasets["test"].to_pandas())

--------------------------------------------------
text: 甄加索是自杀吗
--------------------------------------------------
label: 不是
--------------------------------------------------
answer: nan
--------------------------------------------------
title: 海岸之谜
--------------------------------------------------
puzzle: 在远离城市喧嚣的海边小屋，一天清晨，邻居发现甄加索僵卧在沙滩上，已无生命迹象。现场没有发现任何打斗的迹象。请问甄加索的死因是什么？
--------------------------------------------------
truth: 甄加索是一位热爱自然的画家，他每年都会来到这个海边小屋寻找灵感。在他生命的最后几天，他一直在创作一幅描绘海洋生物的画作。在画即将完成的前一天晚上，他骑着自行车外出，打算在海边观赏夜景。然而，他在沙滩上意外发现了一只搁浅的海豚，为了救助这只海豚，他耗费了极大的体力，最终成功将其送回海中。筋疲力尽的甄加索在沙滩上睡着了，由于他患有严重的心脏病，却未告知旁人，在寒冷的海风中，他的心脏停止了跳动。因此，警方在现场只发现了车轮痕迹和未完成的画作，而没有发现任何他杀的迹象。
--------------------------------------------------
train_text: [gMASK]<sop><|system|>
You are an expert in logical reasoning.<|user|>
你是一个情景猜谜游戏的主持人。游戏规则如下：

1. 参与者会得到一个谜面，谜面会描述一个简单又难以理解的事件。
2. 主持人知道谜底，谜底是谜面的答案。
3. 参与者可以询问任何封闭式问题来找寻事件的真相。
4. 对于每个问题，主持人将根据实际情况回答以下五个选项之一：是、不是、不重要、回答正确、问法错误。各回答的判断标准如下：
   - 若谜面和谜底能找到问题的答案，回答：是或者不是
 

In [13]:
def evaluate_model(model, tokenizer, model_name, dataset, batch_size=1):
    print(f"Evaluating model: {model_name} on {device}")
    predictions = eval_model(
        model, tokenizer, dataset, device=device, batch_size=batch_size
    )

    save_results(
        model_name,
        results_path,
        dataset,
        predictions,
        debug=False,
    )

    metrics = calc_metrics(dataset["label"], predictions, debug=False)
    print(metrics)

In [14]:
%%time

prompt1 = datasets["test"]["prompt"][1000]

gen_kwargs = {"max_length": 4096, "do_sample": True, "top_k": 1}
with torch.no_grad():
    inputs = tokenizer(
        [prompt1],
        return_tensors="pt",
    ).to(device)
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

  if eos_token_id is not None and torch.isin(elements=eos_token_id, test_elements=pad_token_id).any():



是
CPU times: user 888 ms, sys: 312 ms, total: 1.2 s
Wall time: 2.29 s


: 

In [15]:
%%time

evaluate_model(model, tokenizer, f"{model_name}{'_' + adapter_name_or_path if adapter_name_or_path else ''}", datasets["test"], batch_size=16)

Evaluating model: THUDM/glm-4-9b-chat-1m on mps


  1%|          | 1/188 [01:42<5:20:47, 102.93s/it]

--------
step 1: 
不是
--------
step 2: 
不是
--------
step 3: 
不是
--------
step 4: 不是
--------
step 5: 不是


 23%|██▎       | 43/188 [3:53:29<17:57:22, 445.81s/it]