Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

dh-mc commited on Jul 22, 2024

Commit

1ec2139

1 Parent(s): df5f0a3

completed comparison of evaluation results using different GPU and precision

Browse files

Files changed (18) hide show

competition/00a_InternLM2.5_Llama3_GLM4_Results.ipynb +0 -0
competition/00b_InternLM_2.5_Perf_vs_RPP.ipynb +0 -0
competition/00c_InternLM_2.5_Perf_vs_RPP_4bit.ipynb +0 -0
competition/09c_InternLM_bf16_p2_analysis.ipynb +0 -0
competition/10b_InternLM_bf16_p2_r2_analysis.ipynb +0 -0
competition/10c_InternLM_M3_eval.ipynb +1 -1
competition/10f_InternLM_best_analysis.ipynb +0 -0
competition/11_Llama-3_8b_analysis.ipynb +0 -0
competition/11_Llama-3_8b_p1_analysis.ipynb +0 -0
competition/11a_Llama-3_8b_p2_analysis.ipynb +0 -0
competition/12b_InternLM_Push_LoRA_to_hub.ipynb +0 -0
results/mgtv-llama3_p1_full_metrics.csv +2 -2
results/mgtv-llama3_p2_full_metrics.csv +2 -2
results/mgtv-results_colab_p2.csv +2 -2
results/mgtv-results_internlm_4bit_metrics.csv +17 -0
results/mgtv-results_internlm_best.csv +0 -0
results/mgtv-results_m3.csv +0 -0
results/mgtv-results_p2_r2_full_metrics.csv +6 -0

competition/00a_InternLM2.5_Llama3_GLM4_Results.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

competition/00b_InternLM_2.5_Perf_vs_RPP.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

competition/00c_InternLM_2.5_Perf_vs_RPP_4bit.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

competition/09c_InternLM_bf16_p2_analysis.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

competition/10b_InternLM_bf16_p2_r2_analysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

competition/10c_InternLM_M3_eval.ipynb CHANGED Viewed

@@ -1 +1 @@

- {"cells":[{"cell_type":"code","execution_count":1,"metadata":{"executionInfo":{"elapsed":476,"status":"ok","timestamp":1720679526275,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"uWKRSV6eZsCn"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","execution_count":2,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"eb33b19f-1206-41ee-84e2-e6258a12eef7","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2534,"status":"ok","timestamp":1720679529344,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"xwFh14uiZBrI","outputId":"d767799c-34c2-46a5-f052-378146a55321"},"outputs":[],"source":["from pathlib import Path\n","\n","try:\n"," from google.colab import drive\n","\n"," drive.mount(\"/content/drive\")\n"," workding_dir = \"/content/drive/MyDrive/logical-reasoning/\"\n","except ModuleNotFoundError:\n"," workding_dir = str(Path.cwd().parent)"]},{"cell_type":"code","execution_count":3,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"6d394937-6c99-4a7c-9d32-7600a280032f","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"G5pNu3zgZBrL","outputId":"160a554f-fb08-4aa0-bc00-0422fb7c1fac"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: /Users/inflaton/code/engd/projects/logical-reasoning\n"]}],"source":["import os\n","import sys\n","from pathlib import Path\n","\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":4,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"ac667aba-076e-4de6-9984-8f6a67cb09cd","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":4,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"0dVRAabNZBrL","outputId":"b977e116-df16-47cd-9160-a24f611da687"},"outputs":[{"data":{"text/plain":["False"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["need_to_setup_env = False\n","need_to_setup_env"]},{"cell_type":"code","execution_count":5,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"72f9cf79-7b0d-4d9e-90a0-1fa5251b947f","showTitle":false,"title":""},"executionInfo":{"elapsed":4,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"hKUOfP2HZBrL"},"outputs":[],"source":["if need_to_setup_env:\n"," %pip install -r requirements.txt\n"," %cd /content/\n"," %rm -rf LLaMA-Factory\n"," !git clone https://github.com/hiyouga/LLaMA-Factory.git\n"," %cd LLaMA-Factory\n"," %ls\n"," %pip install -e .[torch,bitsandbytes]\n"," \n"," os.chdir(workding_dir)\n"," sys.path.append(workding_dir)\n"," print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":6,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"hPCC-6m7ZBrM","outputId":"c7aa2c96-5e99-440a-c148-201d79465ff9"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /Users/inflaton/code/engd/projects/logical-reasoning/.env\n"]},{"data":{"text/plain":["True"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n"," found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":7,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"f1597656-8042-4878-9d3b-9ebfb8dd86dc","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"1M3IraVtZBrM","outputId":"29ab35f6-2970-4ade-d85d-3174acf8cda0"},"outputs":[{"name":"stdout","output_type":"stream","text":["internlm/internlm2_5-7b-chat-1m llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full/checkpoint-88 False datasets/mgtv results/mgtv-results_m3.csv\n"]}],"source":["import os\n","\n","model_name = os.getenv(\"MODEL_NAME\")\n","adapter_name_or_path = os.getenv(\"ADAPTER_NAME_OR_PATH\")\n","load_in_4bit = os.getenv(\"LOAD_IN_4BIT\") == \"true\"\n","data_path = os.getenv(\"LOGICAL_REASONING_DATA_PATH\")\n","results_path = os.getenv(\"LOGICAL_REASONING_RESULTS_PATH\")\n","use_english_datasets = os.getenv(\"USE_ENGLISH_DATASETS\") == \"true\"\n","\n","print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)"]},{"cell_type":"code","execution_count":8,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"b2a43943-9324-4839-9a47-cfa72de2244b","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":564,"status":"ok","timestamp":1720679529907,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"UgMvt6dIZBrM","outputId":"ce37581c-fd26-46c2-ad87-d933d99f68f7"},"outputs":[{"name":"stdout","output_type":"stream","text":["Python 3.11.9\n","\u001b[33mWARNING: Package(s) not found: flash-attn\u001b[0m\u001b[33m\n","\u001b[0mCPU times: user 38.9 ms, sys: 26.7 ms, total: 65.7 ms\n","Wall time: 2.97 s\n"]}],"source":["%%time\n","!python --version\n","!pip show flash-attn"]},{"cell_type":"code","execution_count":9,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1685,"status":"ok","timestamp":1720679531591,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"ZuS_FsLyZBrN","outputId":"2cba0105-c505-4395-afbd-2f2fee6581d0"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading /Users/inflaton/code/engd/projects/logical-reasoning/llm_toolkit/logical_reasoning_utils.py\n","MPS is available\n"]}],"source":["from llm_toolkit.llm_utils import *\n","from llm_toolkit.logical_reasoning_utils import *\n","\n","device = check_gpu()"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading model: internlm/internlm2_5-7b-chat-1m with adapter: llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full/checkpoint-88\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"443302f393fe45e3a5150cb5e1f35a11","version_major":2,"version_minor":0},"text/plain":["Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["CPU times: user 3.13 s, sys: 6.52 s, total: 9.65 s\n","Wall time: 30.7 s\n"]}],"source":["%%time\n","\n","model, tokenizer = load_model(model_name, adapter_name_or_path=adapter_name_or_path, using_llama_factory=False)"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n"]}],"source":["datasets = load_logical_reasoning_dataset(\n"," data_path,\n"," tokenizer=tokenizer,\n"," chinese_prompt=not use_english_datasets,\n"," using_p1=False,\n",")"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["--------------------------------------------------\n","text: 甄加索是自杀吗\n","--------------------------------------------------\n","label: 不是\n","--------------------------------------------------\n","answer: nan\n","--------------------------------------------------\n","title: 海岸之谜\n","--------------------------------------------------\n","puzzle: 在远离城市喧嚣的海边小屋，一天清晨，邻居发现甄加索僵卧在沙滩上，已无生命迹象。现场没有发现任何打斗的迹象。请问甄加索的死因是什么？\n","--------------------------------------------------\n","truth: 甄加索是一位热爱自然的画家，他每年都会来到这个海边小屋寻找灵感。在他生命的最后几天，他一直在创作一幅描绘海洋生物的画作。在画即将完成的前一天晚上，他骑着自行车外出，打算在海边观赏夜景。然而，他在沙滩上意外发现了一只搁浅的海豚，为了救助这只海豚，他耗费了极大的体力，最终成功将其送回海中。筋疲力尽的甄加索在沙滩上睡着了，由于他患有严重的心脏病，却未告知旁人，在寒冷的海风中，他的心脏停止了跳动。因此，警方在现场只发现了车轮痕迹和未完成的画作，而没有发现任何他杀的迹象。\n","--------------------------------------------------\n","train_text: <s><|im_start|>system\n","You are an expert in logical reasoning.<|im_end|>\n","<|im_start|>user\n","你是一个情景猜谜游戏的主持人。游戏规则如下：\n","\n","1. 参与者会得到一个谜面，谜面会描述一个简单又难以理解的事件。\n","2. 主持人知道谜底，谜底是谜面的答案。\n","3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n","4. 对于每个问题，主持人将根据实际情况回答以下五个选项之一：是、不是、不重要、回答正确、问法错误。各回答的判断标准如下：\n"," - 若谜面和谜底能找到问题的答案，回答：是或者不是\n"," - 若谜面和谜底不能直接或者间接推断出问题的答案，回答：不重要\n"," - 若参与者提问不是一个封闭式问题或者问题难以理解，回答：问法错误\n"," - 若参与者提问基本还原了谜底真相，回答：回答正确\n","5. 回答中不能添加任何其它信息，也不能省略选项中的任何一个字。例如，不可以把“不是”省略成“不”。\n","\n","请严格按照这些规则回答参与者提出的问题。\n","\n","**谜面:** 在远离城市喧嚣的海边小屋，一天清晨，邻居发现甄加索僵卧在沙滩上，已无生命迹象。现场没有发现任何打斗的迹象。请问甄加索的死因是什么？\n","\n","**谜底:** 甄加索是一位热爱自然的画家，他每年都会来到这个海边小屋寻找灵感。在他生命的最后几天，他一直在创作一幅描绘海洋生物的画作。在画即将完成的前一天晚上，他骑着自行车外出，打算在海边观赏夜景。然而，他在沙滩上意外发现了一只搁浅的海豚，为了救助这只海豚，他耗费了极大的体力，最终成功将其送回海中。筋疲力尽的甄加索在沙滩上睡着了，由于他患有严重的心脏病，却未告知旁人，在寒冷的海风中，他的心脏停止了跳动。因此，警方在现场只发现了车轮痕迹和未完成的画作，而没有发现任何他杀的迹象。\n","\n","**参与者提出的问题:** 甄加索是自杀吗\n","<|im_end|>\n","<|im_start|>assistant\n","不是</s>\n","--------------------------------------------------\n","prompt: <s><|im_start|>system\n","You are an expert in logical reasoning.<|im_end|>\n","<|im_start|>user\n","你是一个情景猜谜游戏的主持人。游戏规则如下：\n","\n","1. 参与者会得到一个谜面，谜面会描述一个简单又难以理解的事件。\n","2. 主持人知道谜底，谜底是谜面的答案。\n","3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n","4. 对于每个问题，主持人将根据实际情况回答以下五个选项之一：是、不是、不重要、回答正确、问法错误。各回答的判断标准如下：\n"," - 若谜面和谜底能找到问题的答案，回答：是或者不是\n"," - 若谜面和谜底不能直接或者间接推断出问题的答案，回答：不重要\n"," - 若参与者提问不是一个封闭式问题或者问题难以理解，回答：问法错误\n"," - 若参与者提问基本还原了谜底真相，回答：回答正确\n","5. 回答中不能添加任何其它信息，也不能省略选项中的任何一个字。例如，不可以把“不是”省略成“不”。\n","\n","请严格按照这些规则回答参与者提出的问题。\n","\n","**谜面:** 在远离城市喧嚣的海边小屋，一天清晨，邻居发现甄加索僵卧在沙滩上，已无生命迹象。现场没有发现任何打斗的迹象。请问甄加索的死因是什么？\n","\n","**谜底:** 甄加索是一位热爱自然的画家，他每年都会来到这个海边小屋寻找灵感。在他生命的最后几天，他一直在创作一幅描绘海洋生物的画作。在画即将完成的前一天晚上，他骑着自行车外出，打算在海边观赏夜景。然而，他在沙滩上意外发现了一只搁浅的海豚，为了救助这只海豚，他耗费了极大的体力，最终成功将其送回海中。筋疲力尽的甄加索在沙滩上睡着了，由于他患有严重的心脏病，却未告知旁人，在寒冷的海风中，他的心脏停止了跳动。因此，警方在现场只发现了车轮痕迹和未完成的画作，而没有发现任何他杀的迹象。\n","\n","**参与者提出的问题:** 甄加索是自杀吗\n","<|im_end|>\n","<|im_start|>assistant\n","\n"]}],"source":["print_row_details(datasets[\"test\"].to_pandas())"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[],"source":["def evaluate_model(model, tokenizer, model_name, dataset, batch_size=1):\n"," print(f\"Evaluating model: {model_name} on {device}\")\n"," predictions = eval_model(\n"," model, tokenizer, dataset, device=device, batch_size=batch_size\n"," )\n","\n"," save_results(\n"," model_name,\n"," results_path,\n"," dataset,\n"," predictions,\n"," debug=False,\n"," )\n","\n"," metrics = calc_metrics(dataset[\"label\"], predictions, debug=False)\n"," print(metrics)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Evaluating model: internlm/internlm2_5-7b-chat-1m_llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full/checkpoint-88 on mps\n"]},{"name":"stderr","output_type":"stream","text":[" 0%| | 1/3000 [00:03<2:33:23, 3.07s/it]"]},{"name":"stdout","output_type":"stream","text":["--------\n","step 1: 不是</s>\n","--------\n","step 2: 不是\n","--------\n","step 3: 不是\n","--------\n","step 4: 不是\n","--------\n","step 5: 不是\n"]},{"name":"stderr","output_type":"stream","text":["100%|██████████| 3000/3000 [10:20:50<00:00, 12.42s/it] "]},{"name":"stdout","output_type":"stream","text":["{'accuracy': 0.7836666666666666}\n","CPU times: user 12min 26s, sys: 11min 38s, total: 24min 4s\n","Wall time: 10h 20min 50s\n"]},{"name":"stderr","output_type":"stream","text":["\n"]}],"source":["%%time\n","\n","evaluate_model(model, tokenizer, f\"{model_name}_{adapter_name_or_path}\", datasets[\"test\"])"]}],"metadata":{"accelerator":"GPU","application/vnd.databricks.v1+notebook":{"dashboards":[],"environmentMetadata":null,"language":"python","notebookMetadata":{"mostRecentlyExecutedCommandWithImplicitDF":{"commandId":-1,"dataframes":["_sqldf"]},"pythonIndentUnit":4},"notebookName":"10_eval-lf-medium-py3.11","widgets":{}},"colab":{"gpuType":"L4","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":0}

+ {"cells":[{"cell_type":"code","execution_count":1,"metadata":{"executionInfo":{"elapsed":476,"status":"ok","timestamp":1720679526275,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"uWKRSV6eZsCn"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Python 3.11.9\n","Name: transformers\n","Version: 4.41.2\n","Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow\n","Home-page: https://github.com/huggingface/transformers\n","Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)\n","Author-email: transformers@huggingface.co\n","License: Apache 2.0 License\n","Location: /Users/inflaton/anaconda3/envs/logical-reasoning/lib/python3.11/site-packages\n","Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm\n","Required-by: peft\n","---\n","Name: torch\n","Version: 2.5.0.dev20240720\n","Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration\n","Home-page: https://pytorch.org/\n","Author: PyTorch Team\n","Author-email: packages@pytorch.org\n","License: BSD-3\n","Location: /Users/inflaton/anaconda3/envs/logical-reasoning/lib/python3.11/site-packages\n","Requires: filelock, fsspec, jinja2, networkx, sympy, typing-extensions\n","Required-by: accelerate, peft, torchaudio, torchvision\n","---\n","Name: torchvision\n","Version: 0.20.0.dev20240721\n","Summary: image and video datasets and models for torch deep learning\n","Home-page: https://github.com/pytorch/vision\n","Author: PyTorch Core Team\n","Author-email: soumith@pytorch.org\n","License: BSD\n","Location: /Users/inflaton/anaconda3/envs/logical-reasoning/lib/python3.11/site-packages\n","Requires: numpy, pillow, torch\n","Required-by: \n","---\n","Name: torchaudio\n","Version: 2.4.0.dev20240721\n","Summary: An audio package for PyTorch\n","Home-page: https://github.com/pytorch/audio\n","Author: Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang\n","Author-email: soumith@pytorch.org\n","License: \n","Location: /Users/inflaton/anaconda3/envs/logical-reasoning/lib/python3.11/site-packages\n","Requires: torch\n","Required-by: \n","CPU times: user 8.21 ms, sys: 9.66 ms, total: 17.9 ms\n","Wall time: 2.72 s\n"]}],"source":["%%time\n","!python --version\n","!pip show transformers torch torchvision torchaudio"]},{"cell_type":"code","execution_count":4,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"eb33b19f-1206-41ee-84e2-e6258a12eef7","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2534,"status":"ok","timestamp":1720679529344,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"xwFh14uiZBrI","outputId":"d767799c-34c2-46a5-f052-378146a55321"},"outputs":[],"source":["from pathlib import Path\n","\n","try:\n"," from google.colab import drive\n","\n"," drive.mount(\"/content/drive\")\n"," workding_dir = \"/content/drive/MyDrive/logical-reasoning/\"\n","except ModuleNotFoundError:\n"," workding_dir = str(Path.cwd().parent)"]},{"cell_type":"code","execution_count":5,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"6d394937-6c99-4a7c-9d32-7600a280032f","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"G5pNu3zgZBrL","outputId":"160a554f-fb08-4aa0-bc00-0422fb7c1fac"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: /Users/inflaton/code/engd/projects/logical-reasoning\n"]}],"source":["import os\n","import sys\n","from pathlib import Path\n","\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":6,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"hPCC-6m7ZBrM","outputId":"c7aa2c96-5e99-440a-c148-201d79465ff9"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /Users/inflaton/code/engd/projects/logical-reasoning/.env\n"]},{"data":{"text/plain":["True"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n"," found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":7,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"f1597656-8042-4878-9d3b-9ebfb8dd86dc","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"1M3IraVtZBrM","outputId":"29ab35f6-2970-4ade-d85d-3174acf8cda0"},"outputs":[{"name":"stdout","output_type":"stream","text":["internlm/internlm2_5-7b-chat-1m llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full_r2/checkpoint-175 False datasets/mgtv results/mgtv-results_m3.csv\n"]}],"source":["import os\n","\n","model_name = os.getenv(\"MODEL_NAME\")\n","adapter_name_or_path = os.getenv(\"ADAPTER_NAME_OR_PATH\")\n","load_in_4bit = os.getenv(\"LOAD_IN_4BIT\") == \"true\"\n","data_path = os.getenv(\"LOGICAL_REASONING_DATA_PATH\")\n","results_path = os.getenv(\"LOGICAL_REASONING_RESULTS_PATH\")\n","use_english_datasets = os.getenv(\"USE_ENGLISH_DATASETS\") == \"true\"\n","\n","print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)"]},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1685,"status":"ok","timestamp":1720679531591,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"ZuS_FsLyZBrN","outputId":"2cba0105-c505-4395-afbd-2f2fee6581d0"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading /Users/inflaton/code/engd/projects/logical-reasoning/llm_toolkit/logical_reasoning_utils.py\n","MPS is available\n"]}],"source":["from llm_toolkit.llm_utils import *\n","from llm_toolkit.logical_reasoning_utils import *\n","\n","device = check_gpu()"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading model: internlm/internlm2_5-7b-chat-1m with adapter: llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full_r2/checkpoint-175\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"a505972d0ae14d4aab281798aca28c44","version_major":2,"version_minor":0},"text/plain":["Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["CPU times: user 3.33 s, sys: 2.24 s, total: 5.57 s\n","Wall time: 7.48 s\n"]},{"data":{"text/plain":["(torch.bfloat16, 15513174016)"]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["%%time\n","\n","model, tokenizer = load_model(model_name, dtype=torch.bfloat16, adapter_name_or_path=adapter_name_or_path, using_llama_factory=False)\n","model.dtype, model.get_memory_footprint()"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n"]}],"source":["datasets = load_logical_reasoning_dataset(\n"," data_path,\n"," tokenizer=tokenizer,\n"," chinese_prompt=not use_english_datasets,\n"," using_p1=False,\n",")"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["--------------------------------------------------\n","text: 哭泣和村庄有关系吗\n","--------------------------------------------------\n","label: 是\n","--------------------------------------------------\n","answer: nan\n","--------------------------------------------------\n","title: 甄庄哭声\n","--------------------------------------------------\n","puzzle: 在一个安静的夜晚，小村庄的湖边突然传来了阵阵哭泣声。第二天早晨，村长甄锐发现湖边的石头上放着一顶破旧的帽子，但没有人知道这顶帽子是从哪里来的，哭泣声又是为何。请还原故事真相。\n","--------------------------------------------------\n","truth: 原来，这顶破旧的帽子属于一个小男孩，他小时候与爷爷在湖边生活。爷爷教他钓鱼、游泳，还告诉他湖中的海龟是他们的朋友。后来，小男孩随父母去了城市生活，但每年夏天都会回到村子探望爷爷。然而，去年夏天，爷爷因病去世，小男孩伤心欲绝。今年夏天，他回到村子，来到湖边，想起和爷爷的美好回忆，忍不住哭泣。他将爷爷的帽子放在湖边的石头上，希望能让爷爷的在天之灵得到安慰。那晚的哭泣声正是小男孩在祭莫他亲爱的爷爷。\n","--------------------------------------------------\n","train_text: <s><|im_start|>system\n","You are an expert in logical reasoning.<|im_end|>\n","<|im_start|>user\n","你是一个情景猜谜游戏的主持人。游戏规则如下：\n","\n","1. 参与者会得到一个谜面，谜面会描述一个简单又难以理解的事件。\n","2. 主持人知道谜底，谜底是谜面的答案。\n","3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n","4. 对于每个问题，主持人将根据实际情况回答以下五个选项之一：是、不是、不重要、回答正确、问法错误。各回答的判断标准如下：\n"," - 若谜面和谜底能找到问题的答案，回答：是或者不是\n"," - 若谜面和谜底不能直接或者间接推断出问题的答案，回答：不重要\n"," - 若参与者提问不是一个封闭式问题或者问题难以理解，回答：问法错误\n"," - 若参与者提问基本还原了谜底真相，回答：回答正确\n","5. 回答中不能添加任何其它信息，也不能省略选项中的任何一个字。例如，不可以把“不是”省略成“不”。\n","\n","请严格按照这些规则回答参与者提出的问题。\n","\n","**谜面:** 在一个安静的夜晚，小村庄的湖边突然传来了阵阵哭泣声。第二天早晨，村长甄锐发现湖边的石头上放着一顶破旧的帽子，但没有人知道这顶帽子是从哪里来的，哭泣声又是为何。请还原故事真相。\n","\n","**谜底:** 原来，这顶破旧的帽子属于一个小男孩，他小时候与爷爷在湖边生活。爷爷教他钓鱼、游泳，还告诉他湖中的海龟是他们的朋友。后来，小男孩随父母去了城市生活，但每年夏天都会回到村子探望爷爷。然而，去年夏天，爷爷因病去世，小男孩伤心欲绝。今年夏天，他回到村子，来到湖边，想起和爷爷的美好回忆，忍不住哭泣。他将爷爷的帽子放在湖边的石头上，希望能让爷爷的在天之灵得到安慰。那晚的哭泣声正是小男孩在祭莫他亲爱的爷爷。\n","\n","**参与者提出的问题:** 哭泣和村庄有关系吗\n","<|im_end|>\n","<|im_start|>assistant\n","是</s>\n","--------------------------------------------------\n","prompt: <s><|im_start|>system\n","You are an expert in logical reasoning.<|im_end|>\n","<|im_start|>user\n","你是一个情景猜谜游戏的主持人。游戏规则如下：\n","\n","1. 参与者会得到一个谜面，谜面会描述一个简单又难以理解的事件。\n","2. 主持人知道谜底，谜底是谜面的答案。\n","3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n","4. 对于每个问题，主持人将根据实际情况回答以下五个选项之一：是、不是、不重要、回答正确、问法错误。各回答的判断标准如下：\n"," - 若谜面和谜底能找到问题的答案，回答：是或者不是\n"," - 若谜面和谜底不能直接或者间接推断出问题的答案，回答：不重要\n"," - 若参与者提问不是一个封闭式问题或者问题难以理解，回答：问法错误\n"," - 若参与者提问基本还原了谜底真相，回答：回答正确\n","5. 回答中不能添加任何其它信息，也不能省略选��中的任何一个字。例如，不可以把“不是”省略成“不”。\n","\n","请严格按照这些规则回答参与者提出的问题。\n","\n","**谜面:** 在一个安静的夜晚，小村庄的湖边突然传来了阵阵哭泣声。第二天早晨，村长甄锐发现湖边的石头上放着一顶破旧的帽子，但没有人知道这顶帽子是从哪里来的，哭泣声又是为何。请还原故事真相。\n","\n","**谜底:** 原来，这顶破旧的帽子属于一个小男孩，他小时候与爷爷在湖边生活。爷爷教他钓鱼、游泳，还告诉他湖中的海龟是他们的朋友。后来，小男孩随父母去了城市生活，但每年夏天都会回到村子探望爷爷。然而，去年夏天，爷爷因病去世，小男孩伤心欲绝。今年夏天，他回到村子，来到湖边，想起和爷爷的美好回忆，忍不住哭泣。他将爷爷的帽子放在湖边的石头上，希望能让爷爷的在天之灵得到安慰。那晚的哭泣声正是小男孩在祭莫他亲爱的爷爷。\n","\n","**参与者提出的问题:** 哭泣和村庄有关系吗\n","<|im_end|>\n","<|im_start|>assistant\n","\n"]}],"source":["print_row_details(datasets[\"test\"].to_pandas(), [1000])"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["是\n","CPU times: user 619 ms, sys: 392 ms, total: 1.01 s\n","Wall time: 1.9 s\n"]}],"source":["%%time\n","\n","prompt1 = datasets[\"test\"][\"prompt\"][1000]\n","\n","gen_kwargs = {\"max_length\": 4096, \"do_sample\": True, \"top_k\": 1}\n","with torch.no_grad():\n"," inputs = tokenizer(\n"," [prompt1],\n"," return_tensors=\"pt\",\n"," ).to(device)\n"," outputs = model.generate(**inputs, **gen_kwargs)\n"," outputs = outputs[:, inputs['input_ids'].shape[1]:]\n"," print(tokenizer.decode(outputs[0], skip_special_tokens=True))"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[],"source":["def evaluate_model(model, tokenizer, dataset, batch_size=8):\n"," save_model_name = f\"{model_name}_{adapter_name_or_path}_m3_{model.dtype}\"\n"," print(f\"Evaluating model: {save_model_name} on {device}\")\n"," predictions = eval_model(\n"," model, tokenizer, dataset, device=device, batch_size=batch_size\n"," )\n","\n"," save_results(\n"," save_model_name,\n"," results_path,\n"," dataset,\n"," predictions,\n"," debug=False,\n"," )\n","\n"," metrics = calc_metrics(dataset[\"label\"], predictions, debug=False)\n"," print(metrics)"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Evaluating model: internlm/internlm2_5-7b-chat-1m_llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full_r2/checkpoint-175_m3_torch.bfloat16 on mps\n"]},{"name":"stderr","output_type":"stream","text":[" 0%| | 1/375 [00:15<1:35:45, 15.36s/it]"]},{"name":"stdout","output_type":"stream","text":["Batch output: ['不是', '是', '是', '是', '不是', '是', '是', '不是']\n"]},{"name":"stderr","output_type":"stream","text":["100%|██████████| 375/375 [2:42:25<00:00, 25.99s/it] \n"]},{"name":"stdout","output_type":"stream","text":["{'accuracy': 0.807}\n","CPU times: user 5min 2s, sys: 5min 33s, total: 10min 35s\n","Wall time: 2h 42min 25s\n"]}],"source":["%%time\n","\n","evaluate_model(model, tokenizer, datasets[\"test\"])"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading model: internlm/internlm2_5-7b-chat-1m with adapter: llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full_r2/checkpoint-175\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"e3301a4dc82449938e9bedb9d8ec5754","version_major":2,"version_minor":0},"text/plain":["Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["CPU times: user 23.1 s, sys: 10.9 s, total: 34 s\n","Wall time: 16.1 s\n"]},{"data":{"text/plain":["(torch.float16, 15513174016)"]},"execution_count":15,"metadata":{},"output_type":"execute_result"}],"source":["%%time\n","\n","del model, tokenizer\n","\n","model, tokenizer = load_model(model_name, dtype=torch.float16, adapter_name_or_path=adapter_name_or_path, using_llama_factory=False)\n","model.dtype, model.get_memory_footprint()"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Evaluating model: internlm/internlm2_5-7b-chat-1m_llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full_r2/checkpoint-175_m3_torch.float16 on mps\n"]},{"name":"stderr","output_type":"stream","text":[" 0%| | 1/375 [00:19<1:59:08, 19.11s/it]"]},{"name":"stdout","output_type":"stream","text":["Batch output: ['不是', '是', '是', '是', '不是', '是', '是', '不是']\n"]},{"name":"stderr","output_type":"stream","text":["100%|██████████| 375/375 [2:19:06<00:00, 22.26s/it] "]},{"name":"stdout","output_type":"stream","text":["{'accuracy': 0.8023333333333333}\n","CPU times: user 3min 14s, sys: 4min 37s, total: 7min 52s\n","Wall time: 2h 19min 6s\n"]},{"name":"stderr","output_type":"stream","text":["\n"]}],"source":["%%time\n","\n","evaluate_model(model, tokenizer, datasets[\"test\"])"]},{"cell_type":"code","execution_count":21,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading model: internlm/internlm2_5-7b-chat-1m with adapter: llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full_r2/checkpoint-175\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"ceb78f7f50244e1bb703d4cb1b880cd6","version_major":2,"version_minor":0},"text/plain":["Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["CPU times: user 34.8 s, sys: 13 s, total: 47.8 s\n","Wall time: 10.5 s\n"]},{"data":{"text/plain":["(torch.float32, 31026339840)"]},"execution_count":21,"metadata":{},"output_type":"execute_result"}],"source":["%%time\n","\n","del model, tokenizer\n","\n","model, tokenizer = load_model(model_name, dtype=torch.float32, adapter_name_or_path=adapter_name_or_path, using_llama_factory=False)\n","model.dtype, model.get_memory_footprint()"]},{"cell_type":"code","execution_count":22,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Evaluating model: internlm/internlm2_5-7b-chat-1m_llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full_r2/checkpoint-175_m3_torch.float32 on mps\n"]},{"name":"stderr","output_type":"stream","text":[" 0%| | 1/375 [00:20<2:07:18, 20.42s/it]"]},{"name":"stdout","output_type":"stream","text":["Batch output: ['不是', '是', '是', '是', '不是', '是', '是', '不是']\n"]},{"name":"stderr","output_type":"stream","text":["100%|██████████| 375/375 [5:23:51<00:00, 51.82s/it] \n"]},{"name":"stdout","output_type":"stream","text":["{'accuracy': 0.8016666666666666}\n","CPU times: user 4min 1s, sys: 10min 4s, total: 14min 6s\n","Wall time: 5h 23min 51s\n"]}],"source":["%%time\n","\n","evaluate_model(model, tokenizer, datasets[\"test\"])"]}],"metadata":{"accelerator":"GPU","application/vnd.databricks.v1+notebook":{"dashboards":[],"environmentMetadata":null,"language":"python","notebookMetadata":{"mostRecentlyExecutedCommandWithImplicitDF":{"commandId":-1,"dataframes":["_sqldf"]},"pythonIndentUnit":4},"notebookName":"10_eval-lf-medium-py3.11","widgets":{}},"colab":{"gpuType":"L4","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":0}

competition/10f_InternLM_best_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

competition/11_Llama-3_8b_analysis.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

competition/11_Llama-3_8b_p1_analysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

competition/11a_Llama-3_8b_p2_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

competition/12b_InternLM_Push_LoRA_to_hub.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

results/mgtv-llama3_p1_full_metrics.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5b4210453d59ad49c11feef76cf64cee627fe0dfecdf399cd444e8c99b6647cc
-size 986

 version https://git-lfs.github.com/spec/v1
+oid sha256:85bfa6552ba4190f23563740e1c6adcec0b0d5502f9379e61a1cc1122792546c
+size 1342

results/mgtv-llama3_p2_full_metrics.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fbc58da2045e8e4c69e0c9d61d4f16cddb9eca8bfa84861823e008e2dc80cb52
-size 905

 version https://git-lfs.github.com/spec/v1
+oid sha256:7bf8e3243fb2b4cdd6ec8e2d2e9c9b3c6ceb629b323cedd5b4c47d4375c91d41
+size 1369

results/mgtv-results_colab_p2.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0142b2e9ef5e3247e7735476a3fd2eb2e71f5ba712b280da176eebac5dcf48d4
-size 2816898

 version https://git-lfs.github.com/spec/v1
+oid sha256:a1f4e839725283ad10eee3c90d4570c94b6531ce7f65edb15c4153f1eea4caf3
+size 2871990

results/mgtv-results_internlm_4bit_metrics.csv ADDED Viewed

	@@ -0,0 +1,17 @@

+repetition penalty,model,accuracy,precision,recall,f1
+1.00,internlm/internlm2_5-7b-chat-1m_4bit_rp1.0,0.783,0.8089894370996324,0.783,0.7918676604452453
+1.01,internlm/internlm2_5-7b-chat-1m_4bit_rp1.01,0.783,0.8089894370996324,0.783,0.7918676604452453
+1.02,internlm/internlm2_5-7b-chat-1m_4bit_rp1.02,0.783,0.8089894370996324,0.783,0.7918676604452453
+1.03,internlm/internlm2_5-7b-chat-1m_4bit_rp1.03,0.783,0.8089894370996324,0.783,0.7918676604452453
+1.04,internlm/internlm2_5-7b-chat-1m_4bit_rp1.04,0.783,0.8089894370996324,0.783,0.7918676604452453
+1.05,internlm/internlm2_5-7b-chat-1m_4bit_rp1.05,0.783,0.8089894370996324,0.783,0.7918676604452453
+1.06,internlm/internlm2_5-7b-chat-1m_4bit_rp1.06,0.783,0.8089894370996324,0.783,0.7918676604452453
+1.07,internlm/internlm2_5-7b-chat-1m_4bit_rp1.07,0.783,0.8089894370996324,0.783,0.7918676604452453
+1.08,internlm/internlm2_5-7b-chat-1m_4bit_rp1.08,0.7796666666666666,0.8092240536393919,0.7796666666666666,0.790029078222227
+1.09,internlm/internlm2_5-7b-chat-1m_4bit_rp1.09,0.694,0.8069396574279459,0.694,0.7320540780553862
+1.10,internlm/internlm2_5-7b-chat-1m_4bit_rp1.1,0.532,0.7907326858338991,0.532,0.5650084362647749
+1.11,internlm/internlm2_5-7b-chat-1m_4bit_rp1.11,0.4703333333333333,0.8029879177982981,0.4703333333333333,0.4846135307750191
+1.12,internlm/internlm2_5-7b-chat-1m_4bit_rp1.12,0.438,0.854597269027913,0.438,0.46295579268753667
+1.13,internlm/internlm2_5-7b-chat-1m_4bit_rp1.1300000000000001,0.4076666666666667,0.8777389419821567,0.4076666666666667,0.4483520542323376
+1.14,internlm/internlm2_5-7b-chat-1m_4bit_rp1.1400000000000001,0.35933333333333334,0.8851092633830993,0.35933333333333334,0.4204411983471381
+1.15,internlm/internlm2_5-7b-chat-1m_4bit_rp1.1500000000000001,0.301,0.8493737317963316,0.301,0.3786855007398624

results/mgtv-results_internlm_best.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

results/mgtv-results_m3.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

results/mgtv-results_p2_r2_full_metrics.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+epoch,model,accuracy,precision,recall,f1
+0,internlm/internlm2_5-7b-chat-1m,0.766,0.7479690198649127,0.7875257025359835,0.7649220492304646
+1,internlm/internlm2_5-7b-chat-1m_checkpoint-175,0.812,0.8122861942516547,0.812,0.8102342544894316
+2,internlm/internlm2_5-7b-chat-1m_checkpoint-350,0.7653333333333333,0.8068892149662973,0.7653333333333333,0.7799982606366916
+3,internlm/internlm2_5-7b-chat-1m_checkpoint-525,0.7476666666666667,0.8120325497709814,0.7476666666666667,0.7731222076608317
+4,internlm/internlm2_5-7b-chat-1m_checkpoint-700,0.717,0.8046420022590015,0.717,0.7510339687376877