diff --git "a/competition/12b_InternLM_Push_LoRA_to_hub.ipynb" "b/competition/12b_InternLM_Push_LoRA_to_hub.ipynb"
--- "a/competition/12b_InternLM_Push_LoRA_to_hub.ipynb"
+++ "b/competition/12b_InternLM_Push_LoRA_to_hub.ipynb"
@@ -1 +1 @@
-{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"executionInfo":{"elapsed":476,"status":"ok","timestamp":1720679526275,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"uWKRSV6eZsCn"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","execution_count":2,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"eb33b19f-1206-41ee-84e2-e6258a12eef7","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2534,"status":"ok","timestamp":1720679529344,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"xwFh14uiZBrI","outputId":"d767799c-34c2-46a5-f052-378146a55321"},"outputs":[],"source":["from pathlib import Path\n","\n","try:\n","    from google.colab import drive\n","\n","    drive.mount(\"/content/drive\")\n","    workding_dir = \"/content/drive/MyDrive/logical-reasoning/\"\n","except ModuleNotFoundError:\n","    workding_dir = str(Path.cwd().parent)"]},{"cell_type":"code","execution_count":3,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"6d394937-6c99-4a7c-9d32-7600a280032f","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"G5pNu3zgZBrL","outputId":"160a554f-fb08-4aa0-bc00-0422fb7c1fac"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: /Users/inflaton/code/engd/projects/logical-reasoning\n"]}],"source":["import os\n","import sys\n","from pathlib import Path\n","\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":4,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"hPCC-6m7ZBrM","outputId":"c7aa2c96-5e99-440a-c148-201d79465ff9"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /Users/inflaton/code/engd/projects/logical-reasoning/.env\n"]},{"data":{"text/plain":["True"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n","    found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":5,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"f1597656-8042-4878-9d3b-9ebfb8dd86dc","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"1M3IraVtZBrM","outputId":"29ab35f6-2970-4ade-d85d-3174acf8cda0"},"outputs":[{"name":"stdout","output_type":"stream","text":["internlm/internlm2_5-7b-chat-1m llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full/checkpoint-88 False datasets/mgtv results/mgtv-results_m3.csv\n"]}],"source":["import os\n","\n","model_name = os.getenv(\"MODEL_NAME\")\n","adapter_name_or_path = os.getenv(\"ADAPTER_NAME_OR_PATH\")\n","load_in_4bit = os.getenv(\"LOAD_IN_4BIT\") == \"true\"\n","data_path = os.getenv(\"LOGICAL_REASONING_DATA_PATH\")\n","results_path = os.getenv(\"LOGICAL_REASONING_RESULTS_PATH\")\n","use_english_datasets = os.getenv(\"USE_ENGLISH_DATASETS\") == \"true\"\n","\n","print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)"]},{"cell_type":"code","execution_count":6,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"b2a43943-9324-4839-9a47-cfa72de2244b","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":564,"status":"ok","timestamp":1720679529907,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"UgMvt6dIZBrM","outputId":"ce37581c-fd26-46c2-ad87-d933d99f68f7"},"outputs":[{"name":"stdout","output_type":"stream","text":["Python 3.11.9\n","\u001b[33mWARNING: Package(s) not found: flash-attn\u001b[0m\u001b[33m\n","\u001b[0mCPU times: user 4.51 ms, sys: 7.82 ms, total: 12.3 ms\n","Wall time: 652 ms\n"]}],"source":["%%time\n","!python --version\n","!pip show flash-attn"]},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1685,"status":"ok","timestamp":1720679531591,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"ZuS_FsLyZBrN","outputId":"2cba0105-c505-4395-afbd-2f2fee6581d0"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading /Users/inflaton/code/engd/projects/logical-reasoning/llm_toolkit/logical_reasoning_utils.py\n","MPS is available\n"]}],"source":["from llm_toolkit.llm_utils import *\n","from llm_toolkit.logical_reasoning_utils import *\n","\n","device = check_gpu()"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading model: llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full/checkpoint-88 with adapter: None\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"59cbac92c15441b8b96eeb2079d80a9f","version_major":2,"version_minor":0},"text/plain":["Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["CPU times: user 2.49 s, sys: 5.85 s, total: 8.34 s\n","Wall time: 27.5 s\n"]}],"source":["%%time\n","\n","# model, tokenizer = load_model(model_name, adapter_name_or_path=adapter_name_or_path, using_llama_factory=False)\n","model, tokenizer = load_model(adapter_name_or_path, using_llama_factory=False)"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/inflaton/anaconda3/envs/logical-reasoning/lib/python3.11/site-packages/transformers/integrations/peft.py:399: FutureWarning: The `active_adapter` method is deprecated and will be removed in a future version.\n","  warnings.warn(\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"6a97d694bb2c45288969b8f4806e9db3","version_major":2,"version_minor":0},"text/plain":["adapter_model.safetensors:   0%|          | 0.00/37.8M [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"text/plain":["CommitInfo(commit_url='https://huggingface.co/inflaton-ai/InternLM_2_5-7b_LoRA-Adapter/commit/822b89e129372ed5bc8372489ff6dcfef7d19cd7', commit_message='Upload InternLM2ForCausalLM', commit_description='', oid='822b89e129372ed5bc8372489ff6dcfef7d19cd7', pr_url=None, pr_revision=None, pr_num=None)"]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["model.push_to_hub(\"inflaton-ai/InternLM_2_5-7b_LoRA-Adapter\")"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[],"source":["del model, tokenizer"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading model: internlm/internlm2_5-7b-chat-1m with adapter: inflaton-ai/InternLM_2_5-7b_LoRA-Adapter\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"8e5a9c51fcb44a2e97ebcf0594a68938","version_major":2,"version_minor":0},"text/plain":["Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"105a5fdab3534197b34cf5e4d9ed47b3","version_major":2,"version_minor":0},"text/plain":["adapter_config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"55bcfc7ea08e4b43aab7796ba33a32bd","version_major":2,"version_minor":0},"text/plain":["adapter_model.safetensors:   0%|          | 0.00/37.8M [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["CPU times: user 3.06 s, sys: 5.04 s, total: 8.11 s\n","Wall time: 31.6 s\n"]}],"source":["%%time\n","\n","model, tokenizer = load_model(model_name, adapter_name_or_path=\"inflaton-ai/InternLM_2_5-7b_LoRA-Adapter\", using_llama_factory=False)\n","# model, tokenizer = load_model(\"inflaton-ai/InternLM_2_5-7b_LoRA-Adapter\", using_llama_factory=False)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"c01335f1a61e4b589b7c33978db1e8c6","version_major":2,"version_minor":0},"text/plain":["Map:   0%|          | 0/25000 [00:00<?, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"1e1c242ae44b4a65b7683e8918174989","version_major":2,"version_minor":0},"text/plain":["Map:   0%|          | 0/3000 [00:00<?, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n","    train: Dataset({\n","        features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n","        num_rows: 25000\n","    })\n","    test: Dataset({\n","        features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n","        num_rows: 3000\n","    })\n","})\n"]}],"source":["datasets = load_logical_reasoning_dataset(\n","    data_path,\n","    tokenizer=tokenizer,\n","    chinese_prompt=not use_english_datasets,\n","    using_p1=False,\n",")"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/inflaton/anaconda3/envs/logical-reasoning/lib/python3.11/site-packages/transformers/generation/utils.py:1513: UserWarning: The operator 'aten::isin.Tensor_Tensor_out' is not currently supported on the MPS backend and will fall back to run on the CPU. This may have performance implications. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/mps/MPSFallback.mm:13.)\n","  if eos_token_id is not None and torch.isin(elements=eos_token_id, test_elements=pad_token_id).any():\n"]},{"name":"stdout","output_type":"stream","text":["是\n","CPU times: user 579 ms, sys: 330 ms, total: 909 ms\n","Wall time: 2.86 s\n"]}],"source":["%%time\n","\n","prompt1 = datasets[\"test\"][\"prompt\"][1000]\n","\n","gen_kwargs = {\"max_length\": 4096, \"do_sample\": True, \"top_k\": 1}\n","with torch.no_grad():\n","    inputs = tokenizer(\n","        [prompt1],\n","        return_tensors=\"pt\",\n","    ).to(device)\n","    outputs = model.generate(**inputs, **gen_kwargs)\n","    outputs = outputs[:, inputs['input_ids'].shape[1]:]\n","    print(tokenizer.decode(outputs[0], skip_special_tokens=True))"]}],"metadata":{"accelerator":"GPU","application/vnd.databricks.v1+notebook":{"dashboards":[],"environmentMetadata":null,"language":"python","notebookMetadata":{"mostRecentlyExecutedCommandWithImplicitDF":{"commandId":-1,"dataframes":["_sqldf"]},"pythonIndentUnit":4},"notebookName":"10_eval-lf-medium-py3.11","widgets":{}},"colab":{"gpuType":"L4","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":0}
+{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"executionInfo":{"elapsed":476,"status":"ok","timestamp":1720679526275,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"uWKRSV6eZsCn"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","execution_count":2,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"eb33b19f-1206-41ee-84e2-e6258a12eef7","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2534,"status":"ok","timestamp":1720679529344,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"xwFh14uiZBrI","outputId":"d767799c-34c2-46a5-f052-378146a55321"},"outputs":[],"source":["from pathlib import Path\n","\n","try:\n","    from google.colab import drive\n","\n","    drive.mount(\"/content/drive\")\n","    workding_dir = \"/content/drive/MyDrive/logical-reasoning/\"\n","except ModuleNotFoundError:\n","    workding_dir = str(Path.cwd().parent)"]},{"cell_type":"code","execution_count":3,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"6d394937-6c99-4a7c-9d32-7600a280032f","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"G5pNu3zgZBrL","outputId":"160a554f-fb08-4aa0-bc00-0422fb7c1fac"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: /home/inflaton/code/projects/courses/logical-reasoning\n"]}],"source":["import os\n","import sys\n","from pathlib import Path\n","\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":4,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"hPCC-6m7ZBrM","outputId":"c7aa2c96-5e99-440a-c148-201d79465ff9"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /home/inflaton/code/projects/courses/logical-reasoning/.env\n"]},{"data":{"text/plain":["True"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n","    found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":5,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"f1597656-8042-4878-9d3b-9ebfb8dd86dc","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"1M3IraVtZBrM","outputId":"29ab35f6-2970-4ade-d85d-3174acf8cda0"},"outputs":[{"name":"stdout","output_type":"stream","text":["internlm/internlm2_5-7b-chat-1m llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full_r2/checkpoint-175 False datasets/mgtv results/mgtv-results_internlm_best.csv\n"]}],"source":["import os\n","\n","model_name = os.getenv(\"MODEL_NAME\")\n","adapter_name_or_path = os.getenv(\"ADAPTER_NAME_OR_PATH\")\n","load_in_4bit = os.getenv(\"LOAD_IN_4BIT\") == \"true\"\n","data_path = os.getenv(\"LOGICAL_REASONING_DATA_PATH\")\n","results_path = os.getenv(\"LOGICAL_REASONING_RESULTS_PATH\")\n","use_english_datasets = os.getenv(\"USE_ENGLISH_DATASETS\") == \"true\"\n","\n","print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)"]},{"cell_type":"code","execution_count":6,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"b2a43943-9324-4839-9a47-cfa72de2244b","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":564,"status":"ok","timestamp":1720679529907,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"UgMvt6dIZBrM","outputId":"ce37581c-fd26-46c2-ad87-d933d99f68f7"},"outputs":[{"name":"stdout","output_type":"stream","text":["Python 3.11.9\n","\u001b[33mWARNING: Package(s) not found: flash-attn\u001b[0m\u001b[33m\n","\u001b[0mName: transformers\n","Version: 4.41.2\n","Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow\n","Home-page: https://github.com/huggingface/transformers\n","Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)\n","Author-email: transformers@huggingface.co\n","License: Apache 2.0 License\n","Location: /home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages\n","Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm\n","Required-by: llamafactory, peft, trl, vllm\n","---\n","Name: torch\n","Version: 2.3.0\n","Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration\n","Home-page: https://pytorch.org/\n","Author: PyTorch Team\n","Author-email: packages@pytorch.org\n","License: BSD-3\n","Location: /home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages\n","Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu12, nvidia-cudnn-cu12, nvidia-cufft-cu12, nvidia-curand-cu12, nvidia-cusolver-cu12, nvidia-cusparse-cu12, nvidia-nccl-cu12, nvidia-nvtx-cu12, sympy, triton, typing-extensions\n","Required-by: accelerate, bitsandbytes, peft, torchvision, trl, vllm, vllm-flash-attn, xformers\n","CPU times: user 73.4 ms, sys: 23.9 ms, total: 97.3 ms\n","Wall time: 4.31 s\n"]}],"source":["%%time\n","!python --version\n","!pip show flash-attn transformers torch"]},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1685,"status":"ok","timestamp":1720679531591,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"ZuS_FsLyZBrN","outputId":"2cba0105-c505-4395-afbd-2f2fee6581d0"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading /home/inflaton/code/projects/courses/logical-reasoning/llm_toolkit/logical_reasoning_utils.py\n","GPU is available\n"]}],"source":["from llm_toolkit.llm_utils import *\n","from llm_toolkit.logical_reasoning_utils import *\n","\n","device = check_gpu()"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"data":{"text/plain":["'llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full_r2/checkpoint-175'"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["adapter_name_or_path"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading model: llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full_r2/checkpoint-175 with adapter: None\n"]},{"name":"stderr","output_type":"stream","text":["/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n","  warnings.warn(\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"97207088e8ac445998f38e27cbec5590","version_major":2,"version_minor":0},"text/plain":["Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stderr","output_type":"stream","text":["Some parameters are on the meta device device because they were offloaded to the cpu.\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.20.attention.wqkv.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.20.attention.wqkv.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.20.attention.wo.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.20.attention.wo.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.20.feed_forward.w1.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.20.feed_forward.w1.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.20.feed_forward.w3.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.20.feed_forward.w3.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.20.feed_forward.w2.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.20.feed_forward.w2.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.21.attention.wqkv.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.21.attention.wqkv.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.21.attention.wo.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.21.attention.wo.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.21.feed_forward.w1.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.21.feed_forward.w1.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.21.feed_forward.w3.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.21.feed_forward.w3.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.21.feed_forward.w2.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.21.feed_forward.w2.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.22.attention.wqkv.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.22.attention.wqkv.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.22.attention.wo.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.22.attention.wo.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.22.feed_forward.w1.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.22.feed_forward.w1.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.22.feed_forward.w3.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.22.feed_forward.w3.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.22.feed_forward.w2.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.22.feed_forward.w2.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.23.attention.wqkv.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.23.attention.wqkv.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.23.attention.wo.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.23.attention.wo.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.23.feed_forward.w1.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.23.feed_forward.w1.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.23.feed_forward.w3.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.23.feed_forward.w3.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.23.feed_forward.w2.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.23.feed_forward.w2.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.24.attention.wqkv.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.24.attention.wqkv.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.24.attention.wo.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.24.attention.wo.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.24.feed_forward.w1.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.24.feed_forward.w1.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.24.feed_forward.w3.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.24.feed_forward.w3.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.24.feed_forward.w2.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.24.feed_forward.w2.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.25.attention.wqkv.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.25.attention.wqkv.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.25.attention.wo.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.25.attention.wo.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.25.feed_forward.w1.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.25.feed_forward.w1.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.25.feed_forward.w3.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.25.feed_forward.w3.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.25.feed_forward.w2.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.25.feed_forward.w2.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.26.attention.wqkv.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.26.attention.wqkv.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.26.attention.wo.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.26.attention.wo.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.26.feed_forward.w1.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.26.feed_forward.w1.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.26.feed_forward.w3.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.26.feed_forward.w3.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.26.feed_forward.w2.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.26.feed_forward.w2.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.27.attention.wqkv.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.27.attention.wqkv.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.27.attention.wo.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.27.attention.wo.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.27.feed_forward.w1.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.27.feed_forward.w1.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.27.feed_forward.w3.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.27.feed_forward.w3.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.27.feed_forward.w2.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.27.feed_forward.w2.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.28.attention.wqkv.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.28.attention.wqkv.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.28.attention.wo.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.28.attention.wo.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.28.feed_forward.w1.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.28.feed_forward.w1.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.28.feed_forward.w3.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.28.feed_forward.w3.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.28.feed_forward.w2.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.28.feed_forward.w2.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.29.attention.wqkv.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.29.attention.wqkv.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.29.attention.wo.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.29.attention.wo.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.29.feed_forward.w1.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.29.feed_forward.w1.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.29.feed_forward.w3.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.29.feed_forward.w3.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.29.feed_forward.w2.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.29.feed_forward.w2.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.30.attention.wqkv.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.30.attention.wqkv.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.30.attention.wo.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.30.attention.wo.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.30.feed_forward.w1.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.30.feed_forward.w1.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.30.feed_forward.w3.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.30.feed_forward.w3.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.30.feed_forward.w2.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.30.feed_forward.w2.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.31.attention.wqkv.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.31.attention.wqkv.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.31.attention.wo.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.31.attention.wo.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.31.feed_forward.w1.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.31.feed_forward.w1.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.31.feed_forward.w3.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.31.feed_forward.w3.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.31.feed_forward.w2.lora_A.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:2047: UserWarning: for model.layers.31.feed_forward.w2.lora_B.default.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)\n","  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '\n","Some parameters are on the meta device device because they were offloaded to the cpu.\n"]},{"name":"stdout","output_type":"stream","text":["CPU times: user 32 s, sys: 1min 18s, total: 1min 50s\n","Wall time: 12min\n"]}],"source":["%%time\n","\n","model, tokenizer = load_model(adapter_name_or_path, using_llama_factory=False)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading train/test data files\n","DatasetDict({\n","    train: Dataset({\n","        features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n","        num_rows: 25000\n","    })\n","    test: Dataset({\n","        features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n","        num_rows: 3000\n","    })\n","})\n"]}],"source":["datasets = load_logical_reasoning_dataset(\n","    data_path,\n","    tokenizer=tokenizer,\n","    chinese_prompt=not use_english_datasets,\n","    using_p1=False,\n",")"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["--------------------------------------------------\n","text: 哭泣和村庄有关系吗\n","--------------------------------------------------\n","label: 是\n","--------------------------------------------------\n","answer: nan\n","--------------------------------------------------\n","title: 甄庄哭声\n","--------------------------------------------------\n","puzzle: 在一个安静的夜晚，小村庄的湖边突然传来了阵阵哭泣声。第二天早晨，村长甄锐发现湖边的石头上放着一顶破旧的帽子，但没有人知道这顶帽子是从哪里来的，哭泣声又是为何。请还原故事真相。\n","--------------------------------------------------\n","truth: 原来，这顶破旧的帽子属于一个小男孩，他小时候与爷爷在湖边生活。爷爷教他钓鱼、游泳，还告诉他湖中的海龟是他们的朋友。后来，小男孩随父母去了城市生活，但每年夏天都会回到村子探望爷爷。然而，去年夏天，爷爷因病去世，小男孩伤心欲绝。今年夏天，他回到村子，来到湖边，想起和爷爷的美好回忆，忍不住哭泣。他将爷爷的帽子放在湖边的石头上，希望能让爷爷的在天之灵得到安慰。那晚的哭泣声正是小男孩在祭莫他亲爱的爷爷。\n","--------------------------------------------------\n","train_text: <s><|im_start|>system\n","You are an expert in logical reasoning.<|im_end|>\n","<|im_start|>user\n","你是一个情景猜谜游戏的主持人。游戏规则如下：\n","\n","1. 参与者会得到一个谜面，谜面会描述一个简单又难以理解的事件。\n","2. 主持人知道谜底，谜底是谜面的答案。\n","3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n","4. 对于每个问题，主持人将根据实际情况回答以下五个选项之一：是、不是、不重要、回答正确、问法错误。各回答的判断标准如下：\n","   - 若谜面和谜底能找到问题的答案，回答：是或者不是\n","   - 若谜面和谜底不能直接或者间接推断出问题的答案，回答：不重要\n","   - 若参与者提问不是一个封闭式问题或者问题难以理解，回答：问法错误\n","   - 若参与者提问基本还原了谜底真相，回答：回答正确\n","5. 回答中不能添加任何其它信息，也不能省略选项中的任何一个字。例如，不可以把“不是”省略成“不”。\n","\n","请严格按照这些规则回答参与者提出的问题。\n","\n","**谜面:** 在一个安静的夜晚，小村庄的湖边突然传来了阵阵哭泣声。第二天早晨，村长甄锐发现湖边的石头上放着一顶破旧的帽子，但没有人知道这顶帽子是从哪里来的，哭泣声又是为何。请还原故事真相。\n","\n","**谜底:** 原来，这顶破旧的帽子属于一个小男孩，他小时候与爷爷在湖边生活。爷爷教他钓鱼、游泳，还告诉他湖中的海龟是他们的朋友。后来，小男孩随父母去了城市生活，但每年夏天都会回到村子探望爷爷。然而，去年夏天，爷爷因病去世，小男孩伤心欲绝。今年夏天，他回到村子，来到湖边，想起和爷爷的美好回忆，忍不住哭泣。他将爷爷的帽子放在湖边的石头上，希望能让爷爷的在天之灵得到安慰。那晚的哭泣声正是小男孩在祭莫他亲爱的爷爷。\n","\n","**参与者提出的问题:** 哭泣和村庄有关系吗\n","<|im_end|>\n","<|im_start|>assistant\n","是</s>\n","--------------------------------------------------\n","prompt: <s><|im_start|>system\n","You are an expert in logical reasoning.<|im_end|>\n","<|im_start|>user\n","你是一个情景猜谜游戏的主持人。游戏规则如下：\n","\n","1. 参与者会得到一个谜面，谜面会描述一个简单又难以理解的事件。\n","2. 主持人知道谜底，谜底是谜面的答案。\n","3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n","4. 对于每个问题，主持人将根据实际情况回答以下五个选项之一：是、不是、不重要、回答正确、问法错误。各回答的判断标准如下：\n","   - 若谜面和谜底能找到问题的答案，回答：是或者不是\n","   - 若谜面和谜底不能直接或者间接推断出问题的答案，回答：不重要\n","   - 若参与者提问不是一个封闭式问题或者问题难以理解，回答：问法错误\n","   - 若参与者提问基本还原了谜底真相，回答：回答正确\n","5. 回答中不能添加任何其它信息，也不能省略选项中的任何一个字。例如，不可以把“不是”省略成“不”。\n","\n","请严格按照这些规则回答参与者提出的问题。\n","\n","**谜面:** 在一个安静的夜晚，小村庄的湖边突然传来了阵阵哭泣声。第二天早晨，村长甄锐发现湖边的石头上放着一顶破旧的帽子，但没有人知道这顶帽子是从哪里来的，哭泣声又是为何。请还原故事真相。\n","\n","**谜底:** 原来，这顶��旧的帽子属于一个小男孩，他小时候与爷爷在湖边生活。爷爷教他钓鱼、游泳，还告诉他湖中的海龟是他们的朋友。后来，小男孩随父母去了城市生活，但每年夏天都会回到村子探望爷爷。然而，去年夏天，爷爷因病去世，小男孩伤心欲绝。今年夏天，他回到村子，来到湖边，想起和爷爷的美好回忆，忍不住哭泣。他将爷爷的帽子放在湖边的石头上，希望能让爷爷的在天之灵得到安慰。那晚的哭泣声正是小男孩在祭莫他亲爱的爷爷。\n","\n","**参与者提出的问题:** 哭泣和村庄有关系吗\n","<|im_end|>\n","<|im_start|>assistant\n","\n"]}],"source":["print_row_details(datasets[\"test\"].to_pandas(), [1000])"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"ename":"NotImplementedError","evalue":"Cannot copy out of meta tensor; no data!","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mNotImplementedError\u001b[0m                       Traceback (most recent call last)","File \u001b[0;32m<timed exec>:9\u001b[0m\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/utils/_contextlib.py:115\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    112\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m    113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    114\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 115\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/transformers/generation/utils.py:1758\u001b[0m, in \u001b[0;36mGenerationMixin.generate\u001b[0;34m(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)\u001b[0m\n\u001b[1;32m   1750\u001b[0m     input_ids, model_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_expand_inputs_for_generation(\n\u001b[1;32m   1751\u001b[0m         input_ids\u001b[38;5;241m=\u001b[39minput_ids,\n\u001b[1;32m   1752\u001b[0m         expand_size\u001b[38;5;241m=\u001b[39mgeneration_config\u001b[38;5;241m.\u001b[39mnum_return_sequences,\n\u001b[1;32m   1753\u001b[0m         is_encoder_decoder\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mis_encoder_decoder,\n\u001b[1;32m   1754\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodel_kwargs,\n\u001b[1;32m   1755\u001b[0m     )\n\u001b[1;32m   1757\u001b[0m     \u001b[38;5;66;03m# 13. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)\u001b[39;00m\n\u001b[0;32m-> 1758\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sample\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1759\u001b[0m \u001b[43m        \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1760\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlogits_processor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepared_logits_processor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1761\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlogits_warper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepared_logits_warper\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1762\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstopping_criteria\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepared_stopping_criteria\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1763\u001b[0m \u001b[43m        \u001b[49m\u001b[43mgeneration_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgeneration_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1764\u001b[0m \u001b[43m        \u001b[49m\u001b[43msynced_gpus\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msynced_gpus\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1765\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstreamer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstreamer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1766\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1767\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1769\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m generation_mode \u001b[38;5;129;01min\u001b[39;00m (GenerationMode\u001b[38;5;241m.\u001b[39mBEAM_SAMPLE, GenerationMode\u001b[38;5;241m.\u001b[39mBEAM_SEARCH):\n\u001b[1;32m   1770\u001b[0m     \u001b[38;5;66;03m# 11. prepare logits warper\u001b[39;00m\n\u001b[1;32m   1771\u001b[0m     prepared_logits_warper \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m   1772\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_logits_warper(generation_config) \u001b[38;5;28;01mif\u001b[39;00m generation_config\u001b[38;5;241m.\u001b[39mdo_sample \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1773\u001b[0m     )\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/transformers/generation/utils.py:2397\u001b[0m, in \u001b[0;36mGenerationMixin._sample\u001b[0;34m(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, logits_warper, **model_kwargs)\u001b[0m\n\u001b[1;32m   2394\u001b[0m model_inputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprepare_inputs_for_generation(input_ids, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodel_kwargs)\n\u001b[1;32m   2396\u001b[0m \u001b[38;5;66;03m# forward pass to get next token\u001b[39;00m\n\u001b[0;32m-> 2397\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2398\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_inputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2399\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m   2400\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2401\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2402\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2404\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m synced_gpus \u001b[38;5;129;01mand\u001b[39;00m this_peer_finished:\n\u001b[1;32m   2405\u001b[0m     \u001b[38;5;28;01mcontinue\u001b[39;00m  \u001b[38;5;66;03m# don't waste resources running the code we don't need\u001b[39;00m\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/accelerate/hooks.py:169\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(module, *args, **kwargs)\u001b[0m\n\u001b[1;32m    167\u001b[0m         output \u001b[38;5;241m=\u001b[39m module\u001b[38;5;241m.\u001b[39m_old_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    168\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 169\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_old_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    170\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\u001b[38;5;241m.\u001b[39m_hf_hook\u001b[38;5;241m.\u001b[39mpost_forward(module, output)\n","File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/internlm/internlm2_5-7b-chat-1m/8d1a709a04d71440ef3df6ebbe204672f411c8b6/modeling_internlm2.py:1204\u001b[0m, in \u001b[0;36mInternLM2ForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)\u001b[0m\n\u001b[1;32m   1201\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m   1203\u001b[0m \u001b[38;5;66;03m# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)\u001b[39;00m\n\u001b[0;32m-> 1204\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1205\u001b[0m \u001b[43m    \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1206\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1207\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1208\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1209\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1210\u001b[0m \u001b[43m    \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1211\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1212\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1213\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1214\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_position\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_position\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1215\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1217\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m   1218\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mpretraining_tp \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n","File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/internlm/internlm2_5-7b-chat-1m/8d1a709a04d71440ef3df6ebbe204672f411c8b6/modeling_internlm2.py:1004\u001b[0m, in \u001b[0;36mInternLM2Model.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)\u001b[0m\n\u001b[1;32m    993\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m    994\u001b[0m         decoder_layer\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m    995\u001b[0m         hidden_states,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1001\u001b[0m         cache_position,\n\u001b[1;32m   1002\u001b[0m     )\n\u001b[1;32m   1003\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1004\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mdecoder_layer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1005\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1006\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcausal_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1007\u001b[0m \u001b[43m        \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1008\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1009\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1010\u001b[0m \u001b[43m        \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1011\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_position\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_position\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1012\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1014\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m   1016\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_cache:\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/accelerate/hooks.py:169\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(module, *args, **kwargs)\u001b[0m\n\u001b[1;32m    167\u001b[0m         output \u001b[38;5;241m=\u001b[39m module\u001b[38;5;241m.\u001b[39m_old_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    168\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 169\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_old_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    170\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\u001b[38;5;241m.\u001b[39m_hf_hook\u001b[38;5;241m.\u001b[39mpost_forward(module, output)\n","File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/internlm/internlm2_5-7b-chat-1m/8d1a709a04d71440ef3df6ebbe204672f411c8b6/modeling_internlm2.py:735\u001b[0m, in \u001b[0;36mInternLM2DecoderLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, cache_position)\u001b[0m\n\u001b[1;32m    719\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    720\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[1;32m    721\u001b[0m \u001b[38;5;124;03m    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    731\u001b[0m \u001b[38;5;124;03m    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states\u001b[39;00m\n\u001b[1;32m    732\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    733\u001b[0m residual \u001b[38;5;241m=\u001b[39m hidden_states\n\u001b[0;32m--> 735\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattention_norm\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    737\u001b[0m \u001b[38;5;66;03m# Self Attention\u001b[39;00m\n\u001b[1;32m    738\u001b[0m hidden_states, self_attn_weights, present_key_value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattention(\n\u001b[1;32m    739\u001b[0m     hidden_states\u001b[38;5;241m=\u001b[39mhidden_states,\n\u001b[1;32m    740\u001b[0m     attention_mask\u001b[38;5;241m=\u001b[39mattention_mask,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    745\u001b[0m     cache_position\u001b[38;5;241m=\u001b[39mcache_position,\n\u001b[1;32m    746\u001b[0m )\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/accelerate/hooks.py:164\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(module, *args, **kwargs)\u001b[0m\n\u001b[1;32m    163\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mnew_forward\u001b[39m(module, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 164\u001b[0m     args, kwargs \u001b[38;5;241m=\u001b[39m \u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_hf_hook\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpre_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    165\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m module\u001b[38;5;241m.\u001b[39m_hf_hook\u001b[38;5;241m.\u001b[39mno_grad:\n\u001b[1;32m    166\u001b[0m         \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/accelerate/hooks.py:109\u001b[0m, in \u001b[0;36mSequentialHook.pre_forward\u001b[0;34m(self, module, *args, **kwargs)\u001b[0m\n\u001b[1;32m    107\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpre_forward\u001b[39m(\u001b[38;5;28mself\u001b[39m, module, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    108\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m hook \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhooks:\n\u001b[0;32m--> 109\u001b[0m         args, kwargs \u001b[38;5;241m=\u001b[39m \u001b[43mhook\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpre_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    110\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m args, kwargs\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/accelerate/hooks.py:354\u001b[0m, in \u001b[0;36mAlignDevicesHook.pre_forward\u001b[0;34m(self, module, *args, **kwargs)\u001b[0m\n\u001b[1;32m    346\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m    347\u001b[0m             value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    348\u001b[0m             \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtied_params_map \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    349\u001b[0m             \u001b[38;5;129;01mand\u001b[39;00m value\u001b[38;5;241m.\u001b[39mdata_ptr() \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtied_params_map\n\u001b[1;32m    350\u001b[0m             \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_device \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtied_params_map[value\u001b[38;5;241m.\u001b[39mdata_ptr()]\n\u001b[1;32m    351\u001b[0m         ):\n\u001b[1;32m    352\u001b[0m             \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtied_pointers_to_remove\u001b[38;5;241m.\u001b[39madd((value\u001b[38;5;241m.\u001b[39mdata_ptr(), \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_device))\n\u001b[0;32m--> 354\u001b[0m         \u001b[43mset_module_tensor_to_device\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    355\u001b[0m \u001b[43m            \u001b[49m\u001b[43mmodule\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    356\u001b[0m \u001b[43m            \u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    357\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecution_device\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    358\u001b[0m \u001b[43m            \u001b[49m\u001b[43mvalue\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    359\u001b[0m \u001b[43m            \u001b[49m\u001b[43mfp16_statistics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfp16_statistics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    360\u001b[0m \u001b[43m            \u001b[49m\u001b[43mtied_params_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtied_params_map\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    361\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    363\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m send_to_device(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_device), send_to_device(\n\u001b[1;32m    364\u001b[0m     kwargs, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_device, skip_keys\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mskip_keys\n\u001b[1;32m    365\u001b[0m )\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/accelerate/utils/modeling.py:404\u001b[0m, in \u001b[0;36mset_module_tensor_to_device\u001b[0;34m(module, tensor_name, device, value, dtype, fp16_statistics, tied_params_map)\u001b[0m\n\u001b[1;32m    402\u001b[0m             module\u001b[38;5;241m.\u001b[39m_parameters[tensor_name] \u001b[38;5;241m=\u001b[39m param_cls(new_value, requires_grad\u001b[38;5;241m=\u001b[39mold_value\u001b[38;5;241m.\u001b[39mrequires_grad)\n\u001b[1;32m    403\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(value, torch\u001b[38;5;241m.\u001b[39mTensor):\n\u001b[0;32m--> 404\u001b[0m     new_value \u001b[38;5;241m=\u001b[39m \u001b[43mvalue\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    405\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    406\u001b[0m     new_value \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(value, device\u001b[38;5;241m=\u001b[39mdevice)\n","\u001b[0;31mNotImplementedError\u001b[0m: Cannot copy out of meta tensor; no data!"]}],"source":["%%time\n","\n","prompt1 = datasets[\"test\"][\"prompt\"][1000]\n","\n","gen_kwargs = {\"max_length\": 4096, \"do_sample\": True, \"top_k\": 1}\n","with torch.no_grad():\n","    inputs = tokenizer(\n","        [prompt1],\n","        return_tensors=\"pt\",\n","    ).to(device)\n","    outputs = model.generate(**inputs, **gen_kwargs)\n","    outputs = outputs[:, inputs['input_ids'].shape[1]:]\n","    print(tokenizer.decode(outputs[0], skip_special_tokens=True))"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/transformers/integrations/peft.py:399: FutureWarning: The `active_adapter` method is deprecated and will be removed in a future version.\n","  warnings.warn(\n","/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n","  warnings.warn(\n"]},{"ename":"RuntimeError","evalue":"The weights trying to be saved contained shared tensors [{'base_model.model.model.layers.14.attention.wo.lora_A.weight', 'base_model.model.model.layers.26.attention.wo.lora_B.weight', 'base_model.model.model.layers.3.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.2.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.4.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.30.attention.wo.lora_A.weight', 'base_model.model.model.layers.14.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.21.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.23.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.10.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.31.attention.wo.lora_A.weight', 'base_model.model.model.layers.27.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.1.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.5.attention.wo.lora_B.weight', 'base_model.model.model.layers.22.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.18.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.18.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.29.attention.wo.lora_B.weight', 'base_model.model.model.layers.0.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.7.attention.wo.lora_B.weight', 'base_model.model.model.layers.21.attention.wo.lora_B.weight', 'base_model.model.model.layers.0.attention.wo.lora_B.weight', 'base_model.model.model.layers.5.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.28.attention.wo.lora_B.weight', 'base_model.model.model.layers.12.attention.wo.lora_A.weight', 'base_model.model.model.layers.23.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.9.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.13.attention.wo.lora_B.weight', 'base_model.model.model.layers.12.attention.wo.lora_B.weight', 'base_model.model.model.layers.22.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.4.attention.wo.lora_A.weight', 'base_model.model.model.layers.3.attention.wo.lora_A.weight', 'base_model.model.model.layers.27.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.10.attention.wo.lora_A.weight', 'base_model.model.model.layers.12.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.2.attention.wo.lora_A.weight', 'base_model.model.model.layers.20.attention.wo.lora_B.weight', 'base_model.model.model.layers.5.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.7.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.16.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.28.attention.wo.lora_A.weight', 'base_model.model.model.layers.28.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.15.attention.wo.lora_A.weight', 'base_model.model.model.layers.11.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.19.attention.wo.lora_A.weight', 'base_model.model.model.layers.11.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.31.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.25.attention.wo.lora_A.weight', 'base_model.model.model.layers.6.attention.wo.lora_A.weight', 'base_model.model.model.layers.26.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.7.attention.wo.lora_A.weight', 'base_model.model.model.layers.7.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.27.attention.wo.lora_A.weight', 'base_model.model.model.layers.5.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.29.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.0.attention.wo.lora_A.weight', 'base_model.model.model.layers.16.attention.wo.lora_A.weight', 'base_model.model.model.layers.28.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.14.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.13.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.20.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.29.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.5.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.2.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.2.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.22.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.14.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.11.attention.wo.lora_B.weight', 'base_model.model.model.layers.31.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.25.attention.wo.lora_B.weight', 'base_model.model.model.layers.15.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.18.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.25.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.27.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.8.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.23.attention.wo.lora_A.weight', 'base_model.model.model.layers.9.attention.wo.lora_B.weight', 'base_model.model.model.layers.21.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.8.attention.wo.lora_B.weight', 'base_model.model.model.layers.22.attention.wo.lora_B.weight', 'base_model.model.model.layers.26.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.24.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.12.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.13.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.23.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.16.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.21.attention.wo.lora_A.weight', 'base_model.model.model.layers.28.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.24.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.17.attention.wo.lora_B.weight', 'base_model.model.model.layers.0.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.15.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.29.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.8.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.3.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.14.attention.wo.lora_B.weight', 'base_model.model.model.layers.9.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.24.attention.wo.lora_A.weight', 'base_model.model.model.layers.0.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.1.attention.wo.lora_B.weight', 'base_model.model.model.layers.2.attention.wo.lora_B.weight', 'base_model.model.model.layers.19.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.31.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.5.attention.wo.lora_A.weight', 'base_model.model.model.layers.14.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.31.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.30.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.17.attention.wo.lora_A.weight', 'base_model.model.model.layers.8.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.27.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.4.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.30.attention.wo.lora_B.weight', 'base_model.model.model.layers.10.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.2.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.30.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.23.attention.wo.lora_B.weight', 'base_model.model.model.layers.0.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.15.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.17.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.12.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.15.attention.wo.lora_B.weight', 'base_model.model.model.layers.25.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.8.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.25.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.26.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.10.attention.wo.lora_B.weight', 'base_model.model.model.layers.20.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.24.attention.wo.lora_B.weight', 'base_model.model.model.layers.29.attention.wo.lora_A.weight', 'base_model.model.model.layers.16.attention.wo.lora_B.weight', 'base_model.model.model.layers.26.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.11.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.19.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.19.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.20.attention.wo.lora_A.weight', 'base_model.model.model.layers.3.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.31.attention.wo.lora_B.weight', 'base_model.model.model.layers.7.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.16.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.1.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.24.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.9.attention.wo.lora_A.weight', 'base_model.model.model.layers.12.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.4.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.30.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.13.attention.wo.lora_A.weight', 'base_model.model.model.layers.9.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.20.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.17.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.24.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.3.attention.wo.lora_B.weight', 'base_model.model.model.layers.6.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.8.attention.wo.lora_A.weight', 'base_model.model.model.layers.10.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.11.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.6.attention.wo.lora_B.weight', 'base_model.model.model.layers.6.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.4.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.16.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.4.attention.wo.lora_B.weight', 'base_model.model.model.layers.28.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.6.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.17.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.15.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.30.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.18.attention.wo.lora_A.weight', 'base_model.model.model.layers.1.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.25.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.18.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.27.attention.wo.lora_B.weight', 'base_model.model.model.layers.22.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.13.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.20.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.3.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.11.attention.wo.lora_A.weight', 'base_model.model.model.layers.10.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.19.attention.wo.lora_B.weight', 'base_model.model.model.layers.21.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.23.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.29.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.7.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.21.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.17.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.26.attention.wo.lora_A.weight', 'base_model.model.model.layers.1.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.1.attention.wo.lora_A.weight', 'base_model.model.model.layers.18.attention.wo.lora_B.weight', 'base_model.model.model.layers.6.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.19.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.9.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.22.attention.wo.lora_A.weight', 'base_model.model.model.layers.13.feed_forward.w3.lora_A.weight'}, {'base_model.model.model.layers.25.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.30.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.10.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.26.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.0.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.18.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.7.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.24.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.4.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.15.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.19.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.13.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.5.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.16.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.14.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.29.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.23.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.3.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.1.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.2.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.9.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.20.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.27.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.22.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.21.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.31.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.6.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.11.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.17.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.8.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.28.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.12.attention.wqkv.lora_B.weight'}, {'base_model.model.model.layers.0.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.12.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.12.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.24.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.20.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.0.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.26.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.30.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.23.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.16.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.18.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.31.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.7.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.11.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.23.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.21.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.6.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.11.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.22.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.4.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.17.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.10.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.5.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.16.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.14.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.22.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.27.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.5.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.3.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.28.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.9.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.13.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.15.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.16.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.2.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.18.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.24.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.1.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.25.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.27.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.8.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.9.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.29.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.13.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.17.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.10.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.21.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.30.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.30.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.4.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.15.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.6.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.19.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.9.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.2.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.20.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.19.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.29.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.25.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.14.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.8.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.20.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.18.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.7.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.28.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.6.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.10.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.11.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.21.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.27.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.31.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.23.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.8.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.14.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.5.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.29.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.22.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.3.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.7.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.17.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.0.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.1.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.12.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.13.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.2.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.15.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.19.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.25.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.26.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.24.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.3.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.4.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.26.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.28.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.1.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.31.feed_forward.w1.lora_B.weight'}] that are mismatching the transformers base configuration. Try saving using `safe_serialization=False` or remove this tensor sharing.","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)","Cell \u001b[0;32mIn[12], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m adapter_name_or_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minflaton-ai/InternLM_2_5-7b_LoRA-r2\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 3\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpush_to_hub\u001b[49m\u001b[43m(\u001b[49m\u001b[43madapter_name_or_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      4\u001b[0m tokenizer\u001b[38;5;241m.\u001b[39mpush_to_hub(adapter_name_or_path)\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/transformers/modeling_utils.py:2663\u001b[0m, in \u001b[0;36mPreTrainedModel.push_to_hub\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   2661\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m tags:\n\u001b[1;32m   2662\u001b[0m     kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtags\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m tags\n\u001b[0;32m-> 2663\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpush_to_hub\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/transformers/utils/hub.py:891\u001b[0m, in \u001b[0;36mPushToHubMixin.push_to_hub\u001b[0;34m(self, repo_id, use_temp_dir, commit_message, private, token, max_shard_size, create_pr, safe_serialization, revision, commit_description, tags, **deprecated_kwargs)\u001b[0m\n\u001b[1;32m    888\u001b[0m files_timestamps \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_files_timestamps(work_dir)\n\u001b[1;32m    890\u001b[0m \u001b[38;5;66;03m# Save all files.\u001b[39;00m\n\u001b[0;32m--> 891\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwork_dir\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_shard_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_shard_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msafe_serialization\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msafe_serialization\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    893\u001b[0m \u001b[38;5;66;03m# Update model card if needed:\u001b[39;00m\n\u001b[1;32m    894\u001b[0m model_card\u001b[38;5;241m.\u001b[39msave(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(work_dir, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mREADME.md\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n","File \u001b[0;32m~/miniconda3/envs/llama-factory/lib/python3.11/site-packages/transformers/modeling_utils.py:2574\u001b[0m, in \u001b[0;36mPreTrainedModel.save_pretrained\u001b[0;34m(self, save_directory, is_main_process, state_dict, save_function, push_to_hub, max_shard_size, safe_serialization, variant, token, save_peft_format, **kwargs)\u001b[0m\n\u001b[1;32m   2571\u001b[0m         error_names\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28mset\u001b[39m(shared_names))\n\u001b[1;32m   2573\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(error_names) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m-> 2574\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m   2575\u001b[0m             \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe weights trying to be saved contained shared tensors \u001b[39m\u001b[38;5;132;01m{\u001b[39;00merror_names\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m that are mismatching the transformers base configuration. Try saving using `safe_serialization=False` or remove this tensor sharing.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m   2576\u001b[0m         )\n\u001b[1;32m   2578\u001b[0m \u001b[38;5;66;03m# Shard the model if it is too big.\u001b[39;00m\n\u001b[1;32m   2579\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _hf_peft_config_loaded:\n","\u001b[0;31mRuntimeError\u001b[0m: The weights trying to be saved contained shared tensors [{'base_model.model.model.layers.14.attention.wo.lora_A.weight', 'base_model.model.model.layers.26.attention.wo.lora_B.weight', 'base_model.model.model.layers.3.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.2.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.4.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.30.attention.wo.lora_A.weight', 'base_model.model.model.layers.14.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.21.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.23.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.10.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.31.attention.wo.lora_A.weight', 'base_model.model.model.layers.27.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.1.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.5.attention.wo.lora_B.weight', 'base_model.model.model.layers.22.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.18.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.18.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.29.attention.wo.lora_B.weight', 'base_model.model.model.layers.0.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.7.attention.wo.lora_B.weight', 'base_model.model.model.layers.21.attention.wo.lora_B.weight', 'base_model.model.model.layers.0.attention.wo.lora_B.weight', 'base_model.model.model.layers.5.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.28.attention.wo.lora_B.weight', 'base_model.model.model.layers.12.attention.wo.lora_A.weight', 'base_model.model.model.layers.23.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.9.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.13.attention.wo.lora_B.weight', 'base_model.model.model.layers.12.attention.wo.lora_B.weight', 'base_model.model.model.layers.22.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.4.attention.wo.lora_A.weight', 'base_model.model.model.layers.3.attention.wo.lora_A.weight', 'base_model.model.model.layers.27.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.10.attention.wo.lora_A.weight', 'base_model.model.model.layers.12.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.2.attention.wo.lora_A.weight', 'base_model.model.model.layers.20.attention.wo.lora_B.weight', 'base_model.model.model.layers.5.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.7.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.16.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.28.attention.wo.lora_A.weight', 'base_model.model.model.layers.28.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.15.attention.wo.lora_A.weight', 'base_model.model.model.layers.11.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.19.attention.wo.lora_A.weight', 'base_model.model.model.layers.11.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.31.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.25.attention.wo.lora_A.weight', 'base_model.model.model.layers.6.attention.wo.lora_A.weight', 'base_model.model.model.layers.26.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.7.attention.wo.lora_A.weight', 'base_model.model.model.layers.7.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.27.attention.wo.lora_A.weight', 'base_model.model.model.layers.5.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.29.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.0.attention.wo.lora_A.weight', 'base_model.model.model.layers.16.attention.wo.lora_A.weight', 'base_model.model.model.layers.28.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.14.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.13.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.20.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.29.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.5.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.2.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.2.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.22.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.14.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.11.attention.wo.lora_B.weight', 'base_model.model.model.layers.31.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.25.attention.wo.lora_B.weight', 'base_model.model.model.layers.15.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.18.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.25.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.27.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.8.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.23.attention.wo.lora_A.weight', 'base_model.model.model.layers.9.attention.wo.lora_B.weight', 'base_model.model.model.layers.21.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.8.attention.wo.lora_B.weight', 'base_model.model.model.layers.22.attention.wo.lora_B.weight', 'base_model.model.model.layers.26.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.24.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.12.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.13.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.23.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.16.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.21.attention.wo.lora_A.weight', 'base_model.model.model.layers.28.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.24.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.17.attention.wo.lora_B.weight', 'base_model.model.model.layers.0.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.15.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.29.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.8.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.3.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.14.attention.wo.lora_B.weight', 'base_model.model.model.layers.9.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.24.attention.wo.lora_A.weight', 'base_model.model.model.layers.0.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.1.attention.wo.lora_B.weight', 'base_model.model.model.layers.2.attention.wo.lora_B.weight', 'base_model.model.model.layers.19.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.31.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.5.attention.wo.lora_A.weight', 'base_model.model.model.layers.14.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.31.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.30.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.17.attention.wo.lora_A.weight', 'base_model.model.model.layers.8.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.27.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.4.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.30.attention.wo.lora_B.weight', 'base_model.model.model.layers.10.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.2.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.30.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.23.attention.wo.lora_B.weight', 'base_model.model.model.layers.0.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.15.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.17.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.12.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.15.attention.wo.lora_B.weight', 'base_model.model.model.layers.25.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.8.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.25.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.26.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.10.attention.wo.lora_B.weight', 'base_model.model.model.layers.20.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.24.attention.wo.lora_B.weight', 'base_model.model.model.layers.29.attention.wo.lora_A.weight', 'base_model.model.model.layers.16.attention.wo.lora_B.weight', 'base_model.model.model.layers.26.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.11.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.19.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.19.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.20.attention.wo.lora_A.weight', 'base_model.model.model.layers.3.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.31.attention.wo.lora_B.weight', 'base_model.model.model.layers.7.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.16.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.1.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.24.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.9.attention.wo.lora_A.weight', 'base_model.model.model.layers.12.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.4.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.30.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.13.attention.wo.lora_A.weight', 'base_model.model.model.layers.9.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.20.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.17.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.24.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.3.attention.wo.lora_B.weight', 'base_model.model.model.layers.6.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.8.attention.wo.lora_A.weight', 'base_model.model.model.layers.10.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.11.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.6.attention.wo.lora_B.weight', 'base_model.model.model.layers.6.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.4.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.16.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.4.attention.wo.lora_B.weight', 'base_model.model.model.layers.28.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.6.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.17.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.15.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.30.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.18.attention.wo.lora_A.weight', 'base_model.model.model.layers.1.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.25.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.18.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.27.attention.wo.lora_B.weight', 'base_model.model.model.layers.22.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.13.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.20.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.3.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.11.attention.wo.lora_A.weight', 'base_model.model.model.layers.10.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.19.attention.wo.lora_B.weight', 'base_model.model.model.layers.21.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.23.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.29.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.7.attention.wqkv.lora_A.weight', 'base_model.model.model.layers.21.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.17.feed_forward.w2.lora_B.weight', 'base_model.model.model.layers.26.attention.wo.lora_A.weight', 'base_model.model.model.layers.1.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.1.attention.wo.lora_A.weight', 'base_model.model.model.layers.18.attention.wo.lora_B.weight', 'base_model.model.model.layers.6.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.19.feed_forward.w1.lora_A.weight', 'base_model.model.model.layers.9.feed_forward.w3.lora_A.weight', 'base_model.model.model.layers.22.attention.wo.lora_A.weight', 'base_model.model.model.layers.13.feed_forward.w3.lora_A.weight'}, {'base_model.model.model.layers.25.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.30.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.10.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.26.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.0.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.18.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.7.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.24.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.4.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.15.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.19.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.13.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.5.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.16.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.14.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.29.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.23.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.3.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.1.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.2.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.9.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.20.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.27.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.22.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.21.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.31.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.6.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.11.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.17.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.8.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.28.attention.wqkv.lora_B.weight', 'base_model.model.model.layers.12.attention.wqkv.lora_B.weight'}, {'base_model.model.model.layers.0.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.12.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.12.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.24.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.20.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.0.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.26.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.30.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.23.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.16.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.18.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.31.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.7.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.11.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.23.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.21.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.6.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.11.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.22.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.4.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.17.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.10.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.5.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.16.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.14.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.22.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.27.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.5.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.3.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.28.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.9.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.13.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.15.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.16.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.2.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.18.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.24.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.1.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.25.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.27.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.8.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.9.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.29.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.13.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.17.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.10.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.21.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.30.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.30.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.4.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.15.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.6.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.19.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.9.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.2.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.20.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.19.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.29.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.25.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.14.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.8.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.20.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.18.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.7.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.28.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.6.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.10.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.11.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.21.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.27.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.31.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.23.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.8.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.14.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.5.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.29.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.22.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.3.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.7.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.17.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.0.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.1.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.12.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.13.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.2.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.15.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.19.feed_forward.w1.lora_B.weight', 'base_model.model.model.layers.25.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.26.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.24.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.3.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.4.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.26.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.28.feed_forward.w3.lora_B.weight', 'base_model.model.model.layers.1.feed_forward.w2.lora_A.weight', 'base_model.model.model.layers.31.feed_forward.w1.lora_B.weight'}] that are mismatching the transformers base configuration. Try saving using `safe_serialization=False` or remove this tensor sharing."]}],"source":["adapter_name_or_path = \"inflaton-ai/InternLM_2_5-7b_LoRA-r2\"\n","\n","model.push_to_hub(adapter_name_or_path)\n","tokenizer.push_to_hub(adapter_name_or_path)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["del model, tokenizer"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading model: internlm/internlm2_5-7b-chat-1m with adapter: inflaton-ai/InternLM_2_5-7b_LoRA-Adapter\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"8e5a9c51fcb44a2e97ebcf0594a68938","version_major":2,"version_minor":0},"text/plain":["Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"105a5fdab3534197b34cf5e4d9ed47b3","version_major":2,"version_minor":0},"text/plain":["adapter_config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"55bcfc7ea08e4b43aab7796ba33a32bd","version_major":2,"version_minor":0},"text/plain":["adapter_model.safetensors:   0%|          | 0.00/37.8M [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["CPU times: user 3.06 s, sys: 5.04 s, total: 8.11 s\n","Wall time: 31.6 s\n"]}],"source":["%%time\n","\n","model, tokenizer = load_model(model_name, adapter_name_or_path=adapter_name_or_path, using_llama_factory=False)\n","# model, tokenizer = load_model(\"inflaton-ai/InternLM_2_5-7b_LoRA-Adapter\", using_llama_factory=False)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/inflaton/anaconda3/envs/logical-reasoning/lib/python3.11/site-packages/transformers/generation/utils.py:1513: UserWarning: The operator 'aten::isin.Tensor_Tensor_out' is not currently supported on the MPS backend and will fall back to run on the CPU. This may have performance implications. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/mps/MPSFallback.mm:13.)\n","  if eos_token_id is not None and torch.isin(elements=eos_token_id, test_elements=pad_token_id).any():\n"]},{"name":"stdout","output_type":"stream","text":["是\n","CPU times: user 579 ms, sys: 330 ms, total: 909 ms\n","Wall time: 2.86 s\n"]}],"source":["def evaluate_model(model, tokenizer, model_name, dataset):\n","    print(f\"Evaluating model: {model_name} on {device}\")\n","    predictions = eval_model(model, tokenizer, dataset, device=device)\n","\n","    save_results(\n","        model_name,\n","        results_path,\n","        dataset,\n","        predictions,\n","        debug=False,\n","    )\n","\n","    metrics = calc_metrics(dataset[\"label\"], predictions, debug=False)\n","    print(metrics)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["%%time\n","\n","evaluate_model(model, tokenizer, f\"{model_name}_{adapter_name_or_path}_nv4080\", datasets[\"test\"])"]}],"metadata":{"accelerator":"GPU","application/vnd.databricks.v1+notebook":{"dashboards":[],"environmentMetadata":null,"language":"python","notebookMetadata":{"mostRecentlyExecutedCommandWithImplicitDF":{"commandId":-1,"dataframes":["_sqldf"]},"pythonIndentUnit":4},"notebookName":"10_eval-lf-medium-py3.11","widgets":{}},"colab":{"gpuType":"L4","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":0}