{ "cells": [ { "cell_type": "code", "execution_count": 11, "metadata": { "executionInfo": { "elapsed": 476, "status": "ok", "timestamp": 1720679526275, "user": { "displayName": "HUANG DONGHAO _", "userId": "00977795705617022768" }, "user_tz": -480 }, "id": "uWKRSV6eZsCn" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The autoreload extension is already loaded. To reload it, use:\n", " %reload_ext autoreload\n" ] } ], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "6d394937-6c99-4a7c-9d32-7600a280032f", "showTitle": false, "title": "" }, "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 5, "status": "ok", "timestamp": 1720679529345, "user": { "displayName": "HUANG DONGHAO _", "userId": "00977795705617022768" }, "user_tz": -480 }, "id": "G5pNu3zgZBrL", "outputId": "160a554f-fb08-4aa0-bc00-0422fb7c1fac" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "workding dir: /Users/inflaton/code/engd/papers/rapget-translation\n" ] } ], "source": [ "import os\n", "import sys\n", "from pathlib import Path\n", "\n", "# check if workding_dir is in local variables\n", "if \"workding_dir\" not in locals():\n", " workding_dir = str(Path.cwd().parent)\n", "\n", "os.chdir(workding_dir)\n", "sys.path.append(workding_dir)\n", "print(\"workding dir:\", workding_dir)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "9f67ec60-2f24-411c-84eb-0dd664b44775", "showTitle": false, "title": "" }, "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 3, "status": "ok", "timestamp": 1720679529345, "user": { "displayName": "HUANG DONGHAO _", "userId": "00977795705617022768" }, "user_tz": -480 }, "id": "hPCC-6m7ZBrM", "outputId": "c7aa2c96-5e99-440a-c148-201d79465ff9" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading env vars from: /Users/inflaton/code/engd/papers/rapget-translation/.env\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from dotenv import find_dotenv, load_dotenv\n", "\n", "found_dotenv = find_dotenv(\".env\")\n", "\n", "if len(found_dotenv) == 0:\n", " found_dotenv = find_dotenv(\".env.example\")\n", "print(f\"loading env vars from: {found_dotenv}\")\n", "load_dotenv(found_dotenv, override=True)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "f1597656-8042-4878-9d3b-9ebfb8dd86dc", "showTitle": false, "title": "" }, "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 3, "status": "ok", "timestamp": 1720679529345, "user": { "displayName": "HUANG DONGHAO _", "userId": "00977795705617022768" }, "user_tz": -480 }, "id": "1M3IraVtZBrM", "outputId": "29ab35f6-2970-4ade-d85d-3174acf8cda0" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Qwen/Qwen2-7B-Instruct None False datasets/mac/mac.tsv results/mac-results_rpp_with_mnt_2048.csv False 2048\n" ] } ], "source": [ "import os\n", "\n", "model_name = os.getenv(\"MODEL_NAME\")\n", "adapter_name_or_path = os.getenv(\"ADAPTER_NAME_OR_PATH\")\n", "load_in_4bit = os.getenv(\"LOAD_IN_4BIT\") == \"true\"\n", "data_path = os.getenv(\"DATA_PATH\")\n", "results_path = os.getenv(\"RESULTS_PATH\")\n", "use_english_datasets = os.getenv(\"USE_ENGLISH_DATASETS\") == \"true\"\n", "max_new_tokens = int(os.getenv(\"MAX_NEW_TOKENS\", 2048))\n", "\n", "print(\n", " model_name,\n", " adapter_name_or_path,\n", " load_in_4bit,\n", " data_path,\n", " results_path,\n", " use_english_datasets,\n", " max_new_tokens,\n", ")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "b2a43943-9324-4839-9a47-cfa72de2244b", "showTitle": false, "title": "" }, "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 564, "status": "ok", "timestamp": 1720679529907, "user": { "displayName": "HUANG DONGHAO _", "userId": "00977795705617022768" }, "user_tz": -480 }, "id": "UgMvt6dIZBrM", "outputId": "ce37581c-fd26-46c2-ad87-d933d99f68f7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Python 3.11.9\n", "Name: torch\n", "Version: 2.4.0\n", "Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration\n", "Home-page: https://pytorch.org/\n", "Author: PyTorch Team\n", "Author-email: packages@pytorch.org\n", "License: BSD-3\n", "Location: /Users/inflaton/anaconda3/envs/rapget/lib/python3.11/site-packages\n", "Requires: filelock, fsspec, jinja2, networkx, sympy, typing-extensions\n", "Required-by: accelerate, peft, torchaudio, torchvision, trl\n", "---\n", "Name: transformers\n", "Version: 4.43.3\n", "Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow\n", "Home-page: https://github.com/huggingface/transformers\n", "Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)\n", "Author-email: transformers@huggingface.co\n", "License: Apache 2.0 License\n", "Location: /Users/inflaton/anaconda3/envs/rapget/lib/python3.11/site-packages\n", "Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm\n", "Required-by: llamafactory, peft, trl\n", "CPU times: user 7.4 ms, sys: 21.3 ms, total: 28.7 ms\n", "Wall time: 1.83 s\n" ] } ], "source": [ "%%time\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"true\"\n", "\n", "!python --version\n", "!pip show torch transformers" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 1685, "status": "ok", "timestamp": 1720679531591, "user": { "displayName": "HUANG DONGHAO _", "userId": "00977795705617022768" }, "user_tz": -480 }, "id": "ZuS_FsLyZBrN", "outputId": "2cba0105-c505-4395-afbd-2f2fee6581d0" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MPS is available\n" ] } ], "source": [ "from llm_toolkit.llm_utils import *\n", "from llm_toolkit.translation_utils import *\n", "\n", "device = check_gpu()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading existing data from: logs/openai-training-sample.jsonl\n" ] }, { "data": { "text/html": [ "
\n", " | messages | \n", "
---|---|
0 | \n", "[{'role': 'system', 'content': 'Marv is a fact... | \n", "
1 | \n", "[{'role': 'system', 'content': 'Marv is a fact... | \n", "
2 | \n", "[{'role': 'system', 'content': 'Marv is a fact... | \n", "