Spaces:
Build error
Build error
File size: 13,627 Bytes
c755e09 |
1 2 |
{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"0ea8b46b-839b-445b-8043-ccdf4e920ace","showTitle":false,"title":""},"id":"YLH80COBzi_F"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","execution_count":2,"metadata":{"id":"63B5exAuzq4M"},"outputs":[],"source":["from pathlib import Path\n","\n","try:\n"," from google.colab import drive\n"," drive.mount('/content/drive')\n"," workding_dir = \"/content/drive/MyDrive/logical-reasoning/\"\n","except ModuleNotFoundError:\n"," workding_dir = str(Path.cwd().parent)"]},{"cell_type":"code","execution_count":3,"metadata":{"executionInfo":{"elapsed":368,"status":"ok","timestamp":1719461634865,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"zFulf0bg0H-9","outputId":"debdd535-c828-40b9-efc0-8a180e5830dd"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: /Users/inflaton/Library/CloudStorage/GoogleDrive-dh.huang.2023@smu.edu.sg/My Drive/logical-reasoning\n"]}],"source":["import os\n","import sys\n","\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":4,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":589,"status":"ok","timestamp":1719462011879,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"DIUiweYYzi_I","outputId":"e16e9247-9077-4b0c-f8ea-17059f05a1c4"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /Users/inflaton/Library/CloudStorage/GoogleDrive-dh.huang.2023@smu.edu.sg/My Drive/logical-reasoning/.env\n"]},{"data":{"text/plain":["True"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n"," found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[{"data":{"text/plain":["(5, 5, 5)"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["import pandas as pd\n","\n","\n","df_dev = pd.read_csv(\"datasets/mgtv/dev.csv\")\n","len(df_dev[\"title\"].value_counts()), len(df_dev[\"puzzle\"].value_counts()), len(\n"," df_dev[\"truth\"].value_counts()\n",")"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading /Users/inflaton/Library/CloudStorage/GoogleDrive-dh.huang.2023@smu.edu.sg/My Drive/logical-reasoning/llm_toolkit/translation_utils.py\n"]},{"name":"stderr","output_type":"stream","text":["[nltk_data] Downloading package wordnet to\n","[nltk_data] /Users/inflaton/nltk_data...\n","[nltk_data] Package wordnet is already up-to-date!\n","[nltk_data] Downloading package punkt to /Users/inflaton/nltk_data...\n","[nltk_data] Package punkt is already up-to-date!\n","[nltk_data] Downloading package omw-1.4 to\n","[nltk_data] /Users/inflaton/nltk_data...\n","[nltk_data] Package omw-1.4 is already up-to-date!\n"]}],"source":["from llm_toolkit.translation_utils import translate\n","import pandas as pd\n","\n","\n","def translate_df(df, cache_path=None):\n"," if cache_path and os.path.exists(cache_path):\n"," cache_df = pd.read_csv(cache_path)\n"," else:\n"," cache_df = pd.DataFrame(columns=[\"chinese\", \"english\"])\n","\n"," cache_dict = {k: v for k, v in zip(cache_df[\"chinese\"], cache_df[\"english\"])}\n","\n"," df[\"text\"] = df[\"text\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"title\"] = df[\"title\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"label\"] = df[\"label\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"puzzle\"] = df[\"puzzle\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"truth\"] = df[\"truth\"].apply(lambda x: translate(x, cache_dict))\n","\n"," if cache_path:\n"," for k in cache_df[\"chinese\"]:\n"," if k in cache_dict:\n"," del cache_dict[k]\n","\n"," if k in cache_dict:\n"," new_data = {\"chinese\": k, \"english\": cache_dict[k]}\n"," new_row_df = pd.DataFrame([new_data])\n"," df_cache = pd.concat(\n"," [df_cache, new_row_df],\n"," ignore_index=True,\n"," )\n","\n"," cache_df.to_csv(cache_path, index=False)\n","\n"," return df"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[],"source":["df_dev = translate_df(df_dev, \"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[],"source":["df_dev.to_csv(\"datasets/mgtv/dev_en.csv\", index=False)"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[],"source":["import pandas as pd\n","\n","df = pd.read_csv(\"datasets/mgtv/train.csv\")"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[],"source":["df = translate_df(df, \"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[],"source":["df.to_csv(\"datasets/mgtv/train_en.csv\", index=False)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"data":{"text/plain":["label\n","No 11783\n","Yes 6591\n","Unimportant 5076\n","Incorrect questioning 921\n","Correct answer 629\n","Name: count, dtype: int64"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["df[\"label\"].value_counts()"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>text</th>\n"," <th>label</th>\n"," <th>answer</th>\n"," <th>title</th>\n"," <th>puzzle</th>\n"," <th>truth</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>Did the thief believe in the gods?</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Did they steal the pumpkins to ensure a bounti...</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>The villagers like pumpkins too.</td>\n"," <td>Unimportant</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>People in the village need to use pumpkins as ...</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>Were they stolen from the village?</td>\n"," <td>Yes</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" text label answer \\\n","0 Did the thief believe in the gods? No NaN \n","1 Did they steal the pumpkins to ensure a bounti... No NaN \n","2 The villagers like pumpkins too. Unimportant NaN \n","3 People in the village need to use pumpkins as ... No NaN \n","4 Were they stolen from the village? Yes NaN \n","\n"," title \\\n","0 The Mystery of the Vanishing Pumpkins \n","1 The Mystery of the Vanishing Pumpkins \n","2 The Mystery of the Vanishing Pumpkins \n","3 The Mystery of the Vanishing Pumpkins \n","4 The Mystery of the Vanishing Pumpkins \n","\n"," puzzle \\\n","0 In the village of Zhen, there is a legend that... \n","1 In the village of Zhen, there is a legend that... \n","2 In the village of Zhen, there is a legend that... \n","3 In the village of Zhen, there is a legend that... \n","4 In the village of Zhen, there is a legend that... \n","\n"," truth \n","0 The truth turned out to be related to an old f... \n","1 The truth turned out to be related to an old f... \n","2 The truth turned out to be related to an old f... \n","3 The truth turned out to be related to an old f... \n","4 The truth turned out to be related to an old f... "]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["df.head()"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["df_cn = pd.read_csv(\"datasets/mgtv/train.csv\")\n","df_cache = pd.read_csv(\"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"data":{"text/plain":["Index(['text', 'label', 'answer', 'title', 'puzzle', 'truth'], dtype='object')"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["df_cn.columns"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[{"data":{"text/plain":["0"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["count = 0\n","for col in [\"text\", \"title\", \"puzzle\", \"truth\"]:\n"," for c in df_cn[col].unique():\n"," if c not in df_cache[\"chinese\"].values:\n"," # print(c)\n"," loc = df_cn.loc[df_cn[col] == c, col]\n"," first_occurrence_index = loc.index[\n"," 0\n"," ] # Get the index of the first occurrence\n"," # print(f\"First occurrence at index: {first_occurrence_index}\")\n"," row_cn = df_cn.iloc[first_occurrence_index][col]\n"," row_en = df.iloc[first_occurrence_index][col]\n"," new_data = {\"chinese\": row_cn, \"english\": row_en}\n"," new_row_df = pd.DataFrame([new_data])\n"," df_cache = pd.concat(\n"," [df_cache, new_row_df],\n"," ignore_index=True,\n"," )\n"," count += 1\n","\n","count"]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>chinese</th>\n"," <th>english</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":["Empty DataFrame\n","Columns: [chinese, english]\n","Index: []"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["import re\n","\n","# Function to check if an English translation contains Chinese characters\n","def contains_chinese(text):\n"," return bool(re.search(r\"[\\u4e00-\\u9fff]\", str(text)))\n","\n","\n","# Apply the function to the English column to find rows with partial Chinese text\n","partial_translations = df_cache[df_cache[\"english\"].apply(contains_chinese)]\n","\n","partial_translations.head()"]},{"cell_type":"code","execution_count":19,"metadata":{},"outputs":[{"data":{"text/plain":["(0, 2)"]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["partial_translations.shape"]}],"metadata":{"accelerator":"GPU","application/vnd.databricks.v1+notebook":{"dashboards":[],"environmentMetadata":null,"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"07_MAC_+_Qwen2-7B-Instructi_Unsloth_train","widgets":{}},"colab":{"gpuType":"T4","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":0}
|