{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"0ea8b46b-839b-445b-8043-ccdf4e920ace","showTitle":false,"title":""},"id":"YLH80COBzi_F"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","execution_count":2,"metadata":{"id":"63B5exAuzq4M"},"outputs":[],"source":["from pathlib import Path\n","\n","try:\n"," from google.colab import drive\n"," drive.mount('/content/drive')\n"," workding_dir = \"/content/drive/MyDrive/logical-reasoning/\"\n","except ModuleNotFoundError:\n"," workding_dir = str(Path.cwd().parent)"]},{"cell_type":"code","execution_count":3,"metadata":{"executionInfo":{"elapsed":368,"status":"ok","timestamp":1719461634865,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"zFulf0bg0H-9","outputId":"debdd535-c828-40b9-efc0-8a180e5830dd"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: /Users/inflaton/Library/CloudStorage/GoogleDrive-dh.huang.2023@smu.edu.sg/My Drive/logical-reasoning\n"]}],"source":["import os\n","import sys\n","\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":4,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":589,"status":"ok","timestamp":1719462011879,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"DIUiweYYzi_I","outputId":"e16e9247-9077-4b0c-f8ea-17059f05a1c4"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /Users/inflaton/Library/CloudStorage/GoogleDrive-dh.huang.2023@smu.edu.sg/My Drive/logical-reasoning/.env\n"]},{"data":{"text/plain":["True"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n"," found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[{"data":{"text/plain":["(5, 5, 5)"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["import pandas as pd\n","\n","\n","df_dev = pd.read_csv(\"datasets/mgtv/dev.csv\")\n","len(df_dev[\"title\"].value_counts()), len(df_dev[\"puzzle\"].value_counts()), len(\n"," df_dev[\"truth\"].value_counts()\n",")"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading /Users/inflaton/Library/CloudStorage/GoogleDrive-dh.huang.2023@smu.edu.sg/My Drive/logical-reasoning/llm_toolkit/translation_utils.py\n"]},{"name":"stderr","output_type":"stream","text":["[nltk_data] Downloading package wordnet to\n","[nltk_data] /Users/inflaton/nltk_data...\n","[nltk_data] Package wordnet is already up-to-date!\n","[nltk_data] Downloading package punkt to /Users/inflaton/nltk_data...\n","[nltk_data] Package punkt is already up-to-date!\n","[nltk_data] Downloading package omw-1.4 to\n","[nltk_data] /Users/inflaton/nltk_data...\n","[nltk_data] Package omw-1.4 is already up-to-date!\n"]}],"source":["from llm_toolkit.translation_utils import translate\n","import pandas as pd\n","\n","\n","def translate_df(df, cache_path=None):\n"," if cache_path and os.path.exists(cache_path):\n"," cache_df = pd.read_csv(cache_path)\n"," else:\n"," cache_df = pd.DataFrame(columns=[\"chinese\", \"english\"])\n","\n"," cache_dict = {k: v for k, v in zip(cache_df[\"chinese\"], cache_df[\"english\"])}\n","\n"," df[\"text\"] = df[\"text\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"title\"] = df[\"title\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"label\"] = df[\"label\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"puzzle\"] = df[\"puzzle\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"truth\"] = df[\"truth\"].apply(lambda x: translate(x, cache_dict))\n","\n"," if cache_path:\n"," for k in cache_df[\"chinese\"]:\n"," if k in cache_dict:\n"," del cache_dict[k]\n","\n"," if k in cache_dict:\n"," new_data = {\"chinese\": k, \"english\": cache_dict[k]}\n"," new_row_df = pd.DataFrame([new_data])\n"," df_cache = pd.concat(\n"," [df_cache, new_row_df],\n"," ignore_index=True,\n"," )\n","\n"," cache_df.to_csv(cache_path, index=False)\n","\n"," return df"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[],"source":["df_dev = translate_df(df_dev, \"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[],"source":["df_dev.to_csv(\"datasets/mgtv/dev_en.csv\", index=False)"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[],"source":["import pandas as pd\n","\n","df = pd.read_csv(\"datasets/mgtv/train.csv\")"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[],"source":["df = translate_df(df, \"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[],"source":["df.to_csv(\"datasets/mgtv/train_en.csv\", index=False)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"data":{"text/plain":["label\n","No 11783\n","Yes 6591\n","Unimportant 5076\n","Incorrect questioning 921\n","Correct answer 629\n","Name: count, dtype: int64"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["df[\"label\"].value_counts()"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
textlabelanswertitlepuzzletruth
0Did the thief believe in the gods?NoNaNThe Mystery of the Vanishing PumpkinsIn the village of Zhen, there is a legend that...The truth turned out to be related to an old f...
1Did they steal the pumpkins to ensure a bounti...NoNaNThe Mystery of the Vanishing PumpkinsIn the village of Zhen, there is a legend that...The truth turned out to be related to an old f...
2The villagers like pumpkins too.UnimportantNaNThe Mystery of the Vanishing PumpkinsIn the village of Zhen, there is a legend that...The truth turned out to be related to an old f...
3People in the village need to use pumpkins as ...NoNaNThe Mystery of the Vanishing PumpkinsIn the village of Zhen, there is a legend that...The truth turned out to be related to an old f...
4Were they stolen from the village?YesNaNThe Mystery of the Vanishing PumpkinsIn the village of Zhen, there is a legend that...The truth turned out to be related to an old f...
\n","
"],"text/plain":[" text label answer \\\n","0 Did the thief believe in the gods? No NaN \n","1 Did they steal the pumpkins to ensure a bounti... No NaN \n","2 The villagers like pumpkins too. Unimportant NaN \n","3 People in the village need to use pumpkins as ... No NaN \n","4 Were they stolen from the village? Yes NaN \n","\n"," title \\\n","0 The Mystery of the Vanishing Pumpkins \n","1 The Mystery of the Vanishing Pumpkins \n","2 The Mystery of the Vanishing Pumpkins \n","3 The Mystery of the Vanishing Pumpkins \n","4 The Mystery of the Vanishing Pumpkins \n","\n"," puzzle \\\n","0 In the village of Zhen, there is a legend that... \n","1 In the village of Zhen, there is a legend that... \n","2 In the village of Zhen, there is a legend that... \n","3 In the village of Zhen, there is a legend that... \n","4 In the village of Zhen, there is a legend that... \n","\n"," truth \n","0 The truth turned out to be related to an old f... \n","1 The truth turned out to be related to an old f... \n","2 The truth turned out to be related to an old f... \n","3 The truth turned out to be related to an old f... \n","4 The truth turned out to be related to an old f... "]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["df.head()"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["df_cn = pd.read_csv(\"datasets/mgtv/train.csv\")\n","df_cache = pd.read_csv(\"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"data":{"text/plain":["Index(['text', 'label', 'answer', 'title', 'puzzle', 'truth'], dtype='object')"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["df_cn.columns"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[{"data":{"text/plain":["0"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["count = 0\n","for col in [\"text\", \"title\", \"puzzle\", \"truth\"]:\n"," for c in df_cn[col].unique():\n"," if c not in df_cache[\"chinese\"].values:\n"," # print(c)\n"," loc = df_cn.loc[df_cn[col] == c, col]\n"," first_occurrence_index = loc.index[\n"," 0\n"," ] # Get the index of the first occurrence\n"," # print(f\"First occurrence at index: {first_occurrence_index}\")\n"," row_cn = df_cn.iloc[first_occurrence_index][col]\n"," row_en = df.iloc[first_occurrence_index][col]\n"," new_data = {\"chinese\": row_cn, \"english\": row_en}\n"," new_row_df = pd.DataFrame([new_data])\n"," df_cache = pd.concat(\n"," [df_cache, new_row_df],\n"," ignore_index=True,\n"," )\n"," count += 1\n","\n","count"]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
chineseenglish
\n","
"],"text/plain":["Empty DataFrame\n","Columns: [chinese, english]\n","Index: []"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["import re\n","\n","# Function to check if an English translation contains Chinese characters\n","def contains_chinese(text):\n"," return bool(re.search(r\"[\\u4e00-\\u9fff]\", str(text)))\n","\n","\n","# Apply the function to the English column to find rows with partial Chinese text\n","partial_translations = df_cache[df_cache[\"english\"].apply(contains_chinese)]\n","\n","partial_translations.head()"]},{"cell_type":"code","execution_count":19,"metadata":{},"outputs":[{"data":{"text/plain":["(0, 2)"]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["partial_translations.shape"]}],"metadata":{"accelerator":"GPU","application/vnd.databricks.v1+notebook":{"dashboards":[],"environmentMetadata":null,"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"07_MAC_+_Qwen2-7B-Instructi_Unsloth_train","widgets":{}},"colab":{"gpuType":"T4","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":0}