Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

inflaton commited on Jul 2, 2024

Commit

5860b41

1 Parent(s): 6702142

created english dataset: datasets/mgtv/dev_en.csv

Browse files

Files changed (9) hide show

.env.example +8 -5
competition/02_Translation.ipynb +1 -0
datasets/mgtv/dev_en.csv +0 -0
datasets/mgtv/unique_translations.csv +0 -0
llm_toolkit/chat_mac.py +88 -0
llm_toolkit/eval_mac.py +67 -0
llm_toolkit/translation_engine.py +14 -0
llm_toolkit/translation_utils.py +41 -0
novel-translation/08_eval-lf-py3.11.ipynb +74 -13

.env.example CHANGED Viewed

@@ -1,12 +1,15 @@
 HF_TOKEN=
-MODEL_NAME=unsloth/Qwen2-0.5B-Instruct-bnb-4bit
 LOAD_IN_4BIT=true
 NUM_TRAIN_EPOCHS=10
 DATA_PATH=datasets/mac/mac.tsv
-RESULTS_PATH=results/mac-results-colab.csv
 EVAL_BASE_MODEL=true
-DO_FINE_TUNING=false
-EVAL_FINE_TUNED=false
-SAVE_FINE_TUNED=false

+OPENAI_API_KEY=
 HF_TOKEN=
+MODEL_NAME=Qwen2/Qwen2-1.5B-Instruct
 LOAD_IN_4BIT=true
 NUM_TRAIN_EPOCHS=10
 DATA_PATH=datasets/mac/mac.tsv
+RESULTS_PATH=results/mac-results_lf.csv
 EVAL_BASE_MODEL=true
+EVAL_FINE_TUNED=true
+SAVE_FINE_TUNED=true
+DO_FINE_TUNING=true

competition/02_Translation.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"cells":[{"cell_type":"code","execution_count":1,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"0ea8b46b-839b-445b-8043-ccdf4e920ace","showTitle":false,"title":""},"id":"YLH80COBzi_F"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","execution_count":2,"metadata":{"id":"63B5exAuzq4M"},"outputs":[],"source":["from pathlib import Path\n","\n","try:\n"," from google.colab import drive\n"," drive.mount('/content/drive')\n"," workding_dir = \"/content/drive/MyDrive/logical-reasoning/\"\n","except ModuleNotFoundError:\n"," workding_dir = str(Path.cwd().parent)"]},{"cell_type":"code","execution_count":3,"metadata":{"executionInfo":{"elapsed":368,"status":"ok","timestamp":1719461634865,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"zFulf0bg0H-9","outputId":"debdd535-c828-40b9-efc0-8a180e5830dd"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: /home/inflaton/code/projects/courses/logical-reasoning\n"]}],"source":["import os\n","import sys\n","\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":4,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":589,"status":"ok","timestamp":1719462011879,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"DIUiweYYzi_I","outputId":"e16e9247-9077-4b0c-f8ea-17059f05a1c4"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /home/inflaton/code/projects/courses/logical-reasoning/.env\n"]},{"data":{"text/plain":["True"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n"," found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":5,"metadata":{"id":"4hQO8gkFzi_K"},"outputs":[],"source":["import pandas as pd\n","\n","df = pd.read_csv(\"datasets/mgtv/train.csv\")"]},{"cell_type":"code","execution_count":6,"metadata":{"id":"W2QyVreqhOGM","outputId":"68b9590e-1ac6-4c6f-e0c4-e273ec816419"},"outputs":[{"name":"stdout","output_type":"stream","text":["<class 'pandas.core.frame.DataFrame'>\n","RangeIndex: 25000 entries, 0 to 24999\n","Data columns (total 6 columns):\n"," # Column Non-Null Count Dtype \n","--- ------ -------------- ----- \n"," 0 text 25000 non-null object \n"," 1 label 25000 non-null object \n"," 2 answer 0 non-null float64\n"," 3 title 25000 non-null object \n"," 4 puzzle 25000 non-null object \n"," 5 truth 25000 non-null object \n","dtypes: float64(1), object(5)\n","memory usage: 1.1+ MB\n"]}],"source":["df.info()"]},{"cell_type":"code","execution_count":7,"metadata":{"id":"8mOMrIurhOGN","outputId":"1870d855-7c18-4850-eb88-302acad05719"},"outputs":[{"data":{"text/plain":["label\n","不是 11783\n","是 6591\n","不重要 5076\n","问法错误 921\n","回答正确 629\n","Name: count, dtype: int64"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["df[\"label\"].value_counts()"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"data":{"text/plain":["puzzle\n","在一个寂静的夜晚，甄贾贾匆匆忙忙地从小巷子里跑了出来，脸上满是惊恐。在他身后，一只空荡荡的垃圾桶翻倒在地。甄贾贾并非小巷里的居民，也没有与人结怨。究竟是什么原因让他如此惊慌失措？ 10058\n","在一栋老旧居民楼的地下室。一天，警方接到报案，有人在地下室中发现了甄好奇的遗体。现场没有打斗的痕迹，也没有明显的自杀工具。死者被发现时坐在一把椅子上，表情平静，似乎在死前并未经历痛苦。地下室除了一台电视和一张桌子外，没有其他物品。桌上有一杯水，电视还开着。究竟是什么原因导致了甄好奇的死亡？ 9345\n","一个晴朗的周末，公园的长椅上发生了一件奇怪的事情。一位老人每天下午都会来到这个公园的同一张长椅上看书。但今天，当他像往常一样来到公园时，却发现他的椅子不见了。更奇怪的是，公园里的其他椅子都完好无损，唯独他那张椅子不见了踪影。 1719\n","在一片宁静的沙滩上，甄德帅突然惊慌失措地跑来跑去，似乎在寻找着什么。他的举动引起了其他沙滩游客的注意，但没有人能理解他到底在做什么。 1401\n","在一个寂静的夜晚，考古学家甄历史被发现死在了他刚刚发掘的古埃及墓穴中。他的身体趴在一张石桌上，周围散落着各种神秘的古埃及文物。令人不解的是，墓穴内并没有打斗的痕迹，甄历史的表情安详，似乎是在某种不可思议的平静中结束了自己的生命。请还原真相。 866\n","在神秘的森林深处，有一个小木屋。某天，一只狐狸突然冲进小木屋，紧接着又冲了出来，然后又冲了进去……如此反复十几次。这是怎么回事？ 539\n","在一个安静的夜晚，小镇上的钟楼突然停止了报时。第二天早晨，人们发现钟楼的管理员甄大勇失踪了，而钟楼的门紧闭，从外面看起来一切正常。小镇上的人们议论纷纷，不知道发生了什么事情。 510\n","在一个封闭房间内，发现了甄木匠的遗体。现场没有打斗痕迹，死者脸上带有惊恐的表情。房间内除了一张床和一把椅子外，别无他物。请问，甄木匠是如何在这样一个看似安全的房间内死亡的？ 131\n","在神秘的大森林里，有一个小木屋。一天，一只熊走进了这个木屋，但它并没有寻找食物，反而开始哭泣。它为什么会哭泣呢？ 106\n","在一个寂静的夜晚，古宅里传来了一阵惊恐的尖叫。早晨，人们发现著名收藏家甄先生的遗体躺在他的书房里，身边散落着一些珍贵的古董。令人不解的是，书房的门和窗户都从内部紧闭，没有任何强行闯入的痕迹。究竟是什么原因导致了甄先生的死亡？ 86\n","在甄家村里，有一个古老的传说：每年南瓜丰收的季节，南瓜田里总有一个最大的南瓜会不翼而飞，村民们对此现象困惑不解。请找出南瓜失踪背后的原因。 61\n","在公司午休时间，所有人都注意到甄认真的桌上多了一碗热气腾腾的汤。然而，甄认真却坚决不让大家尝一口，甚至不愿透露汤的真正内容。同事们议论纷纷，都想知道这碗汤隐藏着什么秘密。 54\n","在一个炎热的夏日，乡村的甄家大院的西瓜突然全部不翼而飞。据了解，甄家大院周围并没有其他人家，而且门窗都完好无损，没有任何被撬的痕迹。村民们议论纷纷，猜测这批西瓜究竟去了哪里。你知道西瓜去了哪里吗？ 49\n","在深不见底的森林中，发现一具男子的遗体。死者甄灿身上没有任何明显的外伤，周围也没有打斗的痕迹。唯一奇怪的是，甄灿手中紧握着一片绿叶。警方调查后发现，他在死前曾拼命寻找某种东西。你知道甄灿究竟在寻找什么吗？ 32\n","在深山老林里，发现了甄青年的遗体，现场没有打斗痕迹，遗体旁边有一盏熄灭的登山灯。据了解，甄青年是独自一人的登山爱好者，前一天晚上他还与朋友通了电话，表示第二天要征服附近的一座高峰。警方调查后，排除了他杀的可能性。那么，甄青年究竟是如何在山上遇难的？ 23\n","在一个安静的夜晚，一个图书馆的阅览室内，只有甄读者和一位图书管理员。突然，甄读者站起来，神情紧张地走向管理员，询问能否借用一下电话。管理员点头答应，但发现甄读者并没有拨打电话，而是静静地站在电话旁边片刻后，便匆匆离开了。 20\n","Name: count, dtype: int64"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["df[\"puzzle\"].value_counts()"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"data":{"text/plain":["truth\n","原来，甄贾贾是一名正在潜逃的小偷。当晚，他企图在巷子里寻找可以盗窃的目标，却不料垃圾桶里突然窜出一只野猫，将他吓了一跳。他以为是被警方发现了，于是惊慌失措地逃跑了。而那只垃圾桶，正是他翻找东西时不小心弄翻的。 10058\n","甄好奇其实是一位科幻小说作家，患有严重的幽闭恐惧症。他为了寻找写作的灵感，常常独自一人来到这个地下室观看科幻电影。不幸的是，有一天他在观看电影时突然遭遇停电，地下室一片漆黑。他的幽闭恐惧症发作，极度恐慌中��以为自己被困在一个未知的宇宙空间。在这种极端恐惧的状态下，他突发心脏病，平静地离开了人世。由于心脏病发作时没有痛苦，所以他的表情看起来很平静。而桌上的水和开着的电视，只是他日常在地下室写作时的习惯。 9345\n","真相是，公园里的园艺工人前一天晚上在修剪树枝时不小心弄坏了一盏路灯。他们为了修理路灯，临时把那张长椅搬过去当作梯子使用。修理完毕后，他们忘了把长椅放回原位，导致老人找不到自己的椅子。而那张长椅，此刻正无辜地躺在公园的角落里，上面还写着：“我不是椅子，我是临时梯子。” 1719\n","原来，甄德帅是一位环保志愿者，他在沙滩上发现了一只被渔网缠住的幼海龟。幼海龟的生命危在旦夕，甄德帅急于寻找工具来解救它。然而，由于紧张和焦急，他无法用言语清楚地表达自己的意图，导致其他游客误以为他是在寻找失物。 1401\n","甄历史在研究古埃及文物时，意外地发现了一枚罕见的魔法符文石。在深入的研究中，他偶然激活了符文石上的诅咒。这个诅咒能让触碰它的人在一种幻觉中看到自己最渴望的事情，并由此进入一种假死状态。实际上，甄历史是在幻觉中体验了回到古埃及黄金时代的场景，心满意足地结束了自己的生命。而这一切，外人都无法理解，只能看到他死亡的表象。 866\n","原来，这只狐狸误食了一颗具有神奇效果的果实，这颗果实让狐狸具备了短时间内预见未来的能力。但是，这个能力有一个副作用，就是每次预见未来都会让狐狸忘记之前预见的内容。在反复进出木屋的过程中，狐狸其实是在试图回忆起自己刚刚预见的未来片段，而这个片段恰好是：它的晚餐会在木屋的某个角落里找到。这个真相让森林里的其他动物忍俊不禁，原来狐狸的奇异行为竟然只是为了寻找晚餐。 539\n","真相是，钟楼的管理员甄大勇在夜晚进行例行的钟楼维护时，不慎从钟楼的顶部摔落，但并未死亡，只是昏迷。由于他跌落时砸到了控制时钟报时的机械装置，导致钟声停止。他躺在钟楼底部，但由于门从内部反锁，外面的人无法进入。甄大勇在第二天中午苏醒后，自己打开了门，这才知道自己引发了小镇上的恐慌。 510\n","原来，甄木匠患有严重的心脏病，但他一直对外隐瞒。在事发当天，他独自一人在工作室加工木材，不慎将一根木刺刺入了脚掌。由于他患有心脏病，这根微不足道的木刺引发的感染竟导致心脏骤停，最终离世。由于工作室门窗紧闭，且现场并无其他人在场，这个秘密被隐藏了起来。直到真相大白，村民们才恍然大悟。 131\n","原来，这只熊是一个超级剧迷，它一直在追一部关于熊的连续剧。那天，它刚好看到剧中的熊主角失去了最心爱的玩具，情不自禁地感同身受，流下了眼泪。而这间小木屋，恰好是它看剧的秘密基地。森林里的其他动物都以为熊是因为找不到食物而哭泣，其实它只是为了一个虚构的剧情而感动。 106\n","真相是，甄先生在收藏市场上偶然发现了一枚传说中的毒戒指。传说中，这枚戒指曾属于一位中世纪的巫师，戒指上镶嵌的宝石含有剧毒。甄先生对这枚毒戒指非常感兴趣，于是将其买下。然而，他在把玩毒戒指的时候，不慎将其掉落在地，导致戒指上的宝石破裂，释放出致命的毒气。由于书房门窗紧闭，毒气在室内循环，甄先生无法逃脱，最终窒息而死。至于古董散落一地，是因为甄先生在毒气侵袭下，痛苦挣扎时造成的。 86\n","真相原来与一位年迈的农夫有关。这位农夫年轻时，曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而，命运弄人，姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘，每年都会将最大的南瓜偷走，放到姑娘的墓前，以此寄托自己的哀思。这一行为延续了多年，成为了乡村里一个神秘的传说。 61\n","原来，甄认真最近在参加一个健康饮食的挑战，而这碗汤是他自己制作的低卡路里养生汤。他之所以不愿意分享，是因为这是他第一次尝试烹饪，担心同事们不喜欢，也害��大家取笑他的烹饪技巧。此外，他还在汤中加入了一种据说能提神醒脑的草药，想在下午的工作中保持最佳状态，这也成为了他不愿分享的秘密。 54\n","原来，这批西瓜是被一只巨大的乌鸦偷走了。这只乌鸦为了给自己的孩子们准备食物，它趁着夜色，竟然将甄家大院的西瓜一颗颗地带回了巢穴。第二天，村民们发现了乌鸦的巢穴，里面堆满了西瓜，而这个意外的真相让所有人都忍俊不禁。甄家老爷也感慨地说：“真是世界大了，什么奇事都有！” 49\n","甄灿是一位植物学家，他深知森林中某种罕见草药的药用价值。他的妻子患有重病，急需这种草药救治。为了救妻子，甄灿冒险进入森林寻找这种草药。然而，他在森林中误食了一种有毒的植物，导致心脏麻痹而死。甄灿在临死前意识到自己的错误，因此紧握着那种有毒绿叶，希望以此警示后来者不要重蹈覆辙。而他寻找的那种草药，其实就在他遗体附近不远处。 32\n","真相是甄青年患有梦游症，当天晚上在深山中的帐篷里梦游时，不慎摔下了山崖。由于梦游时意识不清，他没有能及时反应过来危险，而且由于登山灯在梦游中并未打开，导致他无法看清路况，最终导致了悲剧的发生。第二天早上，他的朋友由于联系不上他，报警进行了搜救，最终发现了这一不幸的事实。 23\n","甄读者其实是一位正在躲避追杀的侦探。他在图书馆查阅资料时，发现追杀他的人可能已经接近。为了确认自己的推断，他向管理员借用电话是为了测试电话是否被监听。他之所以没有拨号，而是静静地站在电话旁边，是因为他在检查电话听筒是否有异常的微小声响，这是他作为侦探的专业技能。当他确认电话安全后，他离开了图书馆，并悄悄地用手机联系了他的同事，安排下一步的行动和庇护所。 20\n","Name: count, dtype: int64"]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["df[\"truth\"].value_counts()"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"data":{"text/plain":["(16, 16, 16)"]},"execution_count":10,"metadata":{},"output_type":"execute_result"}],"source":["len(df[\"title\"].value_counts()), len(df[\"puzzle\"].value_counts()), len(df[\"truth\"].value_counts())"]},{"cell_type":"code","execution_count":47,"metadata":{},"outputs":[{"data":{"text/plain":["(3, 3, 3)"]},"execution_count":47,"metadata":{},"output_type":"execute_result"}],"source":["df_test_a = pd.read_csv(\"datasets/mgtv/test_a.csv\")\n","len(df_test_a[\"title\"].value_counts()), len(df_test_a[\"puzzle\"].value_counts()), len(\n"," df_test_a[\"truth\"].value_counts()\n",")"]},{"cell_type":"code","execution_count":64,"metadata":{},"outputs":[{"data":{"text/plain":["(5, 5, 5)"]},"execution_count":64,"metadata":{},"output_type":"execute_result"}],"source":["df_dev = pd.read_csv(\"datasets/mgtv/dev.csv\")\n","len(df_dev[\"title\"].value_counts()), len(df_dev[\"puzzle\"].value_counts()), len(\n"," df_dev[\"truth\"].value_counts()\n",")"]},{"cell_type":"code","execution_count":65,"metadata":{},"outputs":[],"source":["from llm_toolkit.translation_utils import translate\n","import pandas as pd\n","\n","\n","def translate_df(df, cache_path=None):\n"," if cache_path and os.path.exists(cache_path):\n"," cache_df = pd.read_csv(cache_path)\n"," else:\n"," cache_df = pd.DataFrame(columns=[\"chinese\", \"english\"])\n","\n"," cache_dict = {k: v for k, v in zip(cache_df[\"chinese\"], cache_df[\"english\"])}\n","\n"," df[\"text\"] = df[\"text\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"title\"] = df[\"title\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"label\"] = df[\"label\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"puzzle\"] = df[\"puzzle\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"truth\"] = df[\"truth\"].apply(lambda x: translate(x, cache_dict))\n","\n"," if cache_path:\n"," for k in cache_df[\"chinese\"]:\n"," if k in cache_dict:\n"," del cache_dict[k]\n","\n"," if k in cache_dict:\n"," new_row = {'chinese': k, 'english': cache_dict[k]}\n"," cache_df = cache_df.append(new_row, ignore_index=True)\n"," \n"," cache_df.to_csv(cache_path, index=False)\n","\n","\n"," return df"]},{"cell_type":"code","execution_count":66,"metadata":{},"outputs":[],"source":["df_dev = translate_df(df_dev, \"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":67,"metadata":{},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>text</th>\n"," <th>label</th>\n"," <th>answer</th>\n"," <th>title</th>\n"," <th>puzzle</th>\n"," <th>truth</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>Was Zhen Zhesuo suicide?</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Coast</td>\n"," <td>In the quiet seaside cottage of a neighbor, a ...</td>\n"," <td>Zhen Zhesao was a nature-loving painter who ca...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Was Zhen Zhesuo sickly?</td>\n"," <td>Yes</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Coast</td>\n"," <td>In the quiet seaside cottage of a neighbor, a ...</td>\n"," <td>Zhen Zhesao was a nature-loving painter who ca...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>The painting was by Zhen.</td>\n"," <td>Yes</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Coast</td>\n"," <td>In the quiet seaside cottage of a neighbor, a ...</td>\n"," <td>Zhen Zhesao was a nature-loving painter who ca...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>Was Zhen with a heart condition?</td>\n"," <td>Yes</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Coast</td>\n"," <td>In the quiet seaside cottage of a neighbor, a ...</td>\n"," <td>Zhen Zhesao was a nature-loving painter who ca...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>The wheel was the murderer's weapon.</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Coast</td>\n"," <td>In the quiet seaside cottage of a neighbor, a ...</td>\n"," <td>Zhen Zhesao was a nature-loving painter who ca...</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>2995</th>\n"," <td>Did the weeping person have to make a sacrific...</td>\n"," <td>Yes</td>\n"," <td>NaN</td>\n"," <td>Zhen Zhuo's wails</td>\n"," <td>One night, in a quiet village, a weeping sound...</td>\n"," <td>It turned out that the old hat belonged to a l...</td>\n"," </tr>\n"," <tr>\n"," <th>2996</th>\n"," <td>Was the body in the lake?</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>Zhen Zhuo's wails</td>\n"," <td>One night, in a quiet village, a weeping sound...</td>\n"," <td>It turned out that the old hat belonged to a l...</td>\n"," </tr>\n"," <tr>\n"," <th>2997</th>\n"," <td>Do mourners have a special relationship with t...</td>\n"," <td>Yes</td>\n"," <td>NaN</td>\n"," <td>Zhen Zhuo's wails</td>\n"," <td>One night, in a quiet village, a weeping sound...</td>\n"," <td>It turned out that the old hat belonged to a l...</td>\n"," </tr>\n"," <tr>\n"," <th>2998</th>\n"," <td>Was the owner of the hat dead?</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>Zhen Zhuo's wails</td>\n"," <td>One night, in a quiet village, a weeping sound...</td>\n"," <td>It turned out that the old hat belonged to a l...</td>\n"," </tr>\n"," <tr>\n"," <th>2999</th>\n"," <td>Was the dead person wounded?</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>Zhen Zhuo's wails</td>\n"," <td>One night, in a quiet village, a weeping sound...</td>\n"," <td>It turned out that the old hat belonged to a l...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>3000 rows × 6 columns</p>\n","</div>"],"text/plain":[" text label answer \\\n","0 Was Zhen Zhesuo suicide? No NaN \n","1 Was Zhen Zhesuo sickly? Yes NaN \n","2 The painting was by Zhen. Yes NaN \n","3 Was Zhen with a heart condition? Yes NaN \n","4 The wheel was the murderer's weapon. No NaN \n","... ... ... ... \n","2995 Did the weeping person have to make a sacrific... Yes NaN \n","2996 Was the body in the lake? No NaN \n","2997 Do mourners have a special relationship with t... Yes NaN \n","2998 Was the owner of the hat dead? No NaN \n","2999 Was the dead person wounded? No NaN \n","\n"," title \\\n","0 The Mystery of the Coast \n","1 The Mystery of the Coast \n","2 The Mystery of the Coast \n","3 The Mystery of the Coast \n","4 The Mystery of the Coast \n","... ... \n","2995 Zhen Zhuo's wails \n","2996 Zhen Zhuo's wails \n","2997 Zhen Zhuo's wails \n","2998 Zhen Zhuo's wails \n","2999 Zhen Zhuo's wails \n","\n"," puzzle \\\n","0 In the quiet seaside cottage of a neighbor, a ... \n","1 In the quiet seaside cottage of a neighbor, a ... \n","2 In the quiet seaside cottage of a neighbor, a ... \n","3 In the quiet seaside cottage of a neighbor, a ... \n","4 In the quiet seaside cottage of a neighbor, a ... \n","... ... \n","2995 One night, in a quiet village, a weeping sound... \n","2996 One night, in a quiet village, a weeping sound... \n","2997 One night, in a quiet village, a weeping sound... \n","2998 One night, in a quiet village, a weeping sound... \n","2999 One night, in a quiet village, a weeping sound... \n","\n"," truth \n","0 Zhen Zhesao was a nature-loving painter who ca... \n","1 Zhen Zhesao was a nature-loving painter who ca... \n","2 Zhen Zhesao was a nature-loving painter who ca... \n","3 Zhen Zhesao was a nature-loving painter who ca... \n","4 Zhen Zhesao was a nature-loving painter who ca... \n","... ... \n","2995 It turned out that the old hat belonged to a l... \n","2996 It turned out that the old hat belonged to a l... \n","2997 It turned out that the old hat belonged to a l... \n","2998 It turned out that the old hat belonged to a l... \n","2999 It turned out that the old hat belonged to a l... \n","\n","[3000 rows x 6 columns]"]},"execution_count":67,"metadata":{},"output_type":"execute_result"}],"source":["df_dev"]},{"cell_type":"code","execution_count":68,"metadata":{},"outputs":[],"source":["df_dev.to_csv(\"datasets/mgtv/dev_en.csv\", index=False)"]}],"metadata":{"accelerator":"GPU","application/vnd.databricks.v1+notebook":{"dashboards":[],"environmentMetadata":null,"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"07_MAC_+_Qwen2-7B-Instructi_Unsloth_train","widgets":{}},"colab":{"gpuType":"T4","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":0}

datasets/mgtv/dev_en.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

datasets/mgtv/unique_translations.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

llm_toolkit/chat_mac.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+import sys
+from llamafactory.chat import ChatModel
+from llamafactory.extras.misc import torch_gc
+from dotenv import find_dotenv, load_dotenv
+found_dotenv = find_dotenv(".env")
+if len(found_dotenv) == 0:
+    found_dotenv = find_dotenv(".env.example")
+print(f"loading env vars from: {found_dotenv}")
+load_dotenv(found_dotenv, override=False)
+path = os.path.dirname(found_dotenv)
+print(f"Adding {path} to sys.path")
+sys.path.append(path)
+from llm_toolkit.translation_engine import *
+from llm_toolkit.translation_utils import *
+model_name = os.getenv("MODEL_NAME")
+load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
+eval_base_model = os.getenv("EVAL_BASE_MODEL") == "true"
+eval_fine_tuned = os.getenv("EVAL_FINE_TUNED") == "true"
+save_fine_tuned_model = os.getenv("SAVE_FINE_TUNED") == "true"
+num_train_epochs = int(os.getenv("NUM_TRAIN_EPOCHS") or 0)
+data_path = os.getenv("DATA_PATH")
+results_path = os.getenv("RESULTS_PATH")
+max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
+dtype = (
+    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+)
+print(
+    model_name,
+    load_in_4bit,
+    max_seq_length,
+    num_train_epochs,
+    dtype,
+    data_path,
+    results_path,
+    eval_base_model,
+    eval_fine_tuned,
+    save_fine_tuned_model,
+)
+adapter_name_or_path = (
+    sys.argv[1]
+    if len(sys.argv) > 1
+    else "llama-factory/saves/qwen2-0.5b/lora/sft/checkpoint-560"
+)
+args = dict(
+    model_name_or_path=model_name,  # use bnb-4bit-quantized Llama-3-8B-Instruct model
+    adapter_name_or_path=adapter_name_or_path,  # load the saved LoRA adapters
+    template="chatml",  # same to the one in training
+    finetuning_type="lora",  # same to the one in training
+    quantization_bit=4,  # load 4-bit quantized model
+)
+chat_model = ChatModel(args)
+messages = []
+print(
+    "Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application."
+)
+while True:
+    query = input("\nUser: ")
+    if query.strip() == "exit":
+        break
+    if query.strip() == "clear":
+        messages = []
+        torch_gc()
+        print("History has been removed.")
+        continue
+    messages.append({"role": "user", "content": query})
+    print("Assistant: ", end="", flush=True)
+    response = ""
+    for new_text in chat_model.stream_chat(messages):
+        print(new_text, end="", flush=True)
+        response += new_text
+    print()
+    messages.append({"role": "assistant", "content": response})
+torch_gc()

llm_toolkit/eval_mac.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import sys
+import torch
+from dotenv import find_dotenv, load_dotenv
+found_dotenv = find_dotenv(".env")
+if len(found_dotenv) == 0:
+    found_dotenv = find_dotenv(".env.example")
+print(f"loading env vars from: {found_dotenv}")
+load_dotenv(found_dotenv, override=False)
+path = os.path.dirname(found_dotenv)
+print(f"Adding {path} to sys.path")
+sys.path.append(path)
+from llm_toolkit.translation_engine import *
+from llm_toolkit.translation_utils import *
+model_name = os.getenv("MODEL_NAME")
+adapter_name_or_path = os.getenv("ADAPTER_NAME_OR_PATH")
+load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
+data_path = os.getenv("DATA_PATH")
+results_path = os.getenv("RESULTS_PATH")
+print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
+gpu_stats = torch.cuda.get_device_properties(0)
+start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+print(f"(1) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{start_gpu_memory} GB of memory reserved.")
+model, tokenizer = load_model(
+    model_name, load_in_4bit=load_in_4bit, adapter_name_or_path=adapter_name_or_path
+)
+gpu_stats = torch.cuda.get_device_properties(0)
+start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{start_gpu_memory} GB of memory reserved.")
+datasets = load_translation_dataset(data_path, tokenizer)
+print("Evaluating model: " + model_name)
+predictions = eval_model(model, tokenizer, datasets["test"])
+gpu_stats = torch.cuda.get_device_properties(0)
+start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{start_gpu_memory} GB of memory reserved.")
+if adapter_name_or_path is not None:
+    model_name += "_" + adapter_name_or_path.split("/")[-1]
+save_results(
+    model_name,
+    results_path,
+    datasets["test"],
+    predictions,
+    debug=True,
+)
+metrics = calc_metrics(datasets["test"]["english"], predictions, debug=True)
+print(metrics)

llm_toolkit/translation_engine.py CHANGED Viewed

@@ -7,6 +7,7 @@ from trl import SFTTrainer
 from transformers import TrainingArguments, TextStreamer
 from tqdm import tqdm
 from llm_toolkit.translation_utils import *
 print(f"loading {__file__}")
@@ -30,9 +31,22 @@ def load_model(
     max_seq_length=2048,
     dtype=None,
     load_in_4bit=False,
 ):
     print(f"loading model: {model_name}")
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name=model_name,  # YOUR MODEL YOU USED FOR TRAINING
         max_seq_length=max_seq_length,

 from transformers import TrainingArguments, TextStreamer
 from tqdm import tqdm
 from llm_toolkit.translation_utils import *
+from llamafactory.chat import ChatModel
 print(f"loading {__file__}")
     max_seq_length=2048,
     dtype=None,
     load_in_4bit=False,
+    template="chatml",
+    adapter_name_or_path=None,
 ):
     print(f"loading model: {model_name}")
+    if adapter_name_or_path is not None:
+        args = dict(
+            model_name_or_path=model_name,
+            adapter_name_or_path=adapter_name_or_path,  # load the saved LoRA adapters
+            template=template,  # same to the one in training
+            finetuning_type="lora",  # same to the one in training
+            quantization_bit=4,  # load 4-bit quantized model
+        )
+        chat_model = ChatModel(args)
+        return chat_model.engine.model, chat_model.engine.tokenizer
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name=model_name,  # YOUR MODEL YOU USED FOR TRAINING
         max_seq_length=max_seq_length,

llm_toolkit/translation_utils.py CHANGED Viewed

@@ -5,6 +5,9 @@ import evaluate
 import seaborn as sns
 import matplotlib.pyplot as plt
 bleu = evaluate.load("bleu")
 rouge = evaluate.load("rouge")
@@ -222,3 +225,41 @@ def plot_times(perf_df, ylim=0.421):
     fig.tight_layout()
     plt.show()

 import seaborn as sns
 import matplotlib.pyplot as plt
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
 bleu = evaluate.load("bleu")
 rouge = evaluate.load("rouge")
     fig.tight_layout()
     plt.show()
+def translate_via_llm(text):
+    base_url = os.getenv("OPENAI_BASE_URL") or "http://localhost:8000/v1"
+    llm = ChatOpenAI(
+        model="gpt-4o",
+        temperature=0,
+        max_tokens=None,
+        timeout=None,
+        max_retries=2,
+        base_url=base_url,
+    )
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            (
+                "human",
+                "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{input}",
+            ),
+        ]
+    )
+    chain = prompt | llm
+    response = chain.invoke(
+        {
+            "input": text,
+        }
+    )
+    return response.content
+def translate(text, cache_dict):
+    if text in cache_dict:
+        return cache_dict[text]
+    else:
+        translated_text = translate_via_llm(text)
+        cache_dict[text] = translated_text
+        return translated_text

novel-translation/08_eval-lf-py3.11.ipynb CHANGED Viewed

@@ -52,7 +52,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {
     "application/vnd.databricks.v1+cell": {
      "cellMetadata": {},
@@ -76,7 +76,7 @@
        "True"
       ]
      },
-     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1360,7 +1360,7 @@
     "    print(f\"Epoch {i}\")\n",
     "    adapter_path = f\"llama-factory/saves/qwen2-0.5b/lora/sft/checkpoint-{560 * i}\"\n",
     "    os.environ[\"ADAPTER_NAME_OR_PATH\"] = adapter_path\n",
-    "    !python llm_toolkit/eval.py "
    ]
   },
   {
@@ -2517,23 +2517,84 @@
     "    print(f\"Epoch {i}\")\n",
     "    adapter_path = f\"llama-factory/saves/qwen2-1.5b/lora/sft/checkpoint-{560 * i}\"\n",
     "    os.environ[\"ADAPTER_NAME_OR_PATH\"] = adapter_path\n",
-    "    !python llm_toolkit/eval.py "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%time\n",
-    "\n",
-    "os.environ[\"MODEL_NAME\"] = \"Qwen/Qwen2-7B-Instruct\" \n",
-    "for i in range(1, num_train_epochs + 1):\n",
-    "    print(f\"Epoch {i}\")\n",
-    "    adapter_path = f\"llama-factory/saves/qwen2-7b/lora/sft/checkpoint-{560 * i}\"\n",
-    "    os.environ[\"ADAPTER_NAME_OR_PATH\"] = adapter_path\n",
-    "    !python llm_toolkit/eval.py "
    ]
   }
  ],

   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "metadata": {
     "application/vnd.databricks.v1+cell": {
      "cellMetadata": {},
        "True"
       ]
      },
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
     "    print(f\"Epoch {i}\")\n",
     "    adapter_path = f\"llama-factory/saves/qwen2-0.5b/lora/sft/checkpoint-{560 * i}\"\n",
     "    os.environ[\"ADAPTER_NAME_OR_PATH\"] = adapter_path\n",
+    "    !python llm_toolkit/eval_mac.py "
    ]
   },
   {
     "    print(f\"Epoch {i}\")\n",
     "    adapter_path = f\"llama-factory/saves/qwen2-1.5b/lora/sft/checkpoint-{560 * i}\"\n",
     "    os.environ[\"ADAPTER_NAME_OR_PATH\"] = adapter_path\n",
+    "    !python llm_toolkit/eval_mac.py "
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package wordnet to /home/inflaton/nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n",
+      "[nltk_data] Downloading package punkt to /home/inflaton/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n",
+      "[nltk_data] Downloading package omw-1.4 to /home/inflaton/nltk_data...\n",
+      "[nltk_data]   Package omw-1.4 is already up-to-date!\n"
+     ]
+    }
+   ],
+   "source": [
+    "from llm_toolkit.translation_utils import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'According to his logic, this red, stiff, one-foot-long thing on my body was the embodiment of sin.'"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "translate_via_llm(\n",
+    "    \"按他的逻辑，我身上这个通红通红，直不愣登，长约一尺的东西就是罪恶的化身。\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
+    "cache_dict = {}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'按他的逻辑，我身上这个通红通红，直不愣登，长约一尺的东西就是罪恶的化身。': 'According to his logic, this red, stiff, one-foot-long thing on my body was the embodiment of sin.'}"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "translate(\n",
+    "    \"按他的逻辑，我身上这个通红通红，直不愣登，长约一尺的东西就是罪恶的化身。\",\n",
+    "    cache_dict=cache_dict,\n",
+    ")\n",
+    "cache_dict"
    ]
   }
  ],