Spaces:

foxxy-hm
/

e2eqa-wiki

Build error

App Files Files Community

foxxy-hm commited on Jul 5, 2023

Commit

c5d2283

1 Parent(s): 12da2b4

init project

Browse files

Files changed (23) hide show

.gitignore +91 -0
LICENSE +10 -0
notebooks/0.0-create-sliding-window.ipynb +381 -0
notebooks/0.1-find-dirty-data.ipynb +2071 -0
notebooks/0.2-create-stage1-ranking.ipynb +0 -0
notebooks/0.3-create-stage2-ranking.ipynb +1 -0
notebooks/0.4-find-redirects.ipynb +0 -0
notebooks/1.0-train-bm25-stage1.ipynb +1 -0
notebooks/1.1-train-bm25-stage2.ipynb +1 -0
notebooks/1.2-train-pairwise-stage1.ipynb +1 -0
notebooks/1.3-train-pairwise-stage2.ipynb +0 -0
notebooks/1.4-robust-qa-model.ipynb +0 -0
requirements.txt +7 -0
src/app.py +26 -0
src/features/graph_utils.py +110 -0
src/features/text_utils.py +82 -0
src/models/bm25_utils.py +40 -0
src/models/pairwise_model.py +140 -0
src/models/predict_model.py +75 -0
src/models/qa_model.py +52 -0
submission/answer.json +0 -0
submission/submission.json +0 -0
submission/test.py +10 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,91 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+# C extensions
+*.so
+/models/
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# DotEnv configuration
+.env
+# Database
+*.db
+*.rdb
+# Pycharm
+.idea
+# VS Code
+.vscode/
+# Spyder
+.spyproject/
+# Jupyter NB Checkpoints
+.ipynb_checkpoints/
+# exclude data from source control by default
+/data/
+# Mac OS-specific storage files
+.DS_Store
+# vim
+*.swp
+*.swo
+# Mypy cache
+.mypy_cache/

LICENSE ADDED Viewed

	@@ -0,0 +1,10 @@

+The MIT License (MIT)
+Copyright (c) 2023, foxxy
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

notebooks/0.0-create-sliding-window.ipynb ADDED Viewed

	@@ -0,0 +1,381 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install pandas \n",
+    "# !pip install tdqm\n",
+    "# !pip install pandarallel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import string\n",
+    "import math\n",
+    "import json\n",
+    "import pandas as pd\n",
+    "from tqdm.auto import tqdm\n",
+    "tqdm.pandas()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_WORD_SPLIT = re.compile(\"([.,!?\\\"/':;)(])\")\n",
+    "_DIGIT_RE = re.compile(br\"\\d\")\n",
+    "STOP_WORDS = \"\\\" \\' [ ] . , ! : ; ?\".split(\" \")\n",
+    "\n",
+    "def basic_tokenizer(sentence):\n",
+    "    words = []\n",
+    "    for space_separated_fragment in sentence.strip().split():\n",
+    "        words.extend(_WORD_SPLIT.split(space_separated_fragment))\n",
+    "    return [w.lower() for w in words if w != '' and w != ' ' and w not in string.punctuation]\n",
+    "\n",
+    "def remove_appending_title(text,title):\n",
+    "    return text.replace(f\"{title}\\n\\n{title}\",f\"{title} \")\n",
+    "\n",
+    "def create_sliding_window(text, size=256, overlap=32):\n",
+    "    actual_size = size - overlap\n",
+    "    windows = []\n",
+    "    n_windows = math.ceil(len(text)/actual_size)\n",
+    "    for i in range(n_windows):\n",
+    "        windows.append(\" \".join(text[i*actual_size:i*actual_size + size]))\n",
+    "    return windows"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "16dcb3eb371e4084be66ee507040fe18",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0it [00:00, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "all_titles = []\n",
+    "all_texts = []\n",
+    "all_bm25_texts = []\n",
+    "with open(\"../data/raw/wikipedia_20220620_cleaned/wikipedia_20220620_cleaned.jsonl\", encoding=\"utf-8\") as f:\n",
+    "    for i,line in tqdm(enumerate(f)):\n",
+    "        x = json.loads(line)\n",
+    "        text = remove_appending_title(x[\"text\"],x[\"title\"])\n",
+    "        text = text.split(\" \")\n",
+    "        sliding_windows = create_sliding_window(text)\n",
+    "        all_texts.extend(sliding_windows)\n",
+    "        # all_bm25_texts.extend(bm25_windows)\n",
+    "        all_titles.extend([x['title'],]*len(sliding_windows))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Trang Chính</td>\n",
+       "      <td>Trang Chính\\n\\n&lt;templatestyles src=\"Wiki2021/s...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Internet Society</td>\n",
+       "      <td>Internet Society  hay ISOC là một tổ chức quốc...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Tiếng Việt</td>\n",
+       "      <td>Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Tiếng Việt</td>\n",
+       "      <td>hệ thống thanh điệu phát triển cao hơn, hệ thố...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Tiếng Việt</td>\n",
+       "      <td>tiếp xúc Hán – Việt thành 2 giai đoạn chính: \\...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1944401</th>\n",
+       "      <td>VNOI</td>\n",
+       "      <td>chuyên viên của các tập đoàn công nghệ lớn như...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1944402</th>\n",
+       "      <td>Vòng hoa</td>\n",
+       "      <td>Vòng hoa  (; IPA: ) là một loại hoa, lá, quả, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1944403</th>\n",
+       "      <td>Vòng hoa</td>\n",
+       "      <td>vật trong thần thoại La Mã và Hy Lạp được miêu...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1944404</th>\n",
+       "      <td>Vòng hoa</td>\n",
+       "      <td>một vai trò nổi bật, tương tự như ở Anh.\\n\\n==...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1944405</th>\n",
+       "      <td>Vòng hoa</td>\n",
+       "      <td>trên giá đỡ vòng hoa thắp sáng những ngọn nến ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1944406 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    title                                               text\n",
+       "0             Trang Chính  Trang Chính\\n\\n<templatestyles src=\"Wiki2021/s...\n",
+       "1        Internet Society  Internet Society  hay ISOC là một tổ chức quốc...\n",
+       "2              Tiếng Việt  Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...\n",
+       "3              Tiếng Việt  hệ thống thanh điệu phát triển cao hơn, hệ thố...\n",
+       "4              Tiếng Việt  tiếp xúc Hán – Việt thành 2 giai đoạn chính: \\...\n",
+       "...                   ...                                                ...\n",
+       "1944401              VNOI  chuyên viên của các tập đoàn công nghệ lớn như...\n",
+       "1944402          Vòng hoa  Vòng hoa  (; IPA: ) là một loại hoa, lá, quả, ...\n",
+       "1944403          Vòng hoa  vật trong thần thoại La Mã và Hy Lạp được miêu...\n",
+       "1944404          Vòng hoa  một vai trò nổi bật, tương tự như ở Anh.\\n\\n==...\n",
+       "1944405          Vòng hoa  trên giá đỡ vòng hoa thắp sáng những ngọn nến ...\n",
+       "\n",
+       "[1944406 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.DataFrame()\n",
+    "df[\"title\"] = all_titles\n",
+    "df[\"text\"] = all_texts\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pandarallel import pandarallel\n",
+    "\n",
+    "pandarallel.initialize(progress_bar=True, use_memory_fs=False, nb_workers=6) \n",
+    "\n",
+    "def apply_tokenizer(df, num_chunks):\n",
+    "    chunk_sr = pd.Series()\n",
+    "    i = 0\n",
+    "    end = df.shape[0]\n",
+    "    chunk_size = math.floor(end/num_chunks)\n",
+    "    while i != end:\n",
+    "        if end - i < chunk_size:\n",
+    "            chunk_size = end - i\n",
+    "#         print((i,i+chunk_size))\n",
+    "        temp_df = df[i:i+chunk_size].copy()\n",
+    "        chunk_sr = pd.concat([chunk_sr, temp_df[\"text\"].parallel_apply(lambda x: \" \".join(basic_tokenizer(x)))], ignore_index=True)\n",
+    "        i+=chunk_size\n",
+    "    return chunk_sr\n",
+    "\n",
+    "df[\"bm25_text\"] = apply_tokenizer(df, 5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>text</th>\n",
+       "      <th>bm25_text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Trang Chính</td>\n",
+       "      <td>Trang Chính\\n\\n&lt;templatestyles src=\"Wiki2021/s...</td>\n",
+       "      <td>trang chính &lt;templatestyles src= wiki2021 styl...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Internet Society</td>\n",
+       "      <td>Internet Society  hay ISOC là một tổ chức quốc...</td>\n",
+       "      <td>internet society hay isoc là một tổ chức quốc ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Tiếng Việt</td>\n",
+       "      <td>Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...</td>\n",
+       "      <td>tiếng việt cũng gọi là tiếng việt nam hay việt...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Tiếng Việt</td>\n",
+       "      <td>hệ thống thanh điệu phát triển cao hơn, hệ thố...</td>\n",
+       "      <td>hệ thống thanh điệu phát triển cao hơn hệ thốn...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Tiếng Việt</td>\n",
+       "      <td>tiếp xúc Hán – Việt thành 2 giai đoạn chính: \\...</td>\n",
+       "      <td>tiếp xúc hán – việt thành 2 giai đoạn chính bu...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              title                                               text  \\\n",
+       "0       Trang Chính  Trang Chính\\n\\n<templatestyles src=\"Wiki2021/s...   \n",
+       "1  Internet Society  Internet Society  hay ISOC là một tổ chức quốc...   \n",
+       "2        Tiếng Việt  Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...   \n",
+       "3        Tiếng Việt  hệ thống thanh điệu phát triển cao hơn, hệ thố...   \n",
+       "4        Tiếng Việt  tiếp xúc Hán – Việt thành 2 giai đoạn chính: \\...   \n",
+       "\n",
+       "                                           bm25_text  \n",
+       "0  trang chính <templatestyles src= wiki2021 styl...  \n",
+       "1  internet society hay isoc là một tổ chức quốc ...  \n",
+       "2  tiếng việt cũng gọi là tiếng việt nam hay việt...  \n",
+       "3  hệ thống thanh điệu phát triển cao hơn hệ thốn...  \n",
+       "4  tiếp xúc hán – việt thành 2 giai đoạn chính bu...  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1944406, 3)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv(\"../data/processed/wikipedia_20220620_cleaned_v2.csv\",index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

notebooks/0.1-find-dirty-data.ipynb ADDED Viewed

	@@ -0,0 +1,2071 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import pandas as pd\n",
+    "from tqdm.auto import tqdm\n",
+    "tqdm.pandas()\n",
+    "import numpy as np\n",
+    "from glob import glob \n",
+    "import re \n",
+    "from nltk import word_tokenize as lib_tokenizer \n",
+    "import math"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"../data/processed/wikipedia_20220620_cleaned_v2.csv\")\n",
+    "train = json.load(open(\"../data/raw/e2eqa-train+public_test-v1/zac2022_train_merged_final.json\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for x in train['data']:\n",
+    "    x['dirty_text'] = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Trang Chính': True,\n",
+       " 'Internet Society': True,\n",
+       " 'Tiếng Việt': True,\n",
+       " 'Ohio': True,\n",
+       " 'California': True,\n",
+       " 'Thụy Điển': True,\n",
+       " 'Thành phố Hồ Chí Minh': True,\n",
+       " 'Lào Cai': True,\n",
+       " 'W3C': True,\n",
+       " 'Bộ Kế hoạch và Đầu tư (Việt Nam)': True,\n",
+       " 'Lào': True,\n",
+       " 'Hoa Kỳ': True,\n",
+       " 'Hà Giang': True,\n",
+       " 'Cao Bằng': True,\n",
+       " 'Iraq': True,\n",
+       " 'Hà Nội': True,\n",
+       " 'Campuchia': True,\n",
+       " 'VIQR': True,\n",
+       " 'Việt Nam Cộng hòa': True,\n",
+       " 'Sacramento, California': True,\n",
+       " 'Los Angeles': True,\n",
+       " 'San Francisco': True,\n",
+       " 'San Diego': True,\n",
+       " 'Người Mỹ gốc Việt': True,\n",
+       " 'Giấy phép Tài liệu Tự do GNU': True,\n",
+       " 'Lý Thường Kiệt': True,\n",
+       " 'Quang Trung': True,\n",
+       " 'Hồ Biểu Chánh': True,\n",
+       " 'Bắc Kạn': True,\n",
+       " 'Lạng Sơn': True,\n",
+       " 'A': True,\n",
+       " 'B': True,\n",
+       " 'C': True,\n",
+       " 'D': True,\n",
+       " 'E': True,\n",
+       " 'F': True,\n",
+       " 'G': True,\n",
+       " 'H': True,\n",
+       " 'I': True,\n",
+       " 'J': True,\n",
+       " 'K': True,\n",
+       " 'L': True,\n",
+       " 'M': True,\n",
+       " 'N': True,\n",
+       " 'O': True,\n",
+       " 'P': True,\n",
+       " 'Q': True,\n",
+       " 'R': True,\n",
+       " 'S': True,\n",
+       " 'T': True,\n",
+       " 'U': True,\n",
+       " 'V': True,\n",
+       " 'W': True,\n",
+       " 'X': True,\n",
+       " 'Y': True,\n",
+       " 'Z': True,\n",
+       " 'Tuyên Quang': True,\n",
+       " 'Unicode': True,\n",
+       " '2003': True,\n",
+       " 'Thăng Long': True,\n",
+       " 'Lý Thái Tổ': True,\n",
+       " 'Tiến quân ca': True,\n",
+       " 'Đức': True,\n",
+       " 'Liên Hợp Quốc': True,\n",
+       " 'Trần Hưng Đạo': True,\n",
+       " 'Hamid Karzai': True,\n",
+       " 'Hoa (định hướng)': True,\n",
+       " 'Tiếng Thụy Điển': True,\n",
+       " 'Chiến tranh Việt Nam': True,\n",
+       " 'New Orleans': True,\n",
+       " 'Ngọc Lân': True,\n",
+       " 'Ngọc Lan (định hướng)': True,\n",
+       " '26 tháng 1': True,\n",
+       " 'Tiếng Anh': True,\n",
+       " 'Canada': True,\n",
+       " 'Đài Tiếng nói Hoa Kỳ': True,\n",
+       " 'Tháng 1 năm 2004': True,\n",
+       " '1954': True,\n",
+       " 'Đ': True,\n",
+       " 'Trần Đức Lương': True,\n",
+       " 'Québec': True,\n",
+       " 'Saskatchewan': True,\n",
+       " 'Đảo Hoàng tử Edward': True,\n",
+       " 'Ontario': True,\n",
+       " 'Manitoba': True,\n",
+       " 'Newfoundland và Labrador': True,\n",
+       " 'New Brunswick': True,\n",
+       " 'British Columbia': True,\n",
+       " 'Vancouver': True,\n",
+       " 'Alberta': True,\n",
+       " 'Nova Scotia': True,\n",
+       " 'Anh': True,\n",
+       " 'Ottawa': True,\n",
+       " 'Bắc Mỹ': True,\n",
+       " 'Nga': True,\n",
+       " 'Labrador': True,\n",
+       " 'Linux': True,\n",
+       " 'Huế': True,\n",
+       " 'Võ Nguyên Giáp': True,\n",
+       " 'Chữ Nôm': True,\n",
+       " 'Tháng 2 năm 2004': True,\n",
+       " '19 tháng 3': True,\n",
+       " '20 tháng 3': True,\n",
+       " '21 tháng 3': True,\n",
+       " '22 tháng 3': True,\n",
+       " 'Tháng ba': True,\n",
+       " '23 tháng 3': True,\n",
+       " '24 tháng 3': True,\n",
+       " 'Vương quốc Anh (1707–1800)': True,\n",
+       " '25 tháng 3': True,\n",
+       " '26 tháng 3': True,\n",
+       " '27 tháng 3': True,\n",
+       " '28 tháng 3': True,\n",
+       " '29 tháng 3': True,\n",
+       " 'Chủ nhật': True,\n",
+       " 'Lá cờ Ohio': True,\n",
+       " 'Tháng tư': True,\n",
+       " '3 tháng 4': True,\n",
+       " '4 tháng 4': True,\n",
+       " '5 tháng 4': True,\n",
+       " 'Tháng 3 năm 2004': True,\n",
+       " 'VN': True,\n",
+       " 'Bách khoa toàn thư': True,\n",
+       " '9 tháng 4': True,\n",
+       " '10 tháng 4': True,\n",
+       " '11 tháng 4': True,\n",
+       " '12 tháng 4': True,\n",
+       " '13 tháng 4': True,\n",
+       " '14 tháng 4': True,\n",
+       " '15 tháng 4': True,\n",
+       " 'Hóa học': True,\n",
+       " '16 tháng 4': True,\n",
+       " 'Ngô Đình Diệm': True,\n",
+       " 'Máy tính': True,\n",
+       " 'Yên Bái': True,\n",
+       " '20 tháng 4': True,\n",
+       " 'Đà Nẵng': True,\n",
+       " '21 tháng 4': True,\n",
+       " '22 tháng 4': True,\n",
+       " '23 tháng 4': True,\n",
+       " 'Firefox': True,\n",
+       " 'Nguyễn Du': True,\n",
+       " 'CJKV': True,\n",
+       " 'VISCII': True,\n",
+       " '29 tháng 4': True,\n",
+       " 'Cần Thơ': True,\n",
+       " 'GNU': True,\n",
+       " 'Tam giác': True,\n",
+       " 'Internet': True,\n",
+       " 'Hello world': True,\n",
+       " 'Heli': True,\n",
+       " 'Từ viết tắt từ chữ đầu': True,\n",
+       " 'Berkeley Software Distribution': True,\n",
+       " 'FreeBSD': True,\n",
+       " 'Tháng 4 năm 2004': True,\n",
+       " 'Khoa học Trái Đất': True,\n",
+       " 'Việt ngữ (định hướng)': True,\n",
+       " 'PHP': True,\n",
+       " 'Wiki': True,\n",
+       " 'Tiêu chuẩn quốc gia (Việt Nam)': True,\n",
+       " 'Java (công nghệ)': True,\n",
+       " 'Tòa án Công lý Quốc tế': True,\n",
+       " 'Sun Microsystems': True,\n",
+       " 'OpenOffice.org': True,\n",
+       " 'Chương trình Phát triển của Liên Hợp Quốc': True,\n",
+       " '22 tháng 1': True,\n",
+       " 'Việt Minh': True,\n",
+       " 'Thuyết Truman': True,\n",
+       " 'Advanced Micro Devices': True,\n",
+       " 'Pin sạc': True,\n",
+       " 'Edmonton': True,\n",
+       " 'Luxeon': True,\n",
+       " 'Philips Lumileds Lighting': True,\n",
+       " 'Winamp': True,\n",
+       " 'Xích lô': True,\n",
+       " 'Hamasaki Ayumi': True,\n",
+       " 'Amuro Namie': True,\n",
+       " 'Java Platform, Standard Edition': True,\n",
+       " 'Lập trình viên': True,\n",
+       " 'Hạt nhân Linux': True,\n",
+       " 'UniKey': True,\n",
+       " 'Văn Tiến Dũng': True,\n",
+       " 'Đồng Nai': True,\n",
+       " 'Tiếng Nga': True,\n",
+       " 'Quân đội nhân dân Việt Nam': True,\n",
+       " 'Chiến tranh Đông Dương': True,\n",
+       " 'Chiến tranh Vùng Vịnh': True,\n",
+       " 'Phạm Văn Đồng': True,\n",
+       " 'Liên bang Đông Dương': True,\n",
+       " 'World Wide Web': True,\n",
+       " 'Giê-su': True,\n",
+       " 'Quảng Nam': True,\n",
+       " 'Giang Trạch Dân': True,\n",
+       " 'Wikipedia': True,\n",
+       " 'Bỉ': True,\n",
+       " 'Toán học': True,\n",
+       " 'Khoa học tự nhiên': True,\n",
+       " 'Hình học': True,\n",
+       " 'Quốc gia': True,\n",
+       " 'Địa lý': True,\n",
+       " 'Cơ học cổ điển': True,\n",
+       " 'Hóa hữu cơ': True,\n",
+       " 'Nguyễn Văn Thiệu': True,\n",
+       " 'George W. Bush': True,\n",
+       " 'Nội chiến Congo thứ hai': True,\n",
+       " 'Thống kê': True,\n",
+       " 'Chiến tranh thời cổ đại': True,\n",
+       " 'Sinh thái học': True,\n",
+       " 'Ngô Quyền': True,\n",
+       " 'Gangnihessou': True,\n",
+       " 'HTML': True,\n",
+       " 'XHTML': True,\n",
+       " 'Phong trào Chữ thập đỏ và Trăng lưỡi liềm đỏ quốc tế': True,\n",
+       " 'Vật lý học': True,\n",
+       " 'Quần đảo Trường Sa': True,\n",
+       " 'Quần đảo Hoàng Sa': True,\n",
+       " 'Thiên văn học': True,\n",
+       " 'Vovinam': True,\n",
+       " 'Cornhole': True,\n",
+       " 'Nhắn tin tức thời': True,\n",
+       " 'Tam Quốc (Triều Tiên)': True,\n",
+       " 'Nguyên tố': True,\n",
+       " 'Phan Văn Khải': True,\n",
+       " 'Lợn biển': True,\n",
+       " 'Hành tinh': True,\n",
+       " 'Montréal': True,\n",
+       " 'Hình vuông': True,\n",
+       " 'Kính viễn vọng không gian Hubble': True,\n",
+       " 'Nha Trang': True,\n",
+       " 'Klaipėda': True,\n",
+       " 'Trung Quốc': True,\n",
+       " 'Mạng riêng ảo': True,\n",
+       " 'Mạng máy tính': True,\n",
+       " 'Hệ quản trị nội dung': True,\n",
+       " 'Sinh học tế bào': True,\n",
+       " 'Nguyễn Trãi': True,\n",
+       " 'Thực vật học': True,\n",
+       " 'Phân loại học': True,\n",
+       " 'Công nghệ': True,\n",
+       " 'Chia cắt Ấn Độ': True,\n",
+       " 'Sao Kim': True,\n",
+       " 'Sao Hỏa': True,\n",
+       " 'Nam quốc sơn hà': True,\n",
+       " 'Red Deer, Alberta': True,\n",
+       " 'Hệ Mặt Trời': True,\n",
+       " 'Oradea': True,\n",
+       " 'Trái Đất': True,\n",
+       " 'Sao Mộc': True,\n",
+       " 'C�� học': True,\n",
+       " 'Khí tượng học': True,\n",
+       " 'Kế toán': True,\n",
+       " 'Sao Thổ': True,\n",
+       " 'Hoàng đế': True,\n",
+       " 'Moresnet': True,\n",
+       " 'KDE': True,\n",
+       " 'Người Tày': True,\n",
+       " 'Sinh học': True,\n",
+       " 'Tế bào': True,\n",
+       " 'Hóa phân tích': True,\n",
+       " 'Hóa vô cơ': True,\n",
+       " 'Hóa dầu': True,\n",
+       " 'Cân bằng nội môi': True,\n",
+       " 'Cây phát sinh chủng loại': True,\n",
+       " 'Sinh học phân tử': True,\n",
+       " 'Di truyền học': True,\n",
+       " 'Lĩnh Nam': True,\n",
+       " 'Hydrocarbon': True,\n",
+       " 'Phản ứng trùng ngưng': True,\n",
+       " 'Alcohol': True,\n",
+       " 'Nhân Chứng Giê-hô-va': True,\n",
+       " 'Ngôn ngữ lập trình': True,\n",
+       " 'Tổng sản lượng quốc gia': True,\n",
+       " 'GDP (định hướng)': True,\n",
+       " 'Tiệp': True,\n",
+       " 'Hà Lan': True,\n",
+       " 'Frankfurt am Main': True,\n",
+       " 'Sinh lý học': True,\n",
+       " 'Giải phẫu học': True,\n",
+       " 'Tập tính học': True,\n",
+       " 'Truyện Kiều': True,\n",
+       " 'Tản Đà': True,\n",
+       " 'Tết Nguyên Đán': True,\n",
+       " 'Lịch sử Việt Nam': True,\n",
+       " 'Người Việt': True,\n",
+       " 'Dầu mỏ': True,\n",
+       " 'Việt': True,\n",
+       " 'Kim Vân Kiều': True,\n",
+       " 'Kinh': True,\n",
+       " 'Tổ chức các nước xuất khẩu dầu lửa': True,\n",
+       " 'Quận Cam, California': True,\n",
+       " '30 tháng 4': True,\n",
+       " 'Trương Vĩnh Ký': True,\n",
+       " 'Hải Dương': True,\n",
+       " '8 tháng 3': True,\n",
+       " 'Tháng 9 năm 2004': True,\n",
+       " 'Amin': True,\n",
+       " 'Sao Thiên Vương': True,\n",
+       " 'Kỹ thuật': True,\n",
+       " 'Chăm Pa': True,\n",
+       " 'Nguyễn Hoàng': True,\n",
+       " 'Phù Nam': True,\n",
+       " 'Pin nhiên liệu': True,\n",
+       " 'Chiêm Thành': True,\n",
+       " 'Carbon': True,\n",
+       " 'Nguyễn Công Trứ': True,\n",
+       " 'Tế bào nhiên liệu kiềm': True,\n",
+       " 'AFC': True,\n",
+       " 'Hydro': True,\n",
+       " 'Đoạn mồi': True,\n",
+       " 'Probe': True,\n",
+       " 'DNA microarray': True,\n",
+       " 'Lạm phát': True,\n",
+       " 'Sông Hậu': True,\n",
+       " 'Ngũ hành': True,\n",
+       " 'PEMFC': True,\n",
+       " 'Phản ứng chuỗi polymerase': True,\n",
+       " 'Trần Ngọc Liên': True,\n",
+       " 'CPU': True,\n",
+       " 'Nguyên tố hóa học': True,\n",
+       " 'Tổng sản phẩm nội địa': True,\n",
+       " 'Electron': True,\n",
+       " 'Phần mềm doanh nghiệp': True,\n",
+       " 'Berlin': True,\n",
+       " 'Oxy': True,\n",
+       " 'Ozon': True,\n",
+       " 'Tích hợp ứng dụng doanh nghiệp': True,\n",
+       " 'Harry Potter': True,\n",
+       " 'Dương Văn Minh': True,\n",
+       " 'Bạch cầu': True,\n",
+       " 'Tầng bình lưu': True,\n",
+       " 'Chiến tranh thế giới thứ hai': True,\n",
+       " 'Touchdown PCR': True,\n",
+       " 'Telex (kiểu gõ)': True,\n",
+       " 'Hiệu ứng nhà kính': True,\n",
+       " 'Khí quyển Trái Đất': True,\n",
+       " 'Tăng áp (định hướng)': True,\n",
+       " 'Khí hậu học': True,\n",
+       " 'Nước': True,\n",
+       " 'Albert Einstein': True,\n",
+       " 'Isaac Newton': True,\n",
+       " 'Hoạch định tài nguyên doanh nghiệp': True,\n",
+       " 'Nhà vật lý': True,\n",
+       " 'Danh sách nhà vật lý': True,\n",
+       " 'Chúa': True,\n",
+       " '2004': True,\n",
+       " 'HIV/AIDS': True,\n",
+       " 'SIDA': True,\n",
+       " 'Giải Nobel': True,\n",
+       " 'Danh sách người đoạt giải Nobel': True,\n",
+       " 'Vật chất': True,\n",
+       " 'Tiên đề': True,\n",
+       " 'Ánh sáng': True,\n",
+       " 'Tương tác hấp dẫn': True,\n",
+       " 'Khối lượng': True,\n",
+       " 'Thể tích': True,\n",
+       " 'Tích phân': True,\n",
+       " 'Tân Bình (phủ cũ Gia Định)': True,\n",
+       " 'Kim Dung': True,\n",
+       " 'Đại Việt sử ký toàn thư': True,\n",
+       " 'SAP': True,\n",
+       " 'Lập xuân': True,\n",
+       " 'Tiết khí': True,\n",
+       " 'Phương trình': True,\n",
+       " 'Không điểm của hàm số': True,\n",
+       " 'Mét': True,\n",
+       " 'Thời gian': True,\n",
+       " 'Rượu (định hướng)': True,\n",
+       " 'Giây': True,\n",
+       " 'Giờ': True,\n",
+       " 'Phút': True,\n",
+       " 'Tốc độ ánh sáng': True,\n",
+       " 'Xuân phân': True,\n",
+       " 'Ba Lê (định hướng)': True,\n",
+       " 'Moskva': True,\n",
+       " 'Paris': True,\n",
+       " 'Cholesterol': True,\n",
+       " 'Kali': True,\n",
+       " 'CeBIT': True,\n",
+       " 'Hằng số Planck': True,\n",
+       " 'Năng lượng': True,\n",
+       " 'Kilôgam': True,\n",
+       " 'Tiền tố SI': True,\n",
+       " 'Yôta': True,\n",
+       " 'Zêta': True,\n",
+       " 'Êxa': True,\n",
+       " 'Pêta': True,\n",
+       " 'Têra': True,\n",
+       " 'Giga': True,\n",
+       " 'Mêga': True,\n",
+       " 'Kilô': True,\n",
+       " 'Héctô': True,\n",
+       " 'Đêca': True,\n",
+       " 'Đêxi': True,\n",
+       " 'Xăngti': True,\n",
+       " 'Mili': True,\n",
+       " 'Micrô': True,\n",
+       " 'Nanô': True,\n",
+       " 'Pico-': True,\n",
+       " 'Femto-': True,\n",
+       " 'Atto-': True,\n",
+       " 'Zepto-': True,\n",
+       " 'Yocto-': True,\n",
+       " 'Chiều dài': True,\n",
+       " 'Khoảng cách': True,\n",
+       " 'Việt kiều': True,\n",
+       " 'Centimet': True,\n",
+       " 'Gam': True,\n",
+       " 'Milimét': True,\n",
+       " 'Micrômét': True,\n",
+       " 'Nanômét': True,\n",
+       " 'Phạm Hồng Sơn (nhân vật bất đồng chính kiến)': True,\n",
+       " 'Rừng của người đã mất': True,\n",
+       " 'Đại số': True,\n",
+       " 'Kinh Thánh': True,\n",
+       " 'Quang học': True,\n",
+       " 'Người Hoa (Việt Nam)': True,\n",
+       " 'Người Trung Quốc': True,\n",
+       " 'Đông chí': True,\n",
+       " 'Thu phân': True,\n",
+       " 'Thức uống có cồn': True,\n",
+       " 'Vốn điều lệ': True,\n",
+       " 'Vốn pháp định': True,\n",
+       " 'Tiếng Pháp': True,\n",
+       " 'Thành Cát Tư Hãn': True,\n",
+       " 'Dòng Tên': True,\n",
+       " 'Nồng độ': True,\n",
+       " 'Nguyễn Đan Quế': True,\n",
+       " 'Khánh Hòa': True,\n",
+       " 'TeX': True,\n",
+       " 'Sao Hải Vương': True,\n",
+       " 'Sếu đầu đỏ': True,\n",
+       " 'Tây Hạ': True,\n",
+       " 'Đông Bắc Bộ': True,\n",
+       " 'Thuyết tương đối': True,\n",
+       " 'Nhà Kim': True,\n",
+       " 'Ngân Xuyên': True,\n",
+       " 'NATO': True,\n",
+       " 'Hoàng Hà': True,\n",
+       " 'Bệnh tả': True,\n",
+       " 'Trường Giang (định hướng)': True,\n",
+       " 'Tam Quốc (định hướng)': True,\n",
+       " 'Tam quốc': True,\n",
+       " 'Áo': True,\n",
+       " 'Tiếng Đức': True,\n",
+       " 'Cân': True,\n",
+       " 'Lạng': True,\n",
+       " 'Úc': True,\n",
+       " 'Trực khuẩn': True,\n",
+       " 'Escherichia coli': True,\n",
+       " 'The Star-Spangled Banner': True,\n",
+       " 'Pháp': True,\n",
+       " 'Pháp (định hướng)': True,\n",
+       " 'Đại số tuyến tính': True,\n",
+       " 'Ma trận (định hướng)': True,\n",
+       " 'Ma trận (toán học)': True,\n",
+       " 'Hình chữ nhật': True,\n",
+       " 'Hệ đo lường cổ Việt Nam': True,\n",
+       " 'Cải lương': True,\n",
+       " 'Lượng (kim hoàn)': True,\n",
+       " 'Lịch sử Nhật Bản': True,\n",
+       " 'Địa lý Nhật Bản': True,\n",
+       " 'Văn hóa Nhật Bản': True,\n",
+       " 'Các chính đảng ở Nhật Bản': True,\n",
+       " 'Kinh tế Nhật Bản': True,\n",
+       " 'Vi khuẩn': True,\n",
+       " 'Lịch sử Hoa Kỳ': True,\n",
+       " 'Vi trùng': True,\n",
+       " 'Kinh Dịch': True,\n",
+       " 'Sức mua tương đương': True,\n",
+       " 'ASCII': True,\n",
+       " 'Phương trình đại số': True,\n",
+       " 'Phương trình tuyến tính': True,\n",
+       " 'Biến số': True,\n",
+       " 'Chúa tể những chiếc nhẫn': True,\n",
+       " 'Bộ Sếu': True,\n",
+       " 'Nguyễn': True,\n",
+       " 'Johann Sebastian Bach': True,\n",
+       " 'Hệ phương trình tuyến tính': True,\n",
+       " 'Planctomycetes': True,\n",
+       " 'Murein': True,\n",
+       " 'PPP': True,\n",
+       " 'Trường Sa': True,\n",
+       " 'Lịch sử': True,\n",
+       " 'Khoa học ứng dụng': True,\n",
+       " 'Phát thanh quốc tế': True,\n",
+       " 'Truyền thông đại chúng': True,\n",
+       " 'Chỉ (đơn vị đo)': True,\n",
+       " 'Tạ': True,\n",
+       " 'Yến (đo lường)': True,\n",
+       " 'Tấn': True,\n",
+       " 'Vật lý thực nghiệm': True,\n",
+       " 'Điện từ học': True,\n",
+       " 'Bức xạ điện từ': True,\n",
+       " 'Tổ chức Thương mại Thế giới': True,\n",
+       " 'Vàng (định hướng)': True,\n",
+       " 'Vàng': True,\n",
+       " 'Thuần Càn': True,\n",
+       " 'Thuần Khôn': True,\n",
+       " 'Thủy Lôi Truân': True,\n",
+       " 'Sơn Thủy Mông': True,\n",
+       " 'Thủy Thiên Nhu': True,\n",
+       " 'Thiên Thủy Tụng': True,\n",
+       " 'Địa Thủy Sư': True,\n",
+       " '26 tháng 4': True,\n",
+       " 'Đảo Phục Sinh': True,\n",
+       " 'Thái tử': True,\n",
+       " 'Texas': True,\n",
+       " 'Ngân Hà': True,\n",
+       " 'Alaska': True,\n",
+       " 'Tán xạ': True,\n",
+       " 'Kinh tế Đức': True,\n",
+       " 'Thủy Địa Tỷ': True,\n",
+       " 'Phong Thiên Tiểu Súc': True,\n",
+       " 'Thiên Trạch Lý': True,\n",
+       " 'Địa Thiên Thái': True,\n",
+       " 'Thiên Địa Bĩ': True,\n",
+       " 'Thiên Hỏa Đồng Nhân': True,\n",
+       " 'Hỏa Thiên Đại Hữu': True,\n",
+       " 'Địa Sơn Khiêm': True,\n",
+       " 'Lôi Địa Dự': True,\n",
+       " 'Trạch Lôi Tùy': True,\n",
+       " 'Sơn Phong Cổ': True,\n",
+       " 'Địa Trạch Lâm': True,\n",
+       " 'Phong Địa Quan': True,\n",
+       " 'Sơn Hỏa Bí': True,\n",
+       " 'Sơn Địa Bác': True,\n",
+       " 'Địa Lôi Phục': True,\n",
+       " 'Thiên Lôi Vô Vọng': True,\n",
+       " 'Sơn Thiên Đại Súc': True,\n",
+       " 'Sơn Lôi Di': True,\n",
+       " 'Trạch Phong Đại Quá': True,\n",
+       " 'Thuần Khảm': True,\n",
+       " 'Thuần Ly': True,\n",
+       " 'Trạch Sơn Hàm': True,\n",
+       " 'Lôi Phong Hằng': True,\n",
+       " 'Thiên Sơn Độn': True,\n",
+       " 'Lôi Thiên Đại Tráng': True,\n",
+       " 'Địa Hỏa Minh Di': True,\n",
+       " 'Phong Hỏa Gia Nhân': True,\n",
+       " 'Hỏa Trạch Khuê': True,\n",
+       " 'Thủy Sơn Kiển': True,\n",
+       " 'Lôi Thủy Giải': True,\n",
+       " 'Sơn Trạch Tổn': True,\n",
+       " 'Phong Lôi Ích': True,\n",
+       " 'Trạch Thiên Quải': True,\n",
+       " 'Thiên Phong Cấu': True,\n",
+       " 'Trạch Địa Tụy': True,\n",
+       " 'Địa Phong Thăng': True,\n",
+       " 'Trạch Thủy Khốn': True,\n",
+       " 'Thủy Phong Tỉnh': True,\n",
+       " 'Trạch Hỏa Cách': True,\n",
+       " 'Hỏa Phong Đỉnh': True,\n",
+       " 'Lịch sử Hoa Kỳ (1493–1776)': True,\n",
+       " 'Thuần Chấn': True,\n",
+       " 'Thuần Cấn': True,\n",
+       " 'Phong Sơn Tiệm': True,\n",
+       " 'Lôi Trạch Quy Muội': True,\n",
+       " 'Lôi Hỏa Phong': True,\n",
+       " 'Hỏa Sơn Lữ': True,\n",
+       " 'Thuần Tốn': True,\n",
+       " 'Thuần Đoài': True,\n",
+       " 'Phong Thủy Hoán': True,\n",
+       " 'Thủy Trạch Tiết': True,\n",
+       " 'Lôi Sơn Tiểu Quá': True,\n",
+       " 'Thủy Hỏa Ký Tế': True,\n",
+       " 'Hỏa Thủy Vị Tế': True,\n",
+       " 'Lê Chí Quang': True,\n",
+       " 'Sao Diêm Vương': True,\n",
+       " 'Sông Hồng': True,\n",
+       " 'Đo lường': True,\n",
+       " 'Hồng Hà (định hướng)': True,\n",
+       " 'Sông Cái Nha Trang': True,\n",
+       " 'Nguyên Giang': True,\n",
+       " 'Đo lường học': True,\n",
+       " 'Ma trận vuông': True,\n",
+       " 'Hệ thống sông Thái Bình': True,\n",
+       " 'Sông Cầu': True,\n",
+       " 'Đơn vị đo': True,\n",
+       " 'Lễ Phục Sinh': True,\n",
+       " 'Sông Cửu Long': True,\n",
+       " 'Mê Kông': True,\n",
+       " 'Nguyễn Vũ Bình': True,\n",
+       " 'Năm nhuận': True,\n",
+       " 'Đơn vị đo chiều dài': True,\n",
+       " 'Pêtamét': True,\n",
+       " 'Latinh (định hướng)': True,\n",
+       " 'Năm ánh sáng': True,\n",
+       " 'Yôtamét': True,\n",
+       " 'Doraemon': True,\n",
+       " 'Nhà xuất bản Kim Đồng': True,\n",
+       " 'Tiếng Latinh': True,\n",
+       " 'Danh sách ngôn ngữ theo tổng số người sử dụng': True,\n",
+       " 'Thủy Tinh': True,\n",
+       " 'NASA': True,\n",
+       " 'Việt Nam Dân chủ Cộng hòa': True,\n",
+       " 'Năm Cam': True,\n",
+       " 'Năm chí tuyến': True,\n",
+       " 'Kitô giáo': True,\n",
+       " 'Thiên Chúa giáo': True,\n",
+       " 'Illinois': True,\n",
+       " 'Rhode Island': True,\n",
+       " 'Viên': True,\n",
+       " 'Nhiệt động lực học': True,\n",
+       " 'Thứ Hai': True,\n",
+       " 'Thứ Ba': True,\n",
+       " 'Thứ Tư': True,\n",
+       " 'Thứ Năm': True,\n",
+       " 'Thứ Sáu': True,\n",
+       " 'Thứ Bảy': True,\n",
+       " 'Giờ Phối hợp Quốc tế': True,\n",
+       " 'Sao Mai (định hướng)': True,\n",
+       " 'Tháng năm': True,\n",
+       " 'Tháng sáu': True,\n",
+       " 'Tháng bảy': True,\n",
+       " 'Thủy tinh': True,\n",
+       " 'Trịnh Công Sơn': True,\n",
+       " 'Kitô giáo Đông phương': True,\n",
+       " 'Chính thống giáo Đông phương': True,\n",
+       " 'Giáo hội Công giáo': True,\n",
+       " 'Kitô giáo Tây phương': True,\n",
+       " 'Kháng Cách': True,\n",
+       " 'Anh giáo': True,\n",
+       " 'Giáo hội Anh': True,\n",
+       " 'Tháng tám': True,\n",
+       " 'Tháng chín': True,\n",
+       " 'Phạm Duy': True,\n",
+       " 'Tháng mười': True,\n",
+       " 'Tháng mười một': True,\n",
+       " 'Tháng mười hai': True,\n",
+       " 'Tháng một': True,\n",
+       " 'Tháng hai': True,\n",
+       " 'Cơ sở dữ liệu': True,\n",
+       " 'Tháng giêng': True,\n",
+       " 'Tháng chạp': True,\n",
+       " 'Sóc (lịch)': True,\n",
+       " 'Tiếng Ý': True,\n",
+       " 'Nông lịch': True,\n",
+       " 'Tháng Tý': True,\n",
+       " 'Âm lịch': True,\n",
+       " 'Can Chi': True,\n",
+       " 'Công Nguyên': True,\n",
+       " 'CN': True,\n",
+       " 'Lịch sử vật lý học': True,\n",
+       " 'Lịch sử cơ học': True,\n",
+       " 'Danh sách quốc gia theo dân số': True,\n",
+       " 'Công giáo': True,\n",
+       " 'Israel': True,\n",
+       " 'Pakistan': True,\n",
+       " 'Mặt Trời': True,\n",
+       " 'Euro': True,\n",
+       " 'Land der Berge, Land am Strome': True,\n",
+       " 'Sankt Pölten': True,\n",
+       " 'Graz': True,\n",
+       " 'Giáo hoàng Gioan Phaolô II': True,\n",
+       " 'Vật lý hạt': True,\n",
+       " 'Thuyết tương đối hẹp': True,\n",
+       " 'Phục Sinh': True,\n",
+       " 'Lý thuyết dây': True,\n",
+       " 'Đức Quốc Xã': True,\n",
+       " 'Sao lùn trắng': True,\n",
+       " 'Hạt sơ cấp': True,\n",
+       " 'Không-thời gian': True,\n",
+       " 'Cơ học thống kê': True,\n",
+       " 'Hạt Higgs': True,\n",
+       " 'Lý thuyết hấp dẫn lượng tử vòng': True,\n",
+       " 'Chất độc da cam': True,\n",
+       " 'VietNamNet': True,\n",
+       " 'Đồng (đơn vị tiền tệ)': True,\n",
+       " 'Đồng (định hướng)': True,\n",
+       " 'Hằng số vật lý': True,\n",
+       " 'CH': True,\n",
+       " 'TP': True,\n",
+       " 'ĐT': True,\n",
+       " 'CQ': True,\n",
+       " 'Khí quyển (định hướng)': True,\n",
+       " 'TV (định hướng)': True,\n",
+       " 'Nhóm ngôn ngữ gốc Ý': True,\n",
+       " 'TN': True,\n",
+       " 'ĐH': True,\n",
+       " 'ND': True,\n",
+       " 'QĐ': True,\n",
+       " 'HQ': True,\n",
+       " 'BĐ': True,\n",
+       " 'QH': True,\n",
+       " 'LĐ': True,\n",
+       " 'ĐN': True,\n",
+       " 'Hỏa Lôi Phệ Hạp': True,\n",
+       " 'ĐC': True,\n",
+       " 'VH': True,\n",
+       " 'VV': True,\n",
+       " 'PT': True,\n",
+       " 'QL': True,\n",
+       " 'Tương tác cơ bản': True,\n",
+       " 'Giải tích thực': True,\n",
+       " 'Đảng Cộng sản Việt Nam': True,\n",
+       " 'Ngữ hệ Nam Á': True,\n",
+       " 'Ngữ hệ Ấn-Âu': True,\n",
+       " 'Ngữ tộc Môn-Khmer': True,\n",
+       " 'Ngữ tộc Slav': True,\n",
+       " 'Ngữ tộc German': True,\n",
+       " 'Nhóm ngôn ngữ Hy Lạp': True,\n",
+       " 'Giới hạn Chandrasekhar': True,\n",
+       " 'Phân loại sao': True,\n",
+       " 'Kim loại kiềm': True,\n",
+       " 'Au': True,\n",
+       " 'Kanji': True,\n",
+       " 'Lê (họ)': True,\n",
+       " 'Vàng (màu)': True,\n",
+       " 'Tía': True,\n",
+       " 'Tím': True,\n",
+       " 'Đỏ': True,\n",
+       " 'Đỏ thắm': True,\n",
+       " 'Đỏ tươi': True,\n",
+       " 'Đỏ son': True,\n",
+       " 'Hồng (màu)': True,\n",
+       " 'Du lịch Áo': True,\n",
+       " 'Địa khai hóa': True,\n",
+       " 'Alexandre Yersin': True,\n",
+       " 'AG': True,\n",
+       " 'AL': True,\n",
+       " 'AQ': True,\n",
+       " 'AN': True,\n",
+       " 'Màu gốc in ấn': True,\n",
+       " 'VI': True,\n",
+       " 'Vi': True,\n",
+       " 'AM': True,\n",
+       " 'Emacs': True,\n",
+       " 'Phối màu phát xạ': True,\n",
+       " 'Phối màu hấp thụ': True,\n",
+       " 'Mô hình màu CMYK': True,\n",
+       " 'Mô hình màu RGB': True,\n",
+       " 'Hà Nam': True,\n",
+       " 'Nhược trương': True,\n",
+       " 'Đẳng trương': True,\n",
+       " 'Hà Nam (định hướng)': True,\n",
+       " 'Mô hình Chuẩn': True,\n",
+       " 'Thí nghiệm Michelson-Morley': True,\n",
+       " 'Phú Thọ': True,\n",
+       " 'An Giang': True,\n",
+       " 'Đen': True,\n",
+       " 'Tổng thống Đức': True,\n",
+       " 'Trắng': True,\n",
+       " 'Das Lied der Deutschen': True,\n",
+       " 'Cúm gia cầm': True,\n",
+       " 'Danh sách màu': True,\n",
+       " 'Ngữ tộc Celt': True,\n",
+       " 'Hổ phách (màu)': True,\n",
+       " 'Ametit (màu)': True,\n",
+       " 'Tỉnh thành Việt Nam': True,\n",
+       " 'Xanh berin': True,\n",
+       " 'Xanh da trời': True,\n",
+       " 'Ngôn ngữ học': True,\n",
+       " 'Nâu sẫm': True,\n",
+       " 'Xanh dương': True,\n",
+       " 'Sinh vật biến đổi gen': True,\n",
+       " 'Tây Bắc Bộ': True,\n",
+       " 'Đồng bằng sông Hồng': True,\n",
+       " 'Bắc Trung Bộ': True,\n",
+       " 'Duyên hải Nam Trung Bộ': True,\n",
+       " 'Tây Nguyên': True,\n",
+       " 'Đông Nam Bộ': True,\n",
+       " 'Đồng bằng sông Cửu Long': True,\n",
+       " 'Bắc Bộ': True,\n",
+       " 'Miền Nam (Việt Nam)': True,\n",
+       " 'Chàm': True,\n",
+       " 'Chàm (bệnh)': True,\n",
+       " 'Yersinia pestis': True,\n",
+       " 'Viêm da': True,\n",
+       " 'Kỹ thuật di truyền': True,\n",
+       " 'Google': True,\n",
+       " 'Yahoo!': True,\n",
+       " 'Lịch sử Hoa Kỳ (1776–1789)': True,\n",
+       " 'Tháng 2 năm 2005': True,\n",
+       " 'Chàm (định hướng)': True,\n",
+       " 'Điện toán': True,\n",
+       " 'Văn hóa': True,\n",
+       " 'Màu sắc': True,\n",
+       " 'Giấy điện tử': True,\n",
+       " 'Cộng hòa Séc': True,\n",
+       " 'Trương Trọng Thi': True,\n",
+       " 'Toán học ứng dụng': True,\n",
+       " 'PC': True,\n",
+       " 'AC': True,\n",
+       " 'Dc': True,\n",
+       " 'Washington': True,\n",
+       " 'Màu be': True,\n",
+       " 'BE': True,\n",
+       " 'Máy tính cá nhân': True,\n",
+       " 'Ngôn ngữ': True,\n",
+       " 'Kde domov můj?': True,\n",
+       " 'Séc': True,\n",
+       " 'Y học': True,\n",
+       " 'Lời thề Hippocrates': True,\n",
+       " 'Virus': True,\n",
+       " 'Hồng Bàng': True,\n",
+       " 'An Dương Vương': True,\n",
+       " 'Vĩnh Phúc': True,\n",
+       " 'DNA': True,\n",
+       " 'Vụ Nổ Lớn': True,\n",
+       " 'Nâu': True,\n",
+       " 'Big Bang (định hướng)': True,\n",
+       " 'Tòa Thánh': True,\n",
+       " 'Xanh lơ': True,\n",
+       " 'Vi ba': True,\n",
+       " 'Lục bảo': True,\n",
+       " 'Vàng kim loại (màu)': True,\n",
+       " 'Xám': True,\n",
+       " 'Da cam': True,\n",
+       " 'Cam cháy': True,\n",
+       " 'Số Hex': True,\n",
+       " 'Cô ban (màu)': True,\n",
+       " 'Đồng (màu)': True,\n",
+       " 'San hô (màu)': True,\n",
+       " 'Kem (màu)': True,\n",
+       " 'Vòi voi (màu)': True,\n",
+       " 'Cá hồi (màu)': True,\n",
+       " 'Nâu đen': True,\n",
+       " 'Xanh chromi': True,\n",
+       " 'Kaki': True,\n",
+       " 'Ngọc thạch (màu)': True,\n",
+       " 'Oải hương (màu)': True,\n",
+       " 'Bạc (màu)': True,\n",
+       " 'Bước sóng': True,\n",
+       " 'Debian': True,\n",
+       " 'SI (định hướng)': True,\n",
+       " 'Mẫu (dạng thức)': True,\n",
+       " '2 tháng 9': True,\n",
+       " 'BCT': True,\n",
+       " 'Ngữ hệ Kra-Dai': True,\n",
+       " 'Mẫu (đơn vị đo)': True,\n",
+       " 'Anh đào (màu)': True,\n",
+       " 'Vàng chanh': True,\n",
+       " 'Lòng đào': True,\n",
+       " 'Dừa cạn (màu)': True,\n",
+       " 'Ngữ hệ Nhật-Lưu Cầu': True,\n",
+       " 'Đỏ yên chi': True,\n",
+       " 'Ôliu (màu)': True,\n",
+       " 'Lan tím': True,\n",
+       " 'Ngọc lam (màu)': True,\n",
+       " 'Mòng két (màu)': True,\n",
+       " 'Hoa cà (màu)': True,\n",
+       " 'Cẩm quỳ (màu)': True,\n",
+       " 'Mận (màu)': True,\n",
+       " 'Tần số': True,\n",
+       " 'Xanh nõn chuối': True,\n",
+       " 'Màu': True,\n",
+       " 'Toàn cầu hóa': True,\n",
+       " 'Virus (định hướng)': True,\n",
+       " 'Liên Xô': True,\n",
+       " 'Siêu dẫn': True,\n",
+       " 'Men ngọc (màu)': True,\n",
+       " 'Hạt dẻ (màu)': True,\n",
+       " 'Hình học giải tích': True,\n",
+       " 'Lam sẫm': True,\n",
+       " 'Nguyễn Cao': True,\n",
+       " 'Hồng đất': True,\n",
+       " 'Xanh thủy tinh': True,\n",
+       " 'Chu kỳ': True,\n",
+       " 'Nâu tanin': True,\n",
+       " 'Hồng y (màu)': True,\n",
+       " 'Ngữ chi Rôman': True,\n",
+       " 'Ngữ tộc Balt': True,\n",
+       " '17 tháng 4': True,\n",
+       " '18 tháng 4': True,\n",
+       " '19 tháng 4': True,\n",
+       " 'Sóng': True,\n",
+       " 'Tần số góc': True,\n",
+       " 'Thánh địa Mỹ Sơn': True,\n",
+       " 'Quần đảo Cát Bà': True,\n",
+       " 'Đồ Sơn': True,\n",
+       " 'Ba Bể (định hướng)': True,\n",
+       " 'Vịnh Hạ Long': True,\n",
+       " 'Hồ Ba Bể': True,\n",
+       " 'Sa Pa (phường)': True,\n",
+       " 'Đà Lạt': True,\n",
+       " 'Phú Quốc': True,\n",
+       " 'Côn Đảo': True,\n",
+       " 'Phố cổ Hội An': True,\n",
+       " 'Ao Bà Om': True,\n",
+       " 'Ngủ đông': True,\n",
+       " 'Chu trình tiềm tan': True,\n",
+       " 'Tiềm sinh': True,\n",
+       " 'Electrum': True,\n",
+       " 'Tia X': True,\n",
+       " 'KTS': True,\n",
+       " 'Quang tuyến': True,\n",
+       " 'Thanh Xuân': True,\n",
+       " 'Di sản thế giới': True,\n",
+       " 'Hạ Long (định hướng)': True,\n",
+       " '24 tháng 4': True,\n",
+       " 'Hóa sinh': True,\n",
+       " '25 tháng 4': True,\n",
+       " 'Tôn giáo': True,\n",
+       " 'Danh sách Thủ tướng Việt Nam': True,\n",
+       " 'KCS': True,\n",
+       " 'KCN': True,\n",
+       " 'Muối ăn': True,\n",
+       " 'Đại lượng vật lý': True,\n",
+       " 'Sự kiện 30 tháng 4 năm 1975': True,\n",
+       " 'Quy hoạch đô thị': True,\n",
+       " 'Danh sách nhà toán học': True,\n",
+       " 'Phật giáo': True,\n",
+       " 'Tắc đường': True,\n",
+       " 'Khoa học máy tính': True,\n",
+       " 'Hưng Yên': True,\n",
+       " 'Bắc Ninh': True,\n",
+       " 'Hà Tây (tỉnh)': True,\n",
+       " 'Khí quyển Sao Hỏa': True,\n",
+       " 'Thượng Hải': True,\n",
+       " 'Công nghệ thông tin': True,\n",
+       " 'Hồ Chí Minh (định hướng)': True,\n",
+       " 'Khu công nghiệp': True,\n",
+       " 'Tam giáo': True,\n",
+       " 'Thái Bình': True,\n",
+       " 'Nam Định': True,\n",
+       " 'Nhà': True,\n",
+       " 'Ninh Bình': True,\n",
+       " 'Chiếu dời đô': True,\n",
+       " 'Ngọc lục bảo': True,\n",
+       " 'Tin học': True,\n",
+       " 'Tự do hóa': True,\n",
+       " 'CSS': True,\n",
+       " 'Hiệp ước chung về thuế quan và mậu dịch': True,\n",
+       " 'Lai tạp hóa': True,\n",
+       " 'Mỹ hóa': True,\n",
+       " 'Tổ chức sở hữu trí tuệ': True,\n",
+       " 'Khoa học thông tin': True,\n",
+       " 'Thuật toán': True,\n",
+       " 'WEB': True,\n",
+       " '1 tháng 4': True,\n",
+       " 'Tổ chức phi chính phủ': True,\n",
+       " 'Đầu tư trực tiếp nước ngoài': True,\n",
+       " 'NGO': True,\n",
+       " 'Phần cứng': True,\n",
+       " 'Châu Âu': True,\n",
+       " 'Thái Lan': True,\n",
+       " 'Virus (máy tính)': True,\n",
+       " 'Phần mềm ác ý': True,\n",
+       " 'Linux From Scratch': True,\n",
+       " 'LFS': True,\n",
+       " 'Hà Nội (định hướng)': True,\n",
+       " 'Ngọc lam': True,\n",
+       " '2 tháng 4': True,\n",
+       " 'Lipid': True,\n",
+       " 'Toán học tổ hợp': True,\n",
+       " 'Toán học rời rạc': True,\n",
+       " 'Nara (thành phố)': True,\n",
+       " 'Phan Bội Châu': True,\n",
+       " 'Sắc ký': True,\n",
+       " 'Hồng ngọc': True,\n",
+       " 'Ribosome': True,\n",
+       " 'Trung tử': True,\n",
+       " 'Lysosome': True,\n",
+       " 'Mạng lưới nội chất': True,\n",
+       " 'Peroxisome': True,\n",
+       " 'Bộ máy Golgi': True,\n",
+       " 'Điện tử học': True,\n",
+       " 'Ty thể': True,\n",
+       " 'Công nghệ nano': True,\n",
+       " 'Quan Âm': True,\n",
+       " 'Tháp Hà Nội (định hướng)': True,\n",
+       " 'Nhà Nguyễn': True,\n",
+       " 'Danh sách quốc gia có chủ quyền': True,\n",
+       " 'Sông Cầu (định hướng)': True,\n",
+       " 'Thiên đỉnh': True,\n",
+       " 'Thiên để': True,\n",
+       " 'Tạ (định hướng)': True,\n",
+       " 'Tấn (định hướng)': True,\n",
+       " 'Câu lệnh (khoa học máy tính)': True,\n",
+       " 'Chỉ thị (máy tính)': True,\n",
+       " 'Frank Sherwood Rowland': True,\n",
+       " 'Cú pháp câu lệnh': True,\n",
+       " 'Nhóm chức': True,\n",
+       " 'Quy tắc đặt dấu thanh trong chữ quốc ngữ': True,\n",
+       " 'Tán xạ Rayleigh': True,\n",
+       " 'Đúng': True,\n",
+       " 'Đồng': True,\n",
+       " 'Đậu tương': True,\n",
+       " 'Cộng đồng Wikipedia': True,\n",
+       " 'Đồng tính luyến ái': True,\n",
+       " 'Tam quốc diễn nghĩa': True,\n",
+       " 'Vết đen Mặt Trời': True,\n",
+       " 'Mô hình Markov ẩn': True,\n",
+       " 'Tin sinh học': True,\n",
+       " 'Vết đen': True,\n",
+       " 'Tứ đại danh tác': True,\n",
+       " 'Sách đỏ IUCN': True,\n",
+       " 'Hệ điều hành': True,\n",
+       " 'Bắt cặp trình tự': True,\n",
+       " 'Tam quốc diễn nghĩa (định hướng)': True,\n",
+       " 'Trình tự motif': True,\n",
+       " 'Thành phố Trung Quốc': True,\n",
+       " 'Thụy Sĩ': True,\n",
+       " 'Kẽm': True,\n",
+       " 'Dãy số thực': True,\n",
+       " 'Bánh chưng': True,\n",
+       " 'Lá dong': True,\n",
+       " 'Bánh giầy': True,\n",
+       " 'Bạc': True,\n",
+       " 'Boson W': True,\n",
+       " ...}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_titles = dict([(x.strip(),True) for x in open(\"../data/raw/wikipedia_20220620_cleaned/wikipedia_20220620_all_titles.txt\").readlines()])\n",
+    "all_titles"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dict_map = dict({}) \n",
+    " \n",
+    "def word_tokenize(text): \n",
+    "    global dict_map \n",
+    "    words = text.split() \n",
+    "    words_norm = [] \n",
+    "    for w in words: \n",
+    "        if dict_map.get(w, None) is None: \n",
+    "            dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '\"').replace(\"''\", '\"') \n",
+    "        words_norm.append(dict_map[w]) \n",
+    "    return words_norm \n",
+    " \n",
+    "def strip_answer_string(text): \n",
+    "    text = text.strip() \n",
+    "    while text[-1] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n",
+    "        if text[0] != '(' and text[-1] == ')' and '(' in text: \n",
+    "            break \n",
+    "        if text[-1] == '\"' and text[0] != '\"' and text.count('\"') > 1: \n",
+    "            break \n",
+    "        text = text[:-1].strip() \n",
+    "    while text[0] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n",
+    "        if text[0] == '\"' and text[-1] != '\"' and text.count('\"') > 1: \n",
+    "            break \n",
+    "        text = text[1:].strip() \n",
+    "    text = text.strip() \n",
+    "    return text \n",
+    " \n",
+    "def strip_context(text): \n",
+    "    text = text.replace('\\n', ' ') \n",
+    "    text = re.sub(r'\\s+', ' ', text) \n",
+    "    text = text.strip() \n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO: Pandarallel will run on 6 workers.\n",
+      "INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pandarallel import pandarallel\n",
+    "\n",
+    "pandarallel.initialize(progress_bar=True, use_memory_fs=False, nb_workers=6)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9bacd2c6720d48209399c9220f4e3054",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=36008), Label(value='0 / 36008')))…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "140698a55ee644c79d2a54941c4beb6b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=36008), Label(value='0 / 36008')))…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8e9ae778df744c568ac4e990dd356f19",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=36008), Label(value='0 / 36008')))…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b3be62a6ab8a4190974f50bc3e6eb74b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=36008), Label(value='0 / 36008')))…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c98f91a6efa94966af32ceec70eea55c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=36008), Label(value='0 / 36008')))…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ed68d16693ef4b7dadafd098e3d6bc88",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=36008), Label(value='0 / 36008')))…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "07f08ef339ea431ab76a84ff6b8c2e57",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=36008), Label(value='0 / 36008')))…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "82630b19f0164af7915195256ec63a00",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=36008), Label(value='0 / 36008')))…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "365cd1db011a48de884862248ecdfc84",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=36008), Label(value='0 / 36008')))…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c1bb81622d9648308170dbeda422b7a5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))),))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0    Trang Chính < templatestyles src= \" Wiki2021/s...\n",
+       "1    Internet Society hay ISOC là một tổ chức quốc ...\n",
+       "2    Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...\n",
+       "3    hệ thống thanh điệu phát triển cao hơn , hệ th...\n",
+       "4    tiếp xúc Hán – Việt thành 2 giai đoạn chính : ...\n",
+       "Name: cleaned_text, dtype: object"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def apply_tokenizer(df, num_chunks):\n",
+    "    chunk_sr = pd.Series()\n",
+    "    i = 0\n",
+    "    end = df.shape[0]\n",
+    "    chunk_size = math.floor(end/num_chunks)\n",
+    "    while i != end:\n",
+    "        if end - i < chunk_size:\n",
+    "            chunk_size = end - i\n",
+    "        temp_df = df[i:i+chunk_size].copy()\n",
+    "        chunk_sr = pd.concat([chunk_sr, temp_df[\"text\"].parallel_apply(lambda x: \" \".join(word_tokenize(strip_context(x))))], ignore_index=True)\n",
+    "        i+=chunk_size\n",
+    "    return chunk_sr\n",
+    "\n",
+    "df[\"cleaned_text\"] = apply_tokenizer(df, 9)\n",
+    "df[\"cleaned_text\"].head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_titles = set([x['title'].strip() for x in train['data'] if len(x['title']) > 0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8492dc037ce24b2eb849ba28a7a64eb2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=324068), Label(value='0 / 324068')…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df[\"valid\"] = df['title'].parallel_apply(lambda x: str(x).strip() in train_titles)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df[df['valid']]\n",
+    "df.reset_index(drop=True, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(8782,)"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.title.unique().shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>text</th>\n",
+       "      <th>bm25_text</th>\n",
+       "      <th>cleaned_text</th>\n",
+       "      <th>valid</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Tiếng Việt</td>\n",
+       "      <td>Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...</td>\n",
+       "      <td>tiếng việt cũng gọi là tiếng việt nam hay việt...</td>\n",
+       "      <td>Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Tiếng Việt</td>\n",
+       "      <td>hệ thống thanh điệu phát triển cao hơn, hệ thố...</td>\n",
+       "      <td>hệ thống thanh điệu phát triển cao hơn hệ thốn...</td>\n",
+       "      <td>hệ thống thanh điệu phát triển cao hơn , hệ th...</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Tiếng Việt</td>\n",
+       "      <td>tiếp xúc Hán – Việt thành 2 giai đoạn chính: \\...</td>\n",
+       "      <td>tiếp xúc hán – việt thành 2 giai đoạn chính bu...</td>\n",
+       "      <td>tiếp xúc Hán – Việt thành 2 giai đoạn chính : ...</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Tiếng Việt</td>\n",
+       "      <td>thêm hàng loạt các yếu tố Hán–Việt. Như là \"ch...</td>\n",
+       "      <td>thêm hàng loạt các yếu tố hán–việt như là chủ ...</td>\n",
+       "      <td>thêm hàng loạt các yếu tố Hán–Việt . Như là \" ...</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Tiếng Việt</td>\n",
+       "      <td>tiếng Hán vẫn có ai đó chấp nhận và sử dụng tr...</td>\n",
+       "      <td>tiếng hán vẫn có ai đó chấp nhận và sử dụng tr...</td>\n",
+       "      <td>tiếng Hán vẫn có ai đó chấp nhận và sử dụng tr...</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        title                                               text  \\\n",
+       "0  Tiếng Việt  Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...   \n",
+       "1  Tiếng Việt  hệ thống thanh điệu phát triển cao hơn, hệ thố...   \n",
+       "2  Tiếng Việt  tiếp xúc Hán – Việt thành 2 giai đoạn chính: \\...   \n",
+       "3  Tiếng Việt  thêm hàng loạt các yếu tố Hán–Việt. Như là \"ch...   \n",
+       "4  Tiếng Việt  tiếng Hán vẫn có ai đó chấp nhận và sử dụng tr...   \n",
+       "\n",
+       "                                           bm25_text  \\\n",
+       "0  tiếng việt cũng gọi là tiếng việt nam hay việt...   \n",
+       "1  hệ thống thanh điệu phát triển cao hơn hệ thốn...   \n",
+       "2  tiếp xúc hán – việt thành 2 giai đoạn chính bu...   \n",
+       "3  thêm hàng loạt các yếu tố hán–việt như là chủ ...   \n",
+       "4  tiếng hán vẫn có ai đó chấp nhận và sử dụng tr...   \n",
+       "\n",
+       "                                        cleaned_text  valid  \n",
+       "0  Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...   True  \n",
+       "1  hệ thống thanh điệu phát triển cao hơn , hệ th...   True  \n",
+       "2  tiếp xúc Hán – Việt thành 2 giai đoạn chính : ...   True  \n",
+       "3  thêm hàng loạt các yếu tố Hán–Việt . Như là \" ...   True  \n",
+       "4  tiếng Hán vẫn có ai đó chấp nhận và sử dụng tr...   True  "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_df = pd.DataFrame()\n",
+    "train_df[\"id\"] = [x['id'] for x in train['data']]\n",
+    "train_df[\"title\"] = [x['title'] for x in train['data']]\n",
+    "train_df[\"text\"] = [x['text'] for x in train['data']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>title</th>\n",
+       "      <th>text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>718d41cd997b2b44b0685ac54aa55bd8</td>\n",
+       "      <td>Trung Quốc</td>\n",
+       "      <td>Thủ tướng Trung Quốc là nhân vật lãnh đạo chín...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>c926e7b0717202618a10dd907d4b4c39</td>\n",
+       "      <td></td>\n",
+       "      <td>có 23 quốc gia không có lực lượng quân đội, ba...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>d38ef5bf1fb82b410026ed82c8a44cae</td>\n",
+       "      <td>Raymondienne</td>\n",
+       "      <td>Raymondienne (hay Raymonde Dien) sinh ngày 13 ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>b6b5589a98fdccd208dc752bac853993</td>\n",
+       "      <td>Cúp cờ vua thế giới</td>\n",
+       "      <td>Cúp cờ vua thế giới là tên gọi một số giải đấu...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>82396a18fa9812bfec4d3ecb7ae60905</td>\n",
+       "      <td>Shkhara</td>\n",
+       "      <td>Đỉnh núi nằm ở phần trung tâm của dãy núi Đại ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20852</th>\n",
+       "      <td>508022f540c39fe31511f594748759bc</td>\n",
+       "      <td>Eros</td>\n",
+       "      <td>Trong thần thoại Hy Lạp , \" Eros \" là vị thần ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20853</th>\n",
+       "      <td>93c746695c50932ac45ac498a192a3e5</td>\n",
+       "      <td>Lịch sử hành chính Hà Nội</td>\n",
+       "      <td>Vào thời điểm hiện tại ( 2017 ) , về mặt hành ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20854</th>\n",
+       "      <td>c477d4b40045ee4251cf9b2a0482cfc4</td>\n",
+       "      <td>Nhật ký trong tù</td>\n",
+       "      <td>“ Nhật ký trong tù ” là một cuốn sổ tay nhỏ , ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20855</th>\n",
+       "      <td>278ad127825c085a54fa22116c281f92</td>\n",
+       "      <td>Google</td>\n",
+       "      <td>Tên miền www.google.com được đăng ký ngày 15 t...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20856</th>\n",
+       "      <td>09ee53a835ea4ed2234aee8161b16d87</td>\n",
+       "      <td>Dãy núi Hoàng Liên Sơn</td>\n",
+       "      <td>Dãy núi Hoàng Liên Sơn rộng 30 km, chạy dài 18...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>20857 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                     id                      title  \\\n",
+       "0      718d41cd997b2b44b0685ac54aa55bd8                 Trung Quốc   \n",
+       "1      c926e7b0717202618a10dd907d4b4c39                              \n",
+       "2      d38ef5bf1fb82b410026ed82c8a44cae               Raymondienne   \n",
+       "3      b6b5589a98fdccd208dc752bac853993        Cúp cờ vua thế giới   \n",
+       "4      82396a18fa9812bfec4d3ecb7ae60905                    Shkhara   \n",
+       "...                                 ...                        ...   \n",
+       "20852  508022f540c39fe31511f594748759bc                       Eros   \n",
+       "20853  93c746695c50932ac45ac498a192a3e5  Lịch sử hành chính Hà Nội   \n",
+       "20854  c477d4b40045ee4251cf9b2a0482cfc4           Nhật ký trong tù   \n",
+       "20855  278ad127825c085a54fa22116c281f92                     Google   \n",
+       "20856  09ee53a835ea4ed2234aee8161b16d87     Dãy núi Hoàng Liên Sơn   \n",
+       "\n",
+       "                                                    text  \n",
+       "0      Thủ tướng Trung Quốc là nhân vật lãnh đạo chín...  \n",
+       "1      có 23 quốc gia không có lực lượng quân đội, ba...  \n",
+       "2      Raymondienne (hay Raymonde Dien) sinh ngày 13 ...  \n",
+       "3      Cúp cờ vua thế giới là tên gọi một số giải đấu...  \n",
+       "4      Đỉnh núi nằm ở phần trung tâm của dãy núi Đại ...  \n",
+       "...                                                  ...  \n",
+       "20852  Trong thần thoại Hy Lạp , \" Eros \" là vị thần ...  \n",
+       "20853  Vào thời điểm hiện tại ( 2017 ) , về mặt hành ...  \n",
+       "20854  “ Nhật ký trong tù ” là một cuốn sổ tay nhỏ , ...  \n",
+       "20855  Tên miền www.google.com được đăng ký ngày 15 t...  \n",
+       "20856  Dãy núi Hoàng Liên Sơn rộng 30 km, chạy dài 18...  \n",
+       "\n",
+       "[20857 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(train_titles)\n",
+    "'Bánh gai Tứ Trụ' in list(train_titles - set(df['title'].unique())) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Nếu `title` của `train_df` không nằm trong `df` thì đó là cleaned `title` (nghĩa là chỉ có một kết quả), ngược lại nếu nằm trong `df` thì là dirty (có nhiều ứng viên cho kết quả dựa vào thuật toán ranked bm25). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_dirty_text(title, text):\n",
+    "    text= \" \".join(word_tokenize(strip_context(text)))\n",
+    "    if text[-1] == \".\":\n",
+    "        text = text[:-1].strip()\n",
+    "    tmp = df[df.title == title].reset_index(drop=True)\n",
+    "    if len(tmp) == 0:\n",
+    "#         print(title)\n",
+    "        return None\n",
+    "    for candidate in tmp.cleaned_text:\n",
+    "        if text in candidate:\n",
+    "#             print(\"Found dirty\")\n",
+    "            return candidate\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cad2c798e15e47f09d4238f0280e35c9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3477), Label(value='0 / 3477'))), …"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "train_df[\"dirty_text\"] = train_df.parallel_apply(lambda row: find_dirty_text(row.title, row.text),axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>title</th>\n",
+       "      <th>text</th>\n",
+       "      <th>dirty_text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>718d41cd997b2b44b0685ac54aa55bd8</td>\n",
+       "      <td>Trung Quốc</td>\n",
+       "      <td>Thủ tướng Trung Quốc là nhân vật lãnh đạo chín...</td>\n",
+       "      <td>Thủ tướng Trung Quốc là nhân vật lãnh đạo chín...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>c926e7b0717202618a10dd907d4b4c39</td>\n",
+       "      <td></td>\n",
+       "      <td>có 23 quốc gia không có lực lượng quân đội, ba...</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>d38ef5bf1fb82b410026ed82c8a44cae</td>\n",
+       "      <td>Raymondienne</td>\n",
+       "      <td>Raymondienne (hay Raymonde Dien) sinh ngày 13 ...</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>b6b5589a98fdccd208dc752bac853993</td>\n",
+       "      <td>Cúp cờ vua thế giới</td>\n",
+       "      <td>Cúp cờ vua thế giới là tên gọi một số giải đấu...</td>\n",
+       "      <td>Cúp cờ vua thế giới là tên gọi một số giải đấu...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>82396a18fa9812bfec4d3ecb7ae60905</td>\n",
+       "      <td>Shkhara</td>\n",
+       "      <td>Đỉnh núi nằm ở phần trung tâm của dãy núi Đại ...</td>\n",
+       "      <td>Shkhara ( ) là núi cao thứ ba trong dãy núi Ka...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20852</th>\n",
+       "      <td>508022f540c39fe31511f594748759bc</td>\n",
+       "      <td>Eros</td>\n",
+       "      <td>Trong thần thoại Hy Lạp , \" Eros \" là vị thần ...</td>\n",
+       "      <td>Eros Trong thần thoại Hy Lạp , \" Eros \" là vị ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20853</th>\n",
+       "      <td>93c746695c50932ac45ac498a192a3e5</td>\n",
+       "      <td>Lịch sử hành chính Hà Nội</td>\n",
+       "      <td>Vào thời điểm hiện tại ( 2017 ) , về mặt hành ...</td>\n",
+       "      <td>Vào thời điểm hiện tại ( 2017 ) , về mặt hành ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20854</th>\n",
+       "      <td>c477d4b40045ee4251cf9b2a0482cfc4</td>\n",
+       "      <td>Nhật ký trong tù</td>\n",
+       "      <td>“ Nhật ký trong tù ” là một cuốn sổ tay nhỏ , ...</td>\n",
+       "      <td>“ Nhật ký trong tù ” là một cuốn sổ tay nhỏ , ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20855</th>\n",
+       "      <td>278ad127825c085a54fa22116c281f92</td>\n",
+       "      <td>Google</td>\n",
+       "      <td>Tên miền www.google.com được đăng ký ngày 15 t...</td>\n",
+       "      <td>Tên miền www.google.com được đăng ký ngày 15 t...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20856</th>\n",
+       "      <td>09ee53a835ea4ed2234aee8161b16d87</td>\n",
+       "      <td>Dãy núi Hoàng Liên Sơn</td>\n",
+       "      <td>Dãy núi Hoàng Liên Sơn rộng 30 km, chạy dài 18...</td>\n",
+       "      <td>Dãy núi Hoàng Liên Sơn rộng 30 km , chạy dài 1...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>20857 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                     id                      title  \\\n",
+       "0      718d41cd997b2b44b0685ac54aa55bd8                 Trung Quốc   \n",
+       "1      c926e7b0717202618a10dd907d4b4c39                              \n",
+       "2      d38ef5bf1fb82b410026ed82c8a44cae               Raymondienne   \n",
+       "3      b6b5589a98fdccd208dc752bac853993        Cúp cờ vua thế giới   \n",
+       "4      82396a18fa9812bfec4d3ecb7ae60905                    Shkhara   \n",
+       "...                                 ...                        ...   \n",
+       "20852  508022f540c39fe31511f594748759bc                       Eros   \n",
+       "20853  93c746695c50932ac45ac498a192a3e5  Lịch sử hành chính Hà Nội   \n",
+       "20854  c477d4b40045ee4251cf9b2a0482cfc4           Nhật ký trong tù   \n",
+       "20855  278ad127825c085a54fa22116c281f92                     Google   \n",
+       "20856  09ee53a835ea4ed2234aee8161b16d87     Dãy núi Hoàng Liên Sơn   \n",
+       "\n",
+       "                                                    text  \\\n",
+       "0      Thủ tướng Trung Quốc là nhân vật lãnh đạo chín...   \n",
+       "1      có 23 quốc gia không có lực lượng quân đội, ba...   \n",
+       "2      Raymondienne (hay Raymonde Dien) sinh ngày 13 ...   \n",
+       "3      Cúp cờ vua thế giới là tên gọi một số giải đấu...   \n",
+       "4      Đỉnh núi nằm ở phần trung tâm của dãy núi Đại ...   \n",
+       "...                                                  ...   \n",
+       "20852  Trong thần thoại Hy Lạp , \" Eros \" là vị thần ...   \n",
+       "20853  Vào thời điểm hiện tại ( 2017 ) , về mặt hành ...   \n",
+       "20854  “ Nhật ký trong tù ” là một cuốn sổ tay nhỏ , ...   \n",
+       "20855  Tên miền www.google.com được đăng ký ngày 15 t...   \n",
+       "20856  Dãy núi Hoàng Liên Sơn rộng 30 km, chạy dài 18...   \n",
+       "\n",
+       "                                              dirty_text  \n",
+       "0      Thủ tướng Trung Quốc là nhân vật lãnh đạo chín...  \n",
+       "1                                                   None  \n",
+       "2                                                   None  \n",
+       "3      Cúp cờ vua thế giới là tên gọi một số giải đấu...  \n",
+       "4      Shkhara ( ) là núi cao thứ ba trong dãy núi Ka...  \n",
+       "...                                                  ...  \n",
+       "20852  Eros Trong thần thoại Hy Lạp , \" Eros \" là vị ...  \n",
+       "20853  Vào thời điểm hiện tại ( 2017 ) , về mặt hành ...  \n",
+       "20854  “ Nhật ký trong tù ” là một cuốn sổ tay nhỏ , ...  \n",
+       "20855  Tên miền www.google.com được đăng ký ngày 15 t...  \n",
+       "20856  Dãy núi Hoàng Liên Sơn rộng 30 km , chạy dài 1...  \n",
+       "\n",
+       "[20857 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 53,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>title</th>\n",
+       "      <th>text</th>\n",
+       "      <th>dirty_text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>c926e7b0717202618a10dd907d4b4c39</td>\n",
+       "      <td></td>\n",
+       "      <td>có 23 quốc gia không có lực lượng quân đội, ba...</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>d38ef5bf1fb82b410026ed82c8a44cae</td>\n",
+       "      <td>Raymondienne</td>\n",
+       "      <td>Raymondienne (hay Raymonde Dien) sinh ngày 13 ...</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>361a30769c1a5dca2a7b8f5c7f601982</td>\n",
+       "      <td>Bánh gai Tứ Trụ</td>\n",
+       "      <td>Làng Mía thuộc hữu ngạn sông Chu, cách thị trấ...</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>5d7d3b0d5cd7b2917a2081c3b4d7c8a8</td>\n",
+       "      <td>Nhà nước Hồi giáo ( chính thể )</td>\n",
+       "      <td>Khái niệm về nhà nước Hồi giáo hiện đại đã đượ...</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>d7000d411d3f04dee7fa327ef11e3db0</td>\n",
+       "      <td>Quan hệ Israel – Liban</td>\n",
+       "      <td>Thủ tướng Liban Fouad Siniora cho biết vào thá...</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20804</th>\n",
+       "      <td>cb4d3b162a56b52a4927ab29b739c7a2</td>\n",
+       "      <td>Thủy điện An Khê - Kanak</td>\n",
+       "      <td>Nhà máy được khởi công xây dựng vào ngày 14 th...</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20834</th>\n",
+       "      <td>d701dfcba59420604c66e803c9556754</td>\n",
+       "      <td>Sao Hoả</td>\n",
+       "      <td>Sao Hoả còn gọi là : Hoả Tinh , ( Tiếng Anh : ...</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20836</th>\n",
+       "      <td>5769cee0487a9674306d271854244b39</td>\n",
+       "      <td></td>\n",
+       "      <td>Máy quay phim: Phát minh kỳ diệu của anh em Lu...</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20843</th>\n",
+       "      <td>5f420b6d68436b684f33dcded61d5a3c</td>\n",
+       "      <td>Egil Kapstad</td>\n",
+       "      <td>Ông sinh ra tại Oslo, thủ đô của Na Uy và thàn...</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20846</th>\n",
+       "      <td>323218bcf129df929716884a1a89e2e3</td>\n",
+       "      <td></td>\n",
+       "      <td>Vật lý học (tiếng Anh: Physics, từ tiếng Hy Lạ...</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3498 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                     id                            title  \\\n",
+       "1      c926e7b0717202618a10dd907d4b4c39                                    \n",
+       "2      d38ef5bf1fb82b410026ed82c8a44cae                     Raymondienne   \n",
+       "14     361a30769c1a5dca2a7b8f5c7f601982                  Bánh gai Tứ Trụ   \n",
+       "18     5d7d3b0d5cd7b2917a2081c3b4d7c8a8  Nhà nước Hồi giáo ( chính thể )   \n",
+       "20     d7000d411d3f04dee7fa327ef11e3db0           Quan hệ Israel – Liban   \n",
+       "...                                 ...                              ...   \n",
+       "20804  cb4d3b162a56b52a4927ab29b739c7a2         Thủy điện An Khê - Kanak   \n",
+       "20834  d701dfcba59420604c66e803c9556754                          Sao Hoả   \n",
+       "20836  5769cee0487a9674306d271854244b39                                    \n",
+       "20843  5f420b6d68436b684f33dcded61d5a3c                     Egil Kapstad   \n",
+       "20846  323218bcf129df929716884a1a89e2e3                                    \n",
+       "\n",
+       "                                                    text dirty_text  \n",
+       "1      có 23 quốc gia không có lực lượng quân đội, ba...       None  \n",
+       "2      Raymondienne (hay Raymonde Dien) sinh ngày 13 ...       None  \n",
+       "14     Làng Mía thuộc hữu ngạn sông Chu, cách thị trấ...       None  \n",
+       "18     Khái niệm về nhà nước Hồi giáo hiện đại đã đượ...       None  \n",
+       "20     Thủ tướng Liban Fouad Siniora cho biết vào thá...       None  \n",
+       "...                                                  ...        ...  \n",
+       "20804  Nhà máy được khởi công xây dựng vào ngày 14 th...       None  \n",
+       "20834  Sao Hoả còn gọi là : Hoả Tinh , ( Tiếng Anh : ...       None  \n",
+       "20836  Máy quay phim: Phát minh kỳ diệu của anh em Lu...       None  \n",
+       "20843  Ông sinh ra tại Oslo, thủ đô của Na Uy và thàn...       None  \n",
+       "20846  Vật lý học (tiếng Anh: Physics, từ tiếng Hy Lạ...       None  \n",
+       "\n",
+       "[3498 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_df[train_df.dirty_text.isna()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dirty_text_dict = dict([x,y] for x,y in zip(train_df.id, train_df.dirty_text))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for x in train['data']:\n",
+    "    x['dirty_text'] = dirty_text_dict.get(x['id'],x['text'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json.dump(train,open(\"../data/processed/zac2022_train_merged_final.json\",\"wt\"))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

notebooks/0.2-create-stage1-ranking.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/0.3-create-stage2-ranking.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"cells":[{"attachments":{},"cell_type":"markdown","id":"770bfcdb","metadata":{},"source":["### Kaggle link: https://www.kaggle.com/code/noobhocai/0-3-create-stage2-ranking"]},{"cell_type":"code","execution_count":2,"id":"646a64db","metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:45:51.815046Z","iopub.status.busy":"2023-06-19T04:45:51.814713Z","iopub.status.idle":"2023-06-19T04:47:17.791641Z","shell.execute_reply":"2023-06-19T04:47:17.790706Z","shell.execute_reply.started":"2023-06-19T04:45:51.815020Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Collecting pyspark\n"," Downloading pyspark-3.4.0.tar.gz (310.8 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.8/310.8 MB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n","\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n","\u001b[?25hRequirement already satisfied: py4j==0.10.9.7 in /opt/conda/lib/python3.10/site-packages (from pyspark) (0.10.9.7)\n","Building wheels for collected packages: pyspark\n"," Building wheel for pyspark (setup.py) ... \u001b[?25ldone\n","\u001b[?25h Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317146 sha256=932384efffaa19a196f9bb66e53de64eb278df10f2ddf3186116516b1fff67b5\n"," Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327\n","Successfully built pyspark\n","Installing collected packages: pyspark\n","Successfully installed pyspark-3.4.0\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0mRequirement already satisfied: pyarrow in /opt/conda/lib/python3.10/site-packages (9.0.0)\n","Requirement already satisfied: numpy>=1.16.6 in /opt/conda/lib/python3.10/site-packages (from pyarrow) (1.23.5)\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0mRequirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.2.4)\n","Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from nltk) (1.16.0)\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0mCollecting tdqm\n"," Downloading tdqm-0.0.1.tar.gz (1.4 kB)\n"," Preparing metadata (setup.py) ... \u001b[?25ldone\n","\u001b[?25hRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from tdqm) (4.64.1)\n","Building wheels for collected packages: tdqm\n"," Building wheel for tdqm (setup.py) ... \u001b[?25ldone\n","\u001b[?25h Created wheel for tdqm: filename=tdqm-0.0.1-py3-none-any.whl size=1322 sha256=98dae54af0bf0a85639c88412f77b6a49d485bbc930a83a31edfb2dbc709a9ff\n"," Stored in directory: /root/.cache/pip/wheels/37/31/b8/7b711038035720ba0df14376af06e5e76b9bd61759c861ad92\n","Successfully built tdqm\n","Installing collected packages: tdqm\n","Successfully installed tdqm-0.0.1\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0mCollecting rank_bm25\n"," Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)\n","Requirement already satisfied: numpy in /opt/conda/lib/python3.10/site-packages (from rank_bm25) (1.23.5)\n","Installing collected packages: rank_bm25\n","Successfully installed rank_bm25-0.2.2\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0mCollecting pyvi\n"," Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.5/8.5 MB\u001b[0m \u001b[31m57.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n","\u001b[?25hRequirement already satisfied: scikit-learn in /opt/conda/lib/python3.10/site-packages (from pyvi) (1.2.2)\n","Collecting sklearn-crfsuite (from pyvi)\n"," Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)\n","Requirement already satisfied: numpy>=1.17.3 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->pyvi) (1.23.5)\n","Requirement already satisfied: scipy>=1.3.2 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->pyvi) (1.10.1)\n","Requirement already satisfied: joblib>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->pyvi) (1.2.0)\n","Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->pyvi) (3.1.0)\n","Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite->pyvi)\n"," Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m993.5/993.5 kB\u001b[0m \u001b[31m55.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from sklearn-crfsuite->pyvi) (1.16.0)\n","Requirement already satisfied: tabulate in /opt/conda/lib/python3.10/site-packages (from sklearn-crfsuite->pyvi) (0.9.0)\n","Requirement already satisfied: tqdm>=2.0 in /opt/conda/lib/python3.10/site-packages (from sklearn-crfsuite->pyvi) (4.64.1)\n","Installing collected packages: python-crfsuite, sklearn-crfsuite, pyvi\n","Successfully installed python-crfsuite-0.9.9 pyvi-0.1.1 sklearn-crfsuite-0.3.6\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install pyspark\n","!pip install pyarrow\n","!pip install nltk\n","!pip install tdqm\n","!pip install rank_bm25\n","# !pip install pandarallel\n","!pip install pyvi"]},{"cell_type":"code","execution_count":3,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:17.794768Z","iopub.status.busy":"2023-06-19T04:47:17.794437Z","iopub.status.idle":"2023-06-19T04:47:17.869550Z","shell.execute_reply":"2023-06-19T04:47:17.868392Z","shell.execute_reply.started":"2023-06-19T04:47:17.794729Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["3.4.0\n"]}],"source":["import pyspark\n","print(pyspark.__version__)"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:17.873676Z","iopub.status.busy":"2023-06-19T04:47:17.873324Z","iopub.status.idle":"2023-06-19T04:47:25.790613Z","shell.execute_reply":"2023-06-19T04:47:25.789712Z","shell.execute_reply.started":"2023-06-19T04:47:17.873647Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["/opt/conda/lib/python3.10/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n"," warnings.warn(\n"]},{"name":"stdout","output_type":"stream","text":["[nltk_data] Downloading package punkt to /usr/share/nltk_data...\n","[nltk_data] Package punkt is already up-to-date!\n"]},{"name":"stderr","output_type":"stream","text":["Setting default log level to \"WARN\".\n","To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n","23/06/19 04:47:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"]},{"data":{"text/html":["\n"," <div>\n"," <p><b>SparkSession - in-memory</b></p>\n"," \n"," <div>\n"," <p><b>SparkContext</b></p>\n","\n"," <p><a href=\"http://e65557ad95ad:4040\">Spark UI</a></p>\n","\n"," <dl>\n"," <dt>Version</dt>\n"," <dd><code>v3.4.0</code></dd>\n"," <dt>Master</dt>\n"," <dd><code>local[*]</code></dd>\n"," <dt>AppName</dt>\n"," <dd><code>Pyspark</code></dd>\n"," </dl>\n"," </div>\n"," \n"," </div>\n"," "],"text/plain":["<pyspark.sql.session.SparkSession at 0x7e77e1486620>"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["import json\n","import numpy as np\n","import pandas as pd\n","from tqdm.auto import tqdm\n","tqdm.pandas()\n","import pyspark.pandas as ps\n","import re\n","import nltk\n","nltk.download('punkt')\n","from nltk import word_tokenize as lib_tokenizer \n","from pyvi.ViTokenizer import tokenize\n","import string\n","from rank_bm25 import BM25Okapi\n","# from pandarallel import pandarallel\n","\n","import os\n","os.environ[\"PYARROW_IGNORE_TIMEZONE\"] = \"1\"\n","import pyspark as pyspark\n","from pyspark.sql import SparkSession\n","from pyspark.sql.functions import udf, col, lit, lower\n","from pyspark.sql.types import ArrayType, StringType, FloatType, IntegerType\n","MAX_MEMORY = '15G'\n","# Initialize a spark session.\n","# spark.stop()\n","conf = pyspark.SparkConf().setMaster(\"local[*]\") \\\n"," .set('spark.executor.heartbeatInterval', 10000) \\\n"," .set('spark.network.timeout', 10000) \\\n"," .set(\"spark.core.connection.ack.wait.timeout\", \"3600\") \\\n"," .set(\"spark.executor.memory\", MAX_MEMORY) \\\n"," .set(\"spark.driver.memory\", MAX_MEMORY) \\\n"," .set(\"spark.driver.maxResultSize\", \"4g\") \\\n"," .set(\"spark.sql.execution.arrow.pyspark.enabled\", \"true\")\n","spark = SparkSession.builder.appName(\"Pyspark\").config(conf=conf).getOrCreate()\n","spark"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:25.792027Z","iopub.status.busy":"2023-06-19T04:47:25.791731Z","iopub.status.idle":"2023-06-19T04:47:25.799813Z","shell.execute_reply":"2023-06-19T04:47:25.796610Z","shell.execute_reply.started":"2023-06-19T04:47:25.791998Z"},"trusted":true},"outputs":[],"source":["# pandarallel.initialize(progress_bar=True, use_memory_fs=False)"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:25.803740Z","iopub.status.busy":"2023-06-19T04:47:25.802887Z","iopub.status.idle":"2023-06-19T04:47:26.433056Z","shell.execute_reply":"2023-06-19T04:47:26.431337Z","shell.execute_reply.started":"2023-06-19T04:47:25.803711Z"},"trusted":true},"outputs":[],"source":["data = json.load(open(\"/kaggle/input/e2eqa-wiki-zalo-ai/e2eqa-trainpublic_test-v1/e2eqa-train+public_test-v1/zac2022_train_merged_final.json\"))\n","data = [x for x in data['data'] if x[\"category\"] == \"FULL_ANNOTATION\" and \"wiki/\" in x[\"answer\"]]"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:26.435414Z","iopub.status.busy":"2023-06-19T04:47:26.435040Z","iopub.status.idle":"2023-06-19T04:47:39.021537Z","shell.execute_reply":"2023-06-19T04:47:39.020732Z","shell.execute_reply.started":"2023-06-19T04:47:26.435385Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":[" \r"]},{"data":{"text/plain":["pyspark.sql.dataframe.DataFrame"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["df_wiki = spark.read.json(path = \"/kaggle/input/e2eqa-wiki-zalo-ai/wikipedia_20220620_cleaned/wikipedia_20220620_cleaned.jsonl\")\n","# df_wiki = df_wiki.cache().pandas_api()\n","type(df_wiki)"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:39.022723Z","iopub.status.busy":"2023-06-19T04:47:39.022452Z","iopub.status.idle":"2023-06-19T04:47:39.454049Z","shell.execute_reply":"2023-06-19T04:47:39.453277Z","shell.execute_reply.started":"2023-06-19T04:47:39.022697Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["+---+--------+--------------------+--------------------+----------------+--------------------+\n","| id| revid| text| timestamp| title| url|\n","+---+--------+--------------------+--------------------+----------------+--------------------+\n","| 2|68591979|Trang Chính\\n\\n<t...|2022-05-12T12:46:53Z| Trang Chính|https://vi.wikipe...|\n","| 4|67988747|Internet Society\\...|2022-01-20T07:59:10Z|Internet Society|https://vi.wikipe...|\n","| 13|68660631|Tiếng Việt\\n\\nTiế...|2022-05-29T03:42:42Z| Tiếng Việt|https://vi.wikipe...|\n","| 24|68482118|Ohio\\n\\nOhio (viế...|2022-04-17T08:15:22Z| Ohio|https://vi.wikipe...|\n","| 26|68738039|California\\n\\nCal...|2022-06-16T15:27:07Z| California|https://vi.wikipe...|\n","+---+--------+--------------------+--------------------+----------------+--------------------+\n","only showing top 5 rows\n","\n"]}],"source":["df_wiki.show(5)"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:39.455216Z","iopub.status.busy":"2023-06-19T04:47:39.454950Z","iopub.status.idle":"2023-06-19T04:47:40.096642Z","shell.execute_reply":"2023-06-19T04:47:40.095084Z","shell.execute_reply.started":"2023-06-19T04:47:39.455192Z"},"trusted":true},"outputs":[{"data":{"text/plain":["[Table(name='df_wiki_temp', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]"]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["df_wiki.createOrReplaceTempView('df_wiki_temp')\n","spark.catalog.listTables()"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:40.097915Z","iopub.status.busy":"2023-06-19T04:47:40.097624Z","iopub.status.idle":"2023-06-19T04:47:40.130812Z","shell.execute_reply":"2023-06-19T04:47:40.129860Z","shell.execute_reply.started":"2023-06-19T04:47:40.097890Z"},"trusted":true},"outputs":[],"source":["def get_topk(query, topk=100):\n"," tokenized_query = query.split()\n"," tfidf_query = dictionary.doc2bow(tokenized_query)\n"," scores = bm25_index[tfidf_query]\n"," top_n = np.argsort(scores)[::-1][:topk]\n"," titles = [df_wiki.filter(col(\"id\") == i).select(\"title\").collect()[0][0] for i in top_n]\n"," texts = [df_wiki.filter(col(\"id\") == i).select(\"text\").collect()[0][0] for i in top_n]\n"," scores = [scores[i] for i in top_n]\n"," result = \"\\n\".join([f\"{title}\\t{text}\\t{score}\" for title, text, score in zip(titles, texts, scores)])\n"," return result\n","\n","get_topk_udf = udf(get_topk, StringType())\n","\n","dict_map = {}\n","\n","def word_tokenize(text): \n"," global dict_map\n"," words = text.split() \n"," words_norm = [] \n"," for w in words: \n"," if w not in dict_map: \n"," dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '\"').replace(\"''\", '\"') \n"," words_norm.append(dict_map[w]) \n"," return words_norm \n","\n","word_tokenize_udf = udf(word_tokenize, ArrayType(StringType()))\n","\n","def strip_answer_string(text):\n"," text = text.strip() \n"," while text[-1] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n"," if text[0] != '(' and text[-1] == ')' and '(' in text: \n"," break \n"," if text[-1] == '\"' and text[0] != '\"' and text.count('\"') > 1: \n"," break \n"," text = text[:-1].strip() \n"," while text[0] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n"," if text[0] == '\"' and text[-1] != '\"' and text.count('\"') > 1: \n"," break \n"," text = text[1:].strip() \n"," text = text.strip() \n"," return text \n","\n","strip_answer_string_udf = udf(strip_answer_string, StringType())\n","\n","def strip_context(text):\n"," text = text.replace('\\n', ' ') # thay kí tự kết thúc bằng khoảng trắng\n"," text = re.sub(r'\\s+', ' ', text) # thay khoảng trắng lớn hơn 1 kí tự thành khoảng trắng 1 kí tự\n"," text = text.strip() # xoá khoảng trắng ở đầu và cuối \n"," return text \n","\n","strip_context_udf = udf(strip_context, StringType())\n","\n","def post_process(x):\n"," x = x.lower()\n"," x = \" \".join(word_tokenize(strip_context(x))).strip()\n"," x = x.replace(\"\\n\",\" \")\n"," x = \"\".join([i for i in x if i not in string.punctuation])\n"," x = \" \".join(x.split()[:128])\n"," return x \n","\n","post_process_udf = udf(post_process, StringType())"]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:40.137361Z","iopub.status.busy":"2023-06-19T04:47:40.134954Z","iopub.status.idle":"2023-06-19T04:47:42.638078Z","shell.execute_reply":"2023-06-19T04:47:42.637255Z","shell.execute_reply.started":"2023-06-19T04:47:40.137321Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["[Stage 4:> (0 + 1) / 1]\r"]},{"name":"stdout","output_type":"stream","text":["+---+--------+--------------------+--------------------+----------------+--------------------+----------------+--------------------+\n","| id| revid| text| timestamp| title| url| title_lower| text_lower|\n","+---+--------+--------------------+--------------------+----------------+--------------------+----------------+--------------------+\n","| 2|68591979|Trang Chính\\n\\n<t...|2022-05-12T12:46:53Z| Trang Chính|https://vi.wikipe...| trang chính|trang chính templ...|\n","| 4|67988747|Internet Society\\...|2022-01-20T07:59:10Z|Internet Society|https://vi.wikipe...|internet society|internet society ...|\n","| 13|68660631|Tiếng Việt\\n\\nTiế...|2022-05-29T03:42:42Z| Tiếng Việt|https://vi.wikipe...| tiếng việt|tiếng việt tiếng ...|\n","| 24|68482118|Ohio\\n\\nOhio (viế...|2022-04-17T08:15:22Z| Ohio|https://vi.wikipe...| ohio|ohio ohio viết tắ...|\n","| 26|68738039|California\\n\\nCal...|2022-06-16T15:27:07Z| California|https://vi.wikipe...| california|california califo...|\n","+---+--------+--------------------+--------------------+----------------+--------------------+----------------+--------------------+\n","only showing top 5 rows\n","\n"]},{"name":"stderr","output_type":"stream","text":[" \r"]}],"source":["df_wiki = df_wiki.withColumn(\"title_lower\", post_process_udf(col(\"title\")))\n","df_wiki = df_wiki.withColumn(\"text_lower\", post_process_udf(col(\"text\")))\n","df_wiki.show(5)"]},{"cell_type":"code","execution_count":12,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:42.639237Z","iopub.status.busy":"2023-06-19T04:47:42.638940Z","iopub.status.idle":"2023-06-19T04:47:42.674851Z","shell.execute_reply":"2023-06-19T04:47:42.674065Z","shell.execute_reply.started":"2023-06-19T04:47:42.639213Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["== Physical Plan ==\n","*(1) Project [id#8, revid#9, text#10, timestamp#11, title#12, url#13, pythonUDF0#144 AS title_lower#85, pythonUDF1#145 AS text_lower#94]\n","+- BatchEvalPython [post_process(title#12)#84, post_process(text#10)#93], [pythonUDF0#144, pythonUDF1#145]\n"," +- FileScan json [id#8,revid#9,text#10,timestamp#11,title#12,url#13] Batched: false, DataFilters: [], Format: JSON, Location: InMemoryFileIndex(1 paths)[file:/kaggle/input/e2eqa-wiki-zalo-ai/wikipedia_20220620_cleaned/wikip..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:string,revid:string,text:string,timestamp:string,title:string,url:string>\n","\n","\n"]}],"source":["df_wiki.explain()"]},{"cell_type":"markdown","metadata":{},"source":["### Đọc file __stopwords__"]},{"cell_type":"code","execution_count":13,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:42.675987Z","iopub.status.busy":"2023-06-19T04:47:42.675714Z","iopub.status.idle":"2023-06-19T04:47:42.710651Z","shell.execute_reply":"2023-06-19T04:47:42.709880Z","shell.execute_reply.started":"2023-06-19T04:47:42.675963Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 mà_thôi\n","1 tuy_có\n","2 đâu_như\n","3 quay_bước\n","4 hỏi_lại\n"," ... \n","1937 cái_họ\n","1938 hết\n","1939 lại\n","1940 thế_thế\n","1941 dần_dần\n","Name: stopwords, Length: 1942, dtype: object"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["filename = '/kaggle/input/e2eqa-wiki-zalo-ai/external/stopwords.csv'\n","stop = pd.read_csv(filename, sep=\"\\t\", encoding='utf-8')\n","list_stopwords = stop['stopwords']\n","list_stopwords"]},{"cell_type":"markdown","metadata":{},"source":["---\n","---"]},{"cell_type":"code","execution_count":14,"metadata":{"_kg_hide-output":true,"execution":{"iopub.execute_input":"2023-06-19T04:47:42.712418Z","iopub.status.busy":"2023-06-19T04:47:42.711650Z","iopub.status.idle":"2023-06-19T04:47:44.373278Z","shell.execute_reply":"2023-06-19T04:47:44.372435Z","shell.execute_reply.started":"2023-06-19T04:47:42.712390Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["[Stage 5:> (0 + 1) / 1]\r"]},{"name":"stdout","output_type":"stream","text":["+--------------------+\n","| words|\n","+--------------------+\n","|[trang, chính, te...|\n","|[internet, societ...|\n","|[tiếng, việt, tiế...|\n","|[ohio, ohio, viết...|\n","|[california, cali...|\n","+--------------------+\n","only showing top 5 rows\n","\n"]},{"name":"stderr","output_type":"stream","text":[" \r"]}],"source":["# Tokenize text into words\n","def tokenizer(text):\n"," tokens = [word for word in tokenize(text).split() if word not in list_stopwords]\n"," return tokens\n","\n","tokenize_udf = udf(tokenizer, ArrayType(StringType()))\n","df_wiki = df_wiki.withColumn(\"words\", tokenize_udf(col(\"text_lower\")))\n","df_wiki.select(\"words\").show(5)"]},{"cell_type":"markdown","metadata":{},"source":["### Tần suất các từ trong texts"]},{"cell_type":"code","execution_count":15,"metadata":{"_kg_hide-input":false,"_kg_hide-output":false,"execution":{"iopub.execute_input":"2023-06-19T04:47:44.379705Z","iopub.status.busy":"2023-06-19T04:47:44.378439Z","iopub.status.idle":"2023-06-19T05:06:28.712339Z","shell.execute_reply":"2023-06-19T05:06:28.710514Z","shell.execute_reply.started":"2023-06-19T04:47:44.379637Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["[Stage 8:> (0 + 4) / 4]\r"]},{"name":"stdout","output_type":"stream","text":["+--------+-------+\n","| word| count|\n","+--------+-------+\n","| là|1858454|\n","| loài|1385128|\n","| một|1365013|\n","| trong|1046730|\n","| năm|1034325|\n","| được| 923562|\n","| họ| 846094|\n","| này| 726077|\n","| bullet| 671970|\n","| của| 656976|\n","| có| 602435|\n","| và| 578809|\n","|khoa_học| 484960|\n","|đầu_tiên| 381971|\n","| ở| 373949|\n","| hoa| 352879|\n","| thuộc| 339118|\n","| người| 321510|\n","| các| 308467|\n","|thực_vật| 287653|\n","+--------+-------+\n","only showing top 20 rows\n","\n"]},{"name":"stderr","output_type":"stream","text":[" \r"]}],"source":["from pyspark.sql import functions as F\n","# Increase the number of partitions\n","# df_wiki = df_wiki.repartition(100)\n","\n","# Cache the DataFrame in memory\n","df_wiki.cache()\n","\n","# Perform the computation\n","result = (\n"," df_wiki\n"," .select(F.explode(\"words\").alias(\"word\"))\n"," .groupBy(\"word\")\n"," .count()\n"," .orderBy(F.desc(\"count\"))\n",")\n","\n","# Show the results\n","result.show()"]},{"cell_type":"markdown","metadata":{},"source":["---\n","---"]},{"cell_type":"markdown","metadata":{},"source":["#### Map title with answer"]},{"cell_type":"code","execution_count":16,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:06:28.718426Z","iopub.status.busy":"2023-06-19T05:06:28.717581Z","iopub.status.idle":"2023-06-19T05:07:05.169160Z","shell.execute_reply":"2023-06-19T05:07:05.168170Z","shell.execute_reply.started":"2023-06-19T05:06:28.718393Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":[" \r"]}],"source":["from pyspark.sql import Window\n","# add index column\n","df_wiki = df_wiki.rdd.zipWithIndex().toDF([\"data\", \"index\"]).select(\"data.*\", \"index\")"]},{"cell_type":"code","execution_count":17,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:07:05.170729Z","iopub.status.busy":"2023-06-19T05:07:05.170405Z","iopub.status.idle":"2023-06-19T05:08:02.748141Z","shell.execute_reply":"2023-06-19T05:08:02.746532Z","shell.execute_reply.started":"2023-06-19T05:07:05.170695Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":[" \r"]},{"name":"stdout","output_type":"stream","text":["Cù Lao Chàm\n","nhà Hán\n","bán đảo Ả Rập\n"]}],"source":["from pyspark.sql.functions import trim, lower, regexp_replace\n","\n","# Create a dictionary mapping titles to indices\n","title2idx = (\n"," df_wiki\n"," .select(trim(\"title\").alias(\"title\"), \"index\")\n"," .groupBy(\"title\")\n"," .agg({\"index\": \"first\"})\n"," .rdd\n"," .collectAsMap()\n",")\n","\n","# Loop over data and check if answer is in the dictionary\n","for x in data:\n"," answer = x['answer'].replace(\"wiki/\", \"\").replace(\"_\", \" \")\n"," # Check if the cleaned up title is in the dictionary\n"," if answer not in title2idx.keys():\n"," print(answer)\n"," for key, val in title2idx.items():\n"," if answer.lower() == key.lower():\n"," x['answer'] = \"wiki/\" + key.replace(\" \", \"_\")\n"," "]},{"cell_type":"code","execution_count":18,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:08:02.753058Z","iopub.status.busy":"2023-06-19T05:08:02.752193Z","iopub.status.idle":"2023-06-19T05:08:02.764393Z","shell.execute_reply":"2023-06-19T05:08:02.763347Z","shell.execute_reply.started":"2023-06-19T05:08:02.753016Z"},"trusted":true},"outputs":[{"data":{"text/plain":["(1849, 4608)"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["l = []\n","for x in data:\n"," l.append(x['answer'])\n","len(set(l)), len(l)"]},{"cell_type":"markdown","metadata":{},"source":["#### Lọc các stopwords ra khỏi answer"]},{"cell_type":"code","execution_count":19,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:08:02.767039Z","iopub.status.busy":"2023-06-19T05:08:02.766644Z","iopub.status.idle":"2023-06-19T05:08:02.780001Z","shell.execute_reply":"2023-06-19T05:08:02.778564Z","shell.execute_reply.started":"2023-06-19T05:08:02.767009Z"},"trusted":true},"outputs":[],"source":["# from pyspark.ml.feature import StopWordsRemover\n","# # Tokenize and remove stop words\n","# remover = StopWordsRemover(inputCol=\"title_lower\", outputCol=\"tokens\")\n","# tokenized_df = remover.transform(df_wiki.select(\"title_lower\"))\n","\n","# # Convert tokens to a list of strings\n","# to_list = udf(lambda x: x, ArrayType(StringType()))\n","# tokenized_df = tokenized_df.withColumn(\"token_list\", to_list(\"tokens\"))\n","\n","# # Compute TF-IDF vectors\n","# cv = CountVectorizer(inputCol=\"token_list\", outputCol=\"raw_features\")\n","# cv_model = cv.fit(tokenized_df)\n","# featurized_df = cv_model.transform(tokenized_df)\n","# idf = IDF(inputCol=\"raw_features\", outputCol=\"features\")\n","# idf_model = idf.fit(featurized_df)\n","# rescaled_df = idf_model.transform(featurized_df)\n","\n","# # Convert features column to VectorUDT\n","# to_vector = udf(lambda x: x, VectorUDT())\n","# rescaled_df = rescaled_df.withColumn(\"features\", to_vector(\"features\"))\n","\n","# # Compute BM25Okapi score\n","# bm25 = BM25Okapi(rescaled_df.select(\"features\").rdd.flatMap(lambda x: x))\n","# bm25"]},{"cell_type":"code","execution_count":20,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:08:02.781992Z","iopub.status.busy":"2023-06-19T05:08:02.781655Z","iopub.status.idle":"2023-06-19T05:09:00.711662Z","shell.execute_reply":"2023-06-19T05:09:00.710057Z","shell.execute_reply.started":"2023-06-19T05:08:02.781961Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":[" \r"]}],"source":["docs = df_wiki.select('title_lower').toPandas()\n","tokenized_corpus = [[word for word in doc.lower().split() if word not in list_stopwords]\n"," for doc in docs['title_lower']\n"," ] #simple tokenier\n","bm25_title = BM25Okapi(tokenized_corpus)"]},{"cell_type":"code","execution_count":21,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:09:00.713615Z","iopub.status.busy":"2023-06-19T05:09:00.713197Z","iopub.status.idle":"2023-06-19T05:09:01.135969Z","shell.execute_reply":"2023-06-19T05:09:01.135282Z","shell.execute_reply.started":"2023-06-19T05:09:00.713580Z"},"scrolled":true,"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["+--------------------+\n","| query|\n","+--------------------+\n","|[costa, rica, ice...|\n","| [núi, elbrus]|\n","| [alexandria]|\n","| [lê, chân]|\n","| [ý]|\n","+--------------------+\n","only showing top 5 rows\n","\n"]}],"source":["df = pd.DataFrame({\n"," \"query\": [post_process(x[\"short_candidate\"]).lower().split() for x in data]\n","})\n","df = spark.createDataFrame(df)\n","df.show(5)"]},{"cell_type":"code","execution_count":22,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:09:01.136929Z","iopub.status.busy":"2023-06-19T05:09:01.136703Z","iopub.status.idle":"2023-06-19T05:09:01.198467Z","shell.execute_reply":"2023-06-19T05:09:01.197404Z","shell.execute_reply.started":"2023-06-19T05:09:01.136901Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["+--------------------+\n","| query|\n","+--------------------+\n","|[costa, rica, ice...|\n","| [núi, elbrus]|\n","| [alexandria]|\n","| [lê, chân]|\n","| [ý]|\n","| [phan, thiết]|\n","| [xã, nhơn, lý]|\n","| [google]|\n","| [tỉnh, gia, lai]|\n","| [tỉnh, quảng, nam]|\n","| [vua, khải, định]|\n","|[theo, thể, lục, ...|\n","| [trần, duy, hưng]|\n","| [suối, tranh]|\n","|[chùa, làng, vũ, ...|\n","|[nguyễn, phú, trọng]|\n","| [pháp]|\n","| [tỉnh, an, giang]|\n","| [tỉnh, bắc, kạn]|\n","| [nguyễn, văn, tý]|\n","+--------------------+\n","only showing top 20 rows\n","\n"]}],"source":["df.select('query').show()"]},{"cell_type":"code","execution_count":23,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:09:01.199811Z","iopub.status.busy":"2023-06-19T05:09:01.199500Z","iopub.status.idle":"2023-06-19T05:09:08.924690Z","shell.execute_reply":"2023-06-19T05:09:08.923664Z","shell.execute_reply.started":"2023-06-19T05:09:01.199782Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["[Stage 15:> (0 + 1) / 1]\r"]},{"name":"stdout","output_type":"stream","text":["+--------------------+--------------------+\n","| query| top_n|\n","+--------------------+--------------------+\n","|[costa, rica, ice...|[6771, 25055, 585...|\n","| [núi, elbrus]|[11092, 1268687, ...|\n","| [alexandria]|[54875, 223778, 5...|\n","| [lê, chân]|[25316, 17081, 39...|\n","| [ý]|[4452, 46524, 114...|\n","+--------------------+--------------------+\n","only showing top 5 rows\n","\n"]},{"name":"stderr","output_type":"stream","text":[" \r"]}],"source":["# Define UDF to compute topk titles for a query\n","def get_topk(query, topk=10):\n"," scores = bm25_title.get_scores(query)\n"," top_titles = list(map(int, np.argsort(scores)[::-1][:topk]))\n"," return top_titles\n","\n","get_topk_udf = udf(get_topk, ArrayType(IntegerType()))\n","\n","# Add topk titles column to DataFrame\n","df = df.withColumn(\"top_n\", get_topk_udf(df[\"query\"]))\n","df.show(5)"]},{"cell_type":"code","execution_count":24,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:09:08.926215Z","iopub.status.busy":"2023-06-19T05:09:08.925833Z","iopub.status.idle":"2023-06-19T05:30:40.685470Z","shell.execute_reply":"2023-06-19T05:30:40.683455Z","shell.execute_reply.started":"2023-06-19T05:09:08.926182Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":[" \r"]}],"source":["candidate_ids = []\n","true_ids = []\n","top_n_df = df.select(\"top_n\").toPandas()\n","for i, x in enumerate(data):\n"," true_title = x[\"answer\"].replace(\"_\",\" \").replace(\"wiki/\",\"\").strip()\n"," top_n = top_n_df.loc[i, \"top_n\"].tolist() \n"," true_idx = title2idx[true_title]\n"," true_ids.append(true_idx)\n"," candidate_ids.append(top_n)"]},{"cell_type":"code","execution_count":25,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:30:40.688025Z","iopub.status.busy":"2023-06-19T05:30:40.687599Z","iopub.status.idle":"2023-06-19T05:30:40.696966Z","shell.execute_reply":"2023-06-19T05:30:40.695478Z","shell.execute_reply.started":"2023-06-19T05:30:40.687990Z"},"trusted":true},"outputs":[],"source":["for x, y in zip(candidate_ids, true_ids):\n"," if y not in x:\n"," x[-1] = y"]},{"cell_type":"code","execution_count":26,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:30:40.700316Z","iopub.status.busy":"2023-06-19T05:30:40.699499Z","iopub.status.idle":"2023-06-19T05:32:48.695056Z","shell.execute_reply":"2023-06-19T05:32:48.693738Z","shell.execute_reply.started":"2023-06-19T05:30:40.700278Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":[" \r"]}],"source":["# Change to list of row\n","rows = df_wiki.collect()"]},{"cell_type":"code","execution_count":27,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:32:48.697279Z","iopub.status.busy":"2023-06-19T05:32:48.696520Z","iopub.status.idle":"2023-06-19T05:32:55.538212Z","shell.execute_reply":"2023-06-19T05:32:55.537131Z","shell.execute_reply.started":"2023-06-19T05:32:48.697248Z"},"trusted":true},"outputs":[],"source":["df = pd.DataFrame()\n","questions = []\n","answers = []\n","titles = []\n","candidates = []\n","labels = []\n","groups = []\n","for idx, (sample, true_idx, candidate_idxs) in enumerate(zip(data, true_ids, candidate_ids)):\n"," assert true_idx in candidate_idxs\n"," question = sample['question']\n"," answer = sample['short_candidate']\n"," title = [rows[i]['title'] for i in candidate_idxs]\n"," candidate = [rows[i]['text'] for i in candidate_idxs]\n"," label = [1 if x == true_idx else 0 for x in candidate_idxs]\n"," \n"," questions.extend([question,]*len(candidate_idxs))\n"," answers.extend([answer,]*len(candidate_idxs))\n"," groups.extend([idx,]*len(candidate_idxs))\n"," titles.extend(title)\n"," candidates.extend(candidate)\n"," labels.extend(label)\n"," \n","df[\"question\"] = questions \n","df[\"answer\"] = answers \n","df[\"title\"] = titles \n","df[\"candidate\"] = candidates \n","df[\"label\"] = labels \n","df[\"group\"] = groups \n","df.candidate = df.candidate.apply(lambda x: \" \".join(x.strip().split()))"]},{"cell_type":"code","execution_count":28,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:32:55.539819Z","iopub.status.busy":"2023-06-19T05:32:55.539519Z","iopub.status.idle":"2023-06-19T05:32:55.577719Z","shell.execute_reply":"2023-06-19T05:32:55.576585Z","shell.execute_reply.started":"2023-06-19T05:32:55.539794Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>question</th>\n"," <th>answer</th>\n"," <th>title</th>\n"," <th>candidate</th>\n"," <th>label</th>\n"," <th>group</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>Đất nước nào không có quân đội</td>\n"," <td>Costa Rica, Iceland, Panama, Micronesia, Quần ...</td>\n"," <td>Costa Rica</td>\n"," <td>Costa Rica Costa Rica (Phiên âm: Cô-xta Ri-ca)...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Đất nước nào không có quân đội</td>\n"," <td>Costa Rica, Iceland, Panama, Micronesia, Quần ...</td>\n"," <td>Quần đảo Marshall</td>\n"," <td>Quần đảo Marshall Quần đảo Marshall, tên chính...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>Đất nước nào không có quân đội</td>\n"," <td>Costa Rica, Iceland, Panama, Micronesia, Quần ...</td>\n"," <td>Montezuma, Costa Rica</td>\n"," <td>Montezuma, Costa Rica Montezuma là một thị xã ...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>Đất nước nào không có quân đội</td>\n"," <td>Costa Rica, Iceland, Panama, Micronesia, Quần ...</td>\n"," <td>Tamarindo, Costa Rica</td>\n"," <td>Tamarindo, Costa Rica Tamarindo là một thị xã ...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>Đất nước nào không có quân đội</td>\n"," <td>Costa Rica, Iceland, Panama, Micronesia, Quần ...</td>\n"," <td>Micronesia</td>\n"," <td>Micronesia Micronesia (, ), còn gọi là Tiểu Đả...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" question \\\n","0 Đất nước nào không có quân đội \n","1 Đất nước nào không có quân đội \n","2 Đất nước nào không có quân đội \n","3 Đất nước nào không có quân đội \n","4 Đất nước nào không có quân đội \n","\n"," answer title \\\n","0 Costa Rica, Iceland, Panama, Micronesia, Quần ... Costa Rica \n","1 Costa Rica, Iceland, Panama, Micronesia, Quần ... Quần đảo Marshall \n","2 Costa Rica, Iceland, Panama, Micronesia, Quần ... Montezuma, Costa Rica \n","3 Costa Rica, Iceland, Panama, Micronesia, Quần ... Tamarindo, Costa Rica \n","4 Costa Rica, Iceland, Panama, Micronesia, Quần ... Micronesia \n","\n"," candidate label group \n","0 Costa Rica Costa Rica (Phiên âm: Cô-xta Ri-ca)... 0 0 \n","1 Quần đảo Marshall Quần đảo Marshall, tên chính... 0 0 \n","2 Montezuma, Costa Rica Montezuma là một thị xã ... 0 0 \n","3 Tamarindo, Costa Rica Tamarindo là một thị xã ... 0 0 \n","4 Micronesia Micronesia (, ), còn gọi là Tiểu Đả... 0 0 "]},"execution_count":28,"metadata":{},"output_type":"execute_result"}],"source":["df.head()"]},{"cell_type":"code","execution_count":30,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:33:45.138800Z","iopub.status.busy":"2023-06-19T05:33:45.138413Z","iopub.status.idle":"2023-06-19T05:33:51.569429Z","shell.execute_reply":"2023-06-19T05:33:51.568261Z","shell.execute_reply.started":"2023-06-19T05:33:45.138771Z"},"trusted":true},"outputs":[],"source":["df.to_csv(\"/kaggle/working/train_stage2_ranking.csv\",index=False)"]},{"cell_type":"markdown","metadata":{},"source":["<a href=\"/kaggle/working/train_stage2_ranking.csv\"> Download File </a>"]},{"cell_type":"code","execution_count":33,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:38:13.126543Z","iopub.status.busy":"2023-06-19T05:38:13.126055Z","iopub.status.idle":"2023-06-19T05:38:47.848000Z","shell.execute_reply":"2023-06-19T05:38:47.845517Z","shell.execute_reply.started":"2023-06-19T05:38:13.126496Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":[" adding: kaggle/working/train_stage2_ranking.csv (deflated 70%)\n"]}],"source":["!zip -r file.zip /kaggle/working/train_stage2_ranking.csv"]},{"cell_type":"code","execution_count":36,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:41:56.345417Z","iopub.status.busy":"2023-06-19T05:41:56.344395Z","iopub.status.idle":"2023-06-19T05:41:56.821886Z","shell.execute_reply":"2023-06-19T05:41:56.820592Z","shell.execute_reply.started":"2023-06-19T05:41:56.345344Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["total 507M\n","drwxr-xr-x 4 root root 4.0K Jun 19 05:38 .\n","-rw-r--r-- 1 root root 118M Jun 19 05:38 file.zip\n","-rw-r--r-- 1 root root 390M Jun 19 05:33 train_stage2_ranking.csv\n","drwxr-xr-x 2 root root 4.0K Jun 19 04:47 spark-warehouse\n","drwxr-xr-x 2 root root 4.0K Jun 19 04:45 .virtual_documents\n","drwxr-xr-x 5 root root 4.0K Jun 19 04:45 ..\n","---------- 1 root root 263 Jun 19 04:45 __notebook_source__.ipynb\n"]}],"source":["!cd /kaggle && ls -halt working"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.10"}},"nbformat":4,"nbformat_minor":5}

notebooks/0.4-find-redirects.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/1.0-train-bm25-stage1.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"cells":[{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Kaggle link: https://www.kaggle.com/code/noobhocai/train-stage-1"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:46:04.681825Z","iopub.status.busy":"2023-06-26T15:46:04.681521Z","iopub.status.idle":"2023-06-26T15:46:20.131828Z","shell.execute_reply":"2023-06-26T15:46:20.130853Z","shell.execute_reply.started":"2023-06-26T15:46:04.681800Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install pyvi rank_bm25 pandarallel gensim --q"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:46:20.133869Z","iopub.status.busy":"2023-06-26T15:46:20.133505Z","iopub.status.idle":"2023-06-26T15:46:21.705024Z","shell.execute_reply":"2023-06-26T15:46:21.703777Z","shell.execute_reply.started":"2023-06-26T15:46:20.133832Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["INFO: Pandarallel will run on 10 workers.\n","INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.\n"]}],"source":["import os\n","import re\n","from tqdm.auto import tqdm\n","tqdm.pandas()\n","import math\n","import pandas as pd\n","import string\n","from pyvi.ViTokenizer import tokenize\n","import numpy as np\n","import json, pickle\n","from rank_bm25 import BM25Okapi\n","import argparse\n","import gc\n","\n","from glob import glob \n","from nltk import word_tokenize as lib_tokenizer \n","\n","from pandarallel import pandarallel\n","from gensim.corpora import Dictionary\n","from gensim.corpora import MmCorpus\n","from gensim.models import TfidfModel, OkapiBM25Model\n","from gensim.similarities import SparseMatrixSimilarity\n","pandarallel.initialize(progress_bar=True, nb_workers=10)"]},{"cell_type":"code","execution_count":3,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:12:52.194411Z","iopub.status.busy":"2023-06-26T15:12:52.193733Z","iopub.status.idle":"2023-06-26T15:12:52.208121Z","shell.execute_reply":"2023-06-26T15:12:52.206564Z","shell.execute_reply.started":"2023-06-26T15:12:52.194376Z"},"trusted":true},"outputs":[],"source":["def get_topk(query, topk = 100):\n"," tokenized_query = query.split()\n"," tfidf_query = tfidf_model[dictionary.doc2bow(tokenized_query)]\n"," scores = bm25_index[tfidf_query]\n"," top_n = np.argsort(scores)[::-1][:topk]\n"," titles = [df_wiki.title.values[i] for i in top_n]\n"," texts = [df_wiki.text.values[i] for i in top_n]\n"," # print(titles)\n"," # print(tfidf_query, scores)\n"," return titles, texts, scores[top_n]\n","\n","def post_process(x):\n"," x = \" \".join(word_tokenize(strip_context(x))).strip()\n"," x = x.replace(\"\\n\",\" \")\n"," x = \"\".join([i for i in x if i not in string.punctuation])\n"," return x\n","\n","dict_map = dict({}) \n","def word_tokenize(text): \n"," global dict_map \n"," words = text.split() \n"," words_norm = [] \n"," for w in words: \n"," if dict_map.get(w, None) is None: \n"," dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '\"').replace(\"''\", '\"') \n"," words_norm.append(dict_map[w]) \n"," return words_norm \n"," \n","def strip_context(text): \n"," text = text.replace('\\n', ' ') \n"," text = re.sub(r'\\s+', ' ', text) \n"," text = text.strip() \n"," return text"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:12:52.210996Z","iopub.status.busy":"2023-06-26T15:12:52.210623Z","iopub.status.idle":"2023-06-26T15:12:52.227767Z","shell.execute_reply":"2023-06-26T15:12:52.226604Z","shell.execute_reply.started":"2023-06-26T15:12:52.210948Z"},"trusted":true},"outputs":[],"source":["wiki_cleaned_path = \"/kaggle/input/e2eqa-wiki-zalo-ai/processed/wikipedia_20220620_cleaned_v2.csv\"\n","test_data_path = \"/kaggle/input/e2eqa-wiki-zalo-ai/e2eqa-trainpublic_test-v1/e2eqa-train+public_test-v1/zac2022_testa_sample_submission.json\"\n","topk = 300"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:12:52.230234Z","iopub.status.busy":"2023-06-26T15:12:52.229690Z","iopub.status.idle":"2023-06-26T15:14:32.740956Z","shell.execute_reply":"2023-06-26T15:14:32.739621Z","shell.execute_reply.started":"2023-06-26T15:12:52.230185Z"},"trusted":true},"outputs":[],"source":["df_wiki = pd.read_csv(wiki_cleaned_path)"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:14:32.743587Z","iopub.status.busy":"2023-06-26T15:14:32.743104Z","iopub.status.idle":"2023-06-26T15:14:35.140539Z","shell.execute_reply":"2023-06-26T15:14:35.139511Z","shell.execute_reply.started":"2023-06-26T15:14:32.743544Z"},"trusted":true},"outputs":[],"source":["df_wiki = df_wiki.fillna(\"NaN\")\n","if \"title\" not in df_wiki.columns:\n"," df_wiki[\"title\"] = df_wiki[\"titles=\"].fillna(\"\")"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:14:35.143448Z","iopub.status.busy":"2023-06-26T15:14:35.142192Z","iopub.status.idle":"2023-06-26T15:14:35.178049Z","shell.execute_reply":"2023-06-26T15:14:35.176844Z","shell.execute_reply.started":"2023-06-26T15:14:35.143403Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>title</th>\n"," <th>text</th>\n"," <th>bm25_text</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>Trang Chính</td>\n"," <td>Trang Chính\\n\\n<templatestyles src=\"Wiki2021/s...</td>\n"," <td>trang chính <templatestyles src= wiki2021 styl...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Internet Society</td>\n"," <td>Internet Society hay ISOC là một tổ chức quốc...</td>\n"," <td>internet society hay isoc là một tổ chức quốc ...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>Tiếng Việt</td>\n"," <td>Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...</td>\n"," <td>tiếng việt cũng gọi là tiếng việt nam hay việt...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>Tiếng Việt</td>\n"," <td>hệ thống thanh điệu phát triển cao hơn, hệ thố...</td>\n"," <td>hệ thống thanh điệu phát triển cao hơn hệ thốn...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>Tiếng Việt</td>\n"," <td>tiếp xúc Hán – Việt thành 2 giai đoạn chính: \\...</td>\n"," <td>tiếp xúc hán – việt thành 2 giai đoạn chính bu...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" title text \\\n","0 Trang Chính Trang Chính\\n\\n<templatestyles src=\"Wiki2021/s... \n","1 Internet Society Internet Society hay ISOC là một tổ chức quốc... \n","2 Tiếng Việt Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi... \n","3 Tiếng Việt hệ thống thanh điệu phát triển cao hơn, hệ thố... \n","4 Tiếng Việt tiếp xúc Hán – Việt thành 2 giai đoạn chính: \\... \n","\n"," bm25_text \n","0 trang chính <templatestyles src= wiki2021 styl... \n","1 internet society hay isoc là một tổ chức quốc ... \n","2 tiếng việt cũng gọi là tiếng việt nam hay việt... \n","3 hệ thống thanh điệu phát triển cao hơn hệ thốn... \n","4 tiếp xúc hán – việt thành 2 giai đoạn chính bu... "]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["df_wiki.head()"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:14:35.180010Z","iopub.status.busy":"2023-06-26T15:14:35.179628Z","iopub.status.idle":"2023-06-26T15:21:28.016391Z","shell.execute_reply":"2023-06-26T15:21:28.014113Z","shell.execute_reply.started":"2023-06-26T15:14:35.179950Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"8492baeb724e46bd80570a9450bdc18c","version_major":2,"version_minor":0},"text/plain":["VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=194441), Label(value='0 / 194441')…"]},"metadata":{},"output_type":"display_data"}],"source":["df_wiki['bm25_text'] = df_wiki['bm25_text'].parallel_apply(post_process)\n","# corpus = [x.split() for x in df_wiki['bm25_text'].values]"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:22:23.179076Z","iopub.status.busy":"2023-06-26T15:22:23.178506Z","iopub.status.idle":"2023-06-26T15:31:22.030632Z","shell.execute_reply":"2023-06-26T15:31:22.028624Z","shell.execute_reply.started":"2023-06-26T15:22:23.179027Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"01c5f715c22d4ec682dc0c7b4812ec30","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/1944406 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"}],"source":["# Convert the column to a numpy array \n","texts = df_wiki['bm25_text'].to_numpy()\n","# Create an empty dictionary \n","dictionary = Dictionary()\n","# Update the dictionary with each batch of texts \n","for text in tqdm(texts): \n"," dictionary.add_documents([text.split()])"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T14:30:00.288679Z","iopub.status.busy":"2023-06-26T14:30:00.288263Z"},"trusted":true},"outputs":[],"source":["# dictionary = Dictionary(corpus)"]},{"cell_type":"code","execution_count":14,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:35:34.130599Z","iopub.status.busy":"2023-06-26T15:35:34.130050Z","iopub.status.idle":"2023-06-26T15:36:52.503096Z","shell.execute_reply":"2023-06-26T15:36:52.501050Z","shell.execute_reply.started":"2023-06-26T15:35:34.130560Z"},"trusted":true},"outputs":[],"source":["try:\n"," corpus = [text.split() for text in texts]\n","except:\n"," print(\"nope\")"]},{"cell_type":"code","execution_count":17,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:37:34.690339Z","iopub.status.busy":"2023-06-26T15:37:34.689872Z","iopub.status.idle":"2023-06-26T15:37:36.745760Z","shell.execute_reply":"2023-06-26T15:37:36.743959Z","shell.execute_reply.started":"2023-06-26T15:37:34.690307Z"},"trusted":true},"outputs":[],"source":["!mkdir /kaggle/working/bm25_stage1"]},{"cell_type":"markdown","metadata":{},"source":["Save dictionary và corpus vào bộ nhớ"]},{"cell_type":"code","execution_count":19,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:39:40.089532Z","iopub.status.busy":"2023-06-26T15:39:40.089024Z","iopub.status.idle":"2023-06-26T15:39:55.080856Z","shell.execute_reply":"2023-06-26T15:39:55.079008Z","shell.execute_reply.started":"2023-06-26T15:39:40.089498Z"},"trusted":true},"outputs":[],"source":["# open a file for writing\n","with open('/kaggle/working/bm25_stage1/corpus.txt', 'w') as f:\n"," # use a loop to write each inner list to a line in the file\n"," for text in texts:\n"," line = text + '\\n'\n"," f.write(line)"]},{"cell_type":"code","execution_count":18,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:37:38.871933Z","iopub.status.busy":"2023-06-26T15:37:38.870377Z","iopub.status.idle":"2023-06-26T15:37:48.653312Z","shell.execute_reply":"2023-06-26T15:37:48.651908Z","shell.execute_reply.started":"2023-06-26T15:37:38.871878Z"},"trusted":true},"outputs":[],"source":["tfidf_model = TfidfModel(dictionary=dictionary, smartirs='bnn') # Enforce binary weighting of queries\n","dictionary.save(\"/kaggle/working/bm25_stage1/dict\")\n","tfidf_model.save(\"/kaggle/working/bm25_stage1/tfidf\")"]},{"cell_type":"markdown","metadata":{},"source":["---"]},{"cell_type":"markdown","metadata":{},"source":["Dừng session và tiếp tục chạy"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:51:00.031211Z","iopub.status.busy":"2023-06-26T15:51:00.030816Z","iopub.status.idle":"2023-06-26T15:51:00.036153Z","shell.execute_reply":"2023-06-26T15:51:00.034981Z","shell.execute_reply.started":"2023-06-26T15:51:00.031173Z"},"trusted":true},"outputs":[],"source":["from gensim.models.word2vec import LineSentence"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:51:04.581262Z","iopub.status.busy":"2023-06-26T15:51:04.580512Z","iopub.status.idle":"2023-06-26T15:51:07.005096Z","shell.execute_reply":"2023-06-26T15:51:07.004157Z","shell.execute_reply.started":"2023-06-26T15:51:04.581221Z"},"trusted":true},"outputs":[],"source":["# Create a dictionary and a LineSentence object\n","dictionary = Dictionary.load(\"/kaggle/working/bm25_stage1/dict\")\n","corpus = LineSentence(\"/kaggle/working/bm25_stage1/corpus.txt\")"]},{"cell_type":"code","execution_count":13,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:59:04.442336Z","iopub.status.busy":"2023-06-26T15:59:04.441363Z","iopub.status.idle":"2023-06-26T16:06:21.971493Z","shell.execute_reply":"2023-06-26T16:06:21.969936Z","shell.execute_reply.started":"2023-06-26T15:59:04.442296Z"},"trusted":true},"outputs":[],"source":["# # Get an iterator over the corpus\n","# corpus_iterator = corpus.getstream()\n","# Create a BM25 model\n","bm25_model = OkapiBM25Model(dictionary=dictionary)\n","# Create a BM25 corpus by applying doc2bow to each document in the iterator\n","bm25_corpus = bm25_model[list(map(dictionary.doc2bow, corpus))]\n","# Get the number of documents in the corpus from the dictionary or by counting them manually \n","num_docs = dictionary.num_docs\n","# Create a BM25 index\n","bm25_index = SparseMatrixSimilarity(bm25_corpus, num_docs=num_docs,\n"," num_terms=len(dictionary),normalize_queries=False, normalize_documents=False)\n","# Save the BM25 index to a file\n","bm25_index.save(\"/kaggle/working/bm25_stage1/bm25_index\")"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.10"}},"nbformat":4,"nbformat_minor":4}

notebooks/1.1-train-bm25-stage2.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"cells":[{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Kaggle link: https://www.kaggle.com/code/noobhocai/train-stage-2"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:15:26.396463Z","iopub.status.busy":"2023-06-26T16:15:26.396153Z","iopub.status.idle":"2023-06-26T16:15:44.091436Z","shell.execute_reply":"2023-06-26T16:15:44.090085Z","shell.execute_reply.started":"2023-06-26T16:15:26.396437Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install rank_bm25 pandarallel gensim --q"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:15:46.131348Z","iopub.status.busy":"2023-06-26T16:15:46.130925Z","iopub.status.idle":"2023-06-26T16:15:47.760579Z","shell.execute_reply":"2023-06-26T16:15:47.759408Z","shell.execute_reply.started":"2023-06-26T16:15:46.131315Z"},"trusted":true},"outputs":[],"source":["import os\n","import json\n","import pandas as pd\n","import numpy as np\n","import json, pickle\n","from rank_bm25 import BM25Okapi\n","import argparse\n","import gc\n","from tqdm.auto import tqdm\n","tqdm.pandas()\n","from glob import glob \n","import re \n","from nltk import word_tokenize as lib_tokenizer \n","import string\n","from gensim.corpora import Dictionary\n","from gensim.models import TfidfModel, OkapiBM25Model\n","from gensim.similarities import SparseMatrixSimilarity"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:16:45.111948Z","iopub.status.busy":"2023-06-26T16:16:45.111271Z","iopub.status.idle":"2023-06-26T16:16:45.117881Z","shell.execute_reply":"2023-06-26T16:16:45.116573Z","shell.execute_reply.started":"2023-06-26T16:16:45.111915Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["INFO: Pandarallel will run on 10 workers.\n","INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.\n"]}],"source":["from pandarallel import pandarallel\n","\n","pandarallel.initialize(progress_bar=True, nb_workers=10)"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:16:47.210933Z","iopub.status.busy":"2023-06-26T16:16:47.210499Z","iopub.status.idle":"2023-06-26T16:17:49.317252Z","shell.execute_reply":"2023-06-26T16:17:49.316132Z","shell.execute_reply.started":"2023-06-26T16:16:47.210900Z"},"trusted":true},"outputs":[],"source":["df_wiki = pd.read_json(\"/kaggle/input/e2eqa-wiki-zalo-ai/wikipedia_20220620_cleaned/wikipedia_20220620_cleaned.jsonl\", lines=True)"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:18:42.926014Z","iopub.status.busy":"2023-06-26T16:18:42.925307Z","iopub.status.idle":"2023-06-26T16:18:42.961174Z","shell.execute_reply":"2023-06-26T16:18:42.959896Z","shell.execute_reply.started":"2023-06-26T16:18:42.925974Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>id</th>\n"," <th>url</th>\n"," <th>title</th>\n"," <th>text</th>\n"," <th>timestamp</th>\n"," <th>revid</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>2</td>\n"," <td>https://vi.wikipedia.org/wiki?curid=2</td>\n"," <td>Trang Chính</td>\n"," <td>Trang Chính\\n\\n<templatestyles src=\"Wiki2021/s...</td>\n"," <td>2022-05-12 12:46:53+00:00</td>\n"," <td>68591979</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>4</td>\n"," <td>https://vi.wikipedia.org/wiki?curid=4</td>\n"," <td>Internet Society</td>\n"," <td>Internet Society\\n\\nInternet Society hay ISOC ...</td>\n"," <td>2022-01-20 07:59:10+00:00</td>\n"," <td>67988747</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>13</td>\n"," <td>https://vi.wikipedia.org/wiki?curid=13</td>\n"," <td>Tiếng Việt</td>\n"," <td>Tiếng Việt\\n\\nTiếng Việt, cũng gọi là tiếng Vi...</td>\n"," <td>2022-05-29 03:42:42+00:00</td>\n"," <td>68660631</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>24</td>\n"," <td>https://vi.wikipedia.org/wiki?curid=24</td>\n"," <td>Ohio</td>\n"," <td>Ohio\\n\\nOhio (viết tắt là OH, viết tắt cũ là O...</td>\n"," <td>2022-04-17 08:15:22+00:00</td>\n"," <td>68482118</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>26</td>\n"," <td>https://vi.wikipedia.org/wiki?curid=26</td>\n"," <td>California</td>\n"," <td>California\\n\\nCalifornia (phát âm như \"Ca-li-p...</td>\n"," <td>2022-06-16 15:27:07+00:00</td>\n"," <td>68738039</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" id url title \\\n","0 2 https://vi.wikipedia.org/wiki?curid=2 Trang Chính \n","1 4 https://vi.wikipedia.org/wiki?curid=4 Internet Society \n","2 13 https://vi.wikipedia.org/wiki?curid=13 Tiếng Việt \n","3 24 https://vi.wikipedia.org/wiki?curid=24 Ohio \n","4 26 https://vi.wikipedia.org/wiki?curid=26 California \n","\n"," text \\\n","0 Trang Chính\\n\\n<templatestyles src=\"Wiki2021/s... \n","1 Internet Society\\n\\nInternet Society hay ISOC ... \n","2 Tiếng Việt\\n\\nTiếng Việt, cũng gọi là tiếng Vi... \n","3 Ohio\\n\\nOhio (viết tắt là OH, viết tắt cũ là O... \n","4 California\\n\\nCalifornia (phát âm như \"Ca-li-p... \n","\n"," timestamp revid \n","0 2022-05-12 12:46:53+00:00 68591979 \n","1 2022-01-20 07:59:10+00:00 67988747 \n","2 2022-05-29 03:42:42+00:00 68660631 \n","3 2022-04-17 08:15:22+00:00 68482118 \n","4 2022-06-16 15:27:07+00:00 68738039 "]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["df_wiki.head()"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:20:14.921978Z","iopub.status.busy":"2023-06-26T16:20:14.921498Z","iopub.status.idle":"2023-06-26T16:20:14.943942Z","shell.execute_reply":"2023-06-26T16:20:14.942444Z","shell.execute_reply.started":"2023-06-26T16:20:14.921945Z"},"trusted":true},"outputs":[],"source":["def post_process(x):\n"," x = \" \".join(word_tokenize(strip_context(x))).strip()\n"," x = x.replace(\"\\n\",\" \")\n"," x = \"\".join([i for i in x if i not in string.punctuation])\n"," x = \" \".join(x.split()[:128])\n"," return x\n","\n","dict_map = dict({}) \n","def word_tokenize(text): \n"," global dict_map \n"," words = text.split() \n"," words_norm = [] \n"," for w in words: \n"," if dict_map.get(w, None) is None: \n"," dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '\"').replace(\"''\", '\"') \n"," words_norm.append(dict_map[w]) \n"," return words_norm \n"," \n","def strip_answer_string(text): \n"," text = text.strip() \n"," while text[-1] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n"," if text[0] != '(' and text[-1] == ')' and '(' in text: \n"," break \n"," if text[-1] == '\"' and text[0] != '\"' and text.count('\"') > 1: \n"," break \n"," text = text[:-1].strip() \n"," while text[0] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n"," if text[0] == '\"' and text[-1] != '\"' and text.count('\"') > 1: \n"," break \n"," text = text[1:].strip() \n"," text = text.strip() \n"," return text \n"," \n","def strip_context(text): \n"," text = text.replace('\\n', ' ') \n"," text = re.sub(r'\\s+', ' ', text) \n"," text = text.strip() \n"," return text\n","\n","def check_(x):\n"," x = str(x).lower()\n"," return (x.isnumeric() or \"ngày\" in x or \"tháng\" in x or \"năm\" in x)\n","\n","def find_candidate_ids(x, raw_answer=None, already_added=[], topk=50):\n"," x = str(x)\n"," query = post_process(x).lower().split()\n"," tfidf_query = tfidf_model[dictionary.doc2bow(query)]\n"," scores = bm25_index[tfidf_query]\n"," top_n = list(np.argsort(scores)[::-1][:topk])\n"," top_n = [i for i in top_n if i not in already_added]\n"," # scores = list(scores[top_n])\n"," if raw_answer is not None:\n"," raw_answer = raw_answer.strip()\n"," if raw_answer in entity_dict:\n"," title = entity_dict[raw_answer].replace(\"wiki/\",\"\").replace(\"_\",\" \")\n"," extra_id = title2idx.get(title, -1)\n"," # print((raw_answer,title,extra_id, extra_id not in top_n))\n"," if extra_id != -1 and extra_id not in top_n:\n"," print(f\"Add extra id {extra_id} for {raw_answer}\")\n"," top_n.append(extra_id)\n"," top_n = list(set(top_n))\n"," scores = scores[top_n]\n"," return list(top_n), np.array(scores)"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:20:18.394704Z","iopub.status.busy":"2023-06-26T16:20:18.394284Z","iopub.status.idle":"2023-06-26T16:30:31.484998Z","shell.execute_reply":"2023-06-26T16:30:31.483810Z","shell.execute_reply.started":"2023-06-26T16:20:18.394671Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"8cd018dfcf7e4ccc85f93f8bb319f26c","version_major":2,"version_minor":0},"text/plain":["VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=127347), Label(value='0 / 127347')…"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"a13f08c5d7974e1087d598ca8b488840","version_major":2,"version_minor":0},"text/plain":["VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=127347), Label(value='0 / 127347')…"]},"metadata":{},"output_type":"display_data"}],"source":["df_wiki['title_lower'] = df_wiki['title'].apply(lambda x: x.lower()).parallel_apply(post_process)\n","df_wiki['text_lower'] = df_wiki['text'].apply(lambda x: x.lower()).parallel_apply(post_process)"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:33:42.344050Z","iopub.status.busy":"2023-06-26T16:33:42.342811Z","iopub.status.idle":"2023-06-26T16:33:42.362074Z","shell.execute_reply":"2023-06-26T16:33:42.360662Z","shell.execute_reply.started":"2023-06-26T16:33:42.344003Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>id</th>\n"," <th>url</th>\n"," <th>title</th>\n"," <th>text</th>\n"," <th>timestamp</th>\n"," <th>revid</th>\n"," <th>title_lower</th>\n"," <th>text_lower</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>2</td>\n"," <td>https://vi.wikipedia.org/wiki?curid=2</td>\n"," <td>Trang Chính</td>\n"," <td>Trang Chính\\n\\n<templatestyles src=\"Wiki2021/s...</td>\n"," <td>2022-05-12 12:46:53+00:00</td>\n"," <td>68591979</td>\n"," <td>trang chính</td>\n"," <td>trang chính templatestyles src wiki2021stylesc...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>4</td>\n"," <td>https://vi.wikipedia.org/wiki?curid=4</td>\n"," <td>Internet Society</td>\n"," <td>Internet Society\\n\\nInternet Society hay ISOC ...</td>\n"," <td>2022-01-20 07:59:10+00:00</td>\n"," <td>67988747</td>\n"," <td>internet society</td>\n"," <td>internet society internet society hay isoc là ...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>13</td>\n"," <td>https://vi.wikipedia.org/wiki?curid=13</td>\n"," <td>Tiếng Việt</td>\n"," <td>Tiếng Việt\\n\\nTiếng Việt, cũng gọi là tiếng Vi...</td>\n"," <td>2022-05-29 03:42:42+00:00</td>\n"," <td>68660631</td>\n"," <td>tiếng việt</td>\n"," <td>tiếng việt tiếng việt cũng gọi là tiếng việt n...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>24</td>\n"," <td>https://vi.wikipedia.org/wiki?curid=24</td>\n"," <td>Ohio</td>\n"," <td>Ohio\\n\\nOhio (viết tắt là OH, viết tắt cũ là O...</td>\n"," <td>2022-04-17 08:15:22+00:00</td>\n"," <td>68482118</td>\n"," <td>ohio</td>\n"," <td>ohio ohio viết tắt là oh viết tắt cũ là o là m...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>26</td>\n"," <td>https://vi.wikipedia.org/wiki?curid=26</td>\n"," <td>California</td>\n"," <td>California\\n\\nCalifornia (phát âm như \"Ca-li-p...</td>\n"," <td>2022-06-16 15:27:07+00:00</td>\n"," <td>68738039</td>\n"," <td>california</td>\n"," <td>california california phát âm như caliphótnia ...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" id url title \\\n","0 2 https://vi.wikipedia.org/wiki?curid=2 Trang Chính \n","1 4 https://vi.wikipedia.org/wiki?curid=4 Internet Society \n","2 13 https://vi.wikipedia.org/wiki?curid=13 Tiếng Việt \n","3 24 https://vi.wikipedia.org/wiki?curid=24 Ohio \n","4 26 https://vi.wikipedia.org/wiki?curid=26 California \n","\n"," text \\\n","0 Trang Chính\\n\\n<templatestyles src=\"Wiki2021/s... \n","1 Internet Society\\n\\nInternet Society hay ISOC ... \n","2 Tiếng Việt\\n\\nTiếng Việt, cũng gọi là tiếng Vi... \n","3 Ohio\\n\\nOhio (viết tắt là OH, viết tắt cũ là O... \n","4 California\\n\\nCalifornia (phát âm như \"Ca-li-p... \n","\n"," timestamp revid title_lower \\\n","0 2022-05-12 12:46:53+00:00 68591979 trang chính \n","1 2022-01-20 07:59:10+00:00 67988747 internet society \n","2 2022-05-29 03:42:42+00:00 68660631 tiếng việt \n","3 2022-04-17 08:15:22+00:00 68482118 ohio \n","4 2022-06-16 15:27:07+00:00 68738039 california \n","\n"," text_lower \n","0 trang chính templatestyles src wiki2021stylesc... \n","1 internet society internet society hay isoc là ... \n","2 tiếng việt tiếng việt cũng gọi là tiếng việt n... \n","3 ohio ohio viết tắt là oh viết tắt cũ là o là m... \n","4 california california phát âm như caliphótnia ... "]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["df_wiki.head()"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:46:35.539444Z","iopub.status.busy":"2023-06-26T16:46:35.538142Z","iopub.status.idle":"2023-06-26T16:46:38.074928Z","shell.execute_reply":"2023-06-26T16:46:38.073705Z","shell.execute_reply.started":"2023-06-26T16:46:35.539390Z"},"trusted":true},"outputs":[],"source":["title2idx = dict([(x.strip(),y) for x,y in zip(df_wiki.title, df_wiki.index.values)])\n","train = json.load(open(\"/kaggle/input/e2eqa-wiki-zalo-ai/processed/zac2022_train_merged_final.json\"))\n","entity_dict = json.load(open(\"/kaggle/input/e2eqa-wiki-zalo-ai/processed/entities.json\"))"]},{"cell_type":"code","execution_count":19,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:55:20.687579Z","iopub.status.busy":"2023-06-26T16:55:20.686344Z","iopub.status.idle":"2023-06-26T16:55:25.068585Z","shell.execute_reply":"2023-06-26T16:55:25.067193Z","shell.execute_reply.started":"2023-06-26T16:55:20.687529Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["mkdir: cannot create directory ‘/kaggle/working/bm25_stage2’: File exists\n","mkdir: cannot create directory ‘/kaggle/working/bm25_stage2/full_text’: File exists\n"]}],"source":["!mkdir /kaggle/working/bm25_stage2\n","!mkdir /kaggle/working/bm25_stage2/full_text\n","!mkdir /kaggle/working/bm25_stage2/title"]},{"cell_type":"code","execution_count":18,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:54:49.241998Z","iopub.status.busy":"2023-06-26T16:54:49.240748Z","iopub.status.idle":"2023-06-26T16:54:52.346235Z","shell.execute_reply":"2023-06-26T16:54:52.345380Z","shell.execute_reply.started":"2023-06-26T16:54:49.241950Z"},"trusted":true},"outputs":[],"source":["corpus = [doc.split() for doc in df_wiki['text_lower']] #simple tokenier\n","dictionary = Dictionary(corpus)\n","bm25_model = OkapiBM25Model(dictionary=dictionary)\n","bm25_corpus = bm25_model[list(map(dictionary.doc2bow, corpus))]\n","bm25_index = SparseMatrixSimilarity(bm25_corpus, num_docs=len(corpus), num_terms=len(dictionary),normalize_queries=False, normalize_documents=False)\n","tfidf_model = TfidfModel(dictionary=dictionary, smartirs='bnn') # Enforce binary weighting of queries\n","dictionary.save(\"/kaggle/working/bm25_stage2/full_text/dict\")\n","tfidf_model.save(\"/kaggle/working/bm25_stage2/full_text/tfidf\")\n","bm25_index.save(\"/kaggle/working/bm25_stage2/full_text/bm25_index\")"]},{"cell_type":"code","execution_count":22,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:58:58.741331Z","iopub.status.busy":"2023-06-26T16:58:58.740930Z","iopub.status.idle":"2023-06-26T16:58:59.918024Z","shell.execute_reply":"2023-06-26T16:58:59.916804Z","shell.execute_reply.started":"2023-06-26T16:58:58.741301Z"},"trusted":true},"outputs":[],"source":["corpus = [doc.split() for doc in df_wiki['title_lower']] #simple tokenier\n","dictionary = Dictionary(corpus)\n","bm25_model = OkapiBM25Model(dictionary=dictionary)\n","bm25_corpus = bm25_model[list(map(dictionary.doc2bow, corpus))]\n","bm25_index = SparseMatrixSimilarity(bm25_corpus, num_docs=len(corpus), num_terms=len(dictionary),normalize_queries=False, normalize_documents=False)\n","tfidf_model = TfidfModel(dictionary=dictionary, smartirs='bnn') # Enforce binary weighting of queries\n","dictionary.save(\"/kaggle/working/bm25_stage2/title/dict\")\n","tfidf_model.save(\"/kaggle/working/bm25_stage2/title/tfidf\")\n","bm25_index.save(\"/kaggle/working/bm25_stage2/title/bm25_index\")"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.10"}},"nbformat":4,"nbformat_minor":4}

notebooks/1.2-train-pairwise-stage1.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"cells":[{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Kaggle link: https://www.kaggle.com/noobhocai/train-pairwise-stage1"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:52:30.450145Z","iopub.status.busy":"2023-06-28T02:52:30.449601Z","iopub.status.idle":"2023-06-28T02:52:48.306606Z","shell.execute_reply":"2023-06-28T02:52:48.305298Z","shell.execute_reply.started":"2023-06-28T02:52:30.450111Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install sentence_transformers pyvi -q"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:52:48.310777Z","iopub.status.busy":"2023-06-28T02:52:48.309711Z","iopub.status.idle":"2023-06-28T02:53:01.669646Z","shell.execute_reply":"2023-06-28T02:53:01.668372Z","shell.execute_reply.started":"2023-06-28T02:52:48.310744Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Collecting tensorflow-io==0.32.0\n"," Downloading tensorflow_io-0.32.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (28.0 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m28.0/28.0 MB\u001b[0m \u001b[31m40.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n","\u001b[?25hCollecting tensorflow-io-gcs-filesystem==0.32.0 (from tensorflow-io==0.32.0)\n"," Downloading tensorflow_io_gcs_filesystem-0.32.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.4 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m58.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n","\u001b[?25hInstalling collected packages: tensorflow-io-gcs-filesystem, tensorflow-io\n"," Attempting uninstall: tensorflow-io-gcs-filesystem\n"," Found existing installation: tensorflow-io-gcs-filesystem 0.31.0\n"," Uninstalling tensorflow-io-gcs-filesystem-0.31.0:\n"," Successfully uninstalled tensorflow-io-gcs-filesystem-0.31.0\n"," Attempting uninstall: tensorflow-io\n"," Found existing installation: tensorflow-io 0.31.0\n"," Uninstalling tensorflow-io-0.31.0:\n"," Successfully uninstalled tensorflow-io-0.31.0\n","Successfully installed tensorflow-io-0.32.0 tensorflow-io-gcs-filesystem-0.32.0\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install --upgrade tensorflow-io==0.32.0"]},{"cell_type":"code","execution_count":3,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2023-06-28T02:53:01.672246Z","iopub.status.busy":"2023-06-28T02:53:01.671610Z","iopub.status.idle":"2023-06-28T02:53:15.771561Z","shell.execute_reply":"2023-06-28T02:53:15.770593Z","shell.execute_reply.started":"2023-06-28T02:53:01.672210Z"},"trusted":true},"outputs":[],"source":["import os\n","import pandas as pd\n","import json\n","from tqdm.auto import tqdm\n","tqdm.pandas()\n","from transformers import AutoModel, AutoTokenizer\n","import torch\n","from torch.utils.data import DataLoader\n","import pandas as pd\n","from sklearn.metrics.pairwise import cosine_similarity\n","import numpy as np\n","from sentence_transformers import SentenceTransformer\n","from pyvi.ViTokenizer import tokenize\n","from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup\n","from transformers import DataCollatorWithPadding\n","from scipy.stats import pearsonr, spearmanr\n","import math\n","from sklearn.metrics import *"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:15.775041Z","iopub.status.busy":"2023-06-28T02:53:15.774278Z","iopub.status.idle":"2023-06-28T02:53:15.780313Z","shell.execute_reply":"2023-06-28T02:53:15.778919Z","shell.execute_reply.started":"2023-06-28T02:53:15.774985Z"},"trusted":true},"outputs":[],"source":["AUTH_TOKEN = \"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\""]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:15.782302Z","iopub.status.busy":"2023-06-28T02:53:15.781713Z","iopub.status.idle":"2023-06-28T02:53:18.884084Z","shell.execute_reply":"2023-06-28T02:53:18.882908Z","shell.execute_reply.started":"2023-06-28T02:53:15.782268Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"6118765cb5854ac7b8f4c68eb0c654a2","version_major":2,"version_minor":0},"text/plain":["Downloading (…)okenizer_config.json: 0%| | 0.00/398 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"9d4cfe6e1330443eb6775a558657bbfb","version_major":2,"version_minor":0},"text/plain":["Downloading (…)/main/tokenizer.json: 0%| | 0.00/9.08M [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"cf5607811ef54e528e24d6a66eceb758","version_major":2,"version_minor":0},"text/plain":["Downloading (…)cial_tokens_map.json: 0%| | 0.00/239 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["<s> sinh viên đại học khoa học tự nhiên</s>\n"]}],"source":["tokenizer = AutoTokenizer.from_pretrained('nguyenvulebinh/vi-mrc-base', use_auth_token=AUTH_TOKEN)\n","print(tokenizer.decode(tokenizer.encode(\"sinh viên đại học khoa học tự nhiên\")))"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:18.886122Z","iopub.status.busy":"2023-06-28T02:53:18.885497Z","iopub.status.idle":"2023-06-28T02:53:18.897285Z","shell.execute_reply":"2023-06-28T02:53:18.896274Z","shell.execute_reply.started":"2023-06-28T02:53:18.886088Z"},"trusted":true},"outputs":[],"source":["import json \n","from glob import glob \n","import re \n","from nltk import word_tokenize as lib_tokenizer \n"," \n","dict_map = dict({}) \n"," \n","def word_tokenize(text): \n"," global dict_map \n"," words = text.split() \n"," words_norm = [] \n"," for w in words: \n"," if dict_map.get(w, None) is None: \n"," dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '\"').replace(\"''\", '\"') \n"," words_norm.append(dict_map[w]) \n"," return words_norm \n"," \n","def strip_answer_string(text): \n"," text = text.strip() \n"," while text[-1] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n"," if text[0] != '(' and text[-1] == ')' and '(' in text: \n"," break \n"," if text[-1] == '\"' and text[0] != '\"' and text.count('\"') > 1: \n"," break \n"," text = text[:-1].strip() \n"," while text[0] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n"," if text[0] == '\"' and text[-1] != '\"' and text.count('\"') > 1: \n"," break \n"," text = text[1:].strip() \n"," text = text.strip() \n"," return text \n"," \n","def strip_context(text): \n"," text = text.replace('\\n', ' ') \n"," text = re.sub(r'\\s+', ' ', text) \n"," text = text.strip() \n"," return text"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:18.899048Z","iopub.status.busy":"2023-06-28T02:53:18.898414Z","iopub.status.idle":"2023-06-28T02:53:28.963083Z","shell.execute_reply":"2023-06-28T02:53:28.962126Z","shell.execute_reply.started":"2023-06-28T02:53:18.899002Z"},"trusted":true},"outputs":[],"source":["train1 = pd.read_csv(\"/kaggle/input/e2eqa-wiki-zalo-ai/processed/train_stage1_ranking.csv\")\n","train1['text'] = train1['text'].apply(lambda x: \" \".join(word_tokenize(strip_context(x))))\n","train1['question'] = train1['question'].apply(lambda x: \" \".join(word_tokenize(strip_context(x))))\n","df = train1"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:28.965037Z","iopub.status.busy":"2023-06-28T02:53:28.964656Z","iopub.status.idle":"2023-06-28T02:53:28.976243Z","shell.execute_reply":"2023-06-28T02:53:28.974943Z","shell.execute_reply.started":"2023-06-28T02:53:28.964989Z"},"trusted":true},"outputs":[],"source":["import torch.nn as nn\n","from transformers import AutoModel, AutoConfig\n","\n","class MeanPooling(nn.Module):\n"," def __init__(self):\n"," super(MeanPooling, self).__init__()\n","\n"," def forward(self, last_hidden_state, attention_mask):\n"," input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()\n"," sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)\n"," sum_mask = input_mask_expanded.sum(1)\n"," sum_mask = torch.clamp(sum_mask, min=1e-9)\n"," mean_embeddings = sum_embeddings / sum_mask\n"," return mean_embeddings\n","\n","class PairwiseModel(nn.Module):\n"," def __init__(self, model_name):\n"," super(PairwiseModel, self).__init__()\n"," self.model = AutoModel.from_pretrained(model_name, use_auth_token=AUTH_TOKEN)\n"," self.config = AutoConfig.from_pretrained(model_name, use_auth_token=AUTH_TOKEN)\n"," self.drop = nn.Dropout(p=0.2)\n"," self.fc = nn.Linear(768, 1)\n"," \n"," def forward(self, ids, masks):\n"," out = self.model(input_ids=ids,\n"," attention_mask=masks,\n"," output_hidden_states=False).last_hidden_state\n"," out = out[:,0]\n"," outputs = self.fc(out)\n"," return outputs"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:28.978164Z","iopub.status.busy":"2023-06-28T02:53:28.977797Z","iopub.status.idle":"2023-06-28T02:53:28.990859Z","shell.execute_reply":"2023-06-28T02:53:28.989964Z","shell.execute_reply.started":"2023-06-28T02:53:28.978133Z"},"trusted":true},"outputs":[],"source":["from torch.utils.data import Dataset\n","\n","class SiameseDataset(Dataset):\n","\n"," def __init__(self, df, tokenizer, max_length):\n"," self.df = df\n"," self.max_length = max_length\n"," self.tokenizer = tokenizer\n"," self.content1 = tokenizer.batch_encode_plus(list(df.question.apply(lambda x: x.replace(\"_\",\" \")).values), max_length=max_length, truncation=True)[\"input_ids\"]\n"," self.content2 = tokenizer.batch_encode_plus(list(df.text.apply(lambda x: x.replace(\"_\",\" \")).values), max_length=max_length, truncation=True)[\"input_ids\"]\n"," self.targets = self.df.label\n"," \n"," def __len__(self):\n"," return len(self.df)\n","\n"," def __getitem__(self, index):\n"," return {\n"," 'ids1': torch.tensor(self.content1[index], dtype=torch.long),\n"," 'ids2': torch.tensor(self.content2[index][1:], dtype=torch.long),\n"," 'target': torch.tensor(self.targets[index], dtype=torch.float)\n"," }"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:28.995179Z","iopub.status.busy":"2023-06-28T02:53:28.994892Z","iopub.status.idle":"2023-06-28T02:53:29.004203Z","shell.execute_reply":"2023-06-28T02:53:29.003189Z","shell.execute_reply.started":"2023-06-28T02:53:28.995156Z"},"trusted":true},"outputs":[],"source":["pad_token_id = tokenizer.pad_token_id\n","def collate_fn(batch):\n"," ids = [torch.cat([x[\"ids1\"], x[\"ids2\"]]) for x in batch]\n"," targets = [x[\"target\"] for x in batch]\n"," max_len = np.max([len(x) for x in ids])\n"," masks = []\n"," for i in range(len(ids)):\n"," if len(ids[i]) < max_len:\n"," ids[i]= torch.cat((ids[i], torch.tensor([pad_token_id,]*(max_len - len(ids[i])),dtype=torch.long)))\n"," masks.append(ids[i] != pad_token_id)\n"," # print(tokenizer.decode(ids[0]))\n"," outputs = {\n"," \"ids\": torch.vstack(ids),\n"," \"masks\": torch.vstack(masks),\n"," \"target\": torch.vstack(targets).view(-1)\n"," }\n"," return outputs"]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.006085Z","iopub.status.busy":"2023-06-28T02:53:29.005583Z","iopub.status.idle":"2023-06-28T02:53:29.013002Z","shell.execute_reply":"2023-06-28T02:53:29.011919Z","shell.execute_reply.started":"2023-06-28T02:53:29.006052Z"},"trusted":true},"outputs":[],"source":["from sklearn.model_selection import GroupKFold, KFold"]},{"cell_type":"code","execution_count":12,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.014999Z","iopub.status.busy":"2023-06-28T02:53:29.014493Z","iopub.status.idle":"2023-06-28T02:53:29.023324Z","shell.execute_reply":"2023-06-28T02:53:29.022211Z","shell.execute_reply.started":"2023-06-28T02:53:29.014969Z"},"trusted":true},"outputs":[],"source":["def optimizer_scheduler(model, num_train_steps):\n"," param_optimizer = list(model.named_parameters())\n"," no_decay = [\"bias\", \"LayerNorm.weight\"]\n"," optimizer_parameters = [\n"," {\n"," \"params\": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],\n"," \"weight_decay\": 0.001,\n"," },\n"," {\n"," \"params\": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],\n"," \"weight_decay\": 0.0,\n"," },\n"," ]\n","\n"," opt = AdamW(optimizer_parameters, lr=3e-5)\n"," sch = get_linear_schedule_with_warmup(\n"," opt,\n"," num_warmup_steps=int(0.05*num_train_steps),\n"," num_training_steps=num_train_steps,\n"," last_epoch=-1,\n"," )\n"," return opt, sch"]},{"cell_type":"code","execution_count":13,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.025531Z","iopub.status.busy":"2023-06-28T02:53:29.025091Z","iopub.status.idle":"2023-06-28T02:53:29.036112Z","shell.execute_reply":"2023-06-28T02:53:29.034921Z","shell.execute_reply.started":"2023-06-28T02:53:29.025496Z"},"trusted":true},"outputs":[],"source":["from sklearn.model_selection import KFold\n","kfold = KFold(n_splits=5, shuffle=True, random_state=42)"]},{"cell_type":"code","execution_count":14,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.038723Z","iopub.status.busy":"2023-06-28T02:53:29.037722Z","iopub.status.idle":"2023-06-28T04:54:29.777596Z","shell.execute_reply":"2023-06-28T04:54:29.776341Z","shell.execute_reply.started":"2023-06-28T02:53:29.038691Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["[ 3 6 17 ... 20845 20850 20855]\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"f0f58e6b09914620903bd80dae3eca59","version_major":2,"version_minor":0},"text/plain":["Downloading (…)lve/main/config.json: 0%| | 0.00/688 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"2b7bfe1d5b504e8f8ec03f19cd7bfba5","version_major":2,"version_minor":0},"text/plain":["Downloading pytorch_model.bin: 0%| | 0.00/1.11G [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stderr","output_type":"stream","text":["Some weights of the model checkpoint at nguyenvulebinh/vi-mrc-base were not used when initializing RobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']\n","- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n","- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n","Some weights of RobertaModel were not initialized from the model checkpoint at nguyenvulebinh/vi-mrc-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n","You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n","/opt/conda/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n"," warnings.warn(\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"3a999992c57e44838063b744088409f8","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/5 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/5214 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/131 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["F1 0.9280047017337644\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/5214 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/131 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["F1 0.9665513264129183\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/5214 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/131 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["F1 0.9868613138686132\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/5214 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/131 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["F1 0.9889341875364007\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/5214 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/131 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["F1 0.9926921952645424\n"]}],"source":["from tqdm.auto import tqdm\n","loss_fn = nn.BCEWithLogitsLoss()\n","epochs = 5\n","accumulation_steps = 8\n","scaler = torch.cuda.amp.GradScaler()\n","error_ids = None\n","for fold, (train_index, test_index) in enumerate(kfold.split(df, df.label)):\n"," if fold != 0:\n"," break\n"," print(test_index)\n"," model = PairwiseModel('nguyenvulebinh/vi-mrc-base')\n"," # model.load_state_dict(torch.load(f\"./outputs/pairwise_v2.bin\"))\n"," model.cuda()\n"," train_df = df\n"," # train_df = df.iloc[train_index].reset_index(drop=True)\n"," val_df = df.iloc[test_index].reset_index(drop=True)\n"," \n"," train_dataset = SiameseDataset(train_df, tokenizer, 384)\n"," valid_dataset = SiameseDataset(val_df, tokenizer, 384)\n"," train_loader = DataLoader(train_dataset, batch_size=4, collate_fn=collate_fn,\n"," num_workers=2, shuffle=True, pin_memory=True, drop_last=True)\n"," valid_loader = DataLoader(valid_dataset, batch_size=32, collate_fn=collate_fn,\n"," num_workers=2, shuffle=False, pin_memory=True)\n"," \n"," num_train_steps = len(train_loader) * epochs // accumulation_steps\n"," optimizer, scheduler = optimizer_scheduler(model, num_train_steps)\n"," \n"," for epoch in tqdm(range(epochs)):\n"," model.train()\n"," bar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)\n"," for step, data in bar:\n"," ids = data[\"ids\"].cuda()\n"," # for x in ids:\n"," # print(tokenizer.decode(x))\n"," masks = data[\"masks\"].cuda()\n"," target = data[\"target\"].cuda()\n"," # with torch.cuda.amp.autocast():\n"," preds = model(ids, masks)\n"," # print(preds.view(-1))\n"," loss = loss_fn(preds.view(-1), target.view(-1))\n"," loss /= accumulation_steps\n"," loss.backward()\n"," if (step + 1) % accumulation_steps == 0:\n"," optimizer.step()\n"," # scaler.update()\n"," optimizer.zero_grad()\n"," scheduler.step()\n"," bar.set_postfix(loss=loss.item())\n","\n"," model.eval()\n"," with torch.no_grad():\n"," bar = tqdm(enumerate(valid_loader), total=len(valid_loader), leave=False)\n"," targets = []\n"," all_preds = []\n"," for step, data in bar:\n"," ids = data[\"ids\"].cuda()\n"," masks = data[\"masks\"].cuda()\n"," target = data[\"target\"].cuda()\n"," preds = torch.sigmoid(model(ids, masks))\n"," all_preds.extend(preds.cpu().view(-1).numpy())\n"," targets.extend(target.cpu().view(-1).numpy())\n"," all_preds = np.array(all_preds)\n"," targets = np.array(targets)\n"," print(f\"F1 {f1_score(targets, all_preds > 0.5)}\")"]},{"cell_type":"code","execution_count":15,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T04:54:29.780731Z","iopub.status.busy":"2023-06-28T04:54:29.779924Z","iopub.status.idle":"2023-06-28T04:54:29.791869Z","shell.execute_reply":"2023-06-28T04:54:29.790613Z","shell.execute_reply.started":"2023-06-28T04:54:29.780691Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["F1 0.9953106682297772\n"]}],"source":["print(f\"F1 {recall_score(np.array(targets), np.array(all_preds) > 0.5)}\")"]},{"cell_type":"code","execution_count":16,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T04:54:29.796171Z","iopub.status.busy":"2023-06-28T04:54:29.795752Z","iopub.status.idle":"2023-06-28T04:54:31.811461Z","shell.execute_reply":"2023-06-28T04:54:31.810427Z","shell.execute_reply.started":"2023-06-28T04:54:29.796143Z"},"trusted":true},"outputs":[],"source":["torch.save(model.state_dict(), f\"/kaggle/working/pairwise_v2.bin\")\n"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.10"}},"nbformat":4,"nbformat_minor":4}

notebooks/1.3-train-pairwise-stage2.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/1.4-robust-qa-model.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+# local package
+-e .
+# external requirements
+streamlit==1.24.0
+transformers==4.24.0
+git+https://github.com/witiko/gensim.git@feature/bm25

src/app.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import streamlit as st
+from models.predict_model import *
+with st.sidebar:
+    st.write("# 🤖 Language Models")
+    "[![GitHub Repo](https://github.com/codespaces/badge.svg)](https://github.com/Foxxy-HCMUS/e2eqa)"
+st.title("💬 Question-Answering System")
+if "messages" not in st.session_state:
+    st.session_state["messages"] = [{"role": "assistant", "content": "How can I help you?"}]
+for msg in st.session_state.messages:
+    st.chat_message(msg["role"]).write(msg["content"])
+if prompt := st.chat_input():
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    st.chat_message("user").write(prompt)
+    msg = {
+        "role": "assistant",
+        "content": get_answer_e2e(prompt)
+    }
+    st.session_state.messages.append(msg)
+    st.chat_message("assistant").write(msg["content"])

src/features/graph_utils.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import networkx as nx
+import numpy as np
+from cdlib import algorithms
+# these functions are heavily influenced by the HF squad_metrics.py script
+def normalize_text(s):
+    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
+    import string, re
+    def remove_articles(text):
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+    def white_space_fix(text):
+        return " ".join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+def compute_exact_match(prediction, truth):
+    return int(normalize_text(prediction) == normalize_text(truth))
+def compute_f1(prediction, truth):
+    pred_tokens = normalize_text(prediction).split()
+    truth_tokens = normalize_text(truth).split()
+    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
+    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
+        return int(pred_tokens == truth_tokens)
+    common_tokens = set(pred_tokens) & set(truth_tokens)
+    # if there are no common tokens then f1 = 0
+    if len(common_tokens) == 0:
+        return 0
+    prec = len(common_tokens) / len(pred_tokens)
+    rec = len(common_tokens) / len(truth_tokens)
+    return 2 * (prec * rec) / (prec + rec)
+def is_date_or_num(answer):
+    answer = answer.lower().split()
+    for w in answer:
+        w = w.strip()
+        if w.isnumeric() or w in ["ngày", "tháng", "năm"]:
+            return True
+    return False
+def find_best_cluster(answers, best_answer, thr=0.79):
+    if len(answers) == 0:  # or best_answer not in answers:
+        return best_answer
+    elif len(answers) == 1:
+        return answers[0]
+    dists = np.zeros((len(answers), len(answers)))
+    for i in range(len(answers) - 1):
+        for j in range(i + 1, len(answers)):
+            a1 = answers[i].lower().strip()
+            a2 = answers[j].lower().strip()
+            if is_date_or_num(a1) or is_date_or_num(a2):
+                # print(a1, a2)
+                if a1 == a2 or ("tháng" in a1 and a1 in a2) or ("tháng" in a2 and a2 in a1):
+                    dists[i, j] = 1
+                    dists[j, i] = 1
+                # continue
+            elif a1 == a2 or (a1 in a2) or (a2 in a1) or compute_f1(a1.lower(), a2.lower()) >= thr:
+                dists[i, j] = 1
+                dists[j, i] = 1
+    # print(dists)
+    try:
+        thr = 1
+        dups = np.where(dists >= thr)
+        dup_strs = []
+        edges = []
+        for i, j in zip(dups[0], dups[1]):
+            if i != j:
+                edges.append((i, j))
+        G = nx.Graph()
+        for i, answer in enumerate(answers):
+            G.add_node(i, content=answer)
+        G.add_edges_from(edges)
+        partition = algorithms.louvain(G)
+        max_len_comm = np.max([len(x) for x in partition.communities])
+        best_comms = []
+        for comm in partition.communities:
+            # print([answers[i] for i in comm])
+            if len(comm) == max_len_comm:
+                best_comms.append([answers[i] for i in comm])
+        # if len(best_comms) > 1:
+        #     return best_answer
+        for comm in best_comms:
+            if best_answer in comm:
+                return best_answer
+        mid = len(best_comms[0]) // 2
+        # print(mid, sorted(best_comms[0], key = len))
+        return sorted(best_comms[0], key=len)[mid]
+    except Exception as e:
+        print(e, "Disconnected graph")
+        return best_answer

src/features/text_utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import json
+from glob import glob
+import re
+from nltk import word_tokenize as lib_tokenizer
+import string
+def preprocess(x, max_length=-1, remove_puncts=False):
+    x = nltk_tokenize(x)
+    x = x.replace("\n", " ")
+    if remove_puncts:
+        x = "".join([i for i in x if i not in string.punctuation])
+    if max_length > 0:
+        x = " ".join(x.split()[:max_length])
+    return x
+def nltk_tokenize(x):
+    return " ".join(word_tokenize(strip_context(x))).strip()
+def post_process_answer(x, entity_dict):
+    if type(x) is not str:
+        return x
+    try:
+        x = strip_answer_string(x)
+    except:
+        return "NaN"
+    x = "".join([c for c in x if c not in string.punctuation])
+    x = " ".join(x.split())
+    y = x.lower()
+    if len(y) > 1 and y.split()[0].isnumeric() and ("tháng" not in x):
+        return y.split()[0]
+    if not (x.isnumeric() or "ngày" in x or "tháng" in x or "năm" in x):
+        if len(x.split()) <= 2:
+            return entity_dict.get(x.lower(), x)
+        else:
+            return x
+    else:
+        return y
+dict_map = dict({})
+def word_tokenize(text):
+    global dict_map
+    words = text.split()
+    words_norm = []
+    for w in words:
+        if dict_map.get(w, None) is None:
+            dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '"').replace("''", '"')
+        words_norm.append(dict_map[w])
+    return words_norm
+def strip_answer_string(text):
+    text = text.strip()
+    while text[-1] in '.,/><;:\'"[]{}+=-_)(*&^!~`':
+        if text[0] != '(' and text[-1] == ')' and '(' in text:
+            break
+        if text[-1] == '"' and text[0] != '"' and text.count('"') > 1:
+            break
+        text = text[:-1].strip()
+    while text[0] in '.,/><;:\'"[]{}+=-_)(*&^!~`':
+        if text[0] == '"' and text[-1] != '"' and text.count('"') > 1:
+            break
+        text = text[1:].strip()
+    text = text.strip()
+    return text
+def strip_context(text):
+    text = text.replace('\n', ' ')
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+def check_number(x):
+    x = str(x).lower()
+    return (x.isnumeric() or "ngày" in x or "tháng" in x or "năm" in x)

src/models/bm25_utils.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import numpy as np
+from tqdm.auto import tqdm
+tqdm.pandas()
+from gensim.corpora import Dictionary
+from gensim.models import TfidfModel
+from gensim.similarities import SparseMatrixSimilarity
+from features.text_utils import preprocess
+class BM25Gensim:
+    def __init__(self, checkpoint_path, entity_dict, title2idx):
+        self.dictionary = Dictionary.load(checkpoint_path + "/dict")
+        self.tfidf_model = SparseMatrixSimilarity.load(checkpoint_path + "/tfidf")
+        self.bm25_index = TfidfModel.load(checkpoint_path + "/bm25_index")
+        self.title2idx = title2idx
+        self.entity_dict = entity_dict
+    def get_topk_stage1(self, query, topk=100):
+        tokenized_query = query.split()
+        tfidf_query = self.tfidf_model[self.dictionary.doc2bow(tokenized_query)]
+        scores = self.bm25_index[tfidf_query]
+        top_n = np.argsort(scores)[::-1][:topk]
+        return top_n, scores[top_n]
+    def get_topk_stage2(self, x, raw_answer=None, topk=50):
+        x = str(x)
+        query = preprocess(x, max_length=128).lower().split()
+        tfidf_query = self.tfidf_model[self.dictionary.doc2bow(query)]
+        scores = self.bm25_index[tfidf_query]
+        top_n = list(np.argsort(scores)[::-1][:topk])
+        if raw_answer is not None:
+            raw_answer = raw_answer.strip()
+            if raw_answer in self.entity_dict:
+                title = self.entity_dict[raw_answer].replace("wiki/", "").replace("_", " ")
+                extra_id = self.title2idx.get(title, -1)
+                if extra_id != -1 and extra_id not in top_n:
+                    top_n.append(extra_id)
+        scores = scores[top_n]
+        return np.array(top_n), np.array(scores)

src/models/pairwise_model.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+from transformers import AutoModel, AutoConfig
+from transformers import AutoTokenizer
+import pandas as pd
+AUTH_TOKEN = "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"
+tokenizer = AutoTokenizer.from_pretrained('nguyenvulebinh/vi-mrc-base',
+                                          use_auth_token=AUTH_TOKEN)
+pad_token_id = tokenizer.pad_token_id
+class PairwiseModel(nn.Module):
+    def __init__(self, model_name, max_length=384, batch_size=16, device="cuda:0"):
+        super(PairwiseModel, self).__init__()
+        self.max_length = max_length
+        self.batch_size = batch_size
+        self.device = device
+        self.model = AutoModel.from_pretrained(model_name, use_auth_token=AUTH_TOKEN)
+        self.model.to(self.device)
+        self.model.eval()
+        self.config = AutoConfig.from_pretrained(model_name, use_auth_token=AUTH_TOKEN)
+        self.fc = nn.Linear(768, 1).to(self.device)
+    def forward(self, ids, masks):
+        out = self.model(input_ids=ids,
+                         attention_mask=masks,
+                         output_hidden_states=False).last_hidden_state
+        out = out[:, 0]
+        outputs = self.fc(out)
+        return outputs
+    def stage1_ranking(self, question, texts):
+        tmp = pd.DataFrame()
+        tmp["text"] = [" ".join(x.split()) for x in texts]
+        tmp["question"] = question
+        valid_dataset = SiameseDatasetStage1(tmp, tokenizer, self.max_length, is_test=True)
+        valid_loader = DataLoader(valid_dataset, batch_size=self.batch_size, collate_fn=collate_fn,
+                                  num_workers=0, shuffle=False, pin_memory=True)
+        preds = []
+        with torch.no_grad():
+            bar = enumerate(valid_loader)
+            for step, data in bar:
+                ids = data["ids"].to(self.device)
+                masks = data["masks"].to(self.device)
+                preds.append(torch.sigmoid(self(ids, masks)).view(-1))
+            preds = torch.concat(preds)
+        return preds.cpu().numpy()
+    def stage2_ranking(self, question, answer, titles, texts):
+        tmp = pd.DataFrame()
+        tmp["candidate"] = texts
+        tmp["question"] = question
+        tmp["answer"] = answer
+        tmp["title"] = titles
+        valid_dataset = SiameseDatasetStage2(tmp, tokenizer, self.max_length, is_test=True)
+        valid_loader = DataLoader(valid_dataset, batch_size=self.batch_size, collate_fn=collate_fn,
+                                  num_workers=0, shuffle=False, pin_memory=True)
+        preds = []
+        with torch.no_grad():
+            bar = enumerate(valid_loader)
+            for step, data in bar:
+                ids = data["ids"].to(self.device)
+                masks = data["masks"].to(self.device)
+                preds.append(torch.sigmoid(self(ids, masks)).view(-1))
+            preds = torch.concat(preds)
+        return preds.cpu().numpy()
+class SiameseDatasetStage1(Dataset):
+    def __init__(self, df, tokenizer, max_length, is_test=False):
+        self.df = df
+        self.max_length = max_length
+        self.tokenizer = tokenizer
+        self.is_test = is_test
+        self.content1 = tokenizer.batch_encode_plus(list(df.question.values), max_length=max_length, truncation=True)[
+            "input_ids"]
+        self.content2 = tokenizer.batch_encode_plus(list(df.text.values), max_length=max_length, truncation=True)[
+            "input_ids"]
+        if not self.is_test:
+            self.targets = self.df.label
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, index):
+        return {
+            'ids1': torch.tensor(self.content1[index], dtype=torch.long),
+            'ids2': torch.tensor(self.content2[index][1:], dtype=torch.long),
+            'target': torch.tensor(0) if self.is_test else torch.tensor(self.targets[index], dtype=torch.float)
+        }
+class SiameseDatasetStage2(Dataset):
+    def __init__(self, df, tokenizer, max_length, is_test=False):
+        self.df = df
+        self.max_length = max_length
+        self.tokenizer = tokenizer
+        self.is_test = is_test
+        self.df["content1"] = self.df.apply(lambda row: row.question + f" {tokenizer.sep_token} " + row.answer, axis=1)
+        self.df["content2"] = self.df.apply(lambda row: row.title + f" {tokenizer.sep_token} " + row.candidate, axis=1)
+        self.content1 = tokenizer.batch_encode_plus(list(df.content1.values), max_length=max_length, truncation=True)[
+            "input_ids"]
+        self.content2 = tokenizer.batch_encode_plus(list(df.content2.values), max_length=max_length, truncation=True)[
+            "input_ids"]
+        if not self.is_test:
+            self.targets = self.df.label
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, index):
+        return {
+            'ids1': torch.tensor(self.content1[index], dtype=torch.long),
+            'ids2': torch.tensor(self.content2[index][1:], dtype=torch.long),
+            'target': torch.tensor(0) if self.is_test else torch.tensor(self.targets[index], dtype=torch.float)
+        }
+def collate_fn(batch):
+    ids = [torch.cat([x["ids1"], x["ids2"]]) for x in batch]
+    targets = [x["target"] for x in batch]
+    max_len = np.max([len(x) for x in ids])
+    masks = []
+    for i in range(len(ids)):
+        if len(ids[i]) < max_len:
+            ids[i] = torch.cat((ids[i], torch.tensor([pad_token_id, ] * (max_len - len(ids[i])), dtype=torch.long)))
+        masks.append(ids[i] != pad_token_id)
+    # print(tokenizer.decode(ids[0]))
+    outputs = {
+        "ids": torch.vstack(ids),
+        "masks": torch.vstack(masks),
+        "target": torch.vstack(targets).view(-1)
+    }
+    return outputs

src/models/predict_model.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from models.pairwise_model import *
+from features.text_utils import *
+import regex as re
+from models.bm25_utils import BM25Gensim
+from models.qa_model import *
+from tqdm.auto import tqdm
+tqdm.pandas()
+df_wiki_windows = pd.read_csv("../data/processed/wikipedia_20220620_cleaned_v2.csv")
+df_wiki = pd.read_csv("../data/wikipedia_20220620_short.csv")
+df_wiki.title = df_wiki.title.apply(str)
+entity_dict = json.load(open("../data/processed/entities.json"))
+new_dict = dict()
+for key, val in entity_dict.items():
+    val = val.replace("wiki/", "").replace("_", " ")
+    entity_dict[key] = val
+    key = preprocess(key)
+    new_dict[key.lower()] = val
+entity_dict.update(new_dict)
+title2idx = dict([(x.strip(), y) for x, y in zip(df_wiki.title, df_wiki.index.values)])
+qa_model = QAEnsembleModel("nguyenvulebinh/vi-mrc-large", ["../models/qa_model_robust.bin"], entity_dict)
+pairwise_model_stage1 = PairwiseModel("nguyenvulebinh/vi-mrc-base").half()
+pairwise_model_stage1.load_state_dict(torch.load("../models/pairwise_v2.bin"))
+pairwise_model_stage1.eval()
+pairwise_model_stage2 = PairwiseModel("nguyenvulebinh/vi-mrc-base").half()
+pairwise_model_stage2.load_state_dict(torch.load("../models/pairwise_stage2_seed0.bin"))
+bm25_model_stage1 = BM25Gensim("../models/bm25_stage1/", entity_dict, title2idx)
+bm25_model_stage2_full = BM25Gensim("../models/bm25_stage2/full_text/", entity_dict, title2idx)
+bm25_model_stage2_title = BM25Gensim("../models/bm25_stage2/title/", entity_dict, title2idx)
+def get_answer_e2e(question):
+    #Bm25 retrieval for top200 candidates
+    query = preprocess(question).lower()
+    top_n, bm25_scores = bm25_model_stage1.get_topk_stage1(query, topk=200)
+    titles = [preprocess(df_wiki_windows.title.values[i]) for i in top_n]
+    texts = [preprocess(df_wiki_windows.text.values[i]) for i in top_n]
+    #Reranking with pairwise model for top10
+    question = preprocess(question)
+    ranking_preds = pairwise_model_stage1.stage1_ranking(question, texts)
+    ranking_scores = ranking_preds * bm25_scores
+    #Question answering
+    best_idxs = np.argsort(ranking_scores)[-10:]
+    ranking_scores = np.array(ranking_scores)[best_idxs]
+    texts = np.array(texts)[best_idxs]
+    best_answer = qa_model(question, texts, ranking_scores)
+    if best_answer is None:
+        return "Chịu"
+    bm25_answer = preprocess(str(best_answer).lower(), max_length=128, remove_puncts=True)
+    #Entity mapping
+    if not check_number(bm25_answer):
+        bm25_question = preprocess(str(question).lower(), max_length=128, remove_puncts=True)
+        bm25_question_answer = bm25_question + " " + bm25_answer
+        candidates, scores = bm25_model_stage2_title.get_topk_stage2(bm25_answer, raw_answer=best_answer)
+        titles = [df_wiki.title.values[i] for i in candidates]
+        texts = [df_wiki.text.values[i] for i in candidates]
+        ranking_preds = pairwise_model_stage2.stage2_ranking(question, best_answer, titles, texts)
+        if ranking_preds.max() >= 0.1:
+            final_answer = titles[ranking_preds.argmax()]
+        else:
+            candidates, scores = bm25_model_stage2_full.get_topk_stage2(bm25_question_answer)
+            titles = [df_wiki.title.values[i] for i in candidates] + titles
+            texts = [df_wiki.text.values[i] for i in candidates] + texts
+            ranking_preds = np.concatenate(
+                [pairwise_model_stage2.stage2_ranking(question, best_answer, titles, texts), ranking_preds])
+        final_answer = "wiki/"+titles[ranking_preds.argmax()].replace(" ","_")
+    else:
+        final_answer = bm25_answer.lower()
+    return final_answer

src/models/qa_model.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import AutoModelForQuestionAnswering, pipeline
+from features.text_utils import post_process_answer
+from features.graph_utils import find_best_cluster
+class QAEnsembleModel(nn.Module):
+    def __init__(self, model_name, model_checkpoints, entity_dict,
+                 thr=0.1, device="cuda:0"):
+        super(QAEnsembleModel, self).__init__()
+        self.nlps = []
+        for model_checkpoint in model_checkpoints:
+            model = AutoModelForQuestionAnswering.from_pretrained(model_name).half()
+            model.load_state_dict(torch.load(model_checkpoint), strict=False)
+            nlp = pipeline('question-answering', model=model,
+                           tokenizer=model_name, device=int(device.split(":")[-1]))
+            self.nlps.append(nlp)
+        self.entity_dict = entity_dict
+        self.thr = thr
+    def forward(self, question, texts, ranking_scores=None):
+        if ranking_scores is None:
+            ranking_scores = np.ones((len(texts),))
+        curr_answers = []
+        curr_scores = []
+        best_score = 0
+        for i, nlp in enumerate(self.nlps):
+            for text, score in zip(texts, ranking_scores):
+                QA_input = {
+                    'question': question,
+                    'context': text
+                }
+                res = nlp(QA_input)
+                # print(res)
+                if res["score"] > self.thr:
+                    curr_answers.append(res["answer"])
+                    curr_scores.append(res["score"])
+                res["score"] = res["score"] * score
+                if i == 0:
+                    if res["score"] > best_score:
+                        answer = res["answer"]
+                        best_score = res["score"]
+        if len(curr_answers) == 0:
+            return None
+        curr_answers = [post_process_answer(x, self.entity_dict) for x in curr_answers]
+        answer = post_process_answer(answer, self.entity_dict)
+        new_best_answer = post_process_answer(find_best_cluster(curr_answers, answer), self.entity_dict)
+        return new_best_answer

submission/answer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

submission/submission.json ADDED Viewed

The diff for this file is too large to render. See raw diff

submission/test.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import json
+data = json.load(open("answer.json", encoding="utf8"))
+for i in data:
+    if i['answer'] == "Khong biet":
+        i['answer'] = None
+obj = {
+    "data": data
+}
+json.dump(obj, open("submission.json", "w", encoding="utf8"), ensure_ascii=False, indent=4)