Spaces:

KalbeDigitalLab
/

nutrigenme-paper-extractor

Sleeping

App Files Files Community

fadliaulawi commited on May 21

Commit

ca71749

•

1 Parent(s): bd28dd7

Add validation minus API

Browse files

Files changed (4) hide show

.gitignore +1 -1
app.py +8 -7
process.py +93 -22
resources/experiment.ipynb +776 -103

.gitignore CHANGED Viewed

@@ -2,6 +2,6 @@ __pycache__
 .env
 .vscode
 env/
-resources/images/
 resources/papers/
 result/

 .env
 .vscode
 env/
+resources/testing/
 resources/papers/
 result/

app.py CHANGED Viewed

@@ -36,9 +36,9 @@ with col1:
 with col2:
     tokens = (
-        24000,
         16000,
-        8000
     )
     chunk_option = st.selectbox(
         'Token amounts per process:', tokens, key='token'
@@ -136,21 +136,22 @@ if uploaded_files:
                     dataframe.reset_index(drop=True, inplace=True)
                     # Validate Result
-                    cleaned_df, cleaned_llm_df = process.validate(dataframe)
                     end_time = datetime.now()
                     st.write("Success in ", round((end_time.timestamp() - start_time.timestamp()) / 60, 2), "minutes")
-                    st.dataframe(cleaned_df)
                     with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
-                        cleaned_df.to_excel(writer, sheet_name='Result')
-                        cleaned_llm_df.to_excel(writer, sheet_name='Validate with LLM')
                         dataframe.to_excel(writer, sheet_name='Original')
                         writer.close()
                     st.download_button(
                         label="Save Result",
                         data=buffer,
-                        file_name=f"{uploaded_file.name.replace('.pdf', '')}_{chunk_option}.xlsx",
                         mime='application/vnd.ms-excel'
                     )

 with col2:
     tokens = (
+        8000,
         16000,
+        24000
     )
     chunk_option = st.selectbox(
         'Token amounts per process:', tokens, key='token'
                     dataframe.reset_index(drop=True, inplace=True)
                     # Validate Result
+                    df, df_no_llm, df_clean = process.validate(dataframe)
                     end_time = datetime.now()
                     st.write("Success in ", round((end_time.timestamp() - start_time.timestamp()) / 60, 2), "minutes")
+                    st.dataframe(df)
                     with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
+                        df.to_excel(writer, sheet_name='Result Cleaned API LLM')
+                        df_no_llm.to_excel(writer, sheet_name='Result Cleaned API')
+                        df_clean.to_excel(writer, sheet_name='Result Cleaned')
                         dataframe.to_excel(writer, sheet_name='Original')
                         writer.close()
                     st.download_button(
                         label="Save Result",
                         data=buffer,
+                        file_name=f"{uploaded_file.name.replace('.pdf', '')}_{chunk_option}_{model.split('-')[0]}_{model_val.split('-')[0]}.xlsx",
                         mime='application/vnd.ms-excel'
                     )

process.py CHANGED Viewed

@@ -17,6 +17,8 @@ import json
 import os
 import pandas as pd
 import re
 import torch
 load_dotenv()
@@ -80,7 +82,7 @@ class Process():
         if types != 'summ':
             result = re.findall('(\{[^}]+\})', result)[0]
             return eval(result)
         return result
     def get_entity_one(self, chunks):
@@ -133,7 +135,7 @@ class Process():
             buffer = io.BytesIO()
             table.save(buffer, format='PNG')
             image = Image(buffer)
             # Extract to dataframe
             extracted_tables = image.extract_tables(ocr=ocr, implicit_rows=True, borderless_tables=True, min_confidence=0)
@@ -191,17 +193,17 @@ class Process():
         print('OCR table to extract', round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
         print(genes, snps, diseases)
         return genes, snps, diseases
     def validate(self, df):
         df = df.fillna('')
-        df['Genes'] = df['Genes'].str.upper()
         df['SNPs'] = df['SNPs'].str.lower()
         # Check if there is two gene names
-        sym = ['-', '/', '|']
         for i in df.index:
             gene = df.loc[i, 'Genes']
             for s in sym:
@@ -209,12 +211,14 @@ class Process():
                     genes = gene.split(s)
                     df.loc[i + 0.5] = df.loc[i]
                     df = df.sort_index().reset_index(drop=True)
-                    df.loc[i, 'Genes'], df.loc[i + 1, 'Genes'] = genes[0], genes[1]
         # Check if there is SNPs without 'rs'
         for i in df.index:
             safe = True
             snp = df.loc[i, 'SNPs']
             if re.fullmatch('rs(\d)+|', snp):
                 pass
             elif re.fullmatch('ts(\d)+', snp):
@@ -226,29 +230,96 @@ class Process():
             else:
                 safe = False
                 df = df.drop(i)
             if safe:
                 df.loc[i, 'SNPs'] = snp
         df.reset_index(drop=True, inplace=True)
-        # Validate genes and diseases with LLM
-        json_table = df[['Genes', 'SNPs', 'Diseases']].to_json(orient='records')
-        str_json_table = json.dumps(json.loads(json_table), indent=2)
-        result = self.llm_val.invoke(input=prompt_validation.format(str_json_table)).content
-        print('val')
-        print(result)
-        result = result[result.find('['):result.rfind(']')+1]
-        try:
-            result = eval(result)
-        except SyntaxError:
-            result = []
-        df_val = pd.DataFrame(result)
-        df_val = df_val.merge(df.head(1).drop(['Genes', 'SNPs', 'Diseases'], axis=1), 'cross')
-        # TODO: How to validate genes and SNPs with ground truth?
-        return df, df_val

 import os
 import pandas as pd
 import re
+import requests
+import time
 import torch
 load_dotenv()
         if types != 'summ':
             result = re.findall('(\{[^}]+\})', result)[0]
             return eval(result)
         return result
     def get_entity_one(self, chunks):
             buffer = io.BytesIO()
             table.save(buffer, format='PNG')
             image = Image(buffer)
             # Extract to dataframe
             extracted_tables = image.extract_tables(ocr=ocr, implicit_rows=True, borderless_tables=True, min_confidence=0)
         print('OCR table to extract', round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
         print(genes, snps, diseases)
         return genes, snps, diseases
     def validate(self, df):
         df = df.fillna('')
+        df['Genes'] = df['Genes'].str.replace(' ', '').str.upper()
         df['SNPs'] = df['SNPs'].str.lower()
         # Check if there is two gene names
+        sym = [',', '-', '/', '|']
         for i in df.index:
             gene = df.loc[i, 'Genes']
             for s in sym:
                     genes = gene.split(s)
                     df.loc[i + 0.5] = df.loc[i]
                     df = df.sort_index().reset_index(drop=True)
+                    df.loc[i, 'Genes'], df.loc[i + 1, 'Genes'] = genes[0], s.join(genes[1:])
+                    break
         # Check if there is SNPs without 'rs'
         for i in df.index:
             safe = True
             snp = df.loc[i, 'SNPs']
+            snp = snp.replace('l', '1')
             if re.fullmatch('rs(\d)+|', snp):
                 pass
             elif re.fullmatch('ts(\d)+', snp):
             else:
                 safe = False
                 df = df.drop(i)
             if safe:
                 df.loc[i, 'SNPs'] = snp
         df.reset_index(drop=True, inplace=True)
+        df_clean = df.copy()
+        # # Validate genes and SNPs with APIs
+        # def permutate(word):
+        #     if len(word) == 0:
+        #         return ['']
+        #     change = []
+        #     res = permutate(word[1:])
+        #     if word[0] in mistakes:
+        #         change = [mistakes[word[0]] + r for r in res]
+        #     return [word[0] + r for r in res] + change
+        # def call(url):
+        #     while True:
+        #         try:
+        #             res = requests.get(url)
+        #             time.sleep(1)
+        #             break
+        #         except Exception as e:
+        #             print(e)
+        #     return res
+        # mistakes = {'I': '1', 'O': '0'} # Common mistakes need to be maintained
+        # dbsnp = {}
+        # for i in df.index:
+        #     snp = df.loc[i, 'SNPs']
+        #     gene = df.loc[i, 'Genes']
+        #     if snp not in dbsnp:
+        #         res = call(f'https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/')
+        #         try:
+        #             res = res.json()
+        #             dbsnp[snp] = [r['gene']['geneName'] for r in res['genomicContexts']]
+        #         except:
+        #             dbsnp[snp] = []
+        #         res = call(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=snp&retmode=json&id={snp[2:]}').json()['result'][snp[2:]]
+        #         if 'error' not in res:
+        #             dbsnp[snp].extend([r['name'] for r in res['genes']])
+        #         dbsnp[snp] = list(set(dbsnp[snp]))
+        #     if gene not in dbsnp[snp]:
+        #         for other in permutate(gene):
+        #             if other in dbsnp[snp]:
+        #                 df.loc[i, 'Genes'] = other
+        #                 print(f'{gene} corrected to {other}')
+        #                 break
+        #         else:
+        #             df = df.drop(i)
+        # df.reset_index(drop=True, inplace=True)
+        df_no_llm = df.copy()
+        # Validate genes and diseases with LLM (for each 50 rows)
+        idx = 0
+        results = []
+        while True:
+            json_table = df[['Genes', 'SNPs', 'Diseases']][idx:idx+50].to_json(orient='records')
+            str_json_table = json.dumps(json.loads(json_table), indent=2)
+            result = self.llm_val.invoke(input=prompt_validation.format(str_json_table)).content
+            print('val', idx)
+            print(result)
+            result = result[result.find('['):result.rfind(']')+1]
+            try:
+                result = eval(result)
+            except SyntaxError:
+                result = []
+            results.extend(result)
+            idx += 50
+            if idx not in df.index:
+                break
+        df = pd.DataFrame(results)
+        df = df.merge(df_no_llm.head(1).drop(['Genes', 'SNPs', 'Diseases'], axis=1), 'cross')
+        return df, df_no_llm, df_clean

resources/experiment.ipynb CHANGED Viewed

@@ -2056,6 +2056,258 @@
     "        print('{:<10} {:<10} Not Match'.format(gene, snp))\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
@@ -2179,110 +2431,87 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Here is the Python solution using the `json` module and a dictionary to map known gene names to their correct forms:\n",
       "```python\n",
       "import json\n",
       "\n",
-      "# Known gene names and their corrections\n",
-      "gene_corrections = {\n",
-      "    \"SLC242\": \"SLC2A2\",\n",
-      "    \"NEUROD1IBETA2\": \"NEUROD1\",\n",
-      "    \"WFSI\": \"WFS1\",\n",
-      "    \"GLI53\": \"GLIS3\",\n",
-      "    \"FT0\": \"FTO\"\n",
       "}\n",
       "\n",
-      "# Function to correct gene names and SNPs\n",
-      "def correct_gene_data(data):\n",
-      "    corrected_data = []\n",
-      "    for entry in data:\n",
-      "        genes = entry[\"Genes\"]\n",
-      "        snps = entry[\"SNPs\"]\n",
-      "        diseases = entry[\"Diseases\"]\n",
-      "        \n",
-      "        # Correct gene names\n",
-      "        if genes in gene_corrections:\n",
-      "            genes = gene_corrections[genes]\n",
-      "        elif \" and \" not in genes:\n",
-      "            # Check for combined names\n",
-      "            parts = genes.split()\n",
-      "            if len(parts) > 1:\n",
-      "                genes = \" and \".join(parts)\n",
-      "        \n",
-      "        # Correct SNPs (assuming a dictionary of known SNPs for each gene)\n",
-      "        snp_corrections = {\n",
-      "            \"GCK\": {\"rs1799884\": \"rs1799884\"},\n",
-      "            \"SLC2A2\": {\"rs5393\": \"rs5393\"},\n",
-      "            \"NEUROD1\": {\"rs1801262\": \"rs1801262\"},\n",
-      "            \"WFS1\": {\"rs6446482\": \"rs6446482\"},\n",
-      "            \"GLIS3\": {\"rs7020673\": \"rs7020673\"},\n",
-      "            \"FTO\": {\"rs9937290\": \"rs9937290\"}\n",
-      "        }\n",
-      "        if snps and genes in snp_corrections:\n",
-      "            if snps not in snp_corrections[genes]:\n",
-      "                snps = \"\"\n",
-      "        \n",
-      "        # Correct diseases (assuming a dictionary of known diseases for each gene)\n",
-      "        disease_corrections = {\n",
-      "            \"GCK\": {\"GCK-MODY (MODY2), PNDM, CHI\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
-      "            \"SLC2A2\": {\"FBS\": \"FBS\"},\n",
-      "            \"NEUROD1\": {\"MODY6 and PNDM\": \"MODY6 and PNDM\"},\n",
-      "            \"WFS1\": {\"WFS1, sometimes referred to as DIDMOAD\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
-      "            \"GLIS3\": {\"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
-      "            \"FTO\": {\"Obesity\": \"Obesity\"}\n",
-      "        }\n",
-      "        if diseases and genes in disease_corrections:\n",
-      "            if diseases not in disease_corrections[genes]:\n",
-      "                diseases = \"\"\n",
-      "        \n",
-      "        # Add corrected entry to the list\n",
-      "        if genes and snps and diseases:\n",
-      "            corrected_data.append({\"Genes\": genes, \"SNPs\": snps, \"Diseases\": diseases})\n",
-      "    \n",
-      "    return json.dumps(corrected_data)\n",
       "\n",
-      "# Input data\n",
       "data = [\n",
       "    {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
       "    {\"Genes\": \"SLC242\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
       "    {\"Genes\": \"NEUROD1IBETA2\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
       "    {\"Genes\": \"WFSI\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
       "    {\"Genes\": \"GLI53\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
-      "    {\"Genes\": \"FT0\", \"SNPs\": \"rs9937290\", \"Diseases\": \"Obesity\"}\n",
       "]\n",
       "\n",
-      "# Correct and output the data\n",
-      "print(correct_gene_data(data))\n",
-      "```\n",
-      "This will output the corrected data in the same format as the input:\n",
-      "```\n",
-      "[\n",
-      "    {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
-      "    {\"Genes\": \"SLC2A2\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
-      "    {\"Genes\": \"NEUROD1\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
-      "    {\"Genes\": \"WFS1\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
-      "    {\"Genes\": \"GLIS3\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
-      "    {\"Genes\": \"FTO\", \"SNPs\": \"rs9937290\", \"Diseases\": \"Obesity\"}\n",
-      "]\n",
       "```\n",
-      "Note that this implementation assumes a dictionary of known gene names, SNPs, and diseases for correction. You may need to expand or modify these dictionaries based on your specific use case.\n"
      ]
     }
    ],
    "source": [
     "from langchain_openai import ChatOpenAI\n",
     "\n",
     "llm = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url=\"https://api.perplexity.ai\")\n",
     "\n",
     "prompt = \"\"\"\n",
-    "# CONTEXT #\n",
     "In my capacity as a genomics specialist, I have table data containing gene names with their corresponding SNPs and diseases. The data is provided in a list of JSON format, with each JSON object representing a single row in a tabular structure. \n",
     "The problem is because the data is extracted using OCR, some gene names and SNPs may have a typo.\n",
     "\n",
@@ -2320,7 +2549,6 @@
     "  },\n",
     "]\n",
     "\n",
-    "# OBJECTIVE #\n",
     "Given the provided table data, the following tasks need to be completed:\n",
     "\n",
     "1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:\n",
@@ -2329,7 +2557,6 @@
     "2. If SNP is not empty, check whether the gene name corresponds with the SNP. Fix it with the correct SNP if the original SNP is wrong.\n",
     "3. If diseases are not empty, check whether the gene name corresponds with the diseases. Fix it with the correct diseases if the original disease is wrong.\n",
     "\n",
-    "# RESPONSE #\n",
     "The output must be STRICTLY ONLY a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:\n",
     "[\n",
     "    {{\n",
@@ -2348,54 +2575,500 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\load.py:1429: FutureWarning: The repository for bigbio/euadr contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/bigbio/euadr\n",
-      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
-      "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
-      "  warnings.warn(\n"
      ]
     },
     {
-     "ename": "ConnectionError",
-     "evalue": "Couldn't reach https://biosemantics.erasmusmc.nl/downloads/euadr.tgz (ConnectTimeout(MaxRetryError(\"HTTPSConnectionPool(host='biosemantics.erasmusmc.nl', port=443): Max retries exceeded with url: /downloads/euadr.tgz (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001CD0D00D060>, 'Connection to biosemantics.erasmusmc.nl timed out. (connect timeout=100)'))\")))",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mConnectionError\u001b[0m                           Traceback (most recent call last)",
-      "\u001b[1;32m<ipython-input-2-8057498175ab>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mdatasets\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mdataset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"bigbio/euadr\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\load.py\u001b[0m in \u001b[0;36mload_dataset\u001b[1;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)\u001b[0m\n\u001b[0;32m   2547\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2548\u001b[0m     \u001b[1;31m# Download and prepare data\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2549\u001b[1;33m     builder_instance.download_and_prepare(\n\u001b[0m\u001b[0;32m   2550\u001b[0m         \u001b[0mdownload_config\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2551\u001b[0m         \u001b[0mdownload_mode\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdownload_mode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\builder.py\u001b[0m in \u001b[0;36mdownload_and_prepare\u001b[1;34m(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[0;32m   1003\u001b[0m                         \u001b[1;32mif\u001b[0m \u001b[0mnum_proc\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1004\u001b[0m                             \u001b[0mprepare_split_kwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"num_proc\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnum_proc\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1005\u001b[1;33m                         self._download_and_prepare(\n\u001b[0m\u001b[0;32m   1006\u001b[0m                             \u001b[0mdl_manager\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1007\u001b[0m                             \u001b[0mverification_mode\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mverification_mode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\builder.py\u001b[0m in \u001b[0;36m_download_and_prepare\u001b[1;34m(self, dl_manager, verification_mode, **prepare_splits_kwargs)\u001b[0m\n\u001b[0;32m   1765\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1766\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_download_and_prepare\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverification_mode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mprepare_splits_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1767\u001b[1;33m         super()._download_and_prepare(\n\u001b[0m\u001b[0;32m   1768\u001b[0m             \u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1769\u001b[0m             \u001b[0mverification_mode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\builder.py\u001b[0m in \u001b[0;36m_download_and_prepare\u001b[1;34m(self, dl_manager, verification_mode, **prepare_split_kwargs)\u001b[0m\n\u001b[0;32m   1076\u001b[0m         \u001b[0msplit_dict\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSplitDict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdataset_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1077\u001b[0m         \u001b[0msplit_generators_kwargs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_make_split_generators_kwargs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprepare_split_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1078\u001b[1;33m         \u001b[0msplit_generators\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_split_generators\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0msplit_generators_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1079\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1080\u001b[0m         \u001b[1;31m# Checksums verification\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m~\\.cache\\huggingface\\modules\\datasets_modules\\datasets\\bigbio--euadr\\38388d88a335f2d91807b0f813bdfd809fec0e9dcbc32e2d9bfea7275d70f75c\\euadr.py\u001b[0m in \u001b[0;36m_split_generators\u001b[1;34m(self, dl_manager)\u001b[0m\n\u001b[0;32m    105\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_split_generators\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    106\u001b[0m         \u001b[0murls\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_URL\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 107\u001b[1;33m         \u001b[0mdatapath\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdownload_and_extract\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murls\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    108\u001b[0m         return [\n\u001b[0;32m    109\u001b[0m             datasets.SplitGenerator(\n",
-      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\download\\download_manager.py\u001b[0m in \u001b[0;36mdownload_and_extract\u001b[1;34m(self, url_or_urls)\u001b[0m\n\u001b[0;32m    560\u001b[0m             \u001b[0mextracted_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mextracted\u001b[0m \u001b[0mpaths\u001b[0m \u001b[0mof\u001b[0m \u001b[0mgiven\u001b[0m \u001b[0mURL\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    561\u001b[0m         \"\"\"\n\u001b[1;32m--> 562\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mextract\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_urls\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    563\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    564\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mget_recorded_sizes_checksums\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\download\\download_manager.py\u001b[0m in \u001b[0;36mdownload\u001b[1;34m(self, url_or_urls)\u001b[0m\n\u001b[0;32m    424\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    425\u001b[0m         \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 426\u001b[1;33m         downloaded_path_or_paths = map_nested(\n\u001b[0m\u001b[0;32m    427\u001b[0m             \u001b[0mdownload_func\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    428\u001b[0m             \u001b[0murl_or_urls\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\utils\\py_utils.py\u001b[0m in \u001b[0;36mmap_nested\u001b[1;34m(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, parallel_min_length, types, disable_tqdm, desc)\u001b[0m\n\u001b[0;32m    457\u001b[0m     \u001b[1;31m# Singleton\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    458\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtypes\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 459\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mfunction\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    460\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    461\u001b[0m     \u001b[0miterable\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32melse\u001b[0m \u001b[0mdata_struct\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\download\\download_manager.py\u001b[0m in \u001b[0;36m_download\u001b[1;34m(self, url_or_filename, download_config)\u001b[0m\n\u001b[0;32m    449\u001b[0m             \u001b[1;31m# append the relative path to the base_path\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    450\u001b[0m             \u001b[0murl_or_filename\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0murl_or_path_join\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_base_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl_or_filename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 451\u001b[1;33m         \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcached_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdownload_config\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    452\u001b[0m         \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtracked_str\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    453\u001b[0m         \u001b[0mout\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_origin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\utils\\file_utils.py\u001b[0m in \u001b[0;36mcached_path\u001b[1;34m(url_or_filename, download_config, **download_kwargs)\u001b[0m\n\u001b[0;32m    186\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mis_remote_url\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    187\u001b[0m         \u001b[1;31m# URL, so get it from the cache (downloading if necessary)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 188\u001b[1;33m         output_path = get_from_cache(\n\u001b[0m\u001b[0;32m    189\u001b[0m             \u001b[0murl_or_filename\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    190\u001b[0m             \u001b[0mcache_dir\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\utils\\file_utils.py\u001b[0m in \u001b[0;36mget_from_cache\u001b[1;34m(url, cache_dir, force_download, proxies, etag_timeout, resume_download, user_agent, local_files_only, use_etag, max_retries, token, use_auth_token, ignore_url_params, storage_options, download_desc)\u001b[0m\n\u001b[0;32m    571\u001b[0m         \u001b[0m_raise_if_offline_mode_is_enabled\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Tried to reach {url}\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    572\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mhead_error\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 573\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Couldn't reach {url} ({repr(head_error)})\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    574\u001b[0m         \u001b[1;32melif\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    575\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Couldn't reach {url} (error {response.status_code})\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;31mConnectionError\u001b[0m: Couldn't reach https://biosemantics.erasmusmc.nl/downloads/euadr.tgz (ConnectTimeout(MaxRetryError(\"HTTPSConnectionPool(host='biosemantics.erasmusmc.nl', port=443): Max retries exceeded with url: /downloads/euadr.tgz (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001CD0D00D060>, 'Connection to biosemantics.erasmusmc.nl timed out. (connect timeout=100)'))\")))"
      ]
     }
    ],
    "source": [
-    "from datasets import load_dataset\n",
     "\n",
-    "dataset = load_dataset(\"bigbio/euadr\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset"
    ]
   }
  ],
  "metadata": {

     "        print('{:<10} {:<10} Not Match'.format(gene, snp))\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.read_excel('../result/monogenic diabetes_8000.xlsx', sheet_name=\"Result\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['STARDIO', 'STARDI0', 'STARD1O', 'STARD10']"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mistakes = {'I': '1', 'O': '0'}\n",
+    "\n",
+    "def permutate(word):\n",
+    "\n",
+    "    if len(word) == 0:\n",
+    "        return ['']\n",
+    "\n",
+    "    change = []\n",
+    "    res = permutate(word[1:])\n",
+    "\n",
+    "    if word[0] in mistakes:\n",
+    "        change = [mistakes[word[0]] + r for r in res]\n",
+    "\n",
+    "    return [word[0] + r for r in res] + change\n",
+    "\n",
+    "permutate('STARDIO')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GCK        rs1799884  Match\n",
+      "GCK        rs4607517  Match\n",
+      "SLC2A2     rs5393     Match\n",
+      "SLC2A2     rs5394     Match\n",
+      "SLC2A2     rs5400     Match\n",
+      "SLC2A2     rs5404     Match\n",
+      "HNF4A      rs2144908  Match\n",
+      "HNF4A      rs3818247  Match\n",
+      "HNF4A      rs884614   Not Match\n",
+      "HNF4A      rs4810424  Not Match\n",
+      "HNF4A      rs1884613  Not Match\n",
+      "HNF1B      rs757210   Match\n",
+      "TCF2       rs757210   Not Match\n",
+      "HNF1B      rs4430796  Match\n",
+      "TCF2       rs4430796  Not Match\n",
+      "HNF1B      rs7501939  Match\n",
+      "TCF2       rs7501939  Not Match\n",
+      "PAX4       rs10229583 Match\n",
+      "NEUROD1    rs1801262  Match\n",
+      "BETA2      rs1801262  Not Match\n",
+      "WFS1       rs10010131 Match\n",
+      "WFS1       rs6446482  Match\n",
+      "WFS1       rs734312   Match\n",
+      "PPARG      rs1801282  Match\n",
+      "PPARG      rs4684847  Match\n",
+      "GLIS3      rs7020673  Match\n",
+      "GLIS3      rs7034200  Match\n",
+      "GLIS3      rs7041847  Match\n",
+      "HNF1A      rs1801262  Not Match\n",
+      "INS        rs1801282  Not Match\n",
+      "PPARG      rs780094   Not Match\n"
+     ]
+    }
+   ],
+   "source": [
+    "import requests\n",
+    "\n",
+    "dbsnp = {}\n",
+    "\n",
+    "for i in df.index:\n",
+    "    snp = df.loc[i, 'SNPs']\n",
+    "    gene = df.loc[i, 'Genes']\n",
+    "\n",
+    "    if snp not in dbsnp:\n",
+    "        res = requests.get(f'https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/')\n",
+    "\n",
+    "        try:\n",
+    "            res = res.json()\n",
+    "            dbsnp[snp] = [r['gene']['geneName'] for r in res['genomicContexts']]\n",
+    "        except:\n",
+    "            dbsnp[snp] = []\n",
+    "\n",
+    "        res = requests.get(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=snp&retmode=json&id={snp[2:]}').json()['result'][snp[2:]]\n",
+    "        if 'error' not in res:\n",
+    "            dbsnp[snp].extend([r['name'] for r in res['genes']])\n",
+    "\n",
+    "        dbsnp[snp] = list(set(dbsnp[snp]))\n",
+    "\n",
+    "    if gene in dbsnp[snp]:\n",
+    "        print('{:<10} {:<10} Match'.format(gene, snp))\n",
+    "    else:\n",
+    "        for other in permutate(gene):\n",
+    "            if other in dbsnp[snp]:\n",
+    "                print('{:<10} {:<10} Match (corrected)'.format(other, snp))\n",
+    "                break\n",
+    "        else:\n",
+    "            print('{:<10} {:<10} Not Match'.format(gene, snp))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "1\n",
+      "2\n",
+      "3\n",
+      "4\n",
+      "5\n",
+      "6\n",
+      "7\n",
+      "8\n",
+      "9\n",
+      "10\n",
+      "11\n",
+      "12\n",
+      "13\n",
+      "14\n",
+      "15\n",
+      "16\n",
+      "17\n",
+      "18\n",
+      "19\n",
+      "20\n",
+      "21\n",
+      "22\n",
+      "23\n",
+      "24\n",
+      "25\n",
+      "26\n",
+      "27\n",
+      "28\n",
+      "29\n",
+      "30\n",
+      "31\n",
+      "32\n",
+      "33\n",
+      "34\n",
+      "35\n",
+      "36\n",
+      "37\n",
+      "38\n",
+      "39\n",
+      "40\n",
+      "41\n",
+      "42\n",
+      "43\n",
+      "44\n",
+      "45\n",
+      "46\n",
+      "47\n",
+      "48\n",
+      "49\n",
+      "50\n",
+      "51\n",
+      "52\n",
+      "53\n",
+      "54\n",
+      "55\n",
+      "56\n",
+      "57\n",
+      "58\n",
+      "59\n",
+      "60\n",
+      "61\n",
+      "62\n",
+      "63\n",
+      "64\n",
+      "65\n",
+      "66\n",
+      "67\n",
+      "68\n",
+      "69\n",
+      "70\n",
+      "71\n",
+      "72\n",
+      "73\n",
+      "74\n",
+      "75\n",
+      "76\n",
+      "77\n",
+      "78\n",
+      "79\n",
+      "80\n",
+      "81\n",
+      "82\n",
+      "83\n",
+      "84\n",
+      "85\n",
+      "86\n",
+      "87\n",
+      "88\n",
+      "89\n",
+      "90\n",
+      "91\n",
+      "92\n",
+      "93\n",
+      "94\n",
+      "95\n",
+      "96\n",
+      "97\n",
+      "98\n",
+      "99\n"
+     ]
+    }
+   ],
+   "source": [
+    "import requests\n",
+    "import time\n",
+    "\n",
+    "snp = 'rs972283'\n",
+    "for i in range(100):\n",
+    "    print(i)\n",
+    "    while True:\n",
+    "        try:\n",
+    "            res = requests.get(f'https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/')\n",
+    "            break\n",
+    "        except Exception as e:\n",
+    "            print('sleep')\n",
+    "            time.sleep(1)\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "To accomplish this task, we'll need a reference dataset of correct gene names, SNPs, and diseases. Let's assume we have a dictionary `gene_ref` that maps gene names to their corresponding SNPs and diseases.\n",
+      "\n",
+      "Here's a Python script that should accomplish the tasks:\n",
       "```python\n",
       "import json\n",
       "\n",
+      "# Reference dataset (example)\n",
+      "gene_ref = {\n",
+      "    \"GCK\": {\"SNPs\": [\"rs1799884\"], \"Diseases\": [\"GCK-MODY (MODY2)\", \"PNDM\", \"CHI\"]},\n",
+      "    \"SLC2A2\": {\"SNPs\": [\"rs5393\"], \"Diseases\": [\"FBS\"]},\n",
+      "    \"NEUROD1\": {\"SNPs\": [\"rs1801262\"], \"Diseases\": [\"MODY6\", \"PNDM\"]},\n",
+      "    \"WFS1\": {\"SNPs\": [\"rs6446482\"], \"Diseases\": [\"WFS1\", \"DIDMOAD\"]},\n",
+      "    \"GLIS3\": {\"SNPs\": [\"rs7020673\"], \"Diseases\": [\"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"]},\n",
+      "    \"FTO\": {\"SNPs\": [\"rs9937290\"], \"Diseases\": [\"Obesity\"]},\n",
+      "    # Add more gene references as needed\n",
       "}\n",
       "\n",
+      "def correct_gene_name(gene_name):\n",
+      "    # Check for combined names\n",
+      "    for ref_gene in gene_ref:\n",
+      "        if ref_gene in gene_name:\n",
+      "            return [ref_gene]\n",
+      "    # Check for OCR errors\n",
+      "    for ref_gene in gene_ref:\n",
+      "        if len(set(gene_name) & set(ref_gene)) > len(ref_gene) / 2:\n",
+      "            return [ref_gene]\n",
+      "    return []\n",
+      "\n",
+      "def validate_data(data):\n",
+      "    validated_data = []\n",
+      "    for row in data:\n",
+      "        gene_name = row[\"Genes\"]\n",
+      "        corrected_genes = correct_gene_name(gene_name)\n",
+      "        if not corrected_genes:\n",
+      "            continue  # Remove row if gene name is invalid\n",
+      "        for corrected_gene in corrected_genes:\n",
+      "            new_row = row.copy()\n",
+      "            new_row[\"Genes\"] = corrected_gene\n",
+      "            # Check and correct SNP\n",
+      "            if row[\"SNPs\"]:\n",
+      "                if row[\"SNPs\"] not in gene_ref[corrected_gene][\"SNPs\"]:\n",
+      "                    new_row[\"SNPs\"] = gene_ref[corrected_gene][\"SNPs\"][0]\n",
+      "            # Check and correct diseases\n",
+      "            if row[\"Diseases\"]:\n",
+      "                diseases = [disease.strip() for disease in row[\"Diseases\"].split(\",\")]\n",
+      "                if not all(disease in gene_ref[corrected_gene][\"Diseases\"] for disease in diseases):\n",
+      "                    new_row[\"Diseases\"] = \", \".join(gene_ref[corrected_gene][\"Diseases\"])\n",
+      "            validated_data.append(new_row)\n",
+      "    return json.dumps(validated_data)\n",
       "\n",
       "data = [\n",
       "    {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
       "    {\"Genes\": \"SLC242\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
       "    {\"Genes\": \"NEUROD1IBETA2\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
       "    {\"Genes\": \"WFSI\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
       "    {\"Genes\": \"GLI53\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
+      "    {\"Genes\": \"FT0\", \"SNPs\": \"rs9937290\", \"Diseases\": \"Obesity\"},\n",
       "]\n",
       "\n",
+      "print(validate_data(data))\n",
       "```\n",
+      "This script will output a string containing a list of JSON objects with corrected gene names, SNPs, and diseases.\n",
+      "\n",
+      "Note that this implementation assumes a simple reference dataset and may not cover all possible OCR errors or combined gene names. You may need to expand the `gene_ref` dictionary and the `correct_gene_name` function to handle more complex cases.\n"
      ]
     }
    ],
    "source": [
     "from langchain_openai import ChatOpenAI\n",
+    "import os\n",
     "\n",
     "llm = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url=\"https://api.perplexity.ai\")\n",
     "\n",
     "prompt = \"\"\"\n",
     "In my capacity as a genomics specialist, I have table data containing gene names with their corresponding SNPs and diseases. The data is provided in a list of JSON format, with each JSON object representing a single row in a tabular structure. \n",
     "The problem is because the data is extracted using OCR, some gene names and SNPs may have a typo.\n",
     "\n",
     "  },\n",
     "]\n",
     "\n",
     "Given the provided table data, the following tasks need to be completed:\n",
     "\n",
     "1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:\n",
     "2. If SNP is not empty, check whether the gene name corresponds with the SNP. Fix it with the correct SNP if the original SNP is wrong.\n",
     "3. If diseases are not empty, check whether the gene name corresponds with the diseases. Fix it with the correct diseases if the original disease is wrong.\n",
     "\n",
     "The output must be STRICTLY ONLY a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:\n",
     "[\n",
     "    {{\n",
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "from llama_parse import LlamaParse"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Started parsing the file under job_id cb5d7891-1366-47b7-98e2-d6cfbd5d3b87\n",
+      ".."
+     ]
+    }
+   ],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "parser = LlamaParse(\n",
+    "    # api_key=os.environ['LLAMA_'],  # can also be set in your env as LLAMA_CLOUD_API_KEY\n",
+    "    result_type=\"markdown\",  # \"markdown\" and \"text\" are available\n",
+    "    num_workers=4,  # if multiple files passed, split in `num_workers` API calls\n",
+    "    verbose=True,\n",
+    "    language=\"en\",  # Optionally you can define a language, default=en\n",
+    ")\n",
+    "\n",
+    "# sync\n",
+    "objs = parser.get_json_result(\"papers/ukmss-34421.pdf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Table 2                            Voight et al.                                                                                               Page 22',\n",
+       " 'Expression QTL results for T2D-associated variants in blood and adipose tissue',\n",
+       " '',\n",
+       " '',\n",
+       " '',\n",
+       " 'e',\n",
+       " 'SNP with strongest correlation with trait',\n",
+       " '',\n",
+       " '',\n",
+       " '',\n",
+       " 'SNPChr.PositionNearbyRiskGene (transcript)TissueP valueP value',\n",
+       " 'cd2fg',\n",
+       " 'Effect (s.e.m.)P SNP (r)P',\n",
+       " 'B36 (bp) Europe PMC Funders Author Manuscriptsadj adj',\n",
+       " 'ab',\n",
+       " 'geneallele',\n",
+       " '',\n",
+       " '',\n",
+       " '',\n",
+       " 'Novel loci reported in this study',\n",
+       " 'rs4457053576,460,705ZBED3GPDE8B (NM_003719)Adipose0.302 (0.070)−50.80rs6864250−17−13',\n",
+       " '2.8 × 103.1 × 105.8 × 10',\n",
+       " '(0.18)',\n",
+       " 'ZBED3 (NM_032367)Adipose0.429 (0.068)−90.011rs4704389−16−9',\n",
+       " '1.0 × 103.9 × 106.0 × 10',\n",
+       " '(0.20)',\n",
+       " '−11−12',\n",
+       " 'rs9722837130,117,394KLF14GKLF14 (NM_138693)Adipose−0.387 (0.058)0.058rs7381340.0014',\n",
+       " '8.1 × 102.2 × 10',\n",
+       " '(0.30)',\n",
+       " '',\n",
+       " '',\n",
+       " '',\n",
+       " '−5−7',\n",
+       " 'rs896854896,029,687TP53INP1TCCNE2 (NM_057749)Blood−0.225 (0.053)0.78rs47353390.0051',\n",
+       " '3.8 × 105.8 × 10',\n",
+       " '(0.61)',\n",
+       " '',\n",
+       " '',\n",
+       " '',\n",
+       " '−7−24−19',\n",
+       " 'rs15522241172,110,746CENTD2ASTARD10 (NM_006645)Blood0.337 (0.066)0.026rs519790',\n",
+       " '8.6 × 102.7 × 101.6 × 10',\n",
+       " '(0.04)',\n",
+       " 'rs795719712119,945,069HNF1ATACADS (NM_000017)Adipose−0.248 (0.067)−40.29rs9204−53−50',\n",
+       " '3.7 × 101.3 × 105.9 × 10',\n",
+       " '(0.02)',\n",
+       " 'PSMD9 (NM_002813)Blood0.240 (0.065)−40.0088rs3741593−8−6',\n",
+       " '3.9 × 108.3 × 101.7 × 10',\n",
+       " '(0.00)',\n",
+       " '−6−7',\n",
+       " 'OASL (NM_003733)Adipose0.318 (0.068)0.13rs22598830.0018',\n",
+       " '6.4 × 101.1 × 10',\n",
+       " '(0.19)',\n",
+       " '',\n",
+       " '',\n",
+       " '',\n",
+       " '−6−22−16',\n",
+       " 'OASL (NM_003733)Blood0.319 (0.064)0.37rs4556628',\n",
+       " '1.3 × 104.4 × 101.4 × 10',\n",
+       " '(0.21)',\n",
+       " '',\n",
+       " '',\n",
+       " '',\n",
+       " '−4−39−35',\n",
+       " 'COQ5 (NM_032314)Blood0.248 (0.065)0.92rs10774561',\n",
+       " '2.1 × 108.7 × 104.9 × 10',\n",
+       " '(0.02)',\n",
+       " 'UNC119B (NM_032661)Blood−0.254 (0.064)−40.048rs11065202−12−9',\n",
+       " '1.4 × 107.8 × 102.3 × 10',\n",
+       " '(0.09)',\n",
+       " 'CAMKK2 (NM_172215)Adipose−0.497 (0.068)−120.18rs11065504−117−98',\n",
+       " '1.2 × 102.7 × 103.8 × 10',\n",
+       " '(0.08)',\n",
+       " '−8−105−94',\n",
+       " 'CAMKK2 (NM_172215)Blood−0.360 (0.063)0.68rs11065504',\n",
+       " '3.4 × 107.0 × 105.7 × 10',\n",
+       " '(0.08)',\n",
+       " '',\n",
+       " '',\n",
+       " '',\n",
+       " '−6−6−17−17',\n",
+       " 'P2RX4 (NM_175568)Blood0.312 (0.065)rs25644',\n",
+       " '3.4 × 102.0 × 103.4 × 101.9 × 10',\n",
+       " '(0.03)',\n",
+       " '−10−21−12Europe PMC Funders Author Manuscripts',\n",
+       " 'rs80426801589,322,341PRC1AVPS33B (NM_018668)Blood−0.371 (0.057)0.50rs12595616',\n",
+       " '2.9 × 102.3 × 104.5 × 10',\n",
+       " '(0.57)',\n",
+       " 'Previously reported loci',\n",
+       " '',\n",
+       " '',\n",
+       " '',\n",
+       " '−5−5',\n",
+       " 'rs75783262226,728,897IRS1AIRS1 (Contig50189_RC)Adipose−0.251 (0.059)0.89rs29436530.69',\n",
+       " '3.7 × 103.4 × 10',\n",
+       " '(0.93)',\n",
+       " '',\n",
+       " '',\n",
+       " '',\n",
+       " '−8−10',\n",
+       " 'IRS1 (NM_005544)Adipose−0.331 (0.059)0.58rs21760400.0042',\n",
+       " '5.7 × 107.8 × 10',\n",
+       " '(0.74)',\n",
+       " 'rs13081389312,264,800PPARGAIQSEC1 (NM_014869)Adipose−0.630 (0.131)−6−4rs9211−96−94',\n",
+       " '2.9 × 101.4 × 101.1 × 107.4 × 10',\n",
+       " '(0.01)',\n",
+       " 'rs6795735364,680,405ADAMTS9CBC040632 (AK022320)Adipose−0.229 (0.056)−50.28rs4521216−13−10',\n",
+       " '7.6 × 103.0 × 108.7 × 10',\n",
+       " '(0.02)',\n",
+       " '',\n",
+       " '',\n",
+       " '',\n",
+       " '                                                                                Nat Genet. Author manuscript; available in PMC 2011 April 21.']"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "page = 22\n",
+    "text_parts = objs[0]['pages'][page - 1]['text'].split('\\n')\n",
+    "text_parts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Gene Names and Diseases': [{'Gene Name': 'ZBED3',\n",
+       "   'SNP': 'rs6864250',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'ZBED3',\n",
+       "   'SNP': 'rs4704389',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'KLF14',\n",
+       "   'SNP': 'rs972283',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'TP53INP1',\n",
+       "   'SNP': 'rs896854',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'CENTD2',\n",
+       "   'SNP': 'rs1552224',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'HNF1A',\n",
+       "   'SNP': 'rs7957197',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'PSMD9',\n",
+       "   'SNP': 'rs3741593',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'OASL',\n",
+       "   'SNP': 'rs2259883',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'OASL',\n",
+       "   'SNP': 'rs4556628',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'COQ5',\n",
+       "   'SNP': 'rs10774561',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'UNC119B',\n",
+       "   'SNP': 'rs11065202',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'CAMKK2',\n",
+       "   'SNP': 'rs11065504',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'P2RX4',\n",
+       "   'SNP': 'rs25644',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'PRC1',\n",
+       "   'SNP': 'rs8042680',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'IRS1',\n",
+       "   'SNP': 'rs7578326',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'IRS1',\n",
+       "   'SNP': 'rs2943653',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'IRS1',\n",
+       "   'SNP': 'rs2176040',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'PPARG',\n",
+       "   'SNP': 'rs13081389',\n",
+       "   'Diseases': 'T2D-associated variants'},\n",
+       "  {'Gene Name': 'ADAMTS9',\n",
+       "   'SNP': 'rs6795735',\n",
+       "   'Diseases': 'T2D-associated variants'}]}"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from openai import OpenAI\n",
+    "\n",
+    "client = OpenAI()\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"gpt-4-0125-preview\",\n",
+    "    response_format={\"type\": \"json_object\"},\n",
+    "    messages=[\n",
+    "        {\"role\": \"system\", \"content\": \"You are a helpful assistant designed to output JSON.\"},\n",
+    "        {\"role\": \"user\", \"content\": f\"Given a text like this: {text_parts}, automatically extract, return multiple Gene Names, potential diseases and their corresponding SNPs in the format like this: {{\\\"Gene Name\\\": \\\"FTO\\\", \\\"SNP\\\": \\\"rs9939609\\\", \\\"Diseases\\\": \\\"Obesity\\\"}}, from table format at text (this is just an example, don't return this)\"}\n",
+    "    ]\n",
+    ")\n",
+    "res = response.choices[0].message.content\n",
+    "eval(res)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
+      "\n",
+      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:68: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:68: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:72: The name tf.train.import_meta_graph is deprecated. Please use tf.compat.v1.train.import_meta_graph instead.\n",
+      "\n"
+     ]
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:72: The name tf.train.import_meta_graph is deprecated. Please use tf.compat.v1.train.import_meta_graph instead.\n",
+      "\n"
      ]
     },
     {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\layers\\normalization\\batch_normalization.py:979: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\layers\\normalization\\batch_normalization.py:979: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.\n",
+      "\n"
      ]
     }
    ],
    "source": [
+    "import sys\n",
+    "sys.path.append('..')\n",
     "\n",
+    "import os\n",
+    "import torch\n",
+    "from pdf2image import convert_from_path\n",
+    "from table_detector import detection_transform, device, model, ocr, outputs_to_objects\n",
+    "import io\n",
+    "from img2table.document import Image"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1-s2.0-S0002916523016155-main.pdf\n",
+      "4\n",
+      "1329.pdf\n",
+      "8\n",
+      "41467_2020_Article_15421.pdf\n",
+      "11\n",
+      "berndt2013.pdf\n",
+      "14\n",
+      "BMD.pdf\n",
+      "17\n",
+      "clock and eat timing.pdf\n",
+      "23\n",
+      "COMT breast cancer metaanalysis chinese.pdf\n",
+      "26\n",
+      "dubois2010.pdf\n",
+      "30\n",
+      "EMMM-8-688.pdf\n",
+      "40\n",
+      "EMS120610.pdf\n",
+      "45\n",
+      "file.pdf\n",
+      "51\n",
+      "journal.pbio.3001547.pdf\n",
+      "54\n",
+      "lipid.pdf\n",
+      "60\n",
+      "monogenic diabetes.pdf\n",
+      "62\n",
+      "nihms-1651539.pdf\n",
+      "62\n",
+      "nihms-1792335.pdf\n",
+      "73\n",
+      "nihms-668049.pdf\n",
+      "87\n",
+      "nihms364577.pdf\n",
+      "90\n",
+      "nihms510594.pdf\n",
+      "110\n",
+      "pgen.1009952.pdf\n",
+      "116\n",
+      "PIIS0091674919313661.pdf\n",
+      "121\n",
+      "s12881-019-0830-y.pdf\n",
+      "128\n",
+      "s41576-021-00414-z (1).pdf\n",
+      "132\n",
+      "s41588-018-0047-6.pdf\n",
+      "137\n",
+      "s41588-022-01024-z (1).pdf\n",
+      "150\n",
+      "stroke genetic AHA.pdf\n",
+      "154\n",
+      "surendran2016.pdf\n",
+      "158\n",
+      "teslovich2010.pdf\n",
+      "161\n",
+      "testing\n",
+      "ukmss-34421.pdf\n",
+      "167\n",
+      "wightman2021.pdf\n",
+      "173\n"
+     ]
+    }
+   ],
+   "source": [
+    "tables = []\n",
+    "\n",
+    "for path in os.listdir('papers/'):\n",
+    "    print(path)\n",
+    "\n",
+    "    if path[-3:] != 'pdf':\n",
+    "        continue\n",
+    "\n",
+    "    images = convert_from_path('papers/' + path)\n",
+    "\n",
+    "    # Loop pages\n",
+    "    for image in images:\n",
+    "\n",
+    "        pixel_values = detection_transform(image).unsqueeze(0).to(device)\n",
+    "        with torch.no_grad():\n",
+    "            outputs = model(pixel_values)\n",
+    "\n",
+    "        id2label = model.config.id2label\n",
+    "        id2label[len(model.config.id2label)] = \"no object\"\n",
+    "        detected_tables = outputs_to_objects(outputs, image.size, id2label)\n",
+    "\n",
+    "        # Loop table in page (if any)\n",
+    "        for idx in range(len(detected_tables)):\n",
+    "            cropped_table = image.crop(detected_tables[idx][\"bbox\"])\n",
+    "            if detected_tables[idx][\"label\"] == 'table rotated':\n",
+    "                cropped_table = cropped_table.rotate(270, expand=True)\n",
+    "\n",
+    "            # TODO: what is the perfect threshold?\n",
+    "            if detected_tables[idx]['score'] > 0.9:\n",
+    "                # print(detected_tables[idx])\n",
+    "                tables.append(cropped_table)\n",
+    "    \n",
+    "    print(len(tables))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tables[0].save(\n",
+    "    'table.pdf', \"PDF\", resolution=100.0, save_all=True, append_images=tables[1:]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import fitz\n",
+    "\n",
+    "# Open the PDF file\n",
+    "doc=fitz.open(\"table.pdf\")\n",
+    "    \n",
+    "# Say, you like to save the first 6 pages, first page is 0\n",
+    "pages = [2,3,4,7,8,10,12,13,16,17,28,29,33,34,35,46,47,48,49,56,57,59,60,62,76,77,78,79,80,81,82,84,85,86,87,88,89,90,105,106,107,108,109,110,112,113,118,119,120,123,124,125,130,138,139,154,155,156,159,160,164,166,167,168]\n",
+    "pages = [(x - 1) for x in pages]\n",
+    "doc.select(pages)\n",
+    "   \n",
+    "# Save the selected pages to a new PDF\n",
+    "doc.save(\"out_file_name.pdf\")\n",
+    "doc.close()\n"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {