{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
      "\n",
      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:68: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.\n",
      "\n",
      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:72: The name tf.train.import_meta_graph is deprecated. Please use tf.compat.v1.train.import_meta_graph instead.\n",
      "\n",
      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
      "\n",
      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\layers\\normalization\\batch_normalization.py:979: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "\n",
    "from dotenv import load_dotenv\n",
    "from img2table.document import Image\n",
    "from img2table.ocr import DocTR\n",
    "from itertools import product\n",
    "\n",
    "load_dotenv()\n",
    "pd.set_option('expand_frame_repr', False)\n",
    "ocr = DocTR()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "image = Image('../NutriGenMe-Testing/monogenic-1.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>Monogenic Diabetes or</td>\n",
       "      <td>Associated With Common</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Gene Name</td>\n",
       "      <td>Major Function</td>\n",
       "      <td>Syndromes</td>\n",
       "      <td>T1D and/or T2D</td>\n",
       "      <td>Refs.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>KCNJ11</td>\n",
       "      <td>Encodes pore-forming inwardly-rectifying</td>\n",
       "      <td>PNDM (most common cause)</td>\n",
       "      <td>E23K</td>\n",
       "      <td>42-46</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>None</td>\n",
       "      <td>potassium channel subunits (Kir6.2)</td>\n",
       "      <td>and TNDM, CHI, MODY</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ABCC8</td>\n",
       "      <td>Encodes regulatory SUR1 subunits</td>\n",
       "      <td>PNDM and TNDM, CHI, MODY</td>\n",
       "      <td>A1369S, 1273AGA, R1420H</td>\n",
       "      <td>46,47,52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>GCK</td>\n",
       "      <td>A key glucose-phosphoryating enzyme;</td>\n",
       "      <td>GCK-MODY (MODY2), PNDM,</td>\n",
       "      <td>rs1799884 (G/A), rs4607517 (A/G),</td>\n",
       "      <td>75,78,79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>None</td>\n",
       "      <td>a glucose sensor</td>\n",
       "      <td>CHI</td>\n",
       "      <td>3'UTR SNP, chr7:44184184-G/A</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>SLC2A2</td>\n",
       "      <td>Encodes GLUT2, a high-capacity facilitative</td>\n",
       "      <td>FBS</td>\n",
       "      <td>SNPS rs5393 (AA) and rs5394</td>\n",
       "      <td>93-100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>None</td>\n",
       "      <td>glucose transporter</td>\n",
       "      <td>None</td>\n",
       "      <td>(CC) in the promoter region</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>and SNPS rs5400 (T1101) and</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           0                                            1                         2                                  3         4\n",
       "0       None                                         None     Monogenic Diabetes or             Associated With Common      None\n",
       "1  Gene Name                               Major Function                 Syndromes                     T1D and/or T2D     Refs.\n",
       "2     KCNJ11     Encodes pore-forming inwardly-rectifying  PNDM (most common cause)                               E23K     42-46\n",
       "3       None          potassium channel subunits (Kir6.2)       and TNDM, CHI, MODY                               None      None\n",
       "4      ABCC8             Encodes regulatory SUR1 subunits  PNDM and TNDM, CHI, MODY            A1369S, 1273AGA, R1420H  46,47,52\n",
       "5        GCK         A key glucose-phosphoryating enzyme;   GCK-MODY (MODY2), PNDM,  rs1799884 (G/A), rs4607517 (A/G),  75,78,79\n",
       "6       None                             a glucose sensor                       CHI       3'UTR SNP, chr7:44184184-G/A      None\n",
       "7     SLC2A2  Encodes GLUT2, a high-capacity facilitative                       FBS        SNPS rs5393 (AA) and rs5394    93-100\n",
       "8       None                          glucose transporter                      None        (CC) in the promoter region      None\n",
       "9       None                                         None                      None        and SNPS rs5400 (T1101) and      None"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "extracted_tables = image.extract_tables(ocr=ocr, implicit_rows=True, borderless_tables=True, min_confidence=0)\n",
    "\n",
    "df = []\n",
    "print(len(extracted_tables))\n",
    "if len(extracted_tables) > 0:\n",
    "    df = extracted_tables[0].df\n",
    "    for et in extracted_tables[1:]:\n",
    "        df = pd.concat([df, et.df]).reset_index(drop=True)\n",
    "\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                0                                                  1                                                  2                                                  3                                4\n",
      "0                                                                                                 Monogenic Diabetes or                             Associated With Common                                 \n",
      "1       Gene Name                                     Major Function                                          Syndromes                                     T1D and/or T2D                            Refs.\n",
      "2          KCNJ11  Encodes pore-forming inwardly-rectifying potas...       PNDM (most common cause) and TNDM, CHI, MODY                                               E23K                            42-46\n",
      "3           ABCC8                   Encodes regulatory SUR1 subunits                           PNDM and TNDM, CHI, MODY                            A1369S, 1273AGA, R1420H                         46,47,52\n",
      "4             GCK  A key glucose-phosphoryating enzyme; a glucose...                        GCK-MODY (MODY2), PNDM, CHI  rs1799884 (G/A), rs4607517 (A/G), 3'UTR SNP, c...                         75,78,79\n",
      "5          SLC2A2  Encodes GLUT2, a high-capacity facilitative gl...                                                FBS  SNPS rs5393 (AA) and rs5394 (CC) in the promot...                           93-100\n",
      "6      HNF1A/TCF1  TF; regulator of pancreatic B-cell differentia...  HNF1A-MODY (MODY3), most common cause of MODY,...                         G319S, C.1522G>A (p.E508K)                    114, 118, 119\n",
      "7           HNF4A                 Key TF for early fetal development                            HNF4A MODY (MODY1), CHI  SNPS rs2144908, rs3818247 and rs884614, rs4810...                     121-124, 274\n",
      "8      HNF1B/TCF2  TF; required for the generation of pancreatic ...      RCAD syndrome, or MODY5; TNDM and PNDM (rare)       SNP rs757210 A, TS4430796 A, and TS7501939 C                         141, 144\n",
      "9            PDX1  TF; required for pancreas development, B-cell ...                                        PNDM, MODY4  C18R, Q59L, D76N, R197H, G212R, P239Q, InsCCG2...                     163-165, 167\n",
      "10           PAX4  Islet TF that functions mainly as a transcript...                                              MODY9                   R121W, R133W, R37W, rs10229583 G                    180, 181, 187\n",
      "11  NEUROD1/BETA2  TF; required for the development of the endocr...                                     MODY6 and PNDM  R111L and 206 + C; A45T variant at rs1801262 (...                          204-208\n",
      "12           WFS1  A transmembrane protein; a negative regulator ...             WFS1, sometimes referred to as DIDMOAD  R456 and H611, SNPS at rs10010131, rs6446482; ...                          223-225\n",
      "13          PPARG  TF; master regulator of adipogenesis, energy b...  Monogenic diabetes   Monogenic Diabetes Genes ...  Pro12Ala variant (rs1801282), SNP at rs4684847...                     240-243, 250\n",
      "14            INS               Predominant glucose-lowering hormone         PNDM (2nd most common cause), TNDM, MODY10  Class I alleles of INS VNTR associated with T1...                273, 274, 276-281\n",
      "15          GLIS3  TF; regulator of islet development, insulin ge...  Neonatal diabetes syndrome associated with con...  rs7020673 G associated with T1D; rs7034200 A a...  78, 214, 289, 291, 292, 295-308\n"
     ]
    }
   ],
   "source": [
    "lst = []\n",
    "now = []\n",
    "for i in df.index:\n",
    "    if not df.loc[i].isna().any():\n",
    "        if len(now) > 0:\n",
    "            lst.append(now)\n",
    "        now = []\n",
    "    now.append(i)\n",
    "lst.append(now)\n",
    "\n",
    "df.loc[0] = df.loc[0].fillna('')\n",
    "dfc = pd.DataFrame(columns=df.columns)\n",
    "for l in lst:\n",
    "    rows = df.loc[l[0]]\n",
    "    for idx in l[1:]:\n",
    "        rows = rows + ' ' + df.loc[idx].fillna('')\n",
    "    rows = rows.str.strip()\n",
    "    dfc.loc[len(dfc)] = rows\n",
    "\n",
    "print(dfc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Simple Filtering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "118 [('rs5393', 'GLUT2'), ('rs5404', 'SNPS'), ('rs757210', 'HNF1B'), ('rs884614', 'SNPS'), ('rs2144908', 'MODY'), ('rs2144908', 'CHI'), ('rs4684847', 'T1D'), ('rs1884613', 'MODY'), ('rs1884613', 'CHI'), ('rs5393', 'SNPS'), ('rs734312', 'SNPS'), ('rs5394', 'GLUT2'), ('rs757210', 'TS4430796'), ('rs7041847', 'T1D'), ('rs6446482', 'SNPS'), ('rs7020673', 'GLIS3'), ('rs4684847', 'TZDS'), ('rs757210', 'PNDM'), ('rs5400', 'GLUT2'), ('rs7020673', 'T2D'), ('rs3818247', 'HNF4A'), ('rs4810424', 'MODY'), ('rs4810424', 'CHI'), ('rs10229583', 'R133W'), ('rs1801262', 'R111L'), ('rs1801262', 'BETA2'), ('rs10010131', 'SNPS'), ('rs10229583', 'MODY9'), ('rs5400', 'SNPS'), ('rs1801282', 'T1D'), ('rs2144908', 'HNF4A'), ('rs5393', 'FBS'), ('rs757210', 'RCAD'), ('rs10229583', 'R121W'), ('rs1801262', 'INS'), ('rs10010131', 'R456'), ('rs4684847', 'SNP'), ('rs7034200', 'T2D'), ('rs5404', 'T1101'), ('rs4607517', 'MODY'), ('rs1799884', 'MODY'), ('rs1799884', 'CHI'), ('rs4607517', 'PNDM'), ('rs6446482', 'WFS1'), ('rs1799884', 'PNDM'), ('rs5404', 'SLC2A2'), ('rs1801282', 'TZDS'), ('rs5404', 'T198T'), ('rs884614', 'MODY1'), ('rs734312', 'DIDMOAD'), ('rs5394', 'FBS'), ('rs4810424', 'HNF4A'), ('rs7020673', 'T1D'), ('rs757210', 'TCF2'), ('rs5393', 'T1101'), ('rs6446482', 'DIDMOAD'), ('rs1801262', 'A45T'), ('rs5394', 'SNPS'), ('rs5393', 'SLC2A2'), ('rs884614', 'CHI'), ('rs884614', 'MODY'), ('rs5393', 'T198T'), ('rs5400', 'FBS'), ('rs3818247', 'SNPS'), ('rs757210', 'SNP'), ('rs10229583', 'R37W'), ('rs10229583', 'PAX4'), ('rs4684847', 'T2D'), ('rs1801282', 'SNP'), ('rs7034200', 'GLIS3'), ('rs1884613', 'HNF4A'), ('rs4607517', 'GCK'), ('rs757210', 'TS7501939'), ('rs1799884', 'GCK'), ('rs10010131', 'DIDMOAD'), ('rs734312', 'WFS1'), ('rs2144908', 'SNPS'), ('rs5394', 'T198T'), ('rs4684847', 'PPARG'), ('rs734312', 'H611'), ('rs1801262', 'MODY6'), ('rs4607517', 'CHI'), ('rs7041847', 'T2D'), ('rs5404', 'GLUT2'), ('rs5400', 'T1101'), ('rs4607517', 'UTR'), ('rs1799884', 'UTR'), ('rs5400', 'SLC2A2'), ('rs6446482', 'H611'), ('rs5400', 'T198T'), ('rs1799884', 'SNP'), ('rs884614', 'HNF4A'), ('rs4810424', 'SNPS'), ('rs10010131', 'WFS1'), ('rs1801282', 'T2D'), ('rs10010131', 'H611'), ('rs1801262', 'PNDM'), ('rs4607517', 'SNP'), ('rs5394', 'T1101'), ('rs757210', 'TNDM'), ('rs4810424', 'MODY1'), ('rs1801282', 'PPARG'), ('rs7034200', 'T1D'), ('rs7041847', 'GLIS3'), ('rs4607517', 'MODY2'), ('rs5394', 'SLC2A2'), ('rs3818247', 'MODY1'), ('rs1799884', 'MODY2'), ('rs1884613', 'SNPS'), ('rs757210', 'MODY5'), ('rs734312', 'R456'), ('rs3818247', 'MODY'), ('rs3818247', 'CHI'), ('rs6446482', 'R456'), ('rs5404', 'FBS'), ('rs1801262', 'NEUROD1'), ('rs2144908', 'MODY1'), ('rs1884613', 'MODY1')]\n"
     ]
    }
   ],
   "source": [
    "def filter(row):\n",
    "    concat = ' '.join(list(row))\n",
    "    snp = re.findall('rs\\d+', concat)\n",
    "    gene = re.findall('[A-Z][A-Z0-9]{2,}', concat)\n",
    "\n",
    "    return snp, gene\n",
    "\n",
    "pairs = []\n",
    "for i in dfc.index:\n",
    "    snp_gene = filter(dfc.loc[i])\n",
    "    pairs.extend(list(product(*snp_gene)))\n",
    "\n",
    "pairs = list(set(pairs))\n",
    "print(len(pairs), pairs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Df to JSON to LLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[\n",
      "  {\n",
      "    \"0\": \"\",\n",
      "    \"1\": \"\",\n",
      "    \"2\": \"Monogenic Diabetes or\",\n",
      "    \"3\": \"Associated With Common\",\n",
      "    \"4\": \"\"\n",
      "  },\n",
      "  {\n",
      "    \"0\": \"Gene Name\",\n",
      "    \"1\": \"Major Function\",\n",
      "    \"2\": \"Syndromes\",\n",
      "    \"3\": \"T1D and/or T2D\",\n",
      "    \"4\": \"Refs.\"\n",
      "  },\n",
      "  {\n",
      "    \"0\": \"KCNJ11\",\n",
      "    \"1\": \"Encodes pore-forming inwardly-rectifying potassium channel subunits (Kir6.2)\",\n",
      "    \"2\": \"PNDM (most common cause) and TNDM, CHI, MODY\",\n",
      "    \"3\": \"E23K\",\n",
      "    \"4\": \"42-46\"\n",
      "  },\n",
      "  {\n",
      "    \"0\": \"ABCC8\",\n",
      "    \"1\": \"Encodes regulatory SUR1 subunits\",\n",
      "    \"2\": \"PNDM and TNDM, CHI, MODY\",\n",
      "    \"3\": \"A1369S, 1273AGA, R1420H\",\n",
      "    \"4\": \"46,47,52\"\n",
      "  },\n",
      "  {\n",
      "    \"0\": \"GCK\",\n",
      "    \"1\": \"A key glucose-phosphoryating enzyme; a glucose sensor\",\n",
      "    \"2\": \"GCK-MODY (MODY2), PNDM, CHI\",\n",
      "    \"3\": \"rs1799884 (G/A), rs4607517 (A/G), 3'UTR SNP, chr7:44184184-G/A\",\n",
      "    \"4\": \"75,78,79\"\n",
      "  },\n",
      "  {\n",
      "    \"0\": \"SLC2A2\",\n",
      "    \"1\": \"Encodes GLUT2, a high-capacity facilitative glucose transporter\",\n",
      "    \"2\": \"FBS\",\n",
      "    \"3\": \"SNPS rs5393 (AA) and rs5394 (CC) in the promoter region and SNPS rs5400 (T1101) and rs5404 (T198T)\",\n",
      "    \"4\": \"93-100\"\n",
      "  },\n",
      "  {\n",
      "    \"0\": \"HNF1A/TCF1\",\n",
      "    \"1\": \"TF; regulator of pancreatic B-cell differentiation\",\n",
      "    \"2\": \"HNF1A-MODY (MODY3), most common cause of MODY, CHI\",\n",
      "    \"3\": \"G319S, C.1522G>A (p.E508K)\",\n",
      "    \"4\": \"114, 118, 119\"\n",
      "  },\n",
      "  {\n",
      "    \"0\": \"HNF4A\",\n",
      "    \"1\": \"Key TF for early fetal development\",\n",
      "    \"2\": \"HNF4A MODY (MODY1), CHI\",\n",
      "    \"3\": \"SNPS rs2144908, rs3818247 and rs884614, rs4810424, rs1884613\",\n",
      "    \"4\": \"121-124, 274\"\n",
      "  },\n",
      "  {\n",
      "    \"0\": \"HNF1B/TCF2\",\n",
      "    \"1\": \"TF; required for the generation of pancreatic and endocrine progenitors\",\n",
      "    \"2\": \"RCAD syndrome, or MODY5; TNDM and PNDM (rare)\",\n",
      "    \"3\": \"SNP rs757210 A, TS4430796 A, and TS7501939 C\",\n",
      "    \"4\": \"141, 144\"\n",
      "  },\n",
      "  {\n",
      "    \"0\": \"PDX1\",\n",
      "    \"1\": \"TF; required for pancreas development, B-cell differentiation and the maintenance of mature B-cell function\",\n",
      "    \"2\": \"PNDM, MODY4\",\n",
      "    \"3\": \"C18R, Q59L, D76N, R197H, G212R, P239Q, InsCCG243, p.Gly218Alafs*12\",\n",
      "    \"4\": \"163-165, 167\"\n",
      "  },\n",
      "  {\n",
      "    \"0\": \"PAX4\",\n",
      "    \"1\": \"Islet TF that functions mainly as a transcription repressor\",\n",
      "    \"2\": \"MODY9\",\n",
      "    \"3\": \"R121W, R133W, R37W, rs10229583 G\",\n",
      "    \"4\": \"180, 181, 187\"\n",
      "  },\n",
      "  {\n",
      "    \"0\": \"NEUROD1/BETA2\",\n",
      "    \"1\": \"TF; required for the development of the endocrine pancreas; transactivates the INS gene\",\n",
      "    \"2\": \"MODY6 and PNDM\",\n",
      "    \"3\": \"R111L and 206 + C; A45T variant at rs1801262 (inconsistent)\",\n",
      "    \"4\": \"204-208\"\n",
      "  },\n",
      "  {\n",
      "    \"0\": \"WFS1\",\n",
      "    \"1\": \"A transmembrane protein; a negative regulator of ER stress\",\n",
      "    \"2\": \"WFS1, sometimes referred to as DIDMOAD\",\n",
      "    \"3\": \"R456 and H611, SNPS at rs10010131, rs6446482; variants rs10010131 G, 1801213 G, and rs734312 A\",\n",
      "    \"4\": \"223-225\"\n",
      "  },\n",
      "  {\n",
      "    \"0\": \"PPARG\",\n",
      "    \"1\": \"TF; master regulator of adipogenesis, energy balance, lipid biosynthesis, and insulin sensitivity; cellular target of TZDS Monogenic Diabetes Genes Associated With Both Common T1D and T2D\",\n",
      "    \"2\": \"Monogenic diabetes   Monogenic Diabetes Genes Associated With Both Common T1D and T2D\",\n",
      "    \"3\": \"Pro12Ala variant (rs1801282), SNP at rs4684847  Monogenic Diabetes Genes Associated With Both Common T1D and T2D\",\n",
      "    \"4\": \"240-243, 250\"\n",
      "  },\n",
      "  {\n",
      "    \"0\": \"INS\",\n",
      "    \"1\": \"Predominant glucose-lowering hormone\",\n",
      "    \"2\": \"PNDM (2nd most common cause), TNDM, MODY10\",\n",
      "    \"3\": \"Class I alleles of INS VNTR associated with T1D; Class IIl alleles of INS VNTR inconsistently associated with T2D\",\n",
      "    \"4\": \"273, 274, 276-281\"\n",
      "  },\n",
      "  {\n",
      "    \"0\": \"GLIS3\",\n",
      "    \"1\": \"TF; regulator of islet development, insulin gene transcription, and obesity-induced compensatory B-cell proliferation\",\n",
      "    \"2\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\",\n",
      "    \"3\": \"rs7020673 G associated with T1D; rs7034200 A and rs7041847 A associated with T2D\",\n",
      "    \"4\": \"78, 214, 289, 291, 292, 295-308\"\n",
      "  }\n",
      "]\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "js = dfc.to_json(orient='records')\n",
    "\n",
    "df_str = json.dumps(json.loads(js), indent=2)\n",
    "print(df_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_openai import ChatOpenAI\n",
    "\n",
    "llm = ChatOpenAI(model=\"gpt-3.5-turbo\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[\n",
      "    {\n",
      "        \"Genes\": \"KCNJ11\",\n",
      "        \"SNPs\": [\"E23K\"],\n",
      "        \"Diseases\": \"PNDM (most common cause) and TNDM, CHI, MODY\"\n",
      "    },\n",
      "    {\n",
      "        \"Genes\": \"ABCC8\",\n",
      "        \"SNPs\": [\"A1369S\", \"1273AGA\", \"R1420H\"],\n",
      "        \"Diseases\": \"PNDM and TNDM, CHI, MODY\"\n",
      "    },\n",
      "    {\n",
      "        \"Genes\": \"GCK\",\n",
      "        \"SNPs\": [\"rs1799884 (G/A)\", \"rs4607517 (A/G)\", \"3'UTR SNP\", \"chr7:44184184-G/A\"],\n",
      "        \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
      "    },\n",
      "    {\n",
      "        \"Genes\": \"SLC2A2\",\n",
      "        \"SNPs\": [\"rs5393 (AA)\", \"rs5394 (CC)\", \"rs5400 (T1101)\", \"rs5404 (T198T)\"],\n",
      "        \"Diseases\": \"FBS\"\n",
      "    },\n",
      "    {\n",
      "        \"Genes\": \"HNF1A/TCF1\",\n",
      "        \"SNPs\": [\"G319S\", \"C.1522G>A (p.E508K)\"],\n",
      "        \"Diseases\": \"HNF1A-MODY (MODY3), most common cause of MODY, CHI\"\n",
      "    },\n",
      "    {\n",
      "        \"Genes\": \"HNF4A\",\n",
      "        \"SNPs\": [\"rs2144908\", \"rs3818247\", \"rs884614\", \"rs4810424\", \"rs1884613\"],\n",
      "        \"Diseases\": \"HNF4A MODY (MODY1), CHI\"\n",
      "    },\n",
      "    {\n",
      "        \"Genes\": \"HNF1B/TCF2\",\n",
      "        \"SNPs\": [\"rs757210 A\", \"TS4430796 A\", \"TS7501939 C\"],\n",
      "        \"Diseases\": \"RCAD syndrome, or MODY5; TNDM and PNDM (rare)\"\n",
      "    },\n",
      "    {\n",
      "        \"Genes\": \"PDX1\",\n",
      "        \"SNPs\": [\"C18R\", \"Q59L\", \"D76N\", \"R197H\", \"G212R\", \"P239Q\", \"InsCCG243\", \"p.Gly218Alafs*12\"],\n",
      "        \"Diseases\": \"PNDM, MODY4\"\n",
      "    },\n",
      "    {\n",
      "        \"Genes\": \"PAX4\",\n",
      "        \"SNPs\": [\"R121W\", \"R133W\", \"R37W\", \"rs10229583 G\"],\n",
      "        \"Diseases\": \"MODY9\"\n",
      "    },\n",
      "    {\n",
      "        \"Genes\": \"NEUROD1/BETA2\",\n",
      "        \"SNPs\": [\"R111L\", \"206 + C\", \"A45T variant at rs1801262 (inconsistent)\"],\n",
      "        \"Diseases\": \"MODY6 and PNDM\"\n",
      "    },\n",
      "    {\n",
      "        \"Genes\": \"WFS1\",\n",
      "        \"SNPs\": [\"R456\", \"H611\", \"rs10010131\", \"rs6446482\", \"rs10010131 G\", \"1801213 G\", \"rs734312 A\"],\n",
      "        \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
      "    },\n",
      "    {\n",
      "        \"Genes\": \"PPARG\",\n",
      "        \"SNPs\": [\"Pro12Ala variant (rs1801282)\", \"SNP at rs4684847\"],\n",
      "        \"Diseases\": \"Monogenic diabetes   Monogenic Diabetes Genes Associated With Both Common T1D and T2D\"\n",
      "    },\n",
      "    {\n",
      "        \"Genes\": \"INS\",\n",
      "        \"SNPs\": [\"Class I alleles of INS VNTR associated with T1D\", \"Class IIl alleles of INS VNTR inconsistently associated with T2D\"],\n",
      "        \"Diseases\": \"PNDM (2nd most common cause), TNDM, MODY10\"\n",
      "    },\n",
      "    {\n",
      "        \"Genes\": \"GLIS3\",\n",
      "        \"SNPs\": [\"rs7020673 G\", \"rs7034200 A\", \"rs7041847 A\"],\n",
      "        \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
      "    }\n",
      "]\n"
     ]
    }
   ],
   "source": [
    "prompt = \"\"\"\n",
    "# CONTEXT #\n",
    "In my capacity as a genomics specialist, I have table data obtained from a published research paper in the field of genomics. The data is provided in a list of JSONs format, with each JSON object representing a single row in a tabular structure. The first JSON element in the list represents the header row of the table, containing the names of each column.\n",
    "This is the data:\n",
    "{}\n",
    "\n",
    "# OBJECTIVE #\n",
    "Given the provided table data, the following tasks need to be completed:\n",
    "\n",
    "1. Identify all unique gene names present within the table. Each row can contains more than one gene name.\n",
    "2. If present, extract any entries starting with \"rs\" (presumably representing Single Nucleotide Polymorphisms or rsIDs) that correspond to the same row as their associated gene names. Each gene name can correspond with more than one SNPs.\n",
    "3. If available, extract any disease information associated with both the gene name and its corresponding SNP/rsID.\n",
    "\n",
    "It is crucial to maintain the utmost accuracy in this process, as any false or fabricated information (hallucination) can have severe consequences for academic integrity and research credibility.\n",
    "If an SNPs or Diseases is absent from the table, leave the corresponding field blank with an empty string ('').\n",
    "\n",
    "# RESPPOSE #\n",
    "The output should be a string containing list of JSON objects, each representing an entry with the following structure:\n",
    "[\n",
    "    {{\n",
    "        \"Genes\": \"A\",\n",
    "        \"SNPs\": [\"rs123\", \"rs456\"],\n",
    "        \"Diseases\": \"A, B, C\"\n",
    "    }}\n",
    "]\n",
    "\"\"\"\n",
    "\n",
    "result = llm.invoke(prompt.format(df_str))\n",
    "print(result.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'Genes': 'KCNJ11',\n",
       "  'SNPs': ['E23K'],\n",
       "  'Diseases': 'PNDM (most common cause) and TNDM, CHI, MODY'},\n",
       " {'Genes': 'ABCC8',\n",
       "  'SNPs': ['A1369S', '1273AGA', 'R1420H'],\n",
       "  'Diseases': 'PNDM and TNDM, CHI, MODY'},\n",
       " {'Genes': 'GCK',\n",
       "  'SNPs': ['rs1799884 (G/A)',\n",
       "   'rs4607517 (A/G)',\n",
       "   \"3'UTR SNP\",\n",
       "   'chr7:44184184-G/A'],\n",
       "  'Diseases': 'GCK-MODY (MODY2), PNDM, CHI'},\n",
       " {'Genes': 'SLC2A2',\n",
       "  'SNPs': ['rs5393 (AA)', 'rs5394 (CC)', 'rs5400 (T1101)', 'rs5404 (T198T)'],\n",
       "  'Diseases': 'FBS'},\n",
       " {'Genes': 'HNF1A/TCF1',\n",
       "  'SNPs': ['G319S', 'C.1522G>A (p.E508K)'],\n",
       "  'Diseases': 'HNF1A-MODY (MODY3), most common cause of MODY, CHI'},\n",
       " {'Genes': 'HNF4A',\n",
       "  'SNPs': ['rs2144908', 'rs3818247', 'rs884614', 'rs4810424', 'rs1884613'],\n",
       "  'Diseases': 'HNF4A MODY (MODY1), CHI'},\n",
       " {'Genes': 'HNF1B/TCF2',\n",
       "  'SNPs': ['rs757210 A', 'TS4430796 A', 'TS7501939 C'],\n",
       "  'Diseases': 'RCAD syndrome, or MODY5; TNDM and PNDM (rare)'},\n",
       " {'Genes': 'PDX1',\n",
       "  'SNPs': ['C18R',\n",
       "   'Q59L',\n",
       "   'D76N',\n",
       "   'R197H',\n",
       "   'G212R',\n",
       "   'P239Q',\n",
       "   'InsCCG243',\n",
       "   'p.Gly218Alafs*12'],\n",
       "  'Diseases': 'PNDM, MODY4'},\n",
       " {'Genes': 'PAX4',\n",
       "  'SNPs': ['R121W', 'R133W', 'R37W', 'rs10229583 G'],\n",
       "  'Diseases': 'MODY9'},\n",
       " {'Genes': 'NEUROD1/BETA2',\n",
       "  'SNPs': ['R111L', '206 + C', 'A45T variant at rs1801262 (inconsistent)'],\n",
       "  'Diseases': 'MODY6 and PNDM'},\n",
       " {'Genes': 'WFS1',\n",
       "  'SNPs': ['R456',\n",
       "   'H611',\n",
       "   'rs10010131',\n",
       "   'rs6446482',\n",
       "   'rs10010131 G',\n",
       "   '1801213 G',\n",
       "   'rs734312 A'],\n",
       "  'Diseases': 'WFS1, sometimes referred to as DIDMOAD'},\n",
       " {'Genes': 'PPARG',\n",
       "  'SNPs': ['Pro12Ala variant (rs1801282)', 'SNP at rs4684847'],\n",
       "  'Diseases': 'Monogenic diabetes   Monogenic Diabetes Genes Associated With Both Common T1D and T2D'},\n",
       " {'Genes': 'INS',\n",
       "  'SNPs': ['Class I alleles of INS VNTR associated with T1D',\n",
       "   'Class IIl alleles of INS VNTR inconsistently associated with T2D'],\n",
       "  'Diseases': 'PNDM (2nd most common cause), TNDM, MODY10'},\n",
       " {'Genes': 'GLIS3',\n",
       "  'SNPs': ['rs7020673 G', 'rs7034200 A', 'rs7041847 A'],\n",
       "  'Diseases': 'Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys'}]"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lst_result = eval(result.content)\n",
    "lst_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['PNDM (most common cause) and TNDM, CHI, MODY',\n",
       " 'PNDM and TNDM, CHI, MODY',\n",
       " 'PNDM and TNDM, CHI, MODY',\n",
       " 'PNDM and TNDM, CHI, MODY',\n",
       " 'GCK-MODY (MODY2), PNDM, CHI',\n",
       " 'GCK-MODY (MODY2), PNDM, CHI',\n",
       " 'GCK-MODY (MODY2), PNDM, CHI',\n",
       " 'GCK-MODY (MODY2), PNDM, CHI',\n",
       " 'FBS',\n",
       " 'FBS',\n",
       " 'FBS',\n",
       " 'FBS',\n",
       " 'HNF1A-MODY (MODY3), most common cause of MODY, CHI',\n",
       " 'HNF1A-MODY (MODY3), most common cause of MODY, CHI',\n",
       " 'HNF4A MODY (MODY1), CHI',\n",
       " 'HNF4A MODY (MODY1), CHI',\n",
       " 'HNF4A MODY (MODY1), CHI',\n",
       " 'HNF4A MODY (MODY1), CHI',\n",
       " 'HNF4A MODY (MODY1), CHI',\n",
       " 'RCAD syndrome, or MODY5; TNDM and PNDM (rare)',\n",
       " 'RCAD syndrome, or MODY5; TNDM and PNDM (rare)',\n",
       " 'RCAD syndrome, or MODY5; TNDM and PNDM (rare)',\n",
       " 'PNDM, MODY4',\n",
       " 'PNDM, MODY4',\n",
       " 'PNDM, MODY4',\n",
       " 'PNDM, MODY4',\n",
       " 'PNDM, MODY4',\n",
       " 'PNDM, MODY4',\n",
       " 'PNDM, MODY4',\n",
       " 'PNDM, MODY4',\n",
       " 'MODY9',\n",
       " 'MODY9',\n",
       " 'MODY9',\n",
       " 'MODY9',\n",
       " 'MODY6 and PNDM',\n",
       " 'MODY6 and PNDM',\n",
       " 'MODY6 and PNDM',\n",
       " 'WFS1, sometimes referred to as DIDMOAD',\n",
       " 'WFS1, sometimes referred to as DIDMOAD',\n",
       " 'WFS1, sometimes referred to as DIDMOAD',\n",
       " 'WFS1, sometimes referred to as DIDMOAD',\n",
       " 'WFS1, sometimes referred to as DIDMOAD',\n",
       " 'WFS1, sometimes referred to as DIDMOAD',\n",
       " 'WFS1, sometimes referred to as DIDMOAD',\n",
       " 'Monogenic diabetes   Monogenic Diabetes Genes Associated With Both Common T1D and T2D',\n",
       " 'Monogenic diabetes   Monogenic Diabetes Genes Associated With Both Common T1D and T2D',\n",
       " 'PNDM (2nd most common cause), TNDM, MODY10',\n",
       " 'PNDM (2nd most common cause), TNDM, MODY10',\n",
       " 'Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys',\n",
       " 'Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys',\n",
       " 'Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys']"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_gene = []\n",
    "res_snp = []\n",
    "res_disease = []\n",
    "\n",
    "for res in lst_result:\n",
    "    gene = res['Genes']\n",
    "    snps = res['SNPs']\n",
    "    disease = res['Diseases']\n",
    "\n",
    "    for snp in snps:\n",
    "        res_gene.append(gene)\n",
    "        res_snp.append(snp)\n",
    "        res_disease.append(disease)\n",
    "\n",
    "res_disease\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "st = '```json\\n[\\n    {\\n        \"Genes\": \"BCLIIA\",\\n        \"SNPs\": [\"rs243021\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"ZBED3\",\\n        \"SNPs\": [\"rs4457053\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"KLF14\",\\n        \"SNPs\": [\"rs972283\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"TP53INP1\",\\n        \"SNPs\": [\"rs896854\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"CHCHD9\",\\n        \"SNPs\": [\"rs13292136\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"KCNQI\",\\n        \"SNPs\": [\"rs231362\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"CENTD2\",\\n        \"SNPs\": [\"rs1552224\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"HMGA2\",\\n        \"SNPs\": [\"rs15313432\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"HNFIA\",\\n        \"SNPs\": [\"rs7957197\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"ZFAND6\",\\n        \"SNPs\": [\"rsl1634397\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"PRCI\",\\n        \"SNPs\": [\"rs8042680\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"DUSP9\",\\n        \"SNPs\": [\"rs5945326\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"IRSI\",\\n        \"SNPs\": [\"rs7578326\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"MTNRIB\",\\n        \"SNPs\": [\"rs1387153\"],\\n        \"Diseases\": \"\"\\n    }\\n]\\n```'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'[\\n    {\\n        \"Genes\": \"BCLIIA\",\\n        \"SNPs\": [\"rs243021\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"ZBED3\",\\n        \"SNPs\": [\"rs4457053\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"KLF14\",\\n        \"SNPs\": [\"rs972283\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"TP53INP1\",\\n        \"SNPs\": [\"rs896854\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"CHCHD9\",\\n        \"SNPs\": [\"rs13292136\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"KCNQI\",\\n        \"SNPs\": [\"rs231362\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"CENTD2\",\\n        \"SNPs\": [\"rs1552224\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"HMGA2\",\\n        \"SNPs\": [\"rs15313432\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"HNFIA\",\\n        \"SNPs\": [\"rs7957197\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"ZFAND6\",\\n        \"SNPs\": [\"rsl1634397\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"PRCI\",\\n        \"SNPs\": [\"rs8042680\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"DUSP9\",\\n        \"SNPs\": [\"rs5945326\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"IRSI\",\\n        \"SNPs\": [\"rs7578326\"],\\n        \"Diseases\": \"\"\\n    },\\n    {\\n        \"Genes\": \"MTNRIB\",\\n        \"SNPs\": [\"rs1387153\"],\\n        \"Diseases\": \"\"\\n    }\\n'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "st[st.find('['):st.rfind(']')+1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2 1-s2.0-S0002916523016155-main.pdf\n",
      "3 1329.pdf\n",
      "4 41467_2020_Article_15421.pdf\n",
      "4 berndt2013.pdf\n",
      "5 BMD.pdf\n",
      "3 clock and eat timing.pdf\n",
      "2 COMT breast cancer metaanalysis chinese.pdf\n",
      "1 COSTAR CHATGPTPrompt _ Towards Data Science.pdf\n",
      "3 dubois2010.pdf\n",
      "3 EMMM-8-688.pdf\n",
      "6 EMS120610.pdf\n",
      "6 file.pdf\n",
      "3 journal.pbio.3001547.pdf\n",
      "3 lipid.pdf\n",
      "7 monogenic diabetes.pdf\n",
      "3 nihms-1651539.pdf\n",
      "5 nihms-1792335.pdf\n",
      "6 nihms-668049.pdf\n",
      "4 nihms364577.pdf\n",
      "4 nihms510594.pdf\n",
      "4 pgen.1009952.pdf\n",
      "3 PIIS0091674919313661.pdf\n",
      "3 s12881-019-0830-y.pdf\n",
      "4 s41576-021-00414-z (1).pdf\n",
      "3 s41588-018-0047-6.pdf\n",
      "8 s41588-022-01024-z (1).pdf\n",
      "4 stroke genetic AHA.pdf\n",
      "5 surendran2016.pdf\n",
      "3 teslovich2010.pdf\n",
      "1 ukmss-34421-testing.pdf\n",
      "3 ukmss-34421.pdf\n",
      "3 wightman2021.pdf\n"
     ]
    }
   ],
   "source": [
    "from langchain_community.document_loaders.pdf import PyPDFLoader\n",
    "from langchain_core.documents.base import Document\n",
    "from langchain_text_splitters import TokenTextSplitter\n",
    "import os\n",
    "\n",
    "for file in os.listdir('../NutriGenMe-Testing/'):\n",
    "\n",
    "    if file[-4:] != '.pdf':\n",
    "        continue\n",
    "    loader = PyPDFLoader(f\"../NutriGenMe-Testing/{file}\")\n",
    "    pages = loader.load()\n",
    "\n",
    "    docs = [Document('\\n'.join([page.page_content for page in pages]))]\n",
    "    docs[0].metadata = {'source': pages[0].metadata['source']}\n",
    "\n",
    "    text_splitter = TokenTextSplitter.from_tiktoken_encoder(\n",
    "        chunk_size=8000, chunk_overlap=0\n",
    "    )\n",
    "    chunks = text_splitter.split_documents(docs)\n",
    "    print(len(chunks), file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Genes</th>\n",
       "      <th>SNPs</th>\n",
       "      <th>Diseases</th>\n",
       "      <th>Title</th>\n",
       "      <th>Authors</th>\n",
       "      <th>Publisher Name</th>\n",
       "      <th>Publication Year</th>\n",
       "      <th>Population</th>\n",
       "      <th>Sample Size</th>\n",
       "      <th>Study Methodology</th>\n",
       "      <th>Study Level</th>\n",
       "      <th>Conclusion</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>36</td>\n",
       "      <td>PAX4</td>\n",
       "      <td>S1369A</td>\n",
       "      <td>hyperinsulinemic hypoglycemia</td>\n",
       "      <td>Monogenic Diabetes: What It Teaches Us on the ...</td>\n",
       "      <td>Yisheng Yang and Lawrence Chan</td>\n",
       "      <td>Endocrine Reviews</td>\n",
       "      <td>2016</td>\n",
       "      <td>Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>whole-exome sequencing, case-control and famil...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>The study delves into the genetic intricacies ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>37</td>\n",
       "      <td>NEUROD1</td>\n",
       "      <td>E23K</td>\n",
       "      <td>Wolfram syndrome</td>\n",
       "      <td>Monogenic Diabetes: What It Teaches Us on the ...</td>\n",
       "      <td>Yisheng Yang and Lawrence Chan</td>\n",
       "      <td>Endocrine Reviews</td>\n",
       "      <td>2016</td>\n",
       "      <td>Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>whole-exome sequencing, case-control and famil...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>The study delves into the genetic intricacies ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>38</td>\n",
       "      <td>WFS1</td>\n",
       "      <td>Pro12Ala</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Monogenic Diabetes: What It Teaches Us on the ...</td>\n",
       "      <td>Yisheng Yang and Lawrence Chan</td>\n",
       "      <td>Endocrine Reviews</td>\n",
       "      <td>2016</td>\n",
       "      <td>Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>whole-exome sequencing, case-control and famil...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>The study delves into the genetic intricacies ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>39</td>\n",
       "      <td>KIR6.2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Monogenic Diabetes: What It Teaches Us on the ...</td>\n",
       "      <td>Yisheng Yang and Lawrence Chan</td>\n",
       "      <td>Endocrine Reviews</td>\n",
       "      <td>2016</td>\n",
       "      <td>Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>whole-exome sequencing, case-control and famil...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>The study delves into the genetic intricacies ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>40</td>\n",
       "      <td>GLUT2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Monogenic Diabetes: What It Teaches Us on the ...</td>\n",
       "      <td>Yisheng Yang and Lawrence Chan</td>\n",
       "      <td>Endocrine Reviews</td>\n",
       "      <td>2016</td>\n",
       "      <td>Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>whole-exome sequencing, case-control and famil...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>The study delves into the genetic intricacies ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0    Genes      SNPs                       Diseases  \\\n",
       "36          36     PAX4    S1369A  hyperinsulinemic hypoglycemia   \n",
       "37          37  NEUROD1      E23K               Wolfram syndrome   \n",
       "38          38     WFS1  Pro12Ala                            NaN   \n",
       "39          39   KIR6.2       NaN                            NaN   \n",
       "40          40    GLUT2       NaN                            NaN   \n",
       "\n",
       "                                                Title  \\\n",
       "36  Monogenic Diabetes: What It Teaches Us on the ...   \n",
       "37  Monogenic Diabetes: What It Teaches Us on the ...   \n",
       "38  Monogenic Diabetes: What It Teaches Us on the ...   \n",
       "39  Monogenic Diabetes: What It Teaches Us on the ...   \n",
       "40  Monogenic Diabetes: What It Teaches Us on the ...   \n",
       "\n",
       "                           Authors     Publisher Name  Publication Year  \\\n",
       "36  Yisheng Yang and Lawrence Chan  Endocrine Reviews              2016   \n",
       "37  Yisheng Yang and Lawrence Chan  Endocrine Reviews              2016   \n",
       "38  Yisheng Yang and Lawrence Chan  Endocrine Reviews              2016   \n",
       "39  Yisheng Yang and Lawrence Chan  Endocrine Reviews              2016   \n",
       "40  Yisheng Yang and Lawrence Chan  Endocrine Reviews              2016   \n",
       "\n",
       "                                           Population    Sample Size  \\\n",
       "36  Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK  Not Specified   \n",
       "37  Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK  Not Specified   \n",
       "38  Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK  Not Specified   \n",
       "39  Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK  Not Specified   \n",
       "40  Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK  Not Specified   \n",
       "\n",
       "                                    Study Methodology    Study Level  \\\n",
       "36  whole-exome sequencing, case-control and famil...  Not Specified   \n",
       "37  whole-exome sequencing, case-control and famil...  Not Specified   \n",
       "38  whole-exome sequencing, case-control and famil...  Not Specified   \n",
       "39  whole-exome sequencing, case-control and famil...  Not Specified   \n",
       "40  whole-exome sequencing, case-control and famil...  Not Specified   \n",
       "\n",
       "                                           Conclusion  \n",
       "36  The study delves into the genetic intricacies ...  \n",
       "37  The study delves into the genetic intricacies ...  \n",
       "38  The study delves into the genetic intricacies ...  \n",
       "39  The study delves into the genetic intricacies ...  \n",
       "40  The study delves into the genetic intricacies ...  "
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_excel('monogenic diabetes_16000.xlsx', sheet_name='Original')\n",
    "df.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Genes</th>\n",
       "      <th>SNPs</th>\n",
       "      <th>Diseases</th>\n",
       "      <th>Title</th>\n",
       "      <th>Authors</th>\n",
       "      <th>Publisher Name</th>\n",
       "      <th>Publication Year</th>\n",
       "      <th>Population</th>\n",
       "      <th>Sample Size</th>\n",
       "      <th>Study Methodology</th>\n",
       "      <th>Study Level</th>\n",
       "      <th>Conclusion</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>33</td>\n",
       "      <td>HNF4A</td>\n",
       "      <td>rs10229583</td>\n",
       "      <td>neonatal diabetes mellitus</td>\n",
       "      <td>Monogenic Diabetes: What It Teaches Us on the ...</td>\n",
       "      <td>Yisheng Yang and Lawrence Chan</td>\n",
       "      <td>Endocrine Reviews</td>\n",
       "      <td>2016</td>\n",
       "      <td>Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>whole-exome sequencing, case-control and famil...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>The study delves into the genetic intricacies ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>34</td>\n",
       "      <td>HNF1B</td>\n",
       "      <td>rs6467136</td>\n",
       "      <td>maturity-onset diabetes of the young</td>\n",
       "      <td>Monogenic Diabetes: What It Teaches Us on the ...</td>\n",
       "      <td>Yisheng Yang and Lawrence Chan</td>\n",
       "      <td>Endocrine Reviews</td>\n",
       "      <td>2016</td>\n",
       "      <td>Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>whole-exome sequencing, case-control and famil...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>The study delves into the genetic intricacies ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>35</td>\n",
       "      <td>PDX1</td>\n",
       "      <td>rs1801262</td>\n",
       "      <td>permanent neonatal diabetes</td>\n",
       "      <td>Monogenic Diabetes: What It Teaches Us on the ...</td>\n",
       "      <td>Yisheng Yang and Lawrence Chan</td>\n",
       "      <td>Endocrine Reviews</td>\n",
       "      <td>2016</td>\n",
       "      <td>Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>whole-exome sequencing, case-control and famil...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>The study delves into the genetic intricacies ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>39</td>\n",
       "      <td>KIR6.2</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>Monogenic Diabetes: What It Teaches Us on the ...</td>\n",
       "      <td>Yisheng Yang and Lawrence Chan</td>\n",
       "      <td>Endocrine Reviews</td>\n",
       "      <td>2016</td>\n",
       "      <td>Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>whole-exome sequencing, case-control and famil...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>The study delves into the genetic intricacies ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>40</td>\n",
       "      <td>GLUT2</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>Monogenic Diabetes: What It Teaches Us on the ...</td>\n",
       "      <td>Yisheng Yang and Lawrence Chan</td>\n",
       "      <td>Endocrine Reviews</td>\n",
       "      <td>2016</td>\n",
       "      <td>Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>whole-exome sequencing, case-control and famil...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>The study delves into the genetic intricacies ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0   Genes        SNPs                              Diseases  \\\n",
       "33          33   HNF4A  rs10229583            neonatal diabetes mellitus   \n",
       "34          34   HNF1B   rs6467136  maturity-onset diabetes of the young   \n",
       "35          35    PDX1   rs1801262           permanent neonatal diabetes   \n",
       "36          39  KIR6.2                                                     \n",
       "37          40   GLUT2                                                     \n",
       "\n",
       "                                                Title  \\\n",
       "33  Monogenic Diabetes: What It Teaches Us on the ...   \n",
       "34  Monogenic Diabetes: What It Teaches Us on the ...   \n",
       "35  Monogenic Diabetes: What It Teaches Us on the ...   \n",
       "36  Monogenic Diabetes: What It Teaches Us on the ...   \n",
       "37  Monogenic Diabetes: What It Teaches Us on the ...   \n",
       "\n",
       "                           Authors     Publisher Name  Publication Year  \\\n",
       "33  Yisheng Yang and Lawrence Chan  Endocrine Reviews              2016   \n",
       "34  Yisheng Yang and Lawrence Chan  Endocrine Reviews              2016   \n",
       "35  Yisheng Yang and Lawrence Chan  Endocrine Reviews              2016   \n",
       "36  Yisheng Yang and Lawrence Chan  Endocrine Reviews              2016   \n",
       "37  Yisheng Yang and Lawrence Chan  Endocrine Reviews              2016   \n",
       "\n",
       "                                           Population    Sample Size  \\\n",
       "33  Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK  Not Specified   \n",
       "34  Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK  Not Specified   \n",
       "35  Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK  Not Specified   \n",
       "36  Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK  Not Specified   \n",
       "37  Canadian Oji-Cree, Latino, Finnish, Ashkenazi, UK  Not Specified   \n",
       "\n",
       "                                    Study Methodology    Study Level  \\\n",
       "33  whole-exome sequencing, case-control and famil...  Not Specified   \n",
       "34  whole-exome sequencing, case-control and famil...  Not Specified   \n",
       "35  whole-exome sequencing, case-control and famil...  Not Specified   \n",
       "36  whole-exome sequencing, case-control and famil...  Not Specified   \n",
       "37  whole-exome sequencing, case-control and famil...  Not Specified   \n",
       "\n",
       "                                           Conclusion  \n",
       "33  The study delves into the genetic intricacies ...  \n",
       "34  The study delves into the genetic intricacies ...  \n",
       "35  The study delves into the genetic intricacies ...  \n",
       "36  The study delves into the genetic intricacies ...  \n",
       "37  The study delves into the genetic intricacies ...  "
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import re\n",
    "df = df.fillna('')\n",
    "for i in df.index:\n",
    "    snp = df.loc[i, 'SNPs'].lower()\n",
    "    flag = True\n",
    "    # print(snp)\n",
    "    if not re.fullmatch('rs(\\d)+|', snp):\n",
    "        if not re.fullmatch('s(\\d)+', snp):\n",
    "            if not re.fullmatch('(\\d)+', snp):\n",
    "                flag = False\n",
    "            else:\n",
    "                snp = 'rs' + snp\n",
    "        else:\n",
    "            snp = 'r' + snp\n",
    "    \n",
    "    if not flag:\n",
    "        df = df.drop(i)\n",
    "    else:\n",
    "        df.loc[i, 'SNPs'] = snp\n",
    "\n",
    "df = df.reset_index(drop=True)\n",
    "df.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['135457029', '1029', '1432', '3630', '155030', '1576', '5071', '6934', '3039', '5599', '2308', '3643', '5111', '3172', '6927', '116519', '15376', '3767', '18609', '2645']"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from Bio import Entrez, Seq\n",
    "\n",
    "Entrez.email = \"fadliaulawia@gmail.com\"\n",
    "\n",
    "handle = Entrez.esearch(db=\"gene\", term='GCK [All Fields]')\n",
    "record = Entrez.read(handle)\n",
    "record['IdList']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "YES\n",
      "1\n",
      "GCK rs4607517 NO\n",
      "2\n",
      "YES\n",
      "3\n",
      "YES\n",
      "4\n",
      "YES\n",
      "5\n",
      "YES\n",
      "6\n",
      "YES\n",
      "7\n",
      "YES\n",
      "8\n",
      "HNF4A rs884614 NO\n",
      "9\n",
      "HNF4A rs4810424 NO\n",
      "10\n",
      "HNF4A rs1884613 NO\n",
      "11\n",
      "HNF1B/TCF2 rs757210 NO\n",
      "12\n",
      "HNF1B/TCF2 rs4430796 NO\n",
      "13\n",
      "HNF1B/TCF2 rs7501939 NO\n",
      "14\n",
      "NEUROD1IBETA2 rs1801262 NO\n",
      "15\n",
      "YES\n",
      "16\n",
      "YES\n",
      "17\n",
      "YES\n",
      "18\n",
      "YES\n",
      "19\n",
      "YES\n",
      "20\n",
      "YES\n",
      "21\n",
      "YES\n",
      "22\n",
      "YES\n",
      "23\n",
      "YES\n",
      "24\n",
      "YES\n",
      "25\n",
      "KCNJ11 rs2650000 NO\n",
      "26\n",
      "PPARG rs2144908 NO\n",
      "27\n",
      "INS rs3818247 NO\n",
      "28\n",
      "GLIS3 rs884614 NO\n",
      "29\n",
      "ABCC8 rs4810424 NO\n",
      "30\n",
      "GCK rs1884613 NO\n",
      "31\n",
      "SLC2A2 rs757210 NO\n",
      "32\n",
      "HNF1A rs4430796 NO\n",
      "33\n",
      "HNF4A rs10229583 NO\n",
      "34\n",
      "HNF1B rs6467136 NO\n",
      "35\n",
      "PDX1 rs1801262 NO\n",
      "36\n",
      "KIR6.2  NO\n",
      "37\n",
      "GLUT2  NO\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "\n",
    "data = {}\n",
    "\n",
    "for i in df.index:\n",
    "    gene = df.loc[i, 'Genes']\n",
    "    snp = df.loc[i, 'SNPs']\n",
    "\n",
    "    print(i)\n",
    "\n",
    "    if len(data.get(gene, '')) == 0:\n",
    "        url = f'https://www.ncbi.nlm.nih.gov/research/litvar2-api/variant/search/gene/{gene}'\n",
    "        res = requests.get(url).content\n",
    "        data[gene] = res\n",
    "    \n",
    "    val = data[gene]\n",
    "    if len(val) != 0:\n",
    "        if val.decode().find(f\"'{snp}'\") != -1:\n",
    "            print('YES')\n",
    "            continue\n",
    "    \n",
    "    print(gene, snp, \"NO\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt = \"\"\"\n",
    "# CONTEXT #\n",
    "In my capacity as a genomics specialist, I have table data obtained from a published research paper in the field of genomics. The data is provided in a list of JSONs format, with each JSON object representing a single row in a tabular structure.\n",
    "This is the data:\n",
    "\n",
    "[\n",
    "    {{\n",
    "        \"Genes\": \"A\",\n",
    "        \"SNPs\": \"rs123\",\n",
    "        \"Diseases\": \"A disease\"\n",
    "    }}\n",
    "]\n",
    "\n",
    "# OBJECTIVE #\n",
    "Given the provided table data, the following tasks need to be completed:\n",
    "\n",
    "1. Identify all unique gene names present within the table. Each row can contains more than one gene name.\n",
    "2. If present, extract any entries starting with \"rs\" (presumably representing Single Nucleotide Polymorphisms or rsIDs) that correspond to the same row as their associated gene names. Each gene name can correspond with more than one SNPs.\n",
    "3. If available, extract any disease information associated with both the gene name and its corresponding SNP/rsID.\n",
    "\n",
    "It is crucial to maintain the utmost accuracy in this process, as any false or fabricated information (hallucination) can have severe consequences for academic integrity and research credibility.\n",
    "If an SNPs or Diseases is absent from the table, leave the corresponding field blank with an empty string ('').\n",
    "\n",
    "# RESPONSE #\n",
    "The output should only be a string containing list of JSON objects, each representing an entry with the following structure:\n",
    "[\n",
    "    {{\n",
    "        \"Genes\": \"A\",\n",
    "        \"SNPs\": [\"rs123\", \"rs456\"],\n",
    "        \"Diseases\": \"A disease\"\n",
    "    }}\n",
    "]\n",
    "\n",
    "If there is no specific extracted entities provided from the table, just leave the response with an empty lists ([]).\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_openai import ChatOpenAI\n",
    "\n",
    "llm = ChatOpenAI(temperature=0, model_name=\"gpt-4-0125-preview\")\n",
    "\n",
    "result = llm.invoke(\"DO something\").content\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Genes</th>\n",
       "      <th>SNPs</th>\n",
       "      <th>Diseases</th>\n",
       "      <th>Title</th>\n",
       "      <th>Authors</th>\n",
       "      <th>Publisher Name</th>\n",
       "      <th>Publication Year</th>\n",
       "      <th>Population</th>\n",
       "      <th>Sample Size</th>\n",
       "      <th>Study Methodology</th>\n",
       "      <th>Study Level</th>\n",
       "      <th>Conclusion</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>GCK</td>\n",
       "      <td>rs1799884</td>\n",
       "      <td>GCK-MODY (MODY2), PNDM, CHI</td>\n",
       "      <td>Monogenic Diabetes: What It Teaches Us on the ...</td>\n",
       "      <td>Yisheng Yang and Lawrence Chan</td>\n",
       "      <td>Endocrine Reviews</td>\n",
       "      <td>2016</td>\n",
       "      <td>American Indian population, Canadian Oji-Cree ...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>Candidate gene and genome-wide association stu...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>The study conducted by Yisheng Yang and Lawren...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>GCK</td>\n",
       "      <td>s4607517</td>\n",
       "      <td>GCK-MODY (MODY2), PNDM, CHI</td>\n",
       "      <td>Monogenic Diabetes: What It Teaches Us on the ...</td>\n",
       "      <td>Yisheng Yang and Lawrence Chan</td>\n",
       "      <td>Endocrine Reviews</td>\n",
       "      <td>2016</td>\n",
       "      <td>American Indian population, Canadian Oji-Cree ...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>Candidate gene and genome-wide association stu...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>The study conducted by Yisheng Yang and Lawren...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>SLC2A2</td>\n",
       "      <td>rs5393</td>\n",
       "      <td>FBS</td>\n",
       "      <td>Monogenic Diabetes: What It Teaches Us on the ...</td>\n",
       "      <td>Yisheng Yang and Lawrence Chan</td>\n",
       "      <td>Endocrine Reviews</td>\n",
       "      <td>2016</td>\n",
       "      <td>American Indian population, Canadian Oji-Cree ...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>Candidate gene and genome-wide association stu...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>The study conducted by Yisheng Yang and Lawren...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>SLC2A2</td>\n",
       "      <td>rs5394</td>\n",
       "      <td>FBS</td>\n",
       "      <td>Monogenic Diabetes: What It Teaches Us on the ...</td>\n",
       "      <td>Yisheng Yang and Lawrence Chan</td>\n",
       "      <td>Endocrine Reviews</td>\n",
       "      <td>2016</td>\n",
       "      <td>American Indian population, Canadian Oji-Cree ...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>Candidate gene and genome-wide association stu...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>The study conducted by Yisheng Yang and Lawren...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>SLC2A2</td>\n",
       "      <td>rs5400</td>\n",
       "      <td>FBS</td>\n",
       "      <td>Monogenic Diabetes: What It Teaches Us on the ...</td>\n",
       "      <td>Yisheng Yang and Lawrence Chan</td>\n",
       "      <td>Endocrine Reviews</td>\n",
       "      <td>2016</td>\n",
       "      <td>American Indian population, Canadian Oji-Cree ...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>Candidate gene and genome-wide association stu...</td>\n",
       "      <td>Not Specified</td>\n",
       "      <td>The study conducted by Yisheng Yang and Lawren...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0   Genes       SNPs                     Diseases  \\\n",
       "0           0     GCK  rs1799884  GCK-MODY (MODY2), PNDM, CHI   \n",
       "1           1     GCK   s4607517  GCK-MODY (MODY2), PNDM, CHI   \n",
       "2           2  SLC2A2     rs5393                          FBS   \n",
       "3           3  SLC2A2     rs5394                          FBS   \n",
       "4           4  SLC2A2     rs5400                          FBS   \n",
       "\n",
       "                                               Title  \\\n",
       "0  Monogenic Diabetes: What It Teaches Us on the ...   \n",
       "1  Monogenic Diabetes: What It Teaches Us on the ...   \n",
       "2  Monogenic Diabetes: What It Teaches Us on the ...   \n",
       "3  Monogenic Diabetes: What It Teaches Us on the ...   \n",
       "4  Monogenic Diabetes: What It Teaches Us on the ...   \n",
       "\n",
       "                          Authors     Publisher Name  Publication Year  \\\n",
       "0  Yisheng Yang and Lawrence Chan  Endocrine Reviews              2016   \n",
       "1  Yisheng Yang and Lawrence Chan  Endocrine Reviews              2016   \n",
       "2  Yisheng Yang and Lawrence Chan  Endocrine Reviews              2016   \n",
       "3  Yisheng Yang and Lawrence Chan  Endocrine Reviews              2016   \n",
       "4  Yisheng Yang and Lawrence Chan  Endocrine Reviews              2016   \n",
       "\n",
       "                                          Population    Sample Size  \\\n",
       "0  American Indian population, Canadian Oji-Cree ...  Not Specified   \n",
       "1  American Indian population, Canadian Oji-Cree ...  Not Specified   \n",
       "2  American Indian population, Canadian Oji-Cree ...  Not Specified   \n",
       "3  American Indian population, Canadian Oji-Cree ...  Not Specified   \n",
       "4  American Indian population, Canadian Oji-Cree ...  Not Specified   \n",
       "\n",
       "                                   Study Methodology    Study Level  \\\n",
       "0  Candidate gene and genome-wide association stu...  Not Specified   \n",
       "1  Candidate gene and genome-wide association stu...  Not Specified   \n",
       "2  Candidate gene and genome-wide association stu...  Not Specified   \n",
       "3  Candidate gene and genome-wide association stu...  Not Specified   \n",
       "4  Candidate gene and genome-wide association stu...  Not Specified   \n",
       "\n",
       "                                          Conclusion  \n",
       "0  The study conducted by Yisheng Yang and Lawren...  \n",
       "1  The study conducted by Yisheng Yang and Lawren...  \n",
       "2  The study conducted by Yisheng Yang and Lawren...  \n",
       "3  The study conducted by Yisheng Yang and Lawren...  \n",
       "4  The study conducted by Yisheng Yang and Lawren...  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_excel(\"result/monogenic diabetes_8000.xlsx\", sheet_name='Original')\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[\n",
      "  {\n",
      "    \"Genes\": \"GCK\",\n",
      "    \"SNPs\": \"rs1799884\",\n",
      "    \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"GCK\",\n",
      "    \"SNPs\": \"s4607517\",\n",
      "    \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"SLC2A2\",\n",
      "    \"SNPs\": \"rs5393\",\n",
      "    \"Diseases\": \"FBS\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"SLC2A2\",\n",
      "    \"SNPs\": \"rs5394\",\n",
      "    \"Diseases\": \"FBS\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"SLC2A2\",\n",
      "    \"SNPs\": \"rs5400\",\n",
      "    \"Diseases\": \"FBS\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"SLC2A2\",\n",
      "    \"SNPs\": \"rs5404\",\n",
      "    \"Diseases\": \"FBS\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"HNF4A\",\n",
      "    \"SNPs\": \"rs2144908\",\n",
      "    \"Diseases\": \"HNF4A MODY (MODY1), CHI\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"HNF4A\",\n",
      "    \"SNPs\": \"S3818247\",\n",
      "    \"Diseases\": \"HNF4A MODY (MODY1), CHI\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"HNF4A\",\n",
      "    \"SNPs\": \"rs884614\",\n",
      "    \"Diseases\": \"HNF4A MODY (MODY1), CHI\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"HNF4A\",\n",
      "    \"SNPs\": \"rs4810424\",\n",
      "    \"Diseases\": \"HNF4A MODY (MODY1), CHI\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"HNF4A\",\n",
      "    \"SNPs\": \"s1884613\",\n",
      "    \"Diseases\": \"HNF4A MODY (MODY1), CHI\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"HNF1B/TCF2\",\n",
      "    \"SNPs\": \"s757210\",\n",
      "    \"Diseases\": \"RCAD syndrome, or MODY5; TNDM and PNDM (rare)\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"HNF1B/TCF2\",\n",
      "    \"SNPs\": \"S4430796\",\n",
      "    \"Diseases\": \"RCAD syndrome, or MODY5; TNDM and PNDM (rare)\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"HNF1B/TCF2\",\n",
      "    \"SNPs\": \"rs7501939\",\n",
      "    \"Diseases\": \"RCAD syndrome, or MODY5; TNDM and PNDM (rare)\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"NEUROD1IBETA2\",\n",
      "    \"SNPs\": \"rs1801262\",\n",
      "    \"Diseases\": \"MODY6 and PNDM\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"WFS1\",\n",
      "    \"SNPs\": \"rs10010131\",\n",
      "    \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"WFS1\",\n",
      "    \"SNPs\": \"rs6446482\",\n",
      "    \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"WFS1\",\n",
      "    \"SNPs\": \"s10010131\",\n",
      "    \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"WFS1\",\n",
      "    \"SNPs\": \"1801213\",\n",
      "    \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"WFS1\",\n",
      "    \"SNPs\": \"rs734312\",\n",
      "    \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"PPARG\",\n",
      "    \"SNPs\": \"rs1801282\",\n",
      "    \"Diseases\": \"Monogenic diabetes   Monogenic Diabetes Genes Associated With Both common T1D and T2D\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"PPARG\",\n",
      "    \"SNPs\": \"rs4684847\",\n",
      "    \"Diseases\": \"Monogenic diabetes   Monogenic Diabetes Genes Associated With Both common T1D and T2D\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"GLIS3\",\n",
      "    \"SNPs\": \"rs7020673\",\n",
      "    \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"GLIS3\",\n",
      "    \"SNPs\": \"s7034200\",\n",
      "    \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"GLIS3\",\n",
      "    \"SNPs\": \"s7041847\",\n",
      "    \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"KCNJ11\",\n",
      "    \"SNPs\": \"E23K\",\n",
      "    \"Diseases\": \"Type 2 Diabetes\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"PPARG\",\n",
      "    \"SNPs\": \"S1369A\",\n",
      "    \"Diseases\": \"Type 1 Diabetes\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"INS\",\n",
      "    \"SNPs\": \"rs1799884\",\n",
      "    \"Diseases\": \"Neonatal Diabetes Mellitus\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"GLIS3\",\n",
      "    \"SNPs\": \"rs5400\",\n",
      "    \"Diseases\": \"Maturity-Onset Diabetes of the Young (MODY)\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"ABCC8\",\n",
      "    \"SNPs\": \"rs2650000\",\n",
      "    \"Diseases\": \"Wolfram syndrome 1\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"GCK\",\n",
      "    \"SNPs\": \"rs2144908\",\n",
      "    \"Diseases\": \"Fanconi-Bickel syndrome\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"SLC2A2\",\n",
      "    \"SNPs\": \"rs3818247\",\n",
      "    \"Diseases\": \"young-onset diabetes\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"HNF1A\",\n",
      "    \"SNPs\": \"rs884614\",\n",
      "    \"Diseases\": \"prostate cancer\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"HNF1B\",\n",
      "    \"SNPs\": \"rs1884613\",\n",
      "    \"Diseases\": \"PNDM\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"PDX1\",\n",
      "    \"SNPs\": \"rs757210\",\n",
      "    \"Diseases\": \"KPD\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"PAX4\",\n",
      "    \"SNPs\": \"rs4430796\",\n",
      "    \"Diseases\": \"TNDM\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"NEUROD1\",\n",
      "    \"SNPs\": \"rs10229583\",\n",
      "    \"Diseases\": \"type 1b diabetes\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"WFS1\",\n",
      "    \"SNPs\": \"rs6467136\",\n",
      "    \"Diseases\": \"\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"MAFA\",\n",
      "    \"SNPs\": \"rs1801262\",\n",
      "    \"Diseases\": \"\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"Ccnd2\",\n",
      "    \"SNPs\": \"rs10010131\",\n",
      "    \"Diseases\": \"\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"NGN3\",\n",
      "    \"SNPs\": \"rs6446482\",\n",
      "    \"Diseases\": \"\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"FOXA2\",\n",
      "    \"SNPs\": \"rs1801282\",\n",
      "    \"Diseases\": \"\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"TCF2\",\n",
      "    \"SNPs\": \"rs780094\",\n",
      "    \"Diseases\": \"\"\n",
      "  }\n",
      "]\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "df.fillna('', inplace=True)\n",
    "json_table = df[['Genes', 'SNPs', 'Diseases']].to_json(orient='records')\n",
    "str_json_table = json.dumps(json.loads(json_table), indent=2)\n",
    "print(str_json_table)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
      "\n",
      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:68: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:68: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:72: The name tf.train.import_meta_graph is deprecated. Please use tf.compat.v1.train.import_meta_graph instead.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:72: The name tf.train.import_meta_graph is deprecated. Please use tf.compat.v1.train.import_meta_graph instead.\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\layers\\normalization\\batch_normalization.py:979: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\layers\\normalization\\batch_normalization.py:979: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from process import validate\n",
    "\n",
    "\n",
    "df = validate(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GCK        rs1799884  Match\n",
      "GCK        rs4607517  Match\n",
      "SLC2A2     rs5393     Not Available\n",
      "SLC2A2     rs5394     Not Available\n",
      "SLC2A2     rs5400     Match\n",
      "SLC2A2     rs5404     Not Available\n",
      "HNF4A      rs2144908  Not Available\n",
      "HNF4A      rs3818247  Match\n",
      "HNF4A      rs884614   Not Available\n",
      "HNF4A      rs4810424  Not Available\n",
      "HNF4A      rs1884613  Not Available\n",
      "HNF1B      rs757210   Match\n",
      "HNF1B      rs4430796  Match\n",
      "HNF1B      rs7501939  Match\n",
      "NEUROD1IBETA2 rs1801262  Not Available\n",
      "WFS1       rs10010131 Match\n",
      "WFS1       rs6446482  Not Available\n",
      "WFS1       rs10010131 Match\n",
      "WFS1       rs1801213  Not Available\n",
      "WFS1       rs734312   Match\n",
      "PPARG      rs1801282  Match\n",
      "PPARG      rs4684847  Match\n",
      "GLIS3      rs7020673  Match\n",
      "GLIS3      rs7034200  Match\n",
      "GLIS3      rs7041847  Match\n",
      "INS        rs1799884  Not Match\n",
      "GLIS3      rs5400     Not Match\n",
      "ABCC8      rs2650000  Not Match\n",
      "GCK        rs2144908  Not Available\n",
      "SLC2A2     rs3818247  Not Match\n",
      "HNF1A      rs884614   Not Available\n",
      "HNF1B      rs1884613  Not Available\n",
      "PDX1       rs757210   Not Match\n",
      "PAX4       rs4430796  Not Match\n",
      "NEUROD1    rs10229583 Not Match\n",
      "WFS1       rs6467136  Not Match\n",
      "MAFA       rs1801262  Not Available\n",
      "CCND2      rs10010131 Not Match\n",
      "NGN3       rs6446482  Not Available\n",
      "FOXA2      rs1801282  Not Match\n",
      "TCF2       rs780094   Not Match\n",
      "TCF2       rs757210   Not Match\n",
      "TCF2       rs4430796  Not Match\n",
      "TCF2       rs7501939  Not Match\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "\n",
    "for i in df.index:\n",
    "    snp = df.loc[i, 'SNPs']\n",
    "    gene = df.loc[i, 'Genes']\n",
    "\n",
    "    res = requests.get(f'https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/')\n",
    "    try:\n",
    "        res = res.json()\n",
    "    except:\n",
    "        print('{:<10} {:<10} Not Available'.format(gene, snp))\n",
    "        continue\n",
    "    lst = []\n",
    "    for r in res['genomicContexts']:\n",
    "        if r['gene']['geneName'] == gene:\n",
    "            print('{:<10} {:<10} Match'.format(gene, snp))\n",
    "            break\n",
    "    else:\n",
    "        print('{:<10} {:<10} Not Match'.format(gene, snp))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GCK        rs1799884  Match\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GCK        rs4607517  Not Match\n",
      "SLC2A2     rs5393     Match\n",
      "SLC2A2     rs5394     Match\n",
      "SLC2A2     rs5400     Match\n",
      "SLC2A2     rs5404     Match\n",
      "HNF4A      rs2144908  Match\n",
      "HNF4A      rs3818247  Match\n",
      "HNF4A      rs884614   Not Match\n",
      "HNF4A      rs4810424  Not Match\n",
      "HNF4A      rs1884613  Not Match\n",
      "HNF1B      rs757210   Match\n",
      "HNF1B      rs4430796  Match\n",
      "HNF1B      rs7501939  Match\n",
      "NEUROD1IBETA2 rs1801262  Not Match\n",
      "WFS1       rs10010131 Match\n",
      "WFS1       rs6446482  Match\n",
      "WFS1       rs10010131 Match\n",
      "WFS1       rs1801213  Match\n",
      "WFS1       rs734312   Match\n",
      "PPARG      rs1801282  Match\n",
      "PPARG      rs4684847  Match\n",
      "GLIS3      rs7020673  Match\n",
      "GLIS3      rs7034200  Match\n",
      "GLIS3      rs7041847  Match\n",
      "INS        rs1799884  Not Match\n",
      "GLIS3      rs5400     Not Match\n",
      "ABCC8      rs2650000  Not Match\n",
      "GCK        rs2144908  Not Match\n",
      "SLC2A2     rs3818247  Not Match\n",
      "HNF1A      rs884614   Not Match\n",
      "HNF1B      rs1884613  Not Match\n",
      "PDX1       rs757210   Not Match\n",
      "PAX4       rs4430796  Not Match\n",
      "NEUROD1    rs10229583 Not Match\n",
      "WFS1       rs6467136  Not Match\n",
      "MAFA       rs1801262  Not Match\n",
      "CCND2      rs10010131 Not Match\n",
      "NGN3       rs6446482  Not Match\n",
      "FOXA2      rs1801282  Not Match\n",
      "TCF2       rs780094   Not Match\n",
      "TCF2       rs757210   Not Match\n",
      "TCF2       rs4430796  Not Match\n",
      "TCF2       rs7501939  Not Match\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "\n",
    "for i in df.index:\n",
    "    snp = df.loc[i, 'SNPs']\n",
    "    gene = df.loc[i, 'Genes']\n",
    "\n",
    "    res = requests.get(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=snp&retmode=json&id={snp[2:]}').json()['result'][snp[2:]]\n",
    "    if 'error' in res:\n",
    "        print('{:<10} {:<10} Not Available'.format(gene, snp))\n",
    "        continue\n",
    "    lst = []\n",
    "    for r in res['genes']:\n",
    "        if r['name'] == gene:\n",
    "            print('{:<10} {:<10} Match'.format(gene, snp))\n",
    "            break\n",
    "    else:\n",
    "        print('{:<10} {:<10} Not Match'.format(gene, snp))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Here's the list of JSON objects with corrected gene names, SNPs, and diseases based on the given context:\n",
      "\n",
      "[\n",
      "  {\n",
      "    \"Genes\": \"GCK\",\n",
      "    \"SNPs\": \"rs1799884\",\n",
      "    \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"SLC24A2\",\n",
      "    \"SNPs\": \"rs5393\",\n",
      "    \"Diseases\": \"FBS\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"NEUROD1, INS\",\n",
      "    \"SNPs\": \"rs1801262\",\n",
      "    \"Diseases\": \"MODY6 and PNDM\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"WFS1\",\n",
      "    \"SNPs\": \"rs6446482\",\n",
      "    \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"GLIS3\",\n",
      "    \"SNPs\": \"rs7020673\",\n",
      "    \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
      "  },\n",
      "  {\n",
      "    \"Genes\": \"FTO\",\n",
      "    \"SNPs\": \"rs9937290\",\n",
      "    \"Diseases\": \"Obesity\"\n",
      "  }\n",
      "]\n",
      "\n",
      "Changes made:\n",
      "1. Corrected \"SLC242\" to \"SLC24A2\"\n",
      "2. Separated \"NEUROD1IBETA2\" into \"NEUROD1, INS\"\n",
      "3. Corrected \"GLI53\" to \"GLIS3\"\n",
      "4. Corrected \"FT0\" to \"FTO\"\n"
     ]
    }
   ],
   "source": [
    "from langchain_openai import ChatOpenAI\n",
    "import os\n",
    "\n",
    "llm = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url=\"https://api.perplexity.ai\")\n",
    "\n",
    "prompt = \"\"\"\n",
    "# CONTEXT #\n",
    "In my capacity as a genomics specialist, I have table data containing gene names with their corresponding SNPs and diseases. The data is provided in a list of JSON format, with each JSON object representing a single row in a tabular structure. \n",
    "The problem is because the data is extracted using OCR, some gene names and SNPs may have a typo.\n",
    "\n",
    "This is the data:\n",
    "[\n",
    "  {\n",
    "    \"Genes\": \"GCK\",\n",
    "    \"SNPs\": \"rs1799884\",\n",
    "    \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
    "  },\n",
    "  {\n",
    "    \"Genes\": \"SLC242\",\n",
    "    \"SNPs\": \"rs5393\",\n",
    "    \"Diseases\": \"FBS\"\n",
    "  },\n",
    "  {\n",
    "    \"Genes\": \"NEUROD1IBETA2\",\n",
    "    \"SNPs\": \"rs1801262\",\n",
    "    \"Diseases\": \"MODY6 and PNDM\"\n",
    "  },\n",
    "  {\n",
    "    \"Genes\": \"WFSI\",\n",
    "    \"SNPs\": \"rs6446482\",\n",
    "    \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
    "  },\n",
    "  {\n",
    "    \"Genes\": \"GLI53\",\n",
    "    \"SNPs\": \"rs7020673\",\n",
    "    \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
    "  },\n",
    "  {\n",
    "    \"Genes\": \"FT0\",\n",
    "    \"SNPs\": \"rs9937290\",\n",
    "    \"Diseases\": \"Obesity\"\n",
    "  },\n",
    "]\n",
    "\n",
    "# OBJECTIVE #\n",
    "Given the provided table data, the following tasks need to be completed:\n",
    "\n",
    "1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:\n",
    "    - Combined Names: Two gene names erroneously merged into one. Separate these using \"and\": \"A and B\".\n",
    "    - OCR Errors: Similar characters misread by the system. Correct these to the intended form.\n",
    "2. If SNP is not empty, check whether the gene name corresponds with the SNP. Fix it with the correct SNP if the original SNP is wrong.\n",
    "3. If diseases are not empty, check whether the gene name corresponds with the diseases. Fix it with the correct diseases if the original disease is wrong.\n",
    "\n",
    "# RESPONSE #\n",
    "The output must be only a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:\n",
    "[\n",
    "    {{\n",
    "        \"Genes\": \"A\",\n",
    "        \"SNPs\": \"rs123\",\n",
    "        \"Diseases\": \"A disease\"\n",
    "    }}\n",
    "]\n",
    "\"\"\"\n",
    "\n",
    "result = llm.invoke(model='mixtral-8x7b-instruct', input=prompt)\n",
    "print(result.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Here is the Python solution using the `json` module and a dictionary to map known gene names to their correct forms:\n",
      "```python\n",
      "import json\n",
      "\n",
      "# Known gene names and their corrections\n",
      "gene_corrections = {\n",
      "    \"SLC242\": \"SLC2A2\",\n",
      "    \"NEUROD1IBETA2\": \"NEUROD1\",\n",
      "    \"WFSI\": \"WFS1\",\n",
      "    \"GLI53\": \"GLIS3\",\n",
      "    \"FT0\": \"FTO\"\n",
      "}\n",
      "\n",
      "# Function to correct gene names and SNPs\n",
      "def correct_gene_data(data):\n",
      "    corrected_data = []\n",
      "    for entry in data:\n",
      "        genes = entry[\"Genes\"]\n",
      "        snps = entry[\"SNPs\"]\n",
      "        diseases = entry[\"Diseases\"]\n",
      "        \n",
      "        # Correct gene names\n",
      "        if genes in gene_corrections:\n",
      "            genes = gene_corrections[genes]\n",
      "        elif \" and \" not in genes:\n",
      "            # Check for combined names\n",
      "            parts = genes.split()\n",
      "            if len(parts) > 1:\n",
      "                genes = \" and \".join(parts)\n",
      "        \n",
      "        # Correct SNPs (assuming a dictionary of known SNPs for each gene)\n",
      "        snp_corrections = {\n",
      "            \"GCK\": {\"rs1799884\": \"rs1799884\"},\n",
      "            \"SLC2A2\": {\"rs5393\": \"rs5393\"},\n",
      "            \"NEUROD1\": {\"rs1801262\": \"rs1801262\"},\n",
      "            \"WFS1\": {\"rs6446482\": \"rs6446482\"},\n",
      "            \"GLIS3\": {\"rs7020673\": \"rs7020673\"},\n",
      "            \"FTO\": {\"rs9937290\": \"rs9937290\"}\n",
      "        }\n",
      "        if snps and genes in snp_corrections:\n",
      "            if snps not in snp_corrections[genes]:\n",
      "                snps = \"\"\n",
      "        \n",
      "        # Correct diseases (assuming a dictionary of known diseases for each gene)\n",
      "        disease_corrections = {\n",
      "            \"GCK\": {\"GCK-MODY (MODY2), PNDM, CHI\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
      "            \"SLC2A2\": {\"FBS\": \"FBS\"},\n",
      "            \"NEUROD1\": {\"MODY6 and PNDM\": \"MODY6 and PNDM\"},\n",
      "            \"WFS1\": {\"WFS1, sometimes referred to as DIDMOAD\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
      "            \"GLIS3\": {\"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
      "            \"FTO\": {\"Obesity\": \"Obesity\"}\n",
      "        }\n",
      "        if diseases and genes in disease_corrections:\n",
      "            if diseases not in disease_corrections[genes]:\n",
      "                diseases = \"\"\n",
      "        \n",
      "        # Add corrected entry to the list\n",
      "        if genes and snps and diseases:\n",
      "            corrected_data.append({\"Genes\": genes, \"SNPs\": snps, \"Diseases\": diseases})\n",
      "    \n",
      "    return json.dumps(corrected_data)\n",
      "\n",
      "# Input data\n",
      "data = [\n",
      "    {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
      "    {\"Genes\": \"SLC242\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
      "    {\"Genes\": \"NEUROD1IBETA2\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
      "    {\"Genes\": \"WFSI\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
      "    {\"Genes\": \"GLI53\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
      "    {\"Genes\": \"FT0\", \"SNPs\": \"rs9937290\", \"Diseases\": \"Obesity\"}\n",
      "]\n",
      "\n",
      "# Correct and output the data\n",
      "print(correct_gene_data(data))\n",
      "```\n",
      "This will output the corrected data in the same format as the input:\n",
      "```\n",
      "[\n",
      "    {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
      "    {\"Genes\": \"SLC2A2\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
      "    {\"Genes\": \"NEUROD1\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
      "    {\"Genes\": \"WFS1\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
      "    {\"Genes\": \"GLIS3\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
      "    {\"Genes\": \"FTO\", \"SNPs\": \"rs9937290\", \"Diseases\": \"Obesity\"}\n",
      "]\n",
      "```\n",
      "Note that this implementation assumes a dictionary of known gene names, SNPs, and diseases for correction. You may need to expand or modify these dictionaries based on your specific use case.\n"
     ]
    }
   ],
   "source": [
    "from langchain_openai import ChatOpenAI\n",
    "\n",
    "llm = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url=\"https://api.perplexity.ai\")\n",
    "\n",
    "prompt = \"\"\"\n",
    "# CONTEXT #\n",
    "In my capacity as a genomics specialist, I have table data containing gene names with their corresponding SNPs and diseases. The data is provided in a list of JSON format, with each JSON object representing a single row in a tabular structure. \n",
    "The problem is because the data is extracted using OCR, some gene names and SNPs may have a typo.\n",
    "\n",
    "This is the data:\n",
    "[\n",
    "  {\n",
    "    \"Genes\": \"GCK\",\n",
    "    \"SNPs\": \"rs1799884\",\n",
    "    \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
    "  },\n",
    "  {\n",
    "    \"Genes\": \"SLC242\",\n",
    "    \"SNPs\": \"rs5393\",\n",
    "    \"Diseases\": \"FBS\"\n",
    "  },\n",
    "  {\n",
    "    \"Genes\": \"NEUROD1IBETA2\",\n",
    "    \"SNPs\": \"rs1801262\",\n",
    "    \"Diseases\": \"MODY6 and PNDM\"\n",
    "  },\n",
    "  {\n",
    "    \"Genes\": \"WFSI\",\n",
    "    \"SNPs\": \"rs6446482\",\n",
    "    \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
    "  },\n",
    "  {\n",
    "    \"Genes\": \"GLI53\",\n",
    "    \"SNPs\": \"rs7020673\",\n",
    "    \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
    "  },\n",
    "  {\n",
    "    \"Genes\": \"FT0\",\n",
    "    \"SNPs\": \"rs9937290\",\n",
    "    \"Diseases\": \"Obesity\"\n",
    "  },\n",
    "]\n",
    "\n",
    "# OBJECTIVE #\n",
    "Given the provided table data, the following tasks need to be completed:\n",
    "\n",
    "1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:\n",
    "    - Combined Names: Two gene names erroneously merged into one. Duplicate this data row so each gene name has its own data.\n",
    "    - OCR Errors: Similar characters misread by the system. Correct these to the intended form.\n",
    "2. If SNP is not empty, check whether the gene name corresponds with the SNP. Fix it with the correct SNP if the original SNP is wrong.\n",
    "3. If diseases are not empty, check whether the gene name corresponds with the diseases. Fix it with the correct diseases if the original disease is wrong.\n",
    "\n",
    "# RESPONSE #\n",
    "The output must be STRICTLY ONLY a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:\n",
    "[\n",
    "    {{\n",
    "        \"Genes\": \"A\",\n",
    "        \"SNPs\": \"rs123\",\n",
    "        \"Diseases\": \"A disease\"\n",
    "    }}\n",
    "]\n",
    "\"\"\"\n",
    "\n",
    "result = llm.invoke(model='llama-3-70b-instruct', input=prompt)\n",
    "print(result.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\load.py:1429: FutureWarning: The repository for bigbio/euadr contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/bigbio/euadr\n",
      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
      "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "ename": "ConnectionError",
     "evalue": "Couldn't reach https://biosemantics.erasmusmc.nl/downloads/euadr.tgz (ConnectTimeout(MaxRetryError(\"HTTPSConnectionPool(host='biosemantics.erasmusmc.nl', port=443): Max retries exceeded with url: /downloads/euadr.tgz (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001CD0D00D060>, 'Connection to biosemantics.erasmusmc.nl timed out. (connect timeout=100)'))\")))",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mConnectionError\u001b[0m                           Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-2-8057498175ab>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mdatasets\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mdataset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"bigbio/euadr\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\load.py\u001b[0m in \u001b[0;36mload_dataset\u001b[1;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)\u001b[0m\n\u001b[0;32m   2547\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2548\u001b[0m     \u001b[1;31m# Download and prepare data\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2549\u001b[1;33m     builder_instance.download_and_prepare(\n\u001b[0m\u001b[0;32m   2550\u001b[0m         \u001b[0mdownload_config\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2551\u001b[0m         \u001b[0mdownload_mode\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdownload_mode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\builder.py\u001b[0m in \u001b[0;36mdownload_and_prepare\u001b[1;34m(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[0;32m   1003\u001b[0m                         \u001b[1;32mif\u001b[0m \u001b[0mnum_proc\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1004\u001b[0m                             \u001b[0mprepare_split_kwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"num_proc\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnum_proc\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1005\u001b[1;33m                         self._download_and_prepare(\n\u001b[0m\u001b[0;32m   1006\u001b[0m                             \u001b[0mdl_manager\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1007\u001b[0m                             \u001b[0mverification_mode\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mverification_mode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\builder.py\u001b[0m in \u001b[0;36m_download_and_prepare\u001b[1;34m(self, dl_manager, verification_mode, **prepare_splits_kwargs)\u001b[0m\n\u001b[0;32m   1765\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1766\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_download_and_prepare\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverification_mode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mprepare_splits_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1767\u001b[1;33m         super()._download_and_prepare(\n\u001b[0m\u001b[0;32m   1768\u001b[0m             \u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1769\u001b[0m             \u001b[0mverification_mode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\builder.py\u001b[0m in \u001b[0;36m_download_and_prepare\u001b[1;34m(self, dl_manager, verification_mode, **prepare_split_kwargs)\u001b[0m\n\u001b[0;32m   1076\u001b[0m         \u001b[0msplit_dict\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSplitDict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdataset_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1077\u001b[0m         \u001b[0msplit_generators_kwargs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_make_split_generators_kwargs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprepare_split_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1078\u001b[1;33m         \u001b[0msplit_generators\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_split_generators\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0msplit_generators_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1079\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1080\u001b[0m         \u001b[1;31m# Checksums verification\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\.cache\\huggingface\\modules\\datasets_modules\\datasets\\bigbio--euadr\\38388d88a335f2d91807b0f813bdfd809fec0e9dcbc32e2d9bfea7275d70f75c\\euadr.py\u001b[0m in \u001b[0;36m_split_generators\u001b[1;34m(self, dl_manager)\u001b[0m\n\u001b[0;32m    105\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_split_generators\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    106\u001b[0m         \u001b[0murls\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_URL\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 107\u001b[1;33m         \u001b[0mdatapath\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdownload_and_extract\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murls\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    108\u001b[0m         return [\n\u001b[0;32m    109\u001b[0m             datasets.SplitGenerator(\n",
      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\download\\download_manager.py\u001b[0m in \u001b[0;36mdownload_and_extract\u001b[1;34m(self, url_or_urls)\u001b[0m\n\u001b[0;32m    560\u001b[0m             \u001b[0mextracted_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mextracted\u001b[0m \u001b[0mpaths\u001b[0m \u001b[0mof\u001b[0m \u001b[0mgiven\u001b[0m \u001b[0mURL\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    561\u001b[0m         \"\"\"\n\u001b[1;32m--> 562\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mextract\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_urls\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    563\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    564\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mget_recorded_sizes_checksums\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\download\\download_manager.py\u001b[0m in \u001b[0;36mdownload\u001b[1;34m(self, url_or_urls)\u001b[0m\n\u001b[0;32m    424\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    425\u001b[0m         \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 426\u001b[1;33m         downloaded_path_or_paths = map_nested(\n\u001b[0m\u001b[0;32m    427\u001b[0m             \u001b[0mdownload_func\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    428\u001b[0m             \u001b[0murl_or_urls\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\utils\\py_utils.py\u001b[0m in \u001b[0;36mmap_nested\u001b[1;34m(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, parallel_min_length, types, disable_tqdm, desc)\u001b[0m\n\u001b[0;32m    457\u001b[0m     \u001b[1;31m# Singleton\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    458\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtypes\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 459\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mfunction\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    460\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    461\u001b[0m     \u001b[0miterable\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32melse\u001b[0m \u001b[0mdata_struct\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\download\\download_manager.py\u001b[0m in \u001b[0;36m_download\u001b[1;34m(self, url_or_filename, download_config)\u001b[0m\n\u001b[0;32m    449\u001b[0m             \u001b[1;31m# append the relative path to the base_path\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    450\u001b[0m             \u001b[0murl_or_filename\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0murl_or_path_join\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_base_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl_or_filename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 451\u001b[1;33m         \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcached_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdownload_config\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    452\u001b[0m         \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtracked_str\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    453\u001b[0m         \u001b[0mout\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_origin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\utils\\file_utils.py\u001b[0m in \u001b[0;36mcached_path\u001b[1;34m(url_or_filename, download_config, **download_kwargs)\u001b[0m\n\u001b[0;32m    186\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mis_remote_url\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    187\u001b[0m         \u001b[1;31m# URL, so get it from the cache (downloading if necessary)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 188\u001b[1;33m         output_path = get_from_cache(\n\u001b[0m\u001b[0;32m    189\u001b[0m             \u001b[0murl_or_filename\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    190\u001b[0m             \u001b[0mcache_dir\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\utils\\file_utils.py\u001b[0m in \u001b[0;36mget_from_cache\u001b[1;34m(url, cache_dir, force_download, proxies, etag_timeout, resume_download, user_agent, local_files_only, use_etag, max_retries, token, use_auth_token, ignore_url_params, storage_options, download_desc)\u001b[0m\n\u001b[0;32m    571\u001b[0m         \u001b[0m_raise_if_offline_mode_is_enabled\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Tried to reach {url}\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    572\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mhead_error\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 573\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Couldn't reach {url} ({repr(head_error)})\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    574\u001b[0m         \u001b[1;32melif\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    575\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Couldn't reach {url} (error {response.status_code})\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mConnectionError\u001b[0m: Couldn't reach https://biosemantics.erasmusmc.nl/downloads/euadr.tgz (ConnectTimeout(MaxRetryError(\"HTTPSConnectionPool(host='biosemantics.erasmusmc.nl', port=443): Max retries exceeded with url: /downloads/euadr.tgz (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001CD0D00D060>, 'Connection to biosemantics.erasmusmc.nl timed out. (connect timeout=100)'))\")))"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "dataset = load_dataset(\"bigbio/euadr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}