Spaces:

KalbeDigitalLab
/

nutrigenme-paper-extractor

Running

App Files Files Community

fadliaulawi commited on May 8

Commit

3f96f05

•

1 Parent(s): 2ec4f41

Explore Gemini model

Browse files

Files changed (4) hide show

app.py +15 -6
process.py +7 -4
requirements.txt +22 -21
resources/experiment.ipynb +125 -98

app.py CHANGED Viewed

@@ -31,11 +31,21 @@ st.markdown("<div style='text-align: left; color: white; font-size: 16px'>In its
 uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True)
-chunk_option = st.selectbox(
-    'Tokens amounts per process :',
-    (32000, 16000, 8000), key='table_hv'
-)
-chunk_overlap = 0
 if uploaded_files:
     journals = []
@@ -45,7 +55,6 @@ if uploaded_files:
         with st.status("Extraction in progress ...", expanded=True) as status:
             start_time = datetime.now()
-            csv = pd.DataFrame()
             for uploaded_file in stqdm(uploaded_files):
                 with NamedTemporaryFile(dir='.', suffix=".pdf", delete=eval(os.getenv('DELETE_TEMP_PDF', 'True'))) as pdf:
                     pdf.write(uploaded_file.getbuffer())

 uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True)
+col1, col2 = st.columns(2)
+with col1:
+    chunk_option = st.selectbox(
+        'Token amounts per process:',
+        (24000, 16000, 8000), key='token'
+    )
+    chunk_overlap = 0
+with col2:
+    model = st.selectbox(
+        'Model selection: (UNDER DEVELOPED)',
+        # 128000, 32768, 1048576
+        ('gpt-4-turbo', 'llama-3-sonar-large-32k-chat', 'gemini-1.5-pro-latest'), key='model'
+    )
 if uploaded_files:
     journals = []
         with st.status("Extraction in progress ...", expanded=True) as status:
             start_time = datetime.now()
             for uploaded_file in stqdm(uploaded_files):
                 with NamedTemporaryFile(dir='.', suffix=".pdf", delete=eval(os.getenv('DELETE_TEMP_PDF', 'True'))) as pdf:
                     pdf.write(uploaded_file.getbuffer())

process.py CHANGED Viewed

@@ -11,6 +11,7 @@ from pdf2image import convert_from_path
 from prompt import prompt_entity_gsd_chunk, prompt_entity_gsd_combine, prompt_entity_summ_chunk, prompt_entity_summ_combine, prompt_entities_chunk, prompt_entities_combine, prompt_entity_one_chunk, prompt_table, prompt_validation
 from table_detector import detection_transform, device, model, ocr, outputs_to_objects
 import io
 import json
 import os
@@ -19,9 +20,11 @@ import re
 import torch
 load_dotenv()
-llm = ChatOpenAI(temperature=0, model_name="gpt-4-0125-preview")
-llm_p = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")
 prompts = {
     'gsd': [prompt_entity_gsd_chunk, prompt_entity_gsd_combine],
@@ -221,7 +224,7 @@ def validate(df):
     json_table = df[['Genes', 'SNPs', 'Diseases']].to_json(orient='records')
     str_json_table = json.dumps(json.loads(json_table), indent=2)
-    result = llm_p.invoke(model='mistral-7b-instruct', input=prompt_validation.format(str_json_table)).content
     print('val')
     print(result)
@@ -234,6 +237,6 @@ def validate(df):
     df_val = pd.DataFrame(result)
     df_val = df_val.merge(df.head(1).drop(['Genes', 'SNPs', 'Diseases'], axis=1), 'cross')
-    # TODO: How to validate genes and SNPs?
     return df, df_val

 from prompt import prompt_entity_gsd_chunk, prompt_entity_gsd_combine, prompt_entity_summ_chunk, prompt_entity_summ_combine, prompt_entities_chunk, prompt_entities_combine, prompt_entity_one_chunk, prompt_table, prompt_validation
 from table_detector import detection_transform, device, model, ocr, outputs_to_objects
+import google.generativeai as genai
 import io
 import json
 import os
 import torch
 load_dotenv()
+genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
+llm = ChatOpenAI(temperature=0, model_name="gpt-4-turbo")
+llm_p = ChatOpenAI(temperature=0, model_name="llama-3-sonar-large-32k-chat", api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")
+llm_g = genai.GenerativeModel(model_name='gemini-1.5-pro-latest')
 prompts = {
     'gsd': [prompt_entity_gsd_chunk, prompt_entity_gsd_combine],
     json_table = df[['Genes', 'SNPs', 'Diseases']].to_json(orient='records')
     str_json_table = json.dumps(json.loads(json_table), indent=2)
+    result = llm_p.invoke(input=prompt_validation.format(str_json_table)).content
     print('val')
     print(result)
     df_val = pd.DataFrame(result)
     df_val = df_val.merge(df.head(1).drop(['Genes', 'SNPs', 'Diseases'], axis=1), 'cross')
+    # TODO: How to validate genes and SNPs with ground truth?
     return df, df_val

requirements.txt CHANGED Viewed

@@ -1,21 +1,22 @@
-pikepdf
-stqdm
-pdf2image
-nltk
-pandas
-streamlit
-xlsxwriter
-openai
-biopython
-langchain
-pypdf
-tiktoken
-pillow-heif
-torchvision
-transformers
-python-dotenv
-rapidocr-onnxruntime
-langchain-openai
-img2table
-timm
-python-doctr

+pikepdf==8.13.0
+stqdm==0.0.5
+pdf2image==1.17.0
+nltk==3.8.1
+pandas==2.2.2
+streamlit==1.33.0
+xlsxwriter==3.2.0
+openai==1.26.0
+biopython==1.83
+langchain==0.1.13
+pypdf==4.1.0
+tiktoken==0.5.2
+pillow-heif==0.15.0
+torchvision==0.15.2
+transformers==4.38.2
+python-dotenv==1.0.1
+rapidocr-onnxruntime==1.3.15
+langchain-openai==0.1.6
+img2table==1.2.11
+timm==0.9.16
+python-doctr==0.8.1
+google-generativeai==0.5.2

resources/experiment.ipynb CHANGED Viewed

@@ -2058,69 +2058,59 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "To complete the given tasks, I will first provide a list of the correct gene names and their corresponding diseases. Then, I will validate the provided data and correct any discrepancies in the gene names and diseases.\n",
-      "\n",
-      "Correct gene names and their corresponding diseases:\n",
-      "\n",
-      "1. GCK: GCK-MODY (MODY2), PNDM, CHI\n",
-      "2. SLC17A4 (formerly SLC242): FBS\n",
-      "3. NEUROD1: MODY6 and PNDM\n",
-      "4. WFS1: WFS1, sometimes referred to as DIDMOAD\n",
-      "5. GLI3: Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\n",
-      "\n",
-      "Validated JSON objects:\n",
       "\n",
-      "```json\n",
       "[\n",
-      "    {\n",
-      "        \"Genes\": \"GCK\",\n",
-      "        \"SNPs\": \"rs1799884\",\n",
-      "        \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"Genes\": \"SLC17A4\",\n",
-      "        \"SNPs\": \"rs5393\",\n",
-      "        \"Diseases\": \"FBS\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"Genes\": \"NEUROD1\",\n",
-      "        \"SNPs\": \"rs1801262\",\n",
-      "        \"Diseases\": \"MODY6 and PNDM\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"Genes\": \"WFS1\",\n",
-      "        \"SNPs\": \"rs6446482\",\n",
-      "        \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"Genes\": \"GLI3\",\n",
-      "        \"SNPs\": \"rs7020673\",\n",
-      "        \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
-      "    }\n",
       "]\n",
-      "```\n",
       "\n",
-      "Explanation:\n",
-      "\n",
-      "1. Gene GCK is correct.\n",
-      "2. Gene SLC242 is corrected to SLC17A4.\n",
-      "3. Gene NEUROD1IBETA2 is corrected to NEUROD1.\n",
-      "4. Gene WFSI is correct.\n",
-      "5. Gene GLI53 is corrected to GLI3.\n",
-      "\n",
-      "The SNPs and diseases are not corrected since they are not suspected of having typos.\n"
      ]
     }
    ],
    "source": [
     "from langchain_openai import ChatOpenAI\n",
     "\n",
     "llm = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url=\"https://api.perplexity.ai\")\n",
     "\n",
@@ -2156,16 +2146,24 @@
     "    \"SNPs\": \"rs7020673\",\n",
     "    \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
     "  },\n",
     "]\n",
     "\n",
     "# OBJECTIVE #\n",
     "Given the provided table data, the following tasks need to be completed:\n",
     "\n",
-    "1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If not, eliminate this row data because the gene name is invalid. \n",
-    "2. If diseases are not empty, check whether the gene name corresponds with the gene names. Fix it with the correct diseases if the original disease is wrong.\n",
     "\n",
     "# RESPONSE #\n",
-    "The output should only be a string containing a list of JSON objects, each representing a validated entry with the following structure:\n",
     "[\n",
     "    {{\n",
     "        \"Genes\": \"A\",\n",
@@ -2181,79 +2179,100 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Failed to batch ingest runs: TypeError('sequence item 0: expected str instance, ReadTimeoutError found')\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "To accomplish this task, we'll need a reference list of correct gene names and their corresponding diseases. Let's assume we have a dictionary `gene_reference` that maps correct gene names to their corresponding diseases.\n",
-      "\n",
-      "Here's a Python solution using the `json` module:\n",
       "```python\n",
       "import json\n",
       "\n",
-      "# Reference list of correct gene names and their corresponding diseases\n",
-      "gene_reference = {\n",
-      "    \"GCK\": \"GCK-MODY (MODY2), PNDM, CHI\",\n",
-      "    \"SLC2A2\": \"FBS\",\n",
-      "    \"NEUROD1\": \"MODY6 and PNDM\",\n",
-      "    \"WFS1\": \"WFS1, sometimes referred to as DIDMOAD\",\n",
-      "    \"GLIS3\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
       "}\n",
       "\n",
-      "def validate_gene_name(gene_name):\n",
-      "    # Simple typo correction using Levenshtein distance (you can use a more advanced method if needed)\n",
-      "    min_distance = float('inf')\n",
-      "    closest_gene = None\n",
-      "    for ref_gene in gene_reference:\n",
-      "        distance = sum(el1 != el2 for el1, el2 in zip(gene_name, ref_gene))\n",
-      "        if distance < min_distance:\n",
-      "            min_distance = distance\n",
-      "            closest_gene = ref_gene\n",
-      "    return closest_gene if min_distance <= 2 else None  # adjust the threshold as needed\n",
-      "\n",
-      "def validate_data(data):\n",
-      "    validated_data = []\n",
       "    for entry in data:\n",
-      "        gene_name = entry[\"Genes\"]\n",
-      "        corrected_gene_name = validate_gene_name(gene_name)\n",
-      "        if corrected_gene_name:\n",
-      "            entry[\"Genes\"] = corrected_gene_name\n",
-      "            if entry[\"Diseases\"]:\n",
-      "                entry[\"Diseases\"] = gene_reference[corrected_gene_name]\n",
-      "            validated_data.append(entry)\n",
-      "    return json.dumps(validated_data)\n",
       "\n",
       "data = [\n",
       "    {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
       "    {\"Genes\": \"SLC242\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
       "    {\"Genes\": \"NEUROD1IBETA2\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
       "    {\"Genes\": \"WFSI\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
-      "    {\"Genes\": \"GLI53\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"}\n",
       "]\n",
       "\n",
-      "print(validate_data(data))\n",
       "```\n",
-      "This script will output:\n",
       "```\n",
       "[\n",
       "    {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
       "    {\"Genes\": \"SLC2A2\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
       "    {\"Genes\": \"NEUROD1\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
       "    {\"Genes\": \"WFS1\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
-      "    {\"Genes\": \"GLIS3\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"}\n",
       "]\n",
       "```\n",
-      "Note that this implementation uses a simple Levenshtein distance-based approach for typo correction, which may not be sufficient for all cases. You may want to consider using more advanced methods, such as fuzzy matching or machine learning-based approaches, depending on the complexity of your data.\n"
      ]
     }
    ],
@@ -2294,16 +2313,24 @@
     "    \"SNPs\": \"rs7020673\",\n",
     "    \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
     "  },\n",
     "]\n",
     "\n",
     "# OBJECTIVE #\n",
     "Given the provided table data, the following tasks need to be completed:\n",
     "\n",
-    "1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If not, eliminate this row data because the gene name is invalid. \n",
-    "2. If diseases are not empty, check whether the gene name corresponds with the gene names. Fix it with the correct diseases if the original disease is wrong.\n",
     "\n",
     "# RESPONSE #\n",
-    "The output should only be a string containing a list of JSON objects, each representing a validated entry with the following structure:\n",
     "[\n",
     "    {{\n",
     "        \"Genes\": \"A\",\n",

   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Here's the list of JSON objects with corrected gene names, SNPs, and diseases based on the given context:\n",
       "\n",
       "[\n",
+      "  {\n",
+      "    \"Genes\": \"GCK\",\n",
+      "    \"SNPs\": \"rs1799884\",\n",
+      "    \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"Genes\": \"SLC24A2\",\n",
+      "    \"SNPs\": \"rs5393\",\n",
+      "    \"Diseases\": \"FBS\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"Genes\": \"NEUROD1, INS\",\n",
+      "    \"SNPs\": \"rs1801262\",\n",
+      "    \"Diseases\": \"MODY6 and PNDM\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"Genes\": \"WFS1\",\n",
+      "    \"SNPs\": \"rs6446482\",\n",
+      "    \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"Genes\": \"GLIS3\",\n",
+      "    \"SNPs\": \"rs7020673\",\n",
+      "    \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"Genes\": \"FTO\",\n",
+      "    \"SNPs\": \"rs9937290\",\n",
+      "    \"Diseases\": \"Obesity\"\n",
+      "  }\n",
       "]\n",
       "\n",
+      "Changes made:\n",
+      "1. Corrected \"SLC242\" to \"SLC24A2\"\n",
+      "2. Separated \"NEUROD1IBETA2\" into \"NEUROD1, INS\"\n",
+      "3. Corrected \"GLI53\" to \"GLIS3\"\n",
+      "4. Corrected \"FT0\" to \"FTO\"\n"
      ]
     }
    ],
    "source": [
     "from langchain_openai import ChatOpenAI\n",
+    "import os\n",
     "\n",
     "llm = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url=\"https://api.perplexity.ai\")\n",
     "\n",
     "    \"SNPs\": \"rs7020673\",\n",
     "    \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
     "  },\n",
+    "  {\n",
+    "    \"Genes\": \"FT0\",\n",
+    "    \"SNPs\": \"rs9937290\",\n",
+    "    \"Diseases\": \"Obesity\"\n",
+    "  },\n",
     "]\n",
     "\n",
     "# OBJECTIVE #\n",
     "Given the provided table data, the following tasks need to be completed:\n",
     "\n",
+    "1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:\n",
+    "    - Combined Names: Two gene names erroneously merged into one. Separate these using \"and\": \"A and B\".\n",
+    "    - OCR Errors: Similar characters misread by the system. Correct these to the intended form.\n",
+    "2. If SNP is not empty, check whether the gene name corresponds with the SNP. Fix it with the correct SNP if the original SNP is wrong.\n",
+    "3. If diseases are not empty, check whether the gene name corresponds with the diseases. Fix it with the correct diseases if the original disease is wrong.\n",
     "\n",
     "# RESPONSE #\n",
+    "The output must be only a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:\n",
     "[\n",
     "    {{\n",
     "        \"Genes\": \"A\",\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Here is the Python solution using the `json` module and a dictionary to map known gene names to their correct forms:\n",
       "```python\n",
       "import json\n",
       "\n",
+      "# Known gene names and their corrections\n",
+      "gene_corrections = {\n",
+      "    \"SLC242\": \"SLC2A2\",\n",
+      "    \"NEUROD1IBETA2\": \"NEUROD1\",\n",
+      "    \"WFSI\": \"WFS1\",\n",
+      "    \"GLI53\": \"GLIS3\",\n",
+      "    \"FT0\": \"FTO\"\n",
       "}\n",
       "\n",
+      "# Function to correct gene names and SNPs\n",
+      "def correct_gene_data(data):\n",
+      "    corrected_data = []\n",
       "    for entry in data:\n",
+      "        genes = entry[\"Genes\"]\n",
+      "        snps = entry[\"SNPs\"]\n",
+      "        diseases = entry[\"Diseases\"]\n",
+      "        \n",
+      "        # Correct gene names\n",
+      "        if genes in gene_corrections:\n",
+      "            genes = gene_corrections[genes]\n",
+      "        elif \" and \" not in genes:\n",
+      "            # Check for combined names\n",
+      "            parts = genes.split()\n",
+      "            if len(parts) > 1:\n",
+      "                genes = \" and \".join(parts)\n",
+      "        \n",
+      "        # Correct SNPs (assuming a dictionary of known SNPs for each gene)\n",
+      "        snp_corrections = {\n",
+      "            \"GCK\": {\"rs1799884\": \"rs1799884\"},\n",
+      "            \"SLC2A2\": {\"rs5393\": \"rs5393\"},\n",
+      "            \"NEUROD1\": {\"rs1801262\": \"rs1801262\"},\n",
+      "            \"WFS1\": {\"rs6446482\": \"rs6446482\"},\n",
+      "            \"GLIS3\": {\"rs7020673\": \"rs7020673\"},\n",
+      "            \"FTO\": {\"rs9937290\": \"rs9937290\"}\n",
+      "        }\n",
+      "        if snps and genes in snp_corrections:\n",
+      "            if snps not in snp_corrections[genes]:\n",
+      "                snps = \"\"\n",
+      "        \n",
+      "        # Correct diseases (assuming a dictionary of known diseases for each gene)\n",
+      "        disease_corrections = {\n",
+      "            \"GCK\": {\"GCK-MODY (MODY2), PNDM, CHI\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
+      "            \"SLC2A2\": {\"FBS\": \"FBS\"},\n",
+      "            \"NEUROD1\": {\"MODY6 and PNDM\": \"MODY6 and PNDM\"},\n",
+      "            \"WFS1\": {\"WFS1, sometimes referred to as DIDMOAD\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
+      "            \"GLIS3\": {\"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
+      "            \"FTO\": {\"Obesity\": \"Obesity\"}\n",
+      "        }\n",
+      "        if diseases and genes in disease_corrections:\n",
+      "            if diseases not in disease_corrections[genes]:\n",
+      "                diseases = \"\"\n",
+      "        \n",
+      "        # Add corrected entry to the list\n",
+      "        if genes and snps and diseases:\n",
+      "            corrected_data.append({\"Genes\": genes, \"SNPs\": snps, \"Diseases\": diseases})\n",
+      "    \n",
+      "    return json.dumps(corrected_data)\n",
       "\n",
+      "# Input data\n",
       "data = [\n",
       "    {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
       "    {\"Genes\": \"SLC242\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
       "    {\"Genes\": \"NEUROD1IBETA2\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
       "    {\"Genes\": \"WFSI\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
+      "    {\"Genes\": \"GLI53\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
+      "    {\"Genes\": \"FT0\", \"SNPs\": \"rs9937290\", \"Diseases\": \"Obesity\"}\n",
       "]\n",
       "\n",
+      "# Correct and output the data\n",
+      "print(correct_gene_data(data))\n",
       "```\n",
+      "This will output the corrected data in the same format as the input:\n",
       "```\n",
       "[\n",
       "    {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
       "    {\"Genes\": \"SLC2A2\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
       "    {\"Genes\": \"NEUROD1\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
       "    {\"Genes\": \"WFS1\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
+      "    {\"Genes\": \"GLIS3\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
+      "    {\"Genes\": \"FTO\", \"SNPs\": \"rs9937290\", \"Diseases\": \"Obesity\"}\n",
       "]\n",
       "```\n",
+      "Note that this implementation assumes a dictionary of known gene names, SNPs, and diseases for correction. You may need to expand or modify these dictionaries based on your specific use case.\n"
      ]
     }
    ],
     "    \"SNPs\": \"rs7020673\",\n",
     "    \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
     "  },\n",
+    "  {\n",
+    "    \"Genes\": \"FT0\",\n",
+    "    \"SNPs\": \"rs9937290\",\n",
+    "    \"Diseases\": \"Obesity\"\n",
+    "  },\n",
     "]\n",
     "\n",
     "# OBJECTIVE #\n",
     "Given the provided table data, the following tasks need to be completed:\n",
     "\n",
+    "1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:\n",
+    "    - Combined Names: Two gene names erroneously merged into one. Duplicate this data row so each gene name has its own data.\n",
+    "    - OCR Errors: Similar characters misread by the system. Correct these to the intended form.\n",
+    "2. If SNP is not empty, check whether the gene name corresponds with the SNP. Fix it with the correct SNP if the original SNP is wrong.\n",
+    "3. If diseases are not empty, check whether the gene name corresponds with the diseases. Fix it with the correct diseases if the original disease is wrong.\n",
     "\n",
     "# RESPONSE #\n",
+    "The output must be STRICTLY ONLY a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:\n",
     "[\n",
     "    {{\n",
     "        \"Genes\": \"A\",\n",