fadliaulawi
commited on
Commit
•
3f96f05
1
Parent(s):
2ec4f41
Explore Gemini model
Browse files- app.py +15 -6
- process.py +7 -4
- requirements.txt +22 -21
- resources/experiment.ipynb +125 -98
app.py
CHANGED
@@ -31,11 +31,21 @@ st.markdown("<div style='text-align: left; color: white; font-size: 16px'>In its
|
|
31 |
|
32 |
uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True)
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
if uploaded_files:
|
41 |
journals = []
|
@@ -45,7 +55,6 @@ if uploaded_files:
|
|
45 |
with st.status("Extraction in progress ...", expanded=True) as status:
|
46 |
start_time = datetime.now()
|
47 |
|
48 |
-
csv = pd.DataFrame()
|
49 |
for uploaded_file in stqdm(uploaded_files):
|
50 |
with NamedTemporaryFile(dir='.', suffix=".pdf", delete=eval(os.getenv('DELETE_TEMP_PDF', 'True'))) as pdf:
|
51 |
pdf.write(uploaded_file.getbuffer())
|
|
|
31 |
|
32 |
uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True)
|
33 |
|
34 |
+
col1, col2 = st.columns(2)
|
35 |
+
|
36 |
+
with col1:
|
37 |
+
chunk_option = st.selectbox(
|
38 |
+
'Token amounts per process:',
|
39 |
+
(24000, 16000, 8000), key='token'
|
40 |
+
)
|
41 |
+
chunk_overlap = 0
|
42 |
+
|
43 |
+
with col2:
|
44 |
+
model = st.selectbox(
|
45 |
+
'Model selection: (UNDER DEVELOPED)',
|
46 |
+
# 128000, 32768, 1048576
|
47 |
+
('gpt-4-turbo', 'llama-3-sonar-large-32k-chat', 'gemini-1.5-pro-latest'), key='model'
|
48 |
+
)
|
49 |
|
50 |
if uploaded_files:
|
51 |
journals = []
|
|
|
55 |
with st.status("Extraction in progress ...", expanded=True) as status:
|
56 |
start_time = datetime.now()
|
57 |
|
|
|
58 |
for uploaded_file in stqdm(uploaded_files):
|
59 |
with NamedTemporaryFile(dir='.', suffix=".pdf", delete=eval(os.getenv('DELETE_TEMP_PDF', 'True'))) as pdf:
|
60 |
pdf.write(uploaded_file.getbuffer())
|
process.py
CHANGED
@@ -11,6 +11,7 @@ from pdf2image import convert_from_path
|
|
11 |
from prompt import prompt_entity_gsd_chunk, prompt_entity_gsd_combine, prompt_entity_summ_chunk, prompt_entity_summ_combine, prompt_entities_chunk, prompt_entities_combine, prompt_entity_one_chunk, prompt_table, prompt_validation
|
12 |
from table_detector import detection_transform, device, model, ocr, outputs_to_objects
|
13 |
|
|
|
14 |
import io
|
15 |
import json
|
16 |
import os
|
@@ -19,9 +20,11 @@ import re
|
|
19 |
import torch
|
20 |
|
21 |
load_dotenv()
|
|
|
22 |
|
23 |
-
llm = ChatOpenAI(temperature=0, model_name="gpt-4-
|
24 |
-
llm_p = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")
|
|
|
25 |
|
26 |
prompts = {
|
27 |
'gsd': [prompt_entity_gsd_chunk, prompt_entity_gsd_combine],
|
@@ -221,7 +224,7 @@ def validate(df):
|
|
221 |
json_table = df[['Genes', 'SNPs', 'Diseases']].to_json(orient='records')
|
222 |
str_json_table = json.dumps(json.loads(json_table), indent=2)
|
223 |
|
224 |
-
result = llm_p.invoke(
|
225 |
print('val')
|
226 |
print(result)
|
227 |
|
@@ -234,6 +237,6 @@ def validate(df):
|
|
234 |
df_val = pd.DataFrame(result)
|
235 |
df_val = df_val.merge(df.head(1).drop(['Genes', 'SNPs', 'Diseases'], axis=1), 'cross')
|
236 |
|
237 |
-
# TODO: How to validate genes and SNPs?
|
238 |
|
239 |
return df, df_val
|
|
|
11 |
from prompt import prompt_entity_gsd_chunk, prompt_entity_gsd_combine, prompt_entity_summ_chunk, prompt_entity_summ_combine, prompt_entities_chunk, prompt_entities_combine, prompt_entity_one_chunk, prompt_table, prompt_validation
|
12 |
from table_detector import detection_transform, device, model, ocr, outputs_to_objects
|
13 |
|
14 |
+
import google.generativeai as genai
|
15 |
import io
|
16 |
import json
|
17 |
import os
|
|
|
20 |
import torch
|
21 |
|
22 |
load_dotenv()
|
23 |
+
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
|
24 |
|
25 |
+
llm = ChatOpenAI(temperature=0, model_name="gpt-4-turbo")
|
26 |
+
llm_p = ChatOpenAI(temperature=0, model_name="llama-3-sonar-large-32k-chat", api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")
|
27 |
+
llm_g = genai.GenerativeModel(model_name='gemini-1.5-pro-latest')
|
28 |
|
29 |
prompts = {
|
30 |
'gsd': [prompt_entity_gsd_chunk, prompt_entity_gsd_combine],
|
|
|
224 |
json_table = df[['Genes', 'SNPs', 'Diseases']].to_json(orient='records')
|
225 |
str_json_table = json.dumps(json.loads(json_table), indent=2)
|
226 |
|
227 |
+
result = llm_p.invoke(input=prompt_validation.format(str_json_table)).content
|
228 |
print('val')
|
229 |
print(result)
|
230 |
|
|
|
237 |
df_val = pd.DataFrame(result)
|
238 |
df_val = df_val.merge(df.head(1).drop(['Genes', 'SNPs', 'Diseases'], axis=1), 'cross')
|
239 |
|
240 |
+
# TODO: How to validate genes and SNPs with ground truth?
|
241 |
|
242 |
return df, df_val
|
requirements.txt
CHANGED
@@ -1,21 +1,22 @@
|
|
1 |
-
pikepdf
|
2 |
-
stqdm
|
3 |
-
pdf2image
|
4 |
-
nltk
|
5 |
-
pandas
|
6 |
-
streamlit
|
7 |
-
xlsxwriter
|
8 |
-
openai
|
9 |
-
biopython
|
10 |
-
langchain
|
11 |
-
pypdf
|
12 |
-
tiktoken
|
13 |
-
pillow-heif
|
14 |
-
torchvision
|
15 |
-
transformers
|
16 |
-
python-dotenv
|
17 |
-
rapidocr-onnxruntime
|
18 |
-
langchain-openai
|
19 |
-
img2table
|
20 |
-
timm
|
21 |
-
python-doctr
|
|
|
|
1 |
+
pikepdf==8.13.0
|
2 |
+
stqdm==0.0.5
|
3 |
+
pdf2image==1.17.0
|
4 |
+
nltk==3.8.1
|
5 |
+
pandas==2.2.2
|
6 |
+
streamlit==1.33.0
|
7 |
+
xlsxwriter==3.2.0
|
8 |
+
openai==1.26.0
|
9 |
+
biopython==1.83
|
10 |
+
langchain==0.1.13
|
11 |
+
pypdf==4.1.0
|
12 |
+
tiktoken==0.5.2
|
13 |
+
pillow-heif==0.15.0
|
14 |
+
torchvision==0.15.2
|
15 |
+
transformers==4.38.2
|
16 |
+
python-dotenv==1.0.1
|
17 |
+
rapidocr-onnxruntime==1.3.15
|
18 |
+
langchain-openai==0.1.6
|
19 |
+
img2table==1.2.11
|
20 |
+
timm==0.9.16
|
21 |
+
python-doctr==0.8.1
|
22 |
+
google-generativeai==0.5.2
|
resources/experiment.ipynb
CHANGED
@@ -2058,69 +2058,59 @@
|
|
2058 |
},
|
2059 |
{
|
2060 |
"cell_type": "code",
|
2061 |
-
"execution_count":
|
2062 |
"metadata": {},
|
2063 |
"outputs": [
|
2064 |
{
|
2065 |
"name": "stdout",
|
2066 |
"output_type": "stream",
|
2067 |
"text": [
|
2068 |
-
"
|
2069 |
-
"\n",
|
2070 |
-
"Correct gene names and their corresponding diseases:\n",
|
2071 |
-
"\n",
|
2072 |
-
"1. GCK: GCK-MODY (MODY2), PNDM, CHI\n",
|
2073 |
-
"2. SLC17A4 (formerly SLC242): FBS\n",
|
2074 |
-
"3. NEUROD1: MODY6 and PNDM\n",
|
2075 |
-
"4. WFS1: WFS1, sometimes referred to as DIDMOAD\n",
|
2076 |
-
"5. GLI3: Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\n",
|
2077 |
-
"\n",
|
2078 |
-
"Validated JSON objects:\n",
|
2079 |
"\n",
|
2080 |
-
"```json\n",
|
2081 |
"[\n",
|
2082 |
-
"
|
2083 |
-
"
|
2084 |
-
"
|
2085 |
-
"
|
2086 |
-
"
|
2087 |
-
"
|
2088 |
-
"
|
2089 |
-
"
|
2090 |
-
"
|
2091 |
-
"
|
2092 |
-
"
|
2093 |
-
"
|
2094 |
-
"
|
2095 |
-
"
|
2096 |
-
"
|
2097 |
-
"
|
2098 |
-
"
|
2099 |
-
"
|
2100 |
-
"
|
2101 |
-
"
|
2102 |
-
"
|
2103 |
-
"
|
2104 |
-
"
|
2105 |
-
"
|
2106 |
-
"
|
|
|
|
|
|
|
|
|
|
|
2107 |
"]\n",
|
2108 |
-
"```\n",
|
2109 |
"\n",
|
2110 |
-
"
|
2111 |
-
"\n",
|
2112 |
-
"
|
2113 |
-
"
|
2114 |
-
"
|
2115 |
-
"4. Gene WFSI is correct.\n",
|
2116 |
-
"5. Gene GLI53 is corrected to GLI3.\n",
|
2117 |
-
"\n",
|
2118 |
-
"The SNPs and diseases are not corrected since they are not suspected of having typos.\n"
|
2119 |
]
|
2120 |
}
|
2121 |
],
|
2122 |
"source": [
|
2123 |
"from langchain_openai import ChatOpenAI\n",
|
|
|
2124 |
"\n",
|
2125 |
"llm = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url=\"https://api.perplexity.ai\")\n",
|
2126 |
"\n",
|
@@ -2156,16 +2146,24 @@
|
|
2156 |
" \"SNPs\": \"rs7020673\",\n",
|
2157 |
" \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
|
2158 |
" },\n",
|
|
|
|
|
|
|
|
|
|
|
2159 |
"]\n",
|
2160 |
"\n",
|
2161 |
"# OBJECTIVE #\n",
|
2162 |
"Given the provided table data, the following tasks need to be completed:\n",
|
2163 |
"\n",
|
2164 |
-
"1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If
|
2165 |
-
"
|
|
|
|
|
|
|
2166 |
"\n",
|
2167 |
"# RESPONSE #\n",
|
2168 |
-
"The output
|
2169 |
"[\n",
|
2170 |
" {{\n",
|
2171 |
" \"Genes\": \"A\",\n",
|
@@ -2181,79 +2179,100 @@
|
|
2181 |
},
|
2182 |
{
|
2183 |
"cell_type": "code",
|
2184 |
-
"execution_count":
|
2185 |
"metadata": {},
|
2186 |
"outputs": [
|
2187 |
-
{
|
2188 |
-
"name": "stderr",
|
2189 |
-
"output_type": "stream",
|
2190 |
-
"text": [
|
2191 |
-
"Failed to batch ingest runs: TypeError('sequence item 0: expected str instance, ReadTimeoutError found')\n"
|
2192 |
-
]
|
2193 |
-
},
|
2194 |
{
|
2195 |
"name": "stdout",
|
2196 |
"output_type": "stream",
|
2197 |
"text": [
|
2198 |
-
"
|
2199 |
-
"\n",
|
2200 |
-
"Here's a Python solution using the `json` module:\n",
|
2201 |
"```python\n",
|
2202 |
"import json\n",
|
2203 |
"\n",
|
2204 |
-
"#
|
2205 |
-
"
|
2206 |
-
" \"
|
2207 |
-
" \"
|
2208 |
-
" \"
|
2209 |
-
" \"
|
2210 |
-
" \"
|
2211 |
"}\n",
|
2212 |
"\n",
|
2213 |
-
"
|
2214 |
-
"
|
2215 |
-
"
|
2216 |
-
" closest_gene = None\n",
|
2217 |
-
" for ref_gene in gene_reference:\n",
|
2218 |
-
" distance = sum(el1 != el2 for el1, el2 in zip(gene_name, ref_gene))\n",
|
2219 |
-
" if distance < min_distance:\n",
|
2220 |
-
" min_distance = distance\n",
|
2221 |
-
" closest_gene = ref_gene\n",
|
2222 |
-
" return closest_gene if min_distance <= 2 else None # adjust the threshold as needed\n",
|
2223 |
-
"\n",
|
2224 |
-
"def validate_data(data):\n",
|
2225 |
-
" validated_data = []\n",
|
2226 |
" for entry in data:\n",
|
2227 |
-
"
|
2228 |
-
"
|
2229 |
-
"
|
2230 |
-
"
|
2231 |
-
"
|
2232 |
-
"
|
2233 |
-
"
|
2234 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2235 |
"\n",
|
|
|
2236 |
"data = [\n",
|
2237 |
" {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
|
2238 |
" {\"Genes\": \"SLC242\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
|
2239 |
" {\"Genes\": \"NEUROD1IBETA2\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
|
2240 |
" {\"Genes\": \"WFSI\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
|
2241 |
-
" {\"Genes\": \"GLI53\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"}
|
|
|
2242 |
"]\n",
|
2243 |
"\n",
|
2244 |
-
"
|
|
|
2245 |
"```\n",
|
2246 |
-
"This
|
2247 |
"```\n",
|
2248 |
"[\n",
|
2249 |
" {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
|
2250 |
" {\"Genes\": \"SLC2A2\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
|
2251 |
" {\"Genes\": \"NEUROD1\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
|
2252 |
" {\"Genes\": \"WFS1\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
|
2253 |
-
" {\"Genes\": \"GLIS3\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"}
|
|
|
2254 |
"]\n",
|
2255 |
"```\n",
|
2256 |
-
"Note that this implementation
|
2257 |
]
|
2258 |
}
|
2259 |
],
|
@@ -2294,16 +2313,24 @@
|
|
2294 |
" \"SNPs\": \"rs7020673\",\n",
|
2295 |
" \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
|
2296 |
" },\n",
|
|
|
|
|
|
|
|
|
|
|
2297 |
"]\n",
|
2298 |
"\n",
|
2299 |
"# OBJECTIVE #\n",
|
2300 |
"Given the provided table data, the following tasks need to be completed:\n",
|
2301 |
"\n",
|
2302 |
-
"1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If
|
2303 |
-
"
|
|
|
|
|
|
|
2304 |
"\n",
|
2305 |
"# RESPONSE #\n",
|
2306 |
-
"The output
|
2307 |
"[\n",
|
2308 |
" {{\n",
|
2309 |
" \"Genes\": \"A\",\n",
|
|
|
2058 |
},
|
2059 |
{
|
2060 |
"cell_type": "code",
|
2061 |
+
"execution_count": 2,
|
2062 |
"metadata": {},
|
2063 |
"outputs": [
|
2064 |
{
|
2065 |
"name": "stdout",
|
2066 |
"output_type": "stream",
|
2067 |
"text": [
|
2068 |
+
"Here's the list of JSON objects with corrected gene names, SNPs, and diseases based on the given context:\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2069 |
"\n",
|
|
|
2070 |
"[\n",
|
2071 |
+
" {\n",
|
2072 |
+
" \"Genes\": \"GCK\",\n",
|
2073 |
+
" \"SNPs\": \"rs1799884\",\n",
|
2074 |
+
" \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"\n",
|
2075 |
+
" },\n",
|
2076 |
+
" {\n",
|
2077 |
+
" \"Genes\": \"SLC24A2\",\n",
|
2078 |
+
" \"SNPs\": \"rs5393\",\n",
|
2079 |
+
" \"Diseases\": \"FBS\"\n",
|
2080 |
+
" },\n",
|
2081 |
+
" {\n",
|
2082 |
+
" \"Genes\": \"NEUROD1, INS\",\n",
|
2083 |
+
" \"SNPs\": \"rs1801262\",\n",
|
2084 |
+
" \"Diseases\": \"MODY6 and PNDM\"\n",
|
2085 |
+
" },\n",
|
2086 |
+
" {\n",
|
2087 |
+
" \"Genes\": \"WFS1\",\n",
|
2088 |
+
" \"SNPs\": \"rs6446482\",\n",
|
2089 |
+
" \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"\n",
|
2090 |
+
" },\n",
|
2091 |
+
" {\n",
|
2092 |
+
" \"Genes\": \"GLIS3\",\n",
|
2093 |
+
" \"SNPs\": \"rs7020673\",\n",
|
2094 |
+
" \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
|
2095 |
+
" },\n",
|
2096 |
+
" {\n",
|
2097 |
+
" \"Genes\": \"FTO\",\n",
|
2098 |
+
" \"SNPs\": \"rs9937290\",\n",
|
2099 |
+
" \"Diseases\": \"Obesity\"\n",
|
2100 |
+
" }\n",
|
2101 |
"]\n",
|
|
|
2102 |
"\n",
|
2103 |
+
"Changes made:\n",
|
2104 |
+
"1. Corrected \"SLC242\" to \"SLC24A2\"\n",
|
2105 |
+
"2. Separated \"NEUROD1IBETA2\" into \"NEUROD1, INS\"\n",
|
2106 |
+
"3. Corrected \"GLI53\" to \"GLIS3\"\n",
|
2107 |
+
"4. Corrected \"FT0\" to \"FTO\"\n"
|
|
|
|
|
|
|
|
|
2108 |
]
|
2109 |
}
|
2110 |
],
|
2111 |
"source": [
|
2112 |
"from langchain_openai import ChatOpenAI\n",
|
2113 |
+
"import os\n",
|
2114 |
"\n",
|
2115 |
"llm = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url=\"https://api.perplexity.ai\")\n",
|
2116 |
"\n",
|
|
|
2146 |
" \"SNPs\": \"rs7020673\",\n",
|
2147 |
" \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
|
2148 |
" },\n",
|
2149 |
+
" {\n",
|
2150 |
+
" \"Genes\": \"FT0\",\n",
|
2151 |
+
" \"SNPs\": \"rs9937290\",\n",
|
2152 |
+
" \"Diseases\": \"Obesity\"\n",
|
2153 |
+
" },\n",
|
2154 |
"]\n",
|
2155 |
"\n",
|
2156 |
"# OBJECTIVE #\n",
|
2157 |
"Given the provided table data, the following tasks need to be completed:\n",
|
2158 |
"\n",
|
2159 |
+
"1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:\n",
|
2160 |
+
" - Combined Names: Two gene names erroneously merged into one. Separate these using \"and\": \"A and B\".\n",
|
2161 |
+
" - OCR Errors: Similar characters misread by the system. Correct these to the intended form.\n",
|
2162 |
+
"2. If SNP is not empty, check whether the gene name corresponds with the SNP. Fix it with the correct SNP if the original SNP is wrong.\n",
|
2163 |
+
"3. If diseases are not empty, check whether the gene name corresponds with the diseases. Fix it with the correct diseases if the original disease is wrong.\n",
|
2164 |
"\n",
|
2165 |
"# RESPONSE #\n",
|
2166 |
+
"The output must be only a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:\n",
|
2167 |
"[\n",
|
2168 |
" {{\n",
|
2169 |
" \"Genes\": \"A\",\n",
|
|
|
2179 |
},
|
2180 |
{
|
2181 |
"cell_type": "code",
|
2182 |
+
"execution_count": 4,
|
2183 |
"metadata": {},
|
2184 |
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2185 |
{
|
2186 |
"name": "stdout",
|
2187 |
"output_type": "stream",
|
2188 |
"text": [
|
2189 |
+
"Here is the Python solution using the `json` module and a dictionary to map known gene names to their correct forms:\n",
|
|
|
|
|
2190 |
"```python\n",
|
2191 |
"import json\n",
|
2192 |
"\n",
|
2193 |
+
"# Known gene names and their corrections\n",
|
2194 |
+
"gene_corrections = {\n",
|
2195 |
+
" \"SLC242\": \"SLC2A2\",\n",
|
2196 |
+
" \"NEUROD1IBETA2\": \"NEUROD1\",\n",
|
2197 |
+
" \"WFSI\": \"WFS1\",\n",
|
2198 |
+
" \"GLI53\": \"GLIS3\",\n",
|
2199 |
+
" \"FT0\": \"FTO\"\n",
|
2200 |
"}\n",
|
2201 |
"\n",
|
2202 |
+
"# Function to correct gene names and SNPs\n",
|
2203 |
+
"def correct_gene_data(data):\n",
|
2204 |
+
" corrected_data = []\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2205 |
" for entry in data:\n",
|
2206 |
+
" genes = entry[\"Genes\"]\n",
|
2207 |
+
" snps = entry[\"SNPs\"]\n",
|
2208 |
+
" diseases = entry[\"Diseases\"]\n",
|
2209 |
+
" \n",
|
2210 |
+
" # Correct gene names\n",
|
2211 |
+
" if genes in gene_corrections:\n",
|
2212 |
+
" genes = gene_corrections[genes]\n",
|
2213 |
+
" elif \" and \" not in genes:\n",
|
2214 |
+
" # Check for combined names\n",
|
2215 |
+
" parts = genes.split()\n",
|
2216 |
+
" if len(parts) > 1:\n",
|
2217 |
+
" genes = \" and \".join(parts)\n",
|
2218 |
+
" \n",
|
2219 |
+
" # Correct SNPs (assuming a dictionary of known SNPs for each gene)\n",
|
2220 |
+
" snp_corrections = {\n",
|
2221 |
+
" \"GCK\": {\"rs1799884\": \"rs1799884\"},\n",
|
2222 |
+
" \"SLC2A2\": {\"rs5393\": \"rs5393\"},\n",
|
2223 |
+
" \"NEUROD1\": {\"rs1801262\": \"rs1801262\"},\n",
|
2224 |
+
" \"WFS1\": {\"rs6446482\": \"rs6446482\"},\n",
|
2225 |
+
" \"GLIS3\": {\"rs7020673\": \"rs7020673\"},\n",
|
2226 |
+
" \"FTO\": {\"rs9937290\": \"rs9937290\"}\n",
|
2227 |
+
" }\n",
|
2228 |
+
" if snps and genes in snp_corrections:\n",
|
2229 |
+
" if snps not in snp_corrections[genes]:\n",
|
2230 |
+
" snps = \"\"\n",
|
2231 |
+
" \n",
|
2232 |
+
" # Correct diseases (assuming a dictionary of known diseases for each gene)\n",
|
2233 |
+
" disease_corrections = {\n",
|
2234 |
+
" \"GCK\": {\"GCK-MODY (MODY2), PNDM, CHI\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
|
2235 |
+
" \"SLC2A2\": {\"FBS\": \"FBS\"},\n",
|
2236 |
+
" \"NEUROD1\": {\"MODY6 and PNDM\": \"MODY6 and PNDM\"},\n",
|
2237 |
+
" \"WFS1\": {\"WFS1, sometimes referred to as DIDMOAD\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
|
2238 |
+
" \"GLIS3\": {\"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
|
2239 |
+
" \"FTO\": {\"Obesity\": \"Obesity\"}\n",
|
2240 |
+
" }\n",
|
2241 |
+
" if diseases and genes in disease_corrections:\n",
|
2242 |
+
" if diseases not in disease_corrections[genes]:\n",
|
2243 |
+
" diseases = \"\"\n",
|
2244 |
+
" \n",
|
2245 |
+
" # Add corrected entry to the list\n",
|
2246 |
+
" if genes and snps and diseases:\n",
|
2247 |
+
" corrected_data.append({\"Genes\": genes, \"SNPs\": snps, \"Diseases\": diseases})\n",
|
2248 |
+
" \n",
|
2249 |
+
" return json.dumps(corrected_data)\n",
|
2250 |
"\n",
|
2251 |
+
"# Input data\n",
|
2252 |
"data = [\n",
|
2253 |
" {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
|
2254 |
" {\"Genes\": \"SLC242\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
|
2255 |
" {\"Genes\": \"NEUROD1IBETA2\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
|
2256 |
" {\"Genes\": \"WFSI\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
|
2257 |
+
" {\"Genes\": \"GLI53\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
|
2258 |
+
" {\"Genes\": \"FT0\", \"SNPs\": \"rs9937290\", \"Diseases\": \"Obesity\"}\n",
|
2259 |
"]\n",
|
2260 |
"\n",
|
2261 |
+
"# Correct and output the data\n",
|
2262 |
+
"print(correct_gene_data(data))\n",
|
2263 |
"```\n",
|
2264 |
+
"This will output the corrected data in the same format as the input:\n",
|
2265 |
"```\n",
|
2266 |
"[\n",
|
2267 |
" {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
|
2268 |
" {\"Genes\": \"SLC2A2\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
|
2269 |
" {\"Genes\": \"NEUROD1\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
|
2270 |
" {\"Genes\": \"WFS1\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
|
2271 |
+
" {\"Genes\": \"GLIS3\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
|
2272 |
+
" {\"Genes\": \"FTO\", \"SNPs\": \"rs9937290\", \"Diseases\": \"Obesity\"}\n",
|
2273 |
"]\n",
|
2274 |
"```\n",
|
2275 |
+
"Note that this implementation assumes a dictionary of known gene names, SNPs, and diseases for correction. You may need to expand or modify these dictionaries based on your specific use case.\n"
|
2276 |
]
|
2277 |
}
|
2278 |
],
|
|
|
2313 |
" \"SNPs\": \"rs7020673\",\n",
|
2314 |
" \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"\n",
|
2315 |
" },\n",
|
2316 |
+
" {\n",
|
2317 |
+
" \"Genes\": \"FT0\",\n",
|
2318 |
+
" \"SNPs\": \"rs9937290\",\n",
|
2319 |
+
" \"Diseases\": \"Obesity\"\n",
|
2320 |
+
" },\n",
|
2321 |
"]\n",
|
2322 |
"\n",
|
2323 |
"# OBJECTIVE #\n",
|
2324 |
"Given the provided table data, the following tasks need to be completed:\n",
|
2325 |
"\n",
|
2326 |
+
"1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:\n",
|
2327 |
+
" - Combined Names: Two gene names erroneously merged into one. Duplicate this data row so each gene name has its own data.\n",
|
2328 |
+
" - OCR Errors: Similar characters misread by the system. Correct these to the intended form.\n",
|
2329 |
+
"2. If SNP is not empty, check whether the gene name corresponds with the SNP. Fix it with the correct SNP if the original SNP is wrong.\n",
|
2330 |
+
"3. If diseases are not empty, check whether the gene name corresponds with the diseases. Fix it with the correct diseases if the original disease is wrong.\n",
|
2331 |
"\n",
|
2332 |
"# RESPONSE #\n",
|
2333 |
+
"The output must be STRICTLY ONLY a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:\n",
|
2334 |
"[\n",
|
2335 |
" {{\n",
|
2336 |
" \"Genes\": \"A\",\n",
|