fadliaulawi commited on
Commit
ca71749
1 Parent(s): bd28dd7

Add validation minus API

Browse files
Files changed (4) hide show
  1. .gitignore +1 -1
  2. app.py +8 -7
  3. process.py +93 -22
  4. resources/experiment.ipynb +776 -103
.gitignore CHANGED
@@ -2,6 +2,6 @@ __pycache__
2
  .env
3
  .vscode
4
  env/
5
- resources/images/
6
  resources/papers/
7
  result/
 
2
  .env
3
  .vscode
4
  env/
5
+ resources/testing/
6
  resources/papers/
7
  result/
app.py CHANGED
@@ -36,9 +36,9 @@ with col1:
36
 
37
  with col2:
38
  tokens = (
39
- 24000,
40
  16000,
41
- 8000
42
  )
43
  chunk_option = st.selectbox(
44
  'Token amounts per process:', tokens, key='token'
@@ -136,21 +136,22 @@ if uploaded_files:
136
  dataframe.reset_index(drop=True, inplace=True)
137
 
138
  # Validate Result
139
- cleaned_df, cleaned_llm_df = process.validate(dataframe)
140
 
141
  end_time = datetime.now()
142
  st.write("Success in ", round((end_time.timestamp() - start_time.timestamp()) / 60, 2), "minutes")
143
 
144
- st.dataframe(cleaned_df)
145
  with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
146
- cleaned_df.to_excel(writer, sheet_name='Result')
147
- cleaned_llm_df.to_excel(writer, sheet_name='Validate with LLM')
 
148
  dataframe.to_excel(writer, sheet_name='Original')
149
  writer.close()
150
 
151
  st.download_button(
152
  label="Save Result",
153
  data=buffer,
154
- file_name=f"{uploaded_file.name.replace('.pdf', '')}_{chunk_option}.xlsx",
155
  mime='application/vnd.ms-excel'
156
  )
 
36
 
37
  with col2:
38
  tokens = (
39
+ 8000,
40
  16000,
41
+ 24000
42
  )
43
  chunk_option = st.selectbox(
44
  'Token amounts per process:', tokens, key='token'
 
136
  dataframe.reset_index(drop=True, inplace=True)
137
 
138
  # Validate Result
139
+ df, df_no_llm, df_clean = process.validate(dataframe)
140
 
141
  end_time = datetime.now()
142
  st.write("Success in ", round((end_time.timestamp() - start_time.timestamp()) / 60, 2), "minutes")
143
 
144
+ st.dataframe(df)
145
  with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
146
+ df.to_excel(writer, sheet_name='Result Cleaned API LLM')
147
+ df_no_llm.to_excel(writer, sheet_name='Result Cleaned API')
148
+ df_clean.to_excel(writer, sheet_name='Result Cleaned')
149
  dataframe.to_excel(writer, sheet_name='Original')
150
  writer.close()
151
 
152
  st.download_button(
153
  label="Save Result",
154
  data=buffer,
155
+ file_name=f"{uploaded_file.name.replace('.pdf', '')}_{chunk_option}_{model.split('-')[0]}_{model_val.split('-')[0]}.xlsx",
156
  mime='application/vnd.ms-excel'
157
  )
process.py CHANGED
@@ -17,6 +17,8 @@ import json
17
  import os
18
  import pandas as pd
19
  import re
 
 
20
  import torch
21
 
22
  load_dotenv()
@@ -80,7 +82,7 @@ class Process():
80
  if types != 'summ':
81
  result = re.findall('(\{[^}]+\})', result)[0]
82
  return eval(result)
83
-
84
  return result
85
 
86
  def get_entity_one(self, chunks):
@@ -133,7 +135,7 @@ class Process():
133
  buffer = io.BytesIO()
134
  table.save(buffer, format='PNG')
135
  image = Image(buffer)
136
-
137
  # Extract to dataframe
138
  extracted_tables = image.extract_tables(ocr=ocr, implicit_rows=True, borderless_tables=True, min_confidence=0)
139
 
@@ -191,17 +193,17 @@ class Process():
191
 
192
  print('OCR table to extract', round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
193
  print(genes, snps, diseases)
194
-
195
  return genes, snps, diseases
196
 
197
  def validate(self, df):
198
 
199
  df = df.fillna('')
200
- df['Genes'] = df['Genes'].str.upper()
201
  df['SNPs'] = df['SNPs'].str.lower()
202
 
203
  # Check if there is two gene names
204
- sym = ['-', '/', '|']
205
  for i in df.index:
206
  gene = df.loc[i, 'Genes']
207
  for s in sym:
@@ -209,12 +211,14 @@ class Process():
209
  genes = gene.split(s)
210
  df.loc[i + 0.5] = df.loc[i]
211
  df = df.sort_index().reset_index(drop=True)
212
- df.loc[i, 'Genes'], df.loc[i + 1, 'Genes'] = genes[0], genes[1]
 
213
 
214
  # Check if there is SNPs without 'rs'
215
  for i in df.index:
216
  safe = True
217
  snp = df.loc[i, 'SNPs']
 
218
  if re.fullmatch('rs(\d)+|', snp):
219
  pass
220
  elif re.fullmatch('ts(\d)+', snp):
@@ -226,29 +230,96 @@ class Process():
226
  else:
227
  safe = False
228
  df = df.drop(i)
229
-
230
  if safe:
231
  df.loc[i, 'SNPs'] = snp
232
 
233
  df.reset_index(drop=True, inplace=True)
 
234
 
235
- # Validate genes and diseases with LLM
236
- json_table = df[['Genes', 'SNPs', 'Diseases']].to_json(orient='records')
237
- str_json_table = json.dumps(json.loads(json_table), indent=2)
238
 
239
- result = self.llm_val.invoke(input=prompt_validation.format(str_json_table)).content
240
- print('val')
241
- print(result)
 
 
242
 
243
- result = result[result.find('['):result.rfind(']')+1]
244
- try:
245
- result = eval(result)
246
- except SyntaxError:
247
- result = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
- df_val = pd.DataFrame(result)
250
- df_val = df_val.merge(df.head(1).drop(['Genes', 'SNPs', 'Diseases'], axis=1), 'cross')
 
 
251
 
252
- # TODO: How to validate genes and SNPs with ground truth?
 
253
 
254
- return df, df_val
 
17
  import os
18
  import pandas as pd
19
  import re
20
+ import requests
21
+ import time
22
  import torch
23
 
24
  load_dotenv()
 
82
  if types != 'summ':
83
  result = re.findall('(\{[^}]+\})', result)[0]
84
  return eval(result)
85
+
86
  return result
87
 
88
  def get_entity_one(self, chunks):
 
135
  buffer = io.BytesIO()
136
  table.save(buffer, format='PNG')
137
  image = Image(buffer)
138
+
139
  # Extract to dataframe
140
  extracted_tables = image.extract_tables(ocr=ocr, implicit_rows=True, borderless_tables=True, min_confidence=0)
141
 
 
193
 
194
  print('OCR table to extract', round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
195
  print(genes, snps, diseases)
196
+
197
  return genes, snps, diseases
198
 
199
  def validate(self, df):
200
 
201
  df = df.fillna('')
202
+ df['Genes'] = df['Genes'].str.replace(' ', '').str.upper()
203
  df['SNPs'] = df['SNPs'].str.lower()
204
 
205
  # Check if there is two gene names
206
+ sym = [',', '-', '/', '|']
207
  for i in df.index:
208
  gene = df.loc[i, 'Genes']
209
  for s in sym:
 
211
  genes = gene.split(s)
212
  df.loc[i + 0.5] = df.loc[i]
213
  df = df.sort_index().reset_index(drop=True)
214
+ df.loc[i, 'Genes'], df.loc[i + 1, 'Genes'] = genes[0], s.join(genes[1:])
215
+ break
216
 
217
  # Check if there is SNPs without 'rs'
218
  for i in df.index:
219
  safe = True
220
  snp = df.loc[i, 'SNPs']
221
+ snp = snp.replace('l', '1')
222
  if re.fullmatch('rs(\d)+|', snp):
223
  pass
224
  elif re.fullmatch('ts(\d)+', snp):
 
230
  else:
231
  safe = False
232
  df = df.drop(i)
233
+
234
  if safe:
235
  df.loc[i, 'SNPs'] = snp
236
 
237
  df.reset_index(drop=True, inplace=True)
238
+ df_clean = df.copy()
239
 
240
+ # # Validate genes and SNPs with APIs
241
+ # def permutate(word):
 
242
 
243
+ # if len(word) == 0:
244
+ # return ['']
245
+
246
+ # change = []
247
+ # res = permutate(word[1:])
248
 
249
+ # if word[0] in mistakes:
250
+ # change = [mistakes[word[0]] + r for r in res]
251
+
252
+ # return [word[0] + r for r in res] + change
253
+
254
+ # def call(url):
255
+
256
+ # while True:
257
+ # try:
258
+ # res = requests.get(url)
259
+ # time.sleep(1)
260
+ # break
261
+ # except Exception as e:
262
+ # print(e)
263
+
264
+ # return res
265
+
266
+ # mistakes = {'I': '1', 'O': '0'} # Common mistakes need to be maintained
267
+ # dbsnp = {}
268
+
269
+ # for i in df.index:
270
+ # snp = df.loc[i, 'SNPs']
271
+ # gene = df.loc[i, 'Genes']
272
+
273
+ # if snp not in dbsnp:
274
+ # res = call(f'https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/')
275
+ # try:
276
+ # res = res.json()
277
+ # dbsnp[snp] = [r['gene']['geneName'] for r in res['genomicContexts']]
278
+ # except:
279
+ # dbsnp[snp] = []
280
+
281
+ # res = call(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=snp&retmode=json&id={snp[2:]}').json()['result'][snp[2:]]
282
+ # if 'error' not in res:
283
+ # dbsnp[snp].extend([r['name'] for r in res['genes']])
284
+
285
+ # dbsnp[snp] = list(set(dbsnp[snp]))
286
+
287
+ # if gene not in dbsnp[snp]:
288
+ # for other in permutate(gene):
289
+ # if other in dbsnp[snp]:
290
+ # df.loc[i, 'Genes'] = other
291
+ # print(f'{gene} corrected to {other}')
292
+ # break
293
+ # else:
294
+ # df = df.drop(i)
295
+
296
+ # df.reset_index(drop=True, inplace=True)
297
+ df_no_llm = df.copy()
298
+
299
+ # Validate genes and diseases with LLM (for each 50 rows)
300
+ idx = 0
301
+ results = []
302
+
303
+ while True:
304
+ json_table = df[['Genes', 'SNPs', 'Diseases']][idx:idx+50].to_json(orient='records')
305
+ str_json_table = json.dumps(json.loads(json_table), indent=2)
306
+
307
+ result = self.llm_val.invoke(input=prompt_validation.format(str_json_table)).content
308
+ print('val', idx)
309
+ print(result)
310
+
311
+ result = result[result.find('['):result.rfind(']')+1]
312
+ try:
313
+ result = eval(result)
314
+ except SyntaxError:
315
+ result = []
316
 
317
+ results.extend(result)
318
+ idx += 50
319
+ if idx not in df.index:
320
+ break
321
 
322
+ df = pd.DataFrame(results)
323
+ df = df.merge(df_no_llm.head(1).drop(['Genes', 'SNPs', 'Diseases'], axis=1), 'cross')
324
 
325
+ return df, df_no_llm, df_clean
resources/experiment.ipynb CHANGED
@@ -2056,6 +2056,258 @@
2056
  " print('{:<10} {:<10} Not Match'.format(gene, snp))\n"
2057
  ]
2058
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2059
  {
2060
  "cell_type": "code",
2061
  "execution_count": 2,
@@ -2179,110 +2431,87 @@
2179
  },
2180
  {
2181
  "cell_type": "code",
2182
- "execution_count": 4,
2183
  "metadata": {},
2184
  "outputs": [
2185
  {
2186
  "name": "stdout",
2187
  "output_type": "stream",
2188
  "text": [
2189
- "Here is the Python solution using the `json` module and a dictionary to map known gene names to their correct forms:\n",
 
 
2190
  "```python\n",
2191
  "import json\n",
2192
  "\n",
2193
- "# Known gene names and their corrections\n",
2194
- "gene_corrections = {\n",
2195
- " \"SLC242\": \"SLC2A2\",\n",
2196
- " \"NEUROD1IBETA2\": \"NEUROD1\",\n",
2197
- " \"WFSI\": \"WFS1\",\n",
2198
- " \"GLI53\": \"GLIS3\",\n",
2199
- " \"FT0\": \"FTO\"\n",
 
 
2200
  "}\n",
2201
  "\n",
2202
- "# Function to correct gene names and SNPs\n",
2203
- "def correct_gene_data(data):\n",
2204
- " corrected_data = []\n",
2205
- " for entry in data:\n",
2206
- " genes = entry[\"Genes\"]\n",
2207
- " snps = entry[\"SNPs\"]\n",
2208
- " diseases = entry[\"Diseases\"]\n",
2209
- " \n",
2210
- " # Correct gene names\n",
2211
- " if genes in gene_corrections:\n",
2212
- " genes = gene_corrections[genes]\n",
2213
- " elif \" and \" not in genes:\n",
2214
- " # Check for combined names\n",
2215
- " parts = genes.split()\n",
2216
- " if len(parts) > 1:\n",
2217
- " genes = \" and \".join(parts)\n",
2218
- " \n",
2219
- " # Correct SNPs (assuming a dictionary of known SNPs for each gene)\n",
2220
- " snp_corrections = {\n",
2221
- " \"GCK\": {\"rs1799884\": \"rs1799884\"},\n",
2222
- " \"SLC2A2\": {\"rs5393\": \"rs5393\"},\n",
2223
- " \"NEUROD1\": {\"rs1801262\": \"rs1801262\"},\n",
2224
- " \"WFS1\": {\"rs6446482\": \"rs6446482\"},\n",
2225
- " \"GLIS3\": {\"rs7020673\": \"rs7020673\"},\n",
2226
- " \"FTO\": {\"rs9937290\": \"rs9937290\"}\n",
2227
- " }\n",
2228
- " if snps and genes in snp_corrections:\n",
2229
- " if snps not in snp_corrections[genes]:\n",
2230
- " snps = \"\"\n",
2231
- " \n",
2232
- " # Correct diseases (assuming a dictionary of known diseases for each gene)\n",
2233
- " disease_corrections = {\n",
2234
- " \"GCK\": {\"GCK-MODY (MODY2), PNDM, CHI\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
2235
- " \"SLC2A2\": {\"FBS\": \"FBS\"},\n",
2236
- " \"NEUROD1\": {\"MODY6 and PNDM\": \"MODY6 and PNDM\"},\n",
2237
- " \"WFS1\": {\"WFS1, sometimes referred to as DIDMOAD\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
2238
- " \"GLIS3\": {\"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
2239
- " \"FTO\": {\"Obesity\": \"Obesity\"}\n",
2240
- " }\n",
2241
- " if diseases and genes in disease_corrections:\n",
2242
- " if diseases not in disease_corrections[genes]:\n",
2243
- " diseases = \"\"\n",
2244
- " \n",
2245
- " # Add corrected entry to the list\n",
2246
- " if genes and snps and diseases:\n",
2247
- " corrected_data.append({\"Genes\": genes, \"SNPs\": snps, \"Diseases\": diseases})\n",
2248
- " \n",
2249
- " return json.dumps(corrected_data)\n",
2250
  "\n",
2251
- "# Input data\n",
2252
  "data = [\n",
2253
  " {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
2254
  " {\"Genes\": \"SLC242\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
2255
  " {\"Genes\": \"NEUROD1IBETA2\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
2256
  " {\"Genes\": \"WFSI\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
2257
  " {\"Genes\": \"GLI53\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
2258
- " {\"Genes\": \"FT0\", \"SNPs\": \"rs9937290\", \"Diseases\": \"Obesity\"}\n",
2259
  "]\n",
2260
  "\n",
2261
- "# Correct and output the data\n",
2262
- "print(correct_gene_data(data))\n",
2263
- "```\n",
2264
- "This will output the corrected data in the same format as the input:\n",
2265
- "```\n",
2266
- "[\n",
2267
- " {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
2268
- " {\"Genes\": \"SLC2A2\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
2269
- " {\"Genes\": \"NEUROD1\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
2270
- " {\"Genes\": \"WFS1\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
2271
- " {\"Genes\": \"GLIS3\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
2272
- " {\"Genes\": \"FTO\", \"SNPs\": \"rs9937290\", \"Diseases\": \"Obesity\"}\n",
2273
- "]\n",
2274
  "```\n",
2275
- "Note that this implementation assumes a dictionary of known gene names, SNPs, and diseases for correction. You may need to expand or modify these dictionaries based on your specific use case.\n"
 
 
2276
  ]
2277
  }
2278
  ],
2279
  "source": [
2280
  "from langchain_openai import ChatOpenAI\n",
 
2281
  "\n",
2282
  "llm = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url=\"https://api.perplexity.ai\")\n",
2283
  "\n",
2284
  "prompt = \"\"\"\n",
2285
- "# CONTEXT #\n",
2286
  "In my capacity as a genomics specialist, I have table data containing gene names with their corresponding SNPs and diseases. The data is provided in a list of JSON format, with each JSON object representing a single row in a tabular structure. \n",
2287
  "The problem is because the data is extracted using OCR, some gene names and SNPs may have a typo.\n",
2288
  "\n",
@@ -2320,7 +2549,6 @@
2320
  " },\n",
2321
  "]\n",
2322
  "\n",
2323
- "# OBJECTIVE #\n",
2324
  "Given the provided table data, the following tasks need to be completed:\n",
2325
  "\n",
2326
  "1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:\n",
@@ -2329,7 +2557,6 @@
2329
  "2. If SNP is not empty, check whether the gene name corresponds with the SNP. Fix it with the correct SNP if the original SNP is wrong.\n",
2330
  "3. If diseases are not empty, check whether the gene name corresponds with the diseases. Fix it with the correct diseases if the original disease is wrong.\n",
2331
  "\n",
2332
- "# RESPONSE #\n",
2333
  "The output must be STRICTLY ONLY a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:\n",
2334
  "[\n",
2335
  " {{\n",
@@ -2348,54 +2575,500 @@
2348
  "cell_type": "code",
2349
  "execution_count": 2,
2350
  "metadata": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
2351
  "outputs": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2352
  {
2353
  "name": "stderr",
2354
  "output_type": "stream",
2355
  "text": [
2356
- "c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\load.py:1429: FutureWarning: The repository for bigbio/euadr contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/bigbio/euadr\n",
2357
- "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
2358
- "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
2359
- " warnings.warn(\n"
2360
  ]
2361
  },
2362
  {
2363
- "ename": "ConnectionError",
2364
- "evalue": "Couldn't reach https://biosemantics.erasmusmc.nl/downloads/euadr.tgz (ConnectTimeout(MaxRetryError(\"HTTPSConnectionPool(host='biosemantics.erasmusmc.nl', port=443): Max retries exceeded with url: /downloads/euadr.tgz (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001CD0D00D060>, 'Connection to biosemantics.erasmusmc.nl timed out. (connect timeout=100)'))\")))",
2365
- "output_type": "error",
2366
- "traceback": [
2367
- "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
2368
- "\u001b[1;31mConnectionError\u001b[0m Traceback (most recent call last)",
2369
- "\u001b[1;32m<ipython-input-2-8057498175ab>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mdatasets\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mdataset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"bigbio/euadr\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
2370
- "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\load.py\u001b[0m in \u001b[0;36mload_dataset\u001b[1;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)\u001b[0m\n\u001b[0;32m 2547\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2548\u001b[0m \u001b[1;31m# Download and prepare data\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2549\u001b[1;33m builder_instance.download_and_prepare(\n\u001b[0m\u001b[0;32m 2550\u001b[0m \u001b[0mdownload_config\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2551\u001b[0m \u001b[0mdownload_mode\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdownload_mode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2371
- "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\builder.py\u001b[0m in \u001b[0;36mdownload_and_prepare\u001b[1;34m(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[0;32m 1003\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mnum_proc\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1004\u001b[0m \u001b[0mprepare_split_kwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"num_proc\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnum_proc\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1005\u001b[1;33m self._download_and_prepare(\n\u001b[0m\u001b[0;32m 1006\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1007\u001b[0m \u001b[0mverification_mode\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mverification_mode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2372
- "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\builder.py\u001b[0m in \u001b[0;36m_download_and_prepare\u001b[1;34m(self, dl_manager, verification_mode, **prepare_splits_kwargs)\u001b[0m\n\u001b[0;32m 1765\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1766\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_download_and_prepare\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverification_mode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mprepare_splits_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1767\u001b[1;33m super()._download_and_prepare(\n\u001b[0m\u001b[0;32m 1768\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1769\u001b[0m \u001b[0mverification_mode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2373
- "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\builder.py\u001b[0m in \u001b[0;36m_download_and_prepare\u001b[1;34m(self, dl_manager, verification_mode, **prepare_split_kwargs)\u001b[0m\n\u001b[0;32m 1076\u001b[0m \u001b[0msplit_dict\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSplitDict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdataset_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1077\u001b[0m \u001b[0msplit_generators_kwargs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_make_split_generators_kwargs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprepare_split_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1078\u001b[1;33m \u001b[0msplit_generators\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_split_generators\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0msplit_generators_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1079\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1080\u001b[0m \u001b[1;31m# Checksums verification\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2374
- "\u001b[1;32m~\\.cache\\huggingface\\modules\\datasets_modules\\datasets\\bigbio--euadr\\38388d88a335f2d91807b0f813bdfd809fec0e9dcbc32e2d9bfea7275d70f75c\\euadr.py\u001b[0m in \u001b[0;36m_split_generators\u001b[1;34m(self, dl_manager)\u001b[0m\n\u001b[0;32m 105\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_split_generators\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 106\u001b[0m \u001b[0murls\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_URL\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 107\u001b[1;33m \u001b[0mdatapath\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdownload_and_extract\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murls\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 108\u001b[0m return [\n\u001b[0;32m 109\u001b[0m datasets.SplitGenerator(\n",
2375
- "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\download\\download_manager.py\u001b[0m in \u001b[0;36mdownload_and_extract\u001b[1;34m(self, url_or_urls)\u001b[0m\n\u001b[0;32m 560\u001b[0m \u001b[0mextracted_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mextracted\u001b[0m \u001b[0mpaths\u001b[0m \u001b[0mof\u001b[0m \u001b[0mgiven\u001b[0m \u001b[0mURL\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 561\u001b[0m \"\"\"\n\u001b[1;32m--> 562\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mextract\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_urls\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 563\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 564\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mget_recorded_sizes_checksums\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2376
- "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\download\\download_manager.py\u001b[0m in \u001b[0;36mdownload\u001b[1;34m(self, url_or_urls)\u001b[0m\n\u001b[0;32m 424\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 425\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 426\u001b[1;33m downloaded_path_or_paths = map_nested(\n\u001b[0m\u001b[0;32m 427\u001b[0m \u001b[0mdownload_func\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 428\u001b[0m \u001b[0murl_or_urls\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2377
- "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\utils\\py_utils.py\u001b[0m in \u001b[0;36mmap_nested\u001b[1;34m(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, parallel_min_length, types, disable_tqdm, desc)\u001b[0m\n\u001b[0;32m 457\u001b[0m \u001b[1;31m# Singleton\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 458\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtypes\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 459\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfunction\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 460\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 461\u001b[0m \u001b[0miterable\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32melse\u001b[0m \u001b[0mdata_struct\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2378
- "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\download\\download_manager.py\u001b[0m in \u001b[0;36m_download\u001b[1;34m(self, url_or_filename, download_config)\u001b[0m\n\u001b[0;32m 449\u001b[0m \u001b[1;31m# append the relative path to the base_path\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 450\u001b[0m \u001b[0murl_or_filename\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0murl_or_path_join\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_base_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl_or_filename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 451\u001b[1;33m \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcached_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdownload_config\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 452\u001b[0m \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtracked_str\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 453\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_origin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2379
- "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\utils\\file_utils.py\u001b[0m in \u001b[0;36mcached_path\u001b[1;34m(url_or_filename, download_config, **download_kwargs)\u001b[0m\n\u001b[0;32m 186\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mis_remote_url\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[1;31m# URL, so get it from the cache (downloading if necessary)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 188\u001b[1;33m output_path = get_from_cache(\n\u001b[0m\u001b[0;32m 189\u001b[0m \u001b[0murl_or_filename\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 190\u001b[0m \u001b[0mcache_dir\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2380
- "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\utils\\file_utils.py\u001b[0m in \u001b[0;36mget_from_cache\u001b[1;34m(url, cache_dir, force_download, proxies, etag_timeout, resume_download, user_agent, local_files_only, use_etag, max_retries, token, use_auth_token, ignore_url_params, storage_options, download_desc)\u001b[0m\n\u001b[0;32m 571\u001b[0m \u001b[0m_raise_if_offline_mode_is_enabled\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Tried to reach {url}\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 572\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mhead_error\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 573\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Couldn't reach {url} ({repr(head_error)})\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 574\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 575\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Couldn't reach {url} (error {response.status_code})\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2381
- "\u001b[1;31mConnectionError\u001b[0m: Couldn't reach https://biosemantics.erasmusmc.nl/downloads/euadr.tgz (ConnectTimeout(MaxRetryError(\"HTTPSConnectionPool(host='biosemantics.erasmusmc.nl', port=443): Max retries exceeded with url: /downloads/euadr.tgz (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001CD0D00D060>, 'Connection to biosemantics.erasmusmc.nl timed out. (connect timeout=100)'))\")))"
 
 
 
 
 
 
 
 
 
 
2382
  ]
2383
  }
2384
  ],
2385
  "source": [
2386
- "from datasets import load_dataset\n",
 
2387
  "\n",
2388
- "dataset = load_dataset(\"bigbio/euadr\")"
 
 
 
 
 
2389
  ]
2390
  },
2391
  {
2392
  "cell_type": "code",
2393
- "execution_count": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2394
  "metadata": {},
2395
  "outputs": [],
2396
  "source": [
2397
- "dataset"
 
 
 
 
 
 
 
 
 
 
 
 
2398
  ]
 
 
 
 
 
 
 
2399
  }
2400
  ],
2401
  "metadata": {
 
2056
  " print('{:<10} {:<10} Not Match'.format(gene, snp))\n"
2057
  ]
2058
  },
2059
+ {
2060
+ "cell_type": "code",
2061
+ "execution_count": 12,
2062
+ "metadata": {},
2063
+ "outputs": [],
2064
+ "source": [
2065
+ "import pandas as pd\n",
2066
+ "\n",
2067
+ "df = pd.read_excel('../result/monogenic diabetes_8000.xlsx', sheet_name=\"Result\")"
2068
+ ]
2069
+ },
2070
+ {
2071
+ "cell_type": "code",
2072
+ "execution_count": 15,
2073
+ "metadata": {},
2074
+ "outputs": [
2075
+ {
2076
+ "data": {
2077
+ "text/plain": [
2078
+ "['STARDIO', 'STARDI0', 'STARD1O', 'STARD10']"
2079
+ ]
2080
+ },
2081
+ "execution_count": 15,
2082
+ "metadata": {},
2083
+ "output_type": "execute_result"
2084
+ }
2085
+ ],
2086
+ "source": [
2087
+ "mistakes = {'I': '1', 'O': '0'}\n",
2088
+ "\n",
2089
+ "def permutate(word):\n",
2090
+ "\n",
2091
+ " if len(word) == 0:\n",
2092
+ " return ['']\n",
2093
+ "\n",
2094
+ " change = []\n",
2095
+ " res = permutate(word[1:])\n",
2096
+ "\n",
2097
+ " if word[0] in mistakes:\n",
2098
+ " change = [mistakes[word[0]] + r for r in res]\n",
2099
+ "\n",
2100
+ " return [word[0] + r for r in res] + change\n",
2101
+ "\n",
2102
+ "permutate('STARDIO')"
2103
+ ]
2104
+ },
2105
+ {
2106
+ "cell_type": "code",
2107
+ "execution_count": 13,
2108
+ "metadata": {},
2109
+ "outputs": [
2110
+ {
2111
+ "name": "stdout",
2112
+ "output_type": "stream",
2113
+ "text": [
2114
+ "GCK rs1799884 Match\n",
2115
+ "GCK rs4607517 Match\n",
2116
+ "SLC2A2 rs5393 Match\n",
2117
+ "SLC2A2 rs5394 Match\n",
2118
+ "SLC2A2 rs5400 Match\n",
2119
+ "SLC2A2 rs5404 Match\n",
2120
+ "HNF4A rs2144908 Match\n",
2121
+ "HNF4A rs3818247 Match\n",
2122
+ "HNF4A rs884614 Not Match\n",
2123
+ "HNF4A rs4810424 Not Match\n",
2124
+ "HNF4A rs1884613 Not Match\n",
2125
+ "HNF1B rs757210 Match\n",
2126
+ "TCF2 rs757210 Not Match\n",
2127
+ "HNF1B rs4430796 Match\n",
2128
+ "TCF2 rs4430796 Not Match\n",
2129
+ "HNF1B rs7501939 Match\n",
2130
+ "TCF2 rs7501939 Not Match\n",
2131
+ "PAX4 rs10229583 Match\n",
2132
+ "NEUROD1 rs1801262 Match\n",
2133
+ "BETA2 rs1801262 Not Match\n",
2134
+ "WFS1 rs10010131 Match\n",
2135
+ "WFS1 rs6446482 Match\n",
2136
+ "WFS1 rs734312 Match\n",
2137
+ "PPARG rs1801282 Match\n",
2138
+ "PPARG rs4684847 Match\n",
2139
+ "GLIS3 rs7020673 Match\n",
2140
+ "GLIS3 rs7034200 Match\n",
2141
+ "GLIS3 rs7041847 Match\n",
2142
+ "HNF1A rs1801262 Not Match\n",
2143
+ "INS rs1801282 Not Match\n",
2144
+ "PPARG rs780094 Not Match\n"
2145
+ ]
2146
+ }
2147
+ ],
2148
+ "source": [
2149
+ "import requests\n",
2150
+ "\n",
2151
+ "dbsnp = {}\n",
2152
+ "\n",
2153
+ "for i in df.index:\n",
2154
+ " snp = df.loc[i, 'SNPs']\n",
2155
+ " gene = df.loc[i, 'Genes']\n",
2156
+ "\n",
2157
+ " if snp not in dbsnp:\n",
2158
+ " res = requests.get(f'https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/')\n",
2159
+ "\n",
2160
+ " try:\n",
2161
+ " res = res.json()\n",
2162
+ " dbsnp[snp] = [r['gene']['geneName'] for r in res['genomicContexts']]\n",
2163
+ " except:\n",
2164
+ " dbsnp[snp] = []\n",
2165
+ "\n",
2166
+ " res = requests.get(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=snp&retmode=json&id={snp[2:]}').json()['result'][snp[2:]]\n",
2167
+ " if 'error' not in res:\n",
2168
+ " dbsnp[snp].extend([r['name'] for r in res['genes']])\n",
2169
+ "\n",
2170
+ " dbsnp[snp] = list(set(dbsnp[snp]))\n",
2171
+ "\n",
2172
+ " if gene in dbsnp[snp]:\n",
2173
+ " print('{:<10} {:<10} Match'.format(gene, snp))\n",
2174
+ " else:\n",
2175
+ " for other in permutate(gene):\n",
2176
+ " if other in dbsnp[snp]:\n",
2177
+ " print('{:<10} {:<10} Match (corrected)'.format(other, snp))\n",
2178
+ " break\n",
2179
+ " else:\n",
2180
+ " print('{:<10} {:<10} Not Match'.format(gene, snp))\n"
2181
+ ]
2182
+ },
2183
+ {
2184
+ "cell_type": "code",
2185
+ "execution_count": 2,
2186
+ "metadata": {},
2187
+ "outputs": [
2188
+ {
2189
+ "name": "stdout",
2190
+ "output_type": "stream",
2191
+ "text": [
2192
+ "0\n",
2193
+ "1\n",
2194
+ "2\n",
2195
+ "3\n",
2196
+ "4\n",
2197
+ "5\n",
2198
+ "6\n",
2199
+ "7\n",
2200
+ "8\n",
2201
+ "9\n",
2202
+ "10\n",
2203
+ "11\n",
2204
+ "12\n",
2205
+ "13\n",
2206
+ "14\n",
2207
+ "15\n",
2208
+ "16\n",
2209
+ "17\n",
2210
+ "18\n",
2211
+ "19\n",
2212
+ "20\n",
2213
+ "21\n",
2214
+ "22\n",
2215
+ "23\n",
2216
+ "24\n",
2217
+ "25\n",
2218
+ "26\n",
2219
+ "27\n",
2220
+ "28\n",
2221
+ "29\n",
2222
+ "30\n",
2223
+ "31\n",
2224
+ "32\n",
2225
+ "33\n",
2226
+ "34\n",
2227
+ "35\n",
2228
+ "36\n",
2229
+ "37\n",
2230
+ "38\n",
2231
+ "39\n",
2232
+ "40\n",
2233
+ "41\n",
2234
+ "42\n",
2235
+ "43\n",
2236
+ "44\n",
2237
+ "45\n",
2238
+ "46\n",
2239
+ "47\n",
2240
+ "48\n",
2241
+ "49\n",
2242
+ "50\n",
2243
+ "51\n",
2244
+ "52\n",
2245
+ "53\n",
2246
+ "54\n",
2247
+ "55\n",
2248
+ "56\n",
2249
+ "57\n",
2250
+ "58\n",
2251
+ "59\n",
2252
+ "60\n",
2253
+ "61\n",
2254
+ "62\n",
2255
+ "63\n",
2256
+ "64\n",
2257
+ "65\n",
2258
+ "66\n",
2259
+ "67\n",
2260
+ "68\n",
2261
+ "69\n",
2262
+ "70\n",
2263
+ "71\n",
2264
+ "72\n",
2265
+ "73\n",
2266
+ "74\n",
2267
+ "75\n",
2268
+ "76\n",
2269
+ "77\n",
2270
+ "78\n",
2271
+ "79\n",
2272
+ "80\n",
2273
+ "81\n",
2274
+ "82\n",
2275
+ "83\n",
2276
+ "84\n",
2277
+ "85\n",
2278
+ "86\n",
2279
+ "87\n",
2280
+ "88\n",
2281
+ "89\n",
2282
+ "90\n",
2283
+ "91\n",
2284
+ "92\n",
2285
+ "93\n",
2286
+ "94\n",
2287
+ "95\n",
2288
+ "96\n",
2289
+ "97\n",
2290
+ "98\n",
2291
+ "99\n"
2292
+ ]
2293
+ }
2294
+ ],
2295
+ "source": [
2296
+ "import requests\n",
2297
+ "import time\n",
2298
+ "\n",
2299
+ "snp = 'rs972283'\n",
2300
+ "for i in range(100):\n",
2301
+ " print(i)\n",
2302
+ " while True:\n",
2303
+ " try:\n",
2304
+ " res = requests.get(f'https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/')\n",
2305
+ " break\n",
2306
+ " except Exception as e:\n",
2307
+ " print('sleep')\n",
2308
+ " time.sleep(1)\n"
2309
+ ]
2310
+ },
2311
  {
2312
  "cell_type": "code",
2313
  "execution_count": 2,
 
2431
  },
2432
  {
2433
  "cell_type": "code",
2434
+ "execution_count": 3,
2435
  "metadata": {},
2436
  "outputs": [
2437
  {
2438
  "name": "stdout",
2439
  "output_type": "stream",
2440
  "text": [
2441
+ "To accomplish this task, we'll need a reference dataset of correct gene names, SNPs, and diseases. Let's assume we have a dictionary `gene_ref` that maps gene names to their corresponding SNPs and diseases.\n",
2442
+ "\n",
2443
+ "Here's a Python script that should accomplish the tasks:\n",
2444
  "```python\n",
2445
  "import json\n",
2446
  "\n",
2447
+ "# Reference dataset (example)\n",
2448
+ "gene_ref = {\n",
2449
+ " \"GCK\": {\"SNPs\": [\"rs1799884\"], \"Diseases\": [\"GCK-MODY (MODY2)\", \"PNDM\", \"CHI\"]},\n",
2450
+ " \"SLC2A2\": {\"SNPs\": [\"rs5393\"], \"Diseases\": [\"FBS\"]},\n",
2451
+ " \"NEUROD1\": {\"SNPs\": [\"rs1801262\"], \"Diseases\": [\"MODY6\", \"PNDM\"]},\n",
2452
+ " \"WFS1\": {\"SNPs\": [\"rs6446482\"], \"Diseases\": [\"WFS1\", \"DIDMOAD\"]},\n",
2453
+ " \"GLIS3\": {\"SNPs\": [\"rs7020673\"], \"Diseases\": [\"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"]},\n",
2454
+ " \"FTO\": {\"SNPs\": [\"rs9937290\"], \"Diseases\": [\"Obesity\"]},\n",
2455
+ " # Add more gene references as needed\n",
2456
  "}\n",
2457
  "\n",
2458
+ "def correct_gene_name(gene_name):\n",
2459
+ " # Check for combined names\n",
2460
+ " for ref_gene in gene_ref:\n",
2461
+ " if ref_gene in gene_name:\n",
2462
+ " return [ref_gene]\n",
2463
+ " # Check for OCR errors\n",
2464
+ " for ref_gene in gene_ref:\n",
2465
+ " if len(set(gene_name) & set(ref_gene)) > len(ref_gene) / 2:\n",
2466
+ " return [ref_gene]\n",
2467
+ " return []\n",
2468
+ "\n",
2469
+ "def validate_data(data):\n",
2470
+ " validated_data = []\n",
2471
+ " for row in data:\n",
2472
+ " gene_name = row[\"Genes\"]\n",
2473
+ " corrected_genes = correct_gene_name(gene_name)\n",
2474
+ " if not corrected_genes:\n",
2475
+ " continue # Remove row if gene name is invalid\n",
2476
+ " for corrected_gene in corrected_genes:\n",
2477
+ " new_row = row.copy()\n",
2478
+ " new_row[\"Genes\"] = corrected_gene\n",
2479
+ " # Check and correct SNP\n",
2480
+ " if row[\"SNPs\"]:\n",
2481
+ " if row[\"SNPs\"] not in gene_ref[corrected_gene][\"SNPs\"]:\n",
2482
+ " new_row[\"SNPs\"] = gene_ref[corrected_gene][\"SNPs\"][0]\n",
2483
+ " # Check and correct diseases\n",
2484
+ " if row[\"Diseases\"]:\n",
2485
+ " diseases = [disease.strip() for disease in row[\"Diseases\"].split(\",\")]\n",
2486
+ " if not all(disease in gene_ref[corrected_gene][\"Diseases\"] for disease in diseases):\n",
2487
+ " new_row[\"Diseases\"] = \", \".join(gene_ref[corrected_gene][\"Diseases\"])\n",
2488
+ " validated_data.append(new_row)\n",
2489
+ " return json.dumps(validated_data)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2490
  "\n",
 
2491
  "data = [\n",
2492
  " {\"Genes\": \"GCK\", \"SNPs\": \"rs1799884\", \"Diseases\": \"GCK-MODY (MODY2), PNDM, CHI\"},\n",
2493
  " {\"Genes\": \"SLC242\", \"SNPs\": \"rs5393\", \"Diseases\": \"FBS\"},\n",
2494
  " {\"Genes\": \"NEUROD1IBETA2\", \"SNPs\": \"rs1801262\", \"Diseases\": \"MODY6 and PNDM\"},\n",
2495
  " {\"Genes\": \"WFSI\", \"SNPs\": \"rs6446482\", \"Diseases\": \"WFS1, sometimes referred to as DIDMOAD\"},\n",
2496
  " {\"Genes\": \"GLI53\", \"SNPs\": \"rs7020673\", \"Diseases\": \"Neonatal diabetes syndrome associated with congenital hypothyroidism, and polycystic kidneys\"},\n",
2497
+ " {\"Genes\": \"FT0\", \"SNPs\": \"rs9937290\", \"Diseases\": \"Obesity\"},\n",
2498
  "]\n",
2499
  "\n",
2500
+ "print(validate_data(data))\n",
 
 
 
 
 
 
 
 
 
 
 
 
2501
  "```\n",
2502
+ "This script will output a string containing a list of JSON objects with corrected gene names, SNPs, and diseases.\n",
2503
+ "\n",
2504
+ "Note that this implementation assumes a simple reference dataset and may not cover all possible OCR errors or combined gene names. You may need to expand the `gene_ref` dictionary and the `correct_gene_name` function to handle more complex cases.\n"
2505
  ]
2506
  }
2507
  ],
2508
  "source": [
2509
  "from langchain_openai import ChatOpenAI\n",
2510
+ "import os\n",
2511
  "\n",
2512
  "llm = ChatOpenAI(temperature=0, api_key=os.environ['PERPLEXITY_API_KEY'], base_url=\"https://api.perplexity.ai\")\n",
2513
  "\n",
2514
  "prompt = \"\"\"\n",
 
2515
  "In my capacity as a genomics specialist, I have table data containing gene names with their corresponding SNPs and diseases. The data is provided in a list of JSON format, with each JSON object representing a single row in a tabular structure. \n",
2516
  "The problem is because the data is extracted using OCR, some gene names and SNPs may have a typo.\n",
2517
  "\n",
 
2549
  " },\n",
2550
  "]\n",
2551
  "\n",
 
2552
  "Given the provided table data, the following tasks need to be completed:\n",
2553
  "\n",
2554
  "1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:\n",
 
2557
  "2. If SNP is not empty, check whether the gene name corresponds with the SNP. Fix it with the correct SNP if the original SNP is wrong.\n",
2558
  "3. If diseases are not empty, check whether the gene name corresponds with the diseases. Fix it with the correct diseases if the original disease is wrong.\n",
2559
  "\n",
 
2560
  "The output must be STRICTLY ONLY a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:\n",
2561
  "[\n",
2562
  " {{\n",
 
2575
  "cell_type": "code",
2576
  "execution_count": 2,
2577
  "metadata": {},
2578
+ "outputs": [],
2579
+ "source": [
2580
+ "import nest_asyncio\n",
2581
+ "\n",
2582
+ "nest_asyncio.apply()\n",
2583
+ "\n",
2584
+ "from llama_parse import LlamaParse"
2585
+ ]
2586
+ },
2587
+ {
2588
+ "cell_type": "code",
2589
+ "execution_count": 17,
2590
+ "metadata": {},
2591
  "outputs": [
2592
+ {
2593
+ "name": "stdout",
2594
+ "output_type": "stream",
2595
+ "text": [
2596
+ "Started parsing the file under job_id cb5d7891-1366-47b7-98e2-d6cfbd5d3b87\n",
2597
+ ".."
2598
+ ]
2599
+ }
2600
+ ],
2601
+ "source": [
2602
+ "from dotenv import load_dotenv\n",
2603
+ "\n",
2604
+ "load_dotenv()\n",
2605
+ "\n",
2606
+ "parser = LlamaParse(\n",
2607
+ " # api_key=os.environ['LLAMA_'], # can also be set in your env as LLAMA_CLOUD_API_KEY\n",
2608
+ " result_type=\"markdown\", # \"markdown\" and \"text\" are available\n",
2609
+ " num_workers=4, # if multiple files passed, split in `num_workers` API calls\n",
2610
+ " verbose=True,\n",
2611
+ " language=\"en\", # Optionally you can define a language, default=en\n",
2612
+ ")\n",
2613
+ "\n",
2614
+ "# sync\n",
2615
+ "objs = parser.get_json_result(\"papers/ukmss-34421.pdf\")"
2616
+ ]
2617
+ },
2618
+ {
2619
+ "cell_type": "code",
2620
+ "execution_count": 39,
2621
+ "metadata": {},
2622
+ "outputs": [
2623
+ {
2624
+ "data": {
2625
+ "text/plain": [
2626
+ "['Table 2 Voight et al. Page 22',\n",
2627
+ " 'Expression QTL results for T2D-associated variants in blood and adipose tissue',\n",
2628
+ " '',\n",
2629
+ " '',\n",
2630
+ " '',\n",
2631
+ " 'e',\n",
2632
+ " 'SNP with strongest correlation with trait',\n",
2633
+ " '',\n",
2634
+ " '',\n",
2635
+ " '',\n",
2636
+ " 'SNPChr.PositionNearbyRiskGene (transcript)TissueP valueP value',\n",
2637
+ " 'cd2fg',\n",
2638
+ " 'Effect (s.e.m.)P SNP (r)P',\n",
2639
+ " 'B36 (bp) Europe PMC Funders Author Manuscriptsadj adj',\n",
2640
+ " 'ab',\n",
2641
+ " 'geneallele',\n",
2642
+ " '',\n",
2643
+ " '',\n",
2644
+ " '',\n",
2645
+ " 'Novel loci reported in this study',\n",
2646
+ " 'rs4457053576,460,705ZBED3GPDE8B (NM_003719)Adipose0.302 (0.070)−50.80rs6864250−17−13',\n",
2647
+ " '2.8 × 103.1 × 105.8 × 10',\n",
2648
+ " '(0.18)',\n",
2649
+ " 'ZBED3 (NM_032367)Adipose0.429 (0.068)−90.011rs4704389−16−9',\n",
2650
+ " '1.0 × 103.9 × 106.0 × 10',\n",
2651
+ " '(0.20)',\n",
2652
+ " '−11−12',\n",
2653
+ " 'rs9722837130,117,394KLF14GKLF14 (NM_138693)Adipose−0.387 (0.058)0.058rs7381340.0014',\n",
2654
+ " '8.1 × 102.2 × 10',\n",
2655
+ " '(0.30)',\n",
2656
+ " '',\n",
2657
+ " '',\n",
2658
+ " '',\n",
2659
+ " '−5−7',\n",
2660
+ " 'rs896854896,029,687TP53INP1TCCNE2 (NM_057749)Blood−0.225 (0.053)0.78rs47353390.0051',\n",
2661
+ " '3.8 × 105.8 × 10',\n",
2662
+ " '(0.61)',\n",
2663
+ " '',\n",
2664
+ " '',\n",
2665
+ " '',\n",
2666
+ " '−7−24−19',\n",
2667
+ " 'rs15522241172,110,746CENTD2ASTARD10 (NM_006645)Blood0.337 (0.066)0.026rs519790',\n",
2668
+ " '8.6 × 102.7 × 101.6 × 10',\n",
2669
+ " '(0.04)',\n",
2670
+ " 'rs795719712119,945,069HNF1ATACADS (NM_000017)Adipose−0.248 (0.067)−40.29rs9204−53−50',\n",
2671
+ " '3.7 × 101.3 × 105.9 × 10',\n",
2672
+ " '(0.02)',\n",
2673
+ " 'PSMD9 (NM_002813)Blood0.240 (0.065)−40.0088rs3741593−8−6',\n",
2674
+ " '3.9 × 108.3 × 101.7 × 10',\n",
2675
+ " '(0.00)',\n",
2676
+ " '−6−7',\n",
2677
+ " 'OASL (NM_003733)Adipose0.318 (0.068)0.13rs22598830.0018',\n",
2678
+ " '6.4 × 101.1 × 10',\n",
2679
+ " '(0.19)',\n",
2680
+ " '',\n",
2681
+ " '',\n",
2682
+ " '',\n",
2683
+ " '−6−22−16',\n",
2684
+ " 'OASL (NM_003733)Blood0.319 (0.064)0.37rs4556628',\n",
2685
+ " '1.3 × 104.4 × 101.4 × 10',\n",
2686
+ " '(0.21)',\n",
2687
+ " '',\n",
2688
+ " '',\n",
2689
+ " '',\n",
2690
+ " '−4−39−35',\n",
2691
+ " 'COQ5 (NM_032314)Blood0.248 (0.065)0.92rs10774561',\n",
2692
+ " '2.1 × 108.7 × 104.9 × 10',\n",
2693
+ " '(0.02)',\n",
2694
+ " 'UNC119B (NM_032661)Blood−0.254 (0.064)−40.048rs11065202−12−9',\n",
2695
+ " '1.4 × 107.8 × 102.3 × 10',\n",
2696
+ " '(0.09)',\n",
2697
+ " 'CAMKK2 (NM_172215)Adipose−0.497 (0.068)−120.18rs11065504−117−98',\n",
2698
+ " '1.2 × 102.7 × 103.8 × 10',\n",
2699
+ " '(0.08)',\n",
2700
+ " '−8−105−94',\n",
2701
+ " 'CAMKK2 (NM_172215)Blood−0.360 (0.063)0.68rs11065504',\n",
2702
+ " '3.4 × 107.0 × 105.7 × 10',\n",
2703
+ " '(0.08)',\n",
2704
+ " '',\n",
2705
+ " '',\n",
2706
+ " '',\n",
2707
+ " '−6−6−17−17',\n",
2708
+ " 'P2RX4 (NM_175568)Blood0.312 (0.065)rs25644',\n",
2709
+ " '3.4 × 102.0 × 103.4 × 101.9 × 10',\n",
2710
+ " '(0.03)',\n",
2711
+ " '−10−21−12Europe PMC Funders Author Manuscripts',\n",
2712
+ " 'rs80426801589,322,341PRC1AVPS33B (NM_018668)Blood−0.371 (0.057)0.50rs12595616',\n",
2713
+ " '2.9 × 102.3 × 104.5 × 10',\n",
2714
+ " '(0.57)',\n",
2715
+ " 'Previously reported loci',\n",
2716
+ " '',\n",
2717
+ " '',\n",
2718
+ " '',\n",
2719
+ " '−5−5',\n",
2720
+ " 'rs75783262226,728,897IRS1AIRS1 (Contig50189_RC)Adipose−0.251 (0.059)0.89rs29436530.69',\n",
2721
+ " '3.7 × 103.4 × 10',\n",
2722
+ " '(0.93)',\n",
2723
+ " '',\n",
2724
+ " '',\n",
2725
+ " '',\n",
2726
+ " '−8−10',\n",
2727
+ " 'IRS1 (NM_005544)Adipose−0.331 (0.059)0.58rs21760400.0042',\n",
2728
+ " '5.7 × 107.8 × 10',\n",
2729
+ " '(0.74)',\n",
2730
+ " 'rs13081389312,264,800PPARGAIQSEC1 (NM_014869)Adipose−0.630 (0.131)−6−4rs9211−96−94',\n",
2731
+ " '2.9 × 101.4 × 101.1 × 107.4 × 10',\n",
2732
+ " '(0.01)',\n",
2733
+ " 'rs6795735364,680,405ADAMTS9CBC040632 (AK022320)Adipose−0.229 (0.056)−50.28rs4521216−13−10',\n",
2734
+ " '7.6 × 103.0 × 108.7 × 10',\n",
2735
+ " '(0.02)',\n",
2736
+ " '',\n",
2737
+ " '',\n",
2738
+ " '',\n",
2739
+ " ' Nat Genet. Author manuscript; available in PMC 2011 April 21.']"
2740
+ ]
2741
+ },
2742
+ "execution_count": 39,
2743
+ "metadata": {},
2744
+ "output_type": "execute_result"
2745
+ }
2746
+ ],
2747
+ "source": [
2748
+ "page = 22\n",
2749
+ "text_parts = objs[0]['pages'][page - 1]['text'].split('\\n')\n",
2750
+ "text_parts"
2751
+ ]
2752
+ },
2753
+ {
2754
+ "cell_type": "code",
2755
+ "execution_count": 41,
2756
+ "metadata": {},
2757
+ "outputs": [
2758
+ {
2759
+ "data": {
2760
+ "text/plain": [
2761
+ "{'Gene Names and Diseases': [{'Gene Name': 'ZBED3',\n",
2762
+ " 'SNP': 'rs6864250',\n",
2763
+ " 'Diseases': 'T2D-associated variants'},\n",
2764
+ " {'Gene Name': 'ZBED3',\n",
2765
+ " 'SNP': 'rs4704389',\n",
2766
+ " 'Diseases': 'T2D-associated variants'},\n",
2767
+ " {'Gene Name': 'KLF14',\n",
2768
+ " 'SNP': 'rs972283',\n",
2769
+ " 'Diseases': 'T2D-associated variants'},\n",
2770
+ " {'Gene Name': 'TP53INP1',\n",
2771
+ " 'SNP': 'rs896854',\n",
2772
+ " 'Diseases': 'T2D-associated variants'},\n",
2773
+ " {'Gene Name': 'CENTD2',\n",
2774
+ " 'SNP': 'rs1552224',\n",
2775
+ " 'Diseases': 'T2D-associated variants'},\n",
2776
+ " {'Gene Name': 'HNF1A',\n",
2777
+ " 'SNP': 'rs7957197',\n",
2778
+ " 'Diseases': 'T2D-associated variants'},\n",
2779
+ " {'Gene Name': 'PSMD9',\n",
2780
+ " 'SNP': 'rs3741593',\n",
2781
+ " 'Diseases': 'T2D-associated variants'},\n",
2782
+ " {'Gene Name': 'OASL',\n",
2783
+ " 'SNP': 'rs2259883',\n",
2784
+ " 'Diseases': 'T2D-associated variants'},\n",
2785
+ " {'Gene Name': 'OASL',\n",
2786
+ " 'SNP': 'rs4556628',\n",
2787
+ " 'Diseases': 'T2D-associated variants'},\n",
2788
+ " {'Gene Name': 'COQ5',\n",
2789
+ " 'SNP': 'rs10774561',\n",
2790
+ " 'Diseases': 'T2D-associated variants'},\n",
2791
+ " {'Gene Name': 'UNC119B',\n",
2792
+ " 'SNP': 'rs11065202',\n",
2793
+ " 'Diseases': 'T2D-associated variants'},\n",
2794
+ " {'Gene Name': 'CAMKK2',\n",
2795
+ " 'SNP': 'rs11065504',\n",
2796
+ " 'Diseases': 'T2D-associated variants'},\n",
2797
+ " {'Gene Name': 'P2RX4',\n",
2798
+ " 'SNP': 'rs25644',\n",
2799
+ " 'Diseases': 'T2D-associated variants'},\n",
2800
+ " {'Gene Name': 'PRC1',\n",
2801
+ " 'SNP': 'rs8042680',\n",
2802
+ " 'Diseases': 'T2D-associated variants'},\n",
2803
+ " {'Gene Name': 'IRS1',\n",
2804
+ " 'SNP': 'rs7578326',\n",
2805
+ " 'Diseases': 'T2D-associated variants'},\n",
2806
+ " {'Gene Name': 'IRS1',\n",
2807
+ " 'SNP': 'rs2943653',\n",
2808
+ " 'Diseases': 'T2D-associated variants'},\n",
2809
+ " {'Gene Name': 'IRS1',\n",
2810
+ " 'SNP': 'rs2176040',\n",
2811
+ " 'Diseases': 'T2D-associated variants'},\n",
2812
+ " {'Gene Name': 'PPARG',\n",
2813
+ " 'SNP': 'rs13081389',\n",
2814
+ " 'Diseases': 'T2D-associated variants'},\n",
2815
+ " {'Gene Name': 'ADAMTS9',\n",
2816
+ " 'SNP': 'rs6795735',\n",
2817
+ " 'Diseases': 'T2D-associated variants'}]}"
2818
+ ]
2819
+ },
2820
+ "execution_count": 41,
2821
+ "metadata": {},
2822
+ "output_type": "execute_result"
2823
+ }
2824
+ ],
2825
+ "source": [
2826
+ "from openai import OpenAI\n",
2827
+ "\n",
2828
+ "client = OpenAI()\n",
2829
+ "response = client.chat.completions.create(\n",
2830
+ " model=\"gpt-4-0125-preview\",\n",
2831
+ " response_format={\"type\": \"json_object\"},\n",
2832
+ " messages=[\n",
2833
+ " {\"role\": \"system\", \"content\": \"You are a helpful assistant designed to output JSON.\"},\n",
2834
+ " {\"role\": \"user\", \"content\": f\"Given a text like this: {text_parts}, automatically extract, return multiple Gene Names, potential diseases and their corresponding SNPs in the format like this: {{\\\"Gene Name\\\": \\\"FTO\\\", \\\"SNP\\\": \\\"rs9939609\\\", \\\"Diseases\\\": \\\"Obesity\\\"}}, from table format at text (this is just an example, don't return this)\"}\n",
2835
+ " ]\n",
2836
+ ")\n",
2837
+ "res = response.choices[0].message.content\n",
2838
+ "eval(res)"
2839
+ ]
2840
+ },
2841
+ {
2842
+ "cell_type": "code",
2843
+ "execution_count": 4,
2844
+ "metadata": {},
2845
+ "outputs": [
2846
+ {
2847
+ "name": "stdout",
2848
+ "output_type": "stream",
2849
+ "text": [
2850
+ "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
2851
+ "\n",
2852
+ "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:68: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.\n",
2853
+ "\n"
2854
+ ]
2855
+ },
2856
+ {
2857
+ "name": "stderr",
2858
+ "output_type": "stream",
2859
+ "text": [
2860
+ "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:68: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.\n",
2861
+ "\n"
2862
+ ]
2863
+ },
2864
+ {
2865
+ "name": "stdout",
2866
+ "output_type": "stream",
2867
+ "text": [
2868
+ "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:72: The name tf.train.import_meta_graph is deprecated. Please use tf.compat.v1.train.import_meta_graph instead.\n",
2869
+ "\n"
2870
+ ]
2871
+ },
2872
  {
2873
  "name": "stderr",
2874
  "output_type": "stream",
2875
  "text": [
2876
+ "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tf2onnx\\tf_loader.py:72: The name tf.train.import_meta_graph is deprecated. Please use tf.compat.v1.train.import_meta_graph instead.\n",
2877
+ "\n"
 
 
2878
  ]
2879
  },
2880
  {
2881
+ "name": "stdout",
2882
+ "output_type": "stream",
2883
+ "text": [
2884
+ "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
2885
+ "\n"
2886
+ ]
2887
+ },
2888
+ {
2889
+ "name": "stderr",
2890
+ "output_type": "stream",
2891
+ "text": [
2892
+ "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
2893
+ "\n"
2894
+ ]
2895
+ },
2896
+ {
2897
+ "name": "stdout",
2898
+ "output_type": "stream",
2899
+ "text": [
2900
+ "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\layers\\normalization\\batch_normalization.py:979: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.\n",
2901
+ "\n"
2902
+ ]
2903
+ },
2904
+ {
2905
+ "name": "stderr",
2906
+ "output_type": "stream",
2907
+ "text": [
2908
+ "WARNING:tensorflow:From c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\layers\\normalization\\batch_normalization.py:979: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.\n",
2909
+ "\n"
2910
  ]
2911
  }
2912
  ],
2913
  "source": [
2914
+ "import sys\n",
2915
+ "sys.path.append('..')\n",
2916
  "\n",
2917
+ "import os\n",
2918
+ "import torch\n",
2919
+ "from pdf2image import convert_from_path\n",
2920
+ "from table_detector import detection_transform, device, model, ocr, outputs_to_objects\n",
2921
+ "import io\n",
2922
+ "from img2table.document import Image"
2923
  ]
2924
  },
2925
  {
2926
  "cell_type": "code",
2927
+ "execution_count": 21,
2928
+ "metadata": {},
2929
+ "outputs": [
2930
+ {
2931
+ "name": "stdout",
2932
+ "output_type": "stream",
2933
+ "text": [
2934
+ "1-s2.0-S0002916523016155-main.pdf\n",
2935
+ "4\n",
2936
+ "1329.pdf\n",
2937
+ "8\n",
2938
+ "41467_2020_Article_15421.pdf\n",
2939
+ "11\n",
2940
+ "berndt2013.pdf\n",
2941
+ "14\n",
2942
+ "BMD.pdf\n",
2943
+ "17\n",
2944
+ "clock and eat timing.pdf\n",
2945
+ "23\n",
2946
+ "COMT breast cancer metaanalysis chinese.pdf\n",
2947
+ "26\n",
2948
+ "dubois2010.pdf\n",
2949
+ "30\n",
2950
+ "EMMM-8-688.pdf\n",
2951
+ "40\n",
2952
+ "EMS120610.pdf\n",
2953
+ "45\n",
2954
+ "file.pdf\n",
2955
+ "51\n",
2956
+ "journal.pbio.3001547.pdf\n",
2957
+ "54\n",
2958
+ "lipid.pdf\n",
2959
+ "60\n",
2960
+ "monogenic diabetes.pdf\n",
2961
+ "62\n",
2962
+ "nihms-1651539.pdf\n",
2963
+ "62\n",
2964
+ "nihms-1792335.pdf\n",
2965
+ "73\n",
2966
+ "nihms-668049.pdf\n",
2967
+ "87\n",
2968
+ "nihms364577.pdf\n",
2969
+ "90\n",
2970
+ "nihms510594.pdf\n",
2971
+ "110\n",
2972
+ "pgen.1009952.pdf\n",
2973
+ "116\n",
2974
+ "PIIS0091674919313661.pdf\n",
2975
+ "121\n",
2976
+ "s12881-019-0830-y.pdf\n",
2977
+ "128\n",
2978
+ "s41576-021-00414-z (1).pdf\n",
2979
+ "132\n",
2980
+ "s41588-018-0047-6.pdf\n",
2981
+ "137\n",
2982
+ "s41588-022-01024-z (1).pdf\n",
2983
+ "150\n",
2984
+ "stroke genetic AHA.pdf\n",
2985
+ "154\n",
2986
+ "surendran2016.pdf\n",
2987
+ "158\n",
2988
+ "teslovich2010.pdf\n",
2989
+ "161\n",
2990
+ "testing\n",
2991
+ "ukmss-34421.pdf\n",
2992
+ "167\n",
2993
+ "wightman2021.pdf\n",
2994
+ "173\n"
2995
+ ]
2996
+ }
2997
+ ],
2998
+ "source": [
2999
+ "tables = []\n",
3000
+ "\n",
3001
+ "for path in os.listdir('papers/'):\n",
3002
+ " print(path)\n",
3003
+ "\n",
3004
+ " if path[-3:] != 'pdf':\n",
3005
+ " continue\n",
3006
+ "\n",
3007
+ " images = convert_from_path('papers/' + path)\n",
3008
+ "\n",
3009
+ " # Loop pages\n",
3010
+ " for image in images:\n",
3011
+ "\n",
3012
+ " pixel_values = detection_transform(image).unsqueeze(0).to(device)\n",
3013
+ " with torch.no_grad():\n",
3014
+ " outputs = model(pixel_values)\n",
3015
+ "\n",
3016
+ " id2label = model.config.id2label\n",
3017
+ " id2label[len(model.config.id2label)] = \"no object\"\n",
3018
+ " detected_tables = outputs_to_objects(outputs, image.size, id2label)\n",
3019
+ "\n",
3020
+ " # Loop table in page (if any)\n",
3021
+ " for idx in range(len(detected_tables)):\n",
3022
+ " cropped_table = image.crop(detected_tables[idx][\"bbox\"])\n",
3023
+ " if detected_tables[idx][\"label\"] == 'table rotated':\n",
3024
+ " cropped_table = cropped_table.rotate(270, expand=True)\n",
3025
+ "\n",
3026
+ " # TODO: what is the perfect threshold?\n",
3027
+ " if detected_tables[idx]['score'] > 0.9:\n",
3028
+ " # print(detected_tables[idx])\n",
3029
+ " tables.append(cropped_table)\n",
3030
+ " \n",
3031
+ " print(len(tables))\n"
3032
+ ]
3033
+ },
3034
+ {
3035
+ "cell_type": "code",
3036
+ "execution_count": 22,
3037
+ "metadata": {},
3038
+ "outputs": [],
3039
+ "source": [
3040
+ "tables[0].save(\n",
3041
+ " 'table.pdf', \"PDF\", resolution=100.0, save_all=True, append_images=tables[1:]\n",
3042
+ ")"
3043
+ ]
3044
+ },
3045
+ {
3046
+ "cell_type": "code",
3047
+ "execution_count": 27,
3048
  "metadata": {},
3049
  "outputs": [],
3050
  "source": [
3051
+ "import fitz\n",
3052
+ "\n",
3053
+ "# Open the PDF file\n",
3054
+ "doc=fitz.open(\"table.pdf\")\n",
3055
+ " \n",
3056
+ "# Say, you like to save the first 6 pages, first page is 0\n",
3057
+ "pages = [2,3,4,7,8,10,12,13,16,17,28,29,33,34,35,46,47,48,49,56,57,59,60,62,76,77,78,79,80,81,82,84,85,86,87,88,89,90,105,106,107,108,109,110,112,113,118,119,120,123,124,125,130,138,139,154,155,156,159,160,164,166,167,168]\n",
3058
+ "pages = [(x - 1) for x in pages]\n",
3059
+ "doc.select(pages)\n",
3060
+ " \n",
3061
+ "# Save the selected pages to a new PDF\n",
3062
+ "doc.save(\"out_file_name.pdf\")\n",
3063
+ "doc.close()\n"
3064
  ]
3065
+ },
3066
+ {
3067
+ "cell_type": "code",
3068
+ "execution_count": null,
3069
+ "metadata": {},
3070
+ "outputs": [],
3071
+ "source": []
3072
  }
3073
  ],
3074
  "metadata": {