fadliaulawi commited on
Commit
862259b
1 Parent(s): fb4710e

Generate result with LLM validation

Browse files
Files changed (4) hide show
  1. app.py +5 -4
  2. process.py +35 -18
  3. prompt.py +7 -5
  4. resources/experiment.ipynb +53 -0
app.py CHANGED
@@ -33,7 +33,7 @@ uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_m
33
 
34
  chunk_option = st.selectbox(
35
  'Tokens amounts per process :',
36
- (32000, 16000, 8000, 0), key='table_hv'
37
  )
38
  chunk_overlap = 0
39
 
@@ -112,14 +112,15 @@ if uploaded_files:
112
  dataframe = dataframe[['Genes', 'SNPs', 'Diseases', 'Title', 'Authors', 'Publisher Name', 'Publication Year', 'Population', 'Sample Size', 'Study Methodology', 'Study Level', 'Conclusion']]
113
  dataframe.drop_duplicates(['Genes', 'SNPs'], inplace=True)
114
  dataframe.reset_index(drop=True, inplace=True)
115
- cleaned_dataframe = validate(dataframe)
116
 
117
  end_time = datetime.now()
118
  st.write("Success in ", round((end_time.timestamp() - start_time.timestamp()) / 60, 2), "minutes")
119
 
120
- st.dataframe(cleaned_dataframe)
121
  with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
122
- cleaned_dataframe.to_excel(writer, sheet_name='Result')
 
123
  dataframe.to_excel(writer, sheet_name='Original')
124
  writer.close()
125
 
 
33
 
34
  chunk_option = st.selectbox(
35
  'Tokens amounts per process :',
36
+ (32000, 16000, 8000), key='table_hv'
37
  )
38
  chunk_overlap = 0
39
 
 
112
  dataframe = dataframe[['Genes', 'SNPs', 'Diseases', 'Title', 'Authors', 'Publisher Name', 'Publication Year', 'Population', 'Sample Size', 'Study Methodology', 'Study Level', 'Conclusion']]
113
  dataframe.drop_duplicates(['Genes', 'SNPs'], inplace=True)
114
  dataframe.reset_index(drop=True, inplace=True)
115
+ cleaned_df, cleaned_llm_df = validate(dataframe)
116
 
117
  end_time = datetime.now()
118
  st.write("Success in ", round((end_time.timestamp() - start_time.timestamp()) / 60, 2), "minutes")
119
 
120
+ st.dataframe(cleaned_df)
121
  with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
122
+ cleaned_llm_df.to_excel(writer, sheet_name='Result with LLM')
123
+ cleaned_df.to_excel(writer, sheet_name='Result')
124
  dataframe.to_excel(writer, sheet_name='Original')
125
  writer.close()
126
 
process.py CHANGED
@@ -8,7 +8,7 @@ from langchain.chains.llm import LLMChain
8
  from langchain.prompts import PromptTemplate
9
  from langchain_openai import ChatOpenAI
10
  from pdf2image import convert_from_path
11
- from prompt import prompt_entity_gsd_chunk, prompt_entity_gsd_combine, prompt_entity_summ_chunk, prompt_entity_summ_combine, prompt_entities_chunk, prompt_entities_combine, prompt_entity_one_chunk, prompt_table
12
  from table_detector import detection_transform, device, model, ocr, outputs_to_objects
13
 
14
  import io
@@ -180,6 +180,7 @@ def get_table(path):
180
 
181
  def validate(df):
182
 
 
183
  df = df.fillna('')
184
  df['Genes'] = df['Genes'].str.upper()
185
  df['SNPs'] = df['SNPs'].str.lower()
@@ -191,32 +192,48 @@ def validate(df):
191
  for s in sym:
192
  if s in gene:
193
  genes = gene.split(s)
194
- df.loc[len(df)] = df.loc[i]
195
- df.loc[i, 'Genes'] = genes[0]
196
- df.loc[len(df) - 1, 'Genes'] = genes[1]
197
 
198
  # Check if there is SNPs without 'rs'
199
  for i in df.index:
200
  safe = True
201
  snp = df.loc[i, 'SNPs']
202
- if not re.fullmatch('rs(\d)+|', snp):
203
- if not re.fullmatch('s(\d)+', snp):
204
- if not re.fullmatch('(\d)+', snp):
205
- safe = False
206
- df = df.drop(i)
207
- else:
208
- snp = 'rs' + snp
209
- else:
210
- snp = 'r' + snp
211
-
 
 
212
  if safe:
213
  df.loc[i, 'SNPs'] = snp
214
 
215
  df.reset_index(drop=True, inplace=True)
216
 
217
- # TODO: How to validate genes and SNPs?
 
 
 
 
 
 
218
 
219
- # TODO: Validate genes and diseases with LLM
220
- result = llm_p.invoke(model='mistral-7b-instruct', input='How many stars?')
 
 
 
 
 
 
 
 
221
 
222
- return df
 
8
  from langchain.prompts import PromptTemplate
9
  from langchain_openai import ChatOpenAI
10
  from pdf2image import convert_from_path
11
+ from prompt import prompt_entity_gsd_chunk, prompt_entity_gsd_combine, prompt_entity_summ_chunk, prompt_entity_summ_combine, prompt_entities_chunk, prompt_entities_combine, prompt_entity_one_chunk, prompt_table, prompt_validation
12
  from table_detector import detection_transform, device, model, ocr, outputs_to_objects
13
 
14
  import io
 
180
 
181
  def validate(df):
182
 
183
+ df = df[df['Genes'].notna()].reset_index(drop=True)
184
  df = df.fillna('')
185
  df['Genes'] = df['Genes'].str.upper()
186
  df['SNPs'] = df['SNPs'].str.lower()
 
192
  for s in sym:
193
  if s in gene:
194
  genes = gene.split(s)
195
+ df.loc[i + 0.5] = df.loc[i]
196
+ df = df.sort_index().reset_index(drop=True)
197
+ df.loc[i, 'Genes'], df.loc[i + 1, 'Genes'] = genes[0], genes[1]
198
 
199
  # Check if there is SNPs without 'rs'
200
  for i in df.index:
201
  safe = True
202
  snp = df.loc[i, 'SNPs']
203
+ if re.fullmatch('rs(\d)+|', snp):
204
+ pass
205
+ elif re.fullmatch('ts(\d)+', snp):
206
+ snp = 't' + snp[1:]
207
+ elif re.fullmatch('s(\d)+', snp):
208
+ snp = 'r' + snp
209
+ elif re.fullmatch('(\d)+', snp):
210
+ snp = 'rs' + snp
211
+ else:
212
+ safe = False
213
+ df = df.drop(i)
214
+
215
  if safe:
216
  df.loc[i, 'SNPs'] = snp
217
 
218
  df.reset_index(drop=True, inplace=True)
219
 
220
+ # Validate genes and diseases with LLM
221
+ json_table = df[['Genes', 'SNPs', 'Diseases']].to_json(orient='records')
222
+ str_json_table = json.dumps(json.loads(json_table), indent=2)
223
+
224
+ result = llm_p.invoke(model='mistral-7b-instruct', input=prompt_validation.format(str_json_table)).content
225
+ print('val')
226
+ print(result)
227
 
228
+ result = result[result.find('['):result.rfind(']')+1]
229
+ try:
230
+ result = eval(result)
231
+ except SyntaxError:
232
+ result = []
233
+
234
+ df_val = pd.DataFrame(result)
235
+ df_val = df_val.merge(df.head(1).drop(['Genes', 'SNPs', 'Diseases'], axis=1), 'cross')
236
+
237
+ # TODO: How to validate genes and SNPs?
238
 
239
+ return df, df_val
prompt.py CHANGED
@@ -265,8 +265,8 @@ If there is no specific extracted entities provided from the table, just leave t
265
 
266
  prompt_validation = """
267
  # CONTEXT #
268
- In my capacity as a genomics specialist, I have table data containing gene names with its corresponding SNPs and diseases. The data is provided in a list of JSONs format, with each JSON object representing a single row in a tabular structure.
269
- The problem is because the data is extracetd using OCR, some gene names and SNPs maybe have a typo.
270
 
271
  This is the data:
272
  {}
@@ -274,11 +274,13 @@ This is the data:
274
  # OBJECTIVE #
275
  Given the provided table data, the following tasks need to be completed:
276
 
277
- 1. Check whether the gene name is a correct gene name. If the gene name is suspected to be a typo, fix it into a correct form. If not, eliminate this row data because the gene name is not valid.
278
- 2. If diseases not empty, check whether the gene name is correspond with the gene names. Fix it with the correct diseases if the original disease is wrong.
 
 
279
 
280
  # RESPONSE #
281
- The output should only be a string containing list of JSON objects, each representing an validated entry with the following structure:
282
  [
283
  {{
284
  "Genes": "A",
 
265
 
266
  prompt_validation = """
267
  # CONTEXT #
268
+ In my capacity as a genomics specialist, I have table data containing gene names with their corresponding SNPs and diseases. The data is provided in a list of JSON format, with each JSON object representing a single row in a tabular structure.
269
+ The problem is because the data is extracted using OCR, some gene names and SNPs may have a typo.
270
 
271
  This is the data:
272
  {}
 
274
  # OBJECTIVE #
275
  Given the provided table data, the following tasks need to be completed:
276
 
277
+ 1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:
278
+ - Combined Names: Two gene names erroneously merged into one. Separate these using "and": "A and B".
279
+ - OCR Errors: Similar characters misread by the system. Correct these to the intended form.
280
+ 2. If diseases are not empty, check whether the gene name corresponds with the gene names. Fix it with the correct diseases if the original disease is wrong.
281
 
282
  # RESPONSE #
283
+ The output must be only a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:
284
  [
285
  {{
286
  "Genes": "A",
resources/experiment.ipynb CHANGED
@@ -2316,6 +2316,59 @@
2316
  "result = llm.invoke(model='llama-3-70b-instruct', input=prompt)\n",
2317
  "print(result.content)"
2318
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2319
  }
2320
  ],
2321
  "metadata": {
 
2316
  "result = llm.invoke(model='llama-3-70b-instruct', input=prompt)\n",
2317
  "print(result.content)"
2318
  ]
2319
+ },
2320
+ {
2321
+ "cell_type": "code",
2322
+ "execution_count": 2,
2323
+ "metadata": {},
2324
+ "outputs": [
2325
+ {
2326
+ "name": "stderr",
2327
+ "output_type": "stream",
2328
+ "text": [
2329
+ "c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\load.py:1429: FutureWarning: The repository for bigbio/euadr contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/bigbio/euadr\n",
2330
+ "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
2331
+ "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
2332
+ " warnings.warn(\n"
2333
+ ]
2334
+ },
2335
+ {
2336
+ "ename": "ConnectionError",
2337
+ "evalue": "Couldn't reach https://biosemantics.erasmusmc.nl/downloads/euadr.tgz (ConnectTimeout(MaxRetryError(\"HTTPSConnectionPool(host='biosemantics.erasmusmc.nl', port=443): Max retries exceeded with url: /downloads/euadr.tgz (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001CD0D00D060>, 'Connection to biosemantics.erasmusmc.nl timed out. (connect timeout=100)'))\")))",
2338
+ "output_type": "error",
2339
+ "traceback": [
2340
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
2341
+ "\u001b[1;31mConnectionError\u001b[0m Traceback (most recent call last)",
2342
+ "\u001b[1;32m<ipython-input-2-8057498175ab>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mdatasets\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mdataset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"bigbio/euadr\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
2343
+ "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\load.py\u001b[0m in \u001b[0;36mload_dataset\u001b[1;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)\u001b[0m\n\u001b[0;32m 2547\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2548\u001b[0m \u001b[1;31m# Download and prepare data\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2549\u001b[1;33m builder_instance.download_and_prepare(\n\u001b[0m\u001b[0;32m 2550\u001b[0m \u001b[0mdownload_config\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2551\u001b[0m \u001b[0mdownload_mode\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdownload_mode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2344
+ "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\builder.py\u001b[0m in \u001b[0;36mdownload_and_prepare\u001b[1;34m(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[0;32m 1003\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mnum_proc\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1004\u001b[0m \u001b[0mprepare_split_kwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"num_proc\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnum_proc\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1005\u001b[1;33m self._download_and_prepare(\n\u001b[0m\u001b[0;32m 1006\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1007\u001b[0m \u001b[0mverification_mode\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mverification_mode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2345
+ "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\builder.py\u001b[0m in \u001b[0;36m_download_and_prepare\u001b[1;34m(self, dl_manager, verification_mode, **prepare_splits_kwargs)\u001b[0m\n\u001b[0;32m 1765\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1766\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_download_and_prepare\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverification_mode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mprepare_splits_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1767\u001b[1;33m super()._download_and_prepare(\n\u001b[0m\u001b[0;32m 1768\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1769\u001b[0m \u001b[0mverification_mode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2346
+ "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\builder.py\u001b[0m in \u001b[0;36m_download_and_prepare\u001b[1;34m(self, dl_manager, verification_mode, **prepare_split_kwargs)\u001b[0m\n\u001b[0;32m 1076\u001b[0m \u001b[0msplit_dict\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSplitDict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdataset_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1077\u001b[0m \u001b[0msplit_generators_kwargs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_make_split_generators_kwargs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprepare_split_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1078\u001b[1;33m \u001b[0msplit_generators\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_split_generators\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0msplit_generators_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1079\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1080\u001b[0m \u001b[1;31m# Checksums verification\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2347
+ "\u001b[1;32m~\\.cache\\huggingface\\modules\\datasets_modules\\datasets\\bigbio--euadr\\38388d88a335f2d91807b0f813bdfd809fec0e9dcbc32e2d9bfea7275d70f75c\\euadr.py\u001b[0m in \u001b[0;36m_split_generators\u001b[1;34m(self, dl_manager)\u001b[0m\n\u001b[0;32m 105\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_split_generators\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 106\u001b[0m \u001b[0murls\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_URL\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 107\u001b[1;33m \u001b[0mdatapath\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdownload_and_extract\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murls\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 108\u001b[0m return [\n\u001b[0;32m 109\u001b[0m datasets.SplitGenerator(\n",
2348
+ "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\download\\download_manager.py\u001b[0m in \u001b[0;36mdownload_and_extract\u001b[1;34m(self, url_or_urls)\u001b[0m\n\u001b[0;32m 560\u001b[0m \u001b[0mextracted_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mextracted\u001b[0m \u001b[0mpaths\u001b[0m \u001b[0mof\u001b[0m \u001b[0mgiven\u001b[0m \u001b[0mURL\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 561\u001b[0m \"\"\"\n\u001b[1;32m--> 562\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mextract\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_urls\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 563\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 564\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mget_recorded_sizes_checksums\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2349
+ "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\download\\download_manager.py\u001b[0m in \u001b[0;36mdownload\u001b[1;34m(self, url_or_urls)\u001b[0m\n\u001b[0;32m 424\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 425\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 426\u001b[1;33m downloaded_path_or_paths = map_nested(\n\u001b[0m\u001b[0;32m 427\u001b[0m \u001b[0mdownload_func\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 428\u001b[0m \u001b[0murl_or_urls\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2350
+ "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\utils\\py_utils.py\u001b[0m in \u001b[0;36mmap_nested\u001b[1;34m(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, parallel_min_length, types, disable_tqdm, desc)\u001b[0m\n\u001b[0;32m 457\u001b[0m \u001b[1;31m# Singleton\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 458\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtypes\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 459\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfunction\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 460\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 461\u001b[0m \u001b[0miterable\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32melse\u001b[0m \u001b[0mdata_struct\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2351
+ "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\download\\download_manager.py\u001b[0m in \u001b[0;36m_download\u001b[1;34m(self, url_or_filename, download_config)\u001b[0m\n\u001b[0;32m 449\u001b[0m \u001b[1;31m# append the relative path to the base_path\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 450\u001b[0m \u001b[0murl_or_filename\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0murl_or_path_join\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_base_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl_or_filename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 451\u001b[1;33m \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcached_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdownload_config\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 452\u001b[0m \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtracked_str\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 453\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_origin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2352
+ "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\utils\\file_utils.py\u001b[0m in \u001b[0;36mcached_path\u001b[1;34m(url_or_filename, download_config, **download_kwargs)\u001b[0m\n\u001b[0;32m 186\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mis_remote_url\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[1;31m# URL, so get it from the cache (downloading if necessary)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 188\u001b[1;33m output_path = get_from_cache(\n\u001b[0m\u001b[0;32m 189\u001b[0m \u001b[0murl_or_filename\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 190\u001b[0m \u001b[0mcache_dir\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2353
+ "\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\utils\\file_utils.py\u001b[0m in \u001b[0;36mget_from_cache\u001b[1;34m(url, cache_dir, force_download, proxies, etag_timeout, resume_download, user_agent, local_files_only, use_etag, max_retries, token, use_auth_token, ignore_url_params, storage_options, download_desc)\u001b[0m\n\u001b[0;32m 571\u001b[0m \u001b[0m_raise_if_offline_mode_is_enabled\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Tried to reach {url}\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 572\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mhead_error\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 573\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Couldn't reach {url} ({repr(head_error)})\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 574\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 575\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Couldn't reach {url} (error {response.status_code})\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2354
+ "\u001b[1;31mConnectionError\u001b[0m: Couldn't reach https://biosemantics.erasmusmc.nl/downloads/euadr.tgz (ConnectTimeout(MaxRetryError(\"HTTPSConnectionPool(host='biosemantics.erasmusmc.nl', port=443): Max retries exceeded with url: /downloads/euadr.tgz (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001CD0D00D060>, 'Connection to biosemantics.erasmusmc.nl timed out. (connect timeout=100)'))\")))"
2355
+ ]
2356
+ }
2357
+ ],
2358
+ "source": [
2359
+ "from datasets import load_dataset\n",
2360
+ "\n",
2361
+ "dataset = load_dataset(\"bigbio/euadr\")"
2362
+ ]
2363
+ },
2364
+ {
2365
+ "cell_type": "code",
2366
+ "execution_count": null,
2367
+ "metadata": {},
2368
+ "outputs": [],
2369
+ "source": [
2370
+ "dataset"
2371
+ ]
2372
  }
2373
  ],
2374
  "metadata": {