ArneBinder commited on
Commit
b77f1d0
1 Parent(s): 4467900

Upload 7 files

Browse files
Files changed (2) hide show
  1. app.py +77 -17
  2. backend.py +3 -4
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import json
2
  import logging
3
  import os.path
 
4
  from functools import partial
5
  from typing import Dict, List, Optional, Tuple
6
 
@@ -71,7 +72,7 @@ def process_uploaded_files(
71
  str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
72
  ],
73
  vector_store: VectorStore[Tuple[str, str]],
74
- ) -> None:
75
  try:
76
  for file_name in file_names:
77
  if file_name.lower().endswith(".txt"):
@@ -86,6 +87,8 @@ def process_uploaded_files(
86
  except Exception as e:
87
  raise gr.Error(f"Failed to process uploaded files: {e}")
88
 
 
 
89
 
90
  def open_accordion():
91
  return gr.Accordion(open=True)
@@ -181,6 +184,41 @@ def set_relation_types(
181
  )
182
 
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  def main():
185
 
186
  example_text = "Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent."
@@ -283,15 +321,22 @@ def main():
283
  )
284
 
285
  with gr.Column(scale=1):
286
- with gr.Accordion("Indexed Documents", open=False):
 
 
287
  processed_documents_df = gr.DataFrame(
288
  headers=["id", "num_adus", "num_relations"],
289
  interactive=False,
290
  )
 
 
 
 
 
291
 
292
- with gr.Accordion("Reference ADU", open=False):
293
- reference_adu_id = gr.Textbox(label="ID", elem_id="reference_adu_id")
294
- reference_adu_text = gr.Textbox(label="Text")
295
 
296
  with gr.Accordion("Retrieval Configuration", open=False):
297
  min_similarity = gr.Slider(
@@ -318,13 +363,14 @@ def main():
318
  relevant_adus = gr.DataFrame(
319
  label="Relevant ADUs from other documents",
320
  headers=[
321
- "text",
322
  "relation",
323
- "doc_id",
324
  "reference_adu",
 
325
  "sim_score",
326
  "rel_score",
327
  ],
 
328
  )
329
 
330
  render_event_kwargs = dict(
@@ -354,12 +400,10 @@ def main():
354
  )
355
 
356
  upload_btn.upload(
 
 
357
  fn=process_uploaded_files,
358
  inputs=[upload_btn, models_state, processed_documents_state, vector_store_state],
359
- outputs=[],
360
- ).success(
361
- fn=update_processed_documents_df,
362
- inputs=[processed_documents_state],
363
  outputs=[processed_documents_df],
364
  )
365
  processed_documents_df.select(
@@ -368,30 +412,46 @@ def main():
368
  outputs=[document_state],
369
  )
370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  retrieve_relevant_adus_event_kwargs = dict(
372
  fn=get_relevant_adus,
373
  inputs=[
374
- reference_adu_id,
375
  document_state,
376
  vector_store_state,
377
  processed_documents_state,
378
  min_similarity,
379
  top_k,
380
  relation_types,
 
381
  ],
382
  outputs=[relevant_adus],
383
  )
384
 
385
- reference_adu_id.change(
386
  fn=partial(get_annotation_from_document, annotation_layer="labeled_spans"),
387
- inputs=[document_state, reference_adu_id],
388
- outputs=[reference_adu_text],
389
  ).success(**retrieve_relevant_adus_event_kwargs)
390
 
391
  retrieve_similar_adus_btn.click(
392
  fn=get_similar_adus,
393
  inputs=[
394
- reference_adu_id,
395
  document_state,
396
  vector_store_state,
397
  processed_documents_state,
@@ -475,7 +535,7 @@ def main():
475
  }
476
  function setReferenceAduId(entityId) {
477
  // get the textarea element that holds the reference adu id
478
- let referenceAduIdDiv = document.querySelector('#reference_adu_id textarea');
479
  // set the value of the input field
480
  referenceAduIdDiv.value = entityId;
481
  // trigger an input event to update the state
 
1
  import json
2
  import logging
3
  import os.path
4
+ import tempfile
5
  from functools import partial
6
  from typing import Dict, List, Optional, Tuple
7
 
 
72
  str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
73
  ],
74
  vector_store: VectorStore[Tuple[str, str]],
75
+ ) -> pd.DataFrame:
76
  try:
77
  for file_name in file_names:
78
  if file_name.lower().endswith(".txt"):
 
87
  except Exception as e:
88
  raise gr.Error(f"Failed to process uploaded files: {e}")
89
 
90
+ return update_processed_documents_df(processed_documents)
91
+
92
 
93
  def open_accordion():
94
  return gr.Accordion(open=True)
 
184
  )
185
 
186
 
187
+ def download_processed_documents(
188
+ processed_documents: dict[
189
+ str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
190
+ ],
191
+ file_name: str = "processed_documents.json",
192
+ ) -> str:
193
+ processed_documents_json = {
194
+ doc_id: document.asdict() for doc_id, document in processed_documents.items()
195
+ }
196
+ file_path = os.path.join(tempfile.gettempdir(), file_name)
197
+ with open(file_path, "w", encoding="utf-8") as f:
198
+ json.dump(processed_documents_json, f, indent=2)
199
+ return file_path
200
+
201
+
202
+ def upload_processed_documents(
203
+ file_name: str,
204
+ processed_documents: dict[
205
+ str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
206
+ ],
207
+ ) -> Dict[str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions]:
208
+ with open(file_name, "r", encoding="utf-8") as f:
209
+ processed_documents_json = json.load(f)
210
+ for doc_id, document_json in processed_documents_json.items():
211
+ document = TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions.fromdict(
212
+ document_json
213
+ )
214
+ # metadata is not automatically deserialized, so we need to set it manually
215
+ document.metadata["embeddings"] = document_json["metadata"]["embeddings"]
216
+ if doc_id in processed_documents:
217
+ gr.Warning(f"Document '{doc_id}' already exists. Overwriting.")
218
+ processed_documents[doc_id] = document
219
+ return processed_documents
220
+
221
+
222
  def main():
223
 
224
  example_text = "Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent."
 
321
  )
322
 
323
  with gr.Column(scale=1):
324
+ with gr.Accordion(
325
+ "Indexed Documents", open=False
326
+ ) as processed_documents_accordion:
327
  processed_documents_df = gr.DataFrame(
328
  headers=["id", "num_adus", "num_relations"],
329
  interactive=False,
330
  )
331
+ with gr.Row():
332
+ download_processed_documents_btn = gr.DownloadButton("Download")
333
+ upload_processed_documents_btn = gr.UploadButton(
334
+ "Upload", file_types=["json"]
335
+ )
336
 
337
+ with gr.Accordion("Selected ADU", open=False):
338
+ selected_adu_id = gr.Textbox(label="ID", elem_id="selected_adu_id")
339
+ selected_adu_text = gr.Textbox(label="Text")
340
 
341
  with gr.Accordion("Retrieval Configuration", open=False):
342
  min_similarity = gr.Slider(
 
363
  relevant_adus = gr.DataFrame(
364
  label="Relevant ADUs from other documents",
365
  headers=[
 
366
  "relation",
367
+ "adu",
368
  "reference_adu",
369
+ "doc_id",
370
  "sim_score",
371
  "rel_score",
372
  ],
373
+ interactive=False,
374
  )
375
 
376
  render_event_kwargs = dict(
 
400
  )
401
 
402
  upload_btn.upload(
403
+ fn=open_accordion, inputs=[], outputs=[processed_documents_accordion]
404
+ ).then(
405
  fn=process_uploaded_files,
406
  inputs=[upload_btn, models_state, processed_documents_state, vector_store_state],
 
 
 
 
407
  outputs=[processed_documents_df],
408
  )
409
  processed_documents_df.select(
 
412
  outputs=[document_state],
413
  )
414
 
415
+ download_processed_documents_btn.click(
416
+ fn=download_processed_documents,
417
+ inputs=[processed_documents_state],
418
+ outputs=[download_processed_documents_btn],
419
+ )
420
+ upload_processed_documents_btn.upload(
421
+ fn=upload_processed_documents,
422
+ inputs=[upload_processed_documents_btn, processed_documents_state],
423
+ outputs=[processed_documents_state],
424
+ ).success(
425
+ fn=update_processed_documents_df,
426
+ inputs=[processed_documents_state],
427
+ outputs=[processed_documents_df],
428
+ )
429
+
430
  retrieve_relevant_adus_event_kwargs = dict(
431
  fn=get_relevant_adus,
432
  inputs=[
433
+ selected_adu_id,
434
  document_state,
435
  vector_store_state,
436
  processed_documents_state,
437
  min_similarity,
438
  top_k,
439
  relation_types,
440
+ relevant_adus,
441
  ],
442
  outputs=[relevant_adus],
443
  )
444
 
445
+ selected_adu_id.change(
446
  fn=partial(get_annotation_from_document, annotation_layer="labeled_spans"),
447
+ inputs=[document_state, selected_adu_id],
448
+ outputs=[selected_adu_text],
449
  ).success(**retrieve_relevant_adus_event_kwargs)
450
 
451
  retrieve_similar_adus_btn.click(
452
  fn=get_similar_adus,
453
  inputs=[
454
+ selected_adu_id,
455
  document_state,
456
  vector_store_state,
457
  processed_documents_state,
 
535
  }
536
  function setReferenceAduId(entityId) {
537
  // get the textarea element that holds the reference adu id
538
+ let referenceAduIdDiv = document.querySelector('#selected_adu_id textarea');
539
  // set the value of the input field
540
  referenceAduIdDiv.value = entityId;
541
  // trigger an input event to update the state
backend.py CHANGED
@@ -253,6 +253,7 @@ def get_relevant_adus(
253
  min_similarity: float,
254
  top_k: int,
255
  relation_types: List[str],
 
256
  ) -> pd.DataFrame:
257
  similar_entries = vector_store.retrieve_similar(
258
  ref_id=(ref_document.id, ref_annotation_id),
@@ -289,12 +290,10 @@ def get_relevant_adus(
289
  "sim_score": score,
290
  "rel_score": rel.score,
291
  "relation": rel.label,
292
- "text": str(rel.tail),
293
  }
294
  )
295
 
296
  # define column order
297
- df = pd.DataFrame(
298
- result, columns=["text", "relation", "doc_id", "reference_adu", "sim_score", "rel_score"]
299
- )
300
  return df
 
253
  min_similarity: float,
254
  top_k: int,
255
  relation_types: List[str],
256
+ previous_result: pd.DataFrame,
257
  ) -> pd.DataFrame:
258
  similar_entries = vector_store.retrieve_similar(
259
  ref_id=(ref_document.id, ref_annotation_id),
 
290
  "sim_score": score,
291
  "rel_score": rel.score,
292
  "relation": rel.label,
293
+ "adu": str(rel.tail),
294
  }
295
  )
296
 
297
  # define column order
298
+ df = pd.DataFrame(result, columns=previous_result.columns)
 
 
299
  return df