ArneBinder
commited on
Commit
•
b77f1d0
1
Parent(s):
4467900
Upload 7 files
Browse files- app.py +77 -17
- backend.py +3 -4
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import json
|
2 |
import logging
|
3 |
import os.path
|
|
|
4 |
from functools import partial
|
5 |
from typing import Dict, List, Optional, Tuple
|
6 |
|
@@ -71,7 +72,7 @@ def process_uploaded_files(
|
|
71 |
str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
|
72 |
],
|
73 |
vector_store: VectorStore[Tuple[str, str]],
|
74 |
-
) ->
|
75 |
try:
|
76 |
for file_name in file_names:
|
77 |
if file_name.lower().endswith(".txt"):
|
@@ -86,6 +87,8 @@ def process_uploaded_files(
|
|
86 |
except Exception as e:
|
87 |
raise gr.Error(f"Failed to process uploaded files: {e}")
|
88 |
|
|
|
|
|
89 |
|
90 |
def open_accordion():
|
91 |
return gr.Accordion(open=True)
|
@@ -181,6 +184,41 @@ def set_relation_types(
|
|
181 |
)
|
182 |
|
183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
def main():
|
185 |
|
186 |
example_text = "Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent."
|
@@ -283,15 +321,22 @@ def main():
|
|
283 |
)
|
284 |
|
285 |
with gr.Column(scale=1):
|
286 |
-
with gr.Accordion(
|
|
|
|
|
287 |
processed_documents_df = gr.DataFrame(
|
288 |
headers=["id", "num_adus", "num_relations"],
|
289 |
interactive=False,
|
290 |
)
|
|
|
|
|
|
|
|
|
|
|
291 |
|
292 |
-
with gr.Accordion("
|
293 |
-
|
294 |
-
|
295 |
|
296 |
with gr.Accordion("Retrieval Configuration", open=False):
|
297 |
min_similarity = gr.Slider(
|
@@ -318,13 +363,14 @@ def main():
|
|
318 |
relevant_adus = gr.DataFrame(
|
319 |
label="Relevant ADUs from other documents",
|
320 |
headers=[
|
321 |
-
"text",
|
322 |
"relation",
|
323 |
-
"
|
324 |
"reference_adu",
|
|
|
325 |
"sim_score",
|
326 |
"rel_score",
|
327 |
],
|
|
|
328 |
)
|
329 |
|
330 |
render_event_kwargs = dict(
|
@@ -354,12 +400,10 @@ def main():
|
|
354 |
)
|
355 |
|
356 |
upload_btn.upload(
|
|
|
|
|
357 |
fn=process_uploaded_files,
|
358 |
inputs=[upload_btn, models_state, processed_documents_state, vector_store_state],
|
359 |
-
outputs=[],
|
360 |
-
).success(
|
361 |
-
fn=update_processed_documents_df,
|
362 |
-
inputs=[processed_documents_state],
|
363 |
outputs=[processed_documents_df],
|
364 |
)
|
365 |
processed_documents_df.select(
|
@@ -368,30 +412,46 @@ def main():
|
|
368 |
outputs=[document_state],
|
369 |
)
|
370 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
371 |
retrieve_relevant_adus_event_kwargs = dict(
|
372 |
fn=get_relevant_adus,
|
373 |
inputs=[
|
374 |
-
|
375 |
document_state,
|
376 |
vector_store_state,
|
377 |
processed_documents_state,
|
378 |
min_similarity,
|
379 |
top_k,
|
380 |
relation_types,
|
|
|
381 |
],
|
382 |
outputs=[relevant_adus],
|
383 |
)
|
384 |
|
385 |
-
|
386 |
fn=partial(get_annotation_from_document, annotation_layer="labeled_spans"),
|
387 |
-
inputs=[document_state,
|
388 |
-
outputs=[
|
389 |
).success(**retrieve_relevant_adus_event_kwargs)
|
390 |
|
391 |
retrieve_similar_adus_btn.click(
|
392 |
fn=get_similar_adus,
|
393 |
inputs=[
|
394 |
-
|
395 |
document_state,
|
396 |
vector_store_state,
|
397 |
processed_documents_state,
|
@@ -475,7 +535,7 @@ def main():
|
|
475 |
}
|
476 |
function setReferenceAduId(entityId) {
|
477 |
// get the textarea element that holds the reference adu id
|
478 |
-
let referenceAduIdDiv = document.querySelector('#
|
479 |
// set the value of the input field
|
480 |
referenceAduIdDiv.value = entityId;
|
481 |
// trigger an input event to update the state
|
|
|
1 |
import json
|
2 |
import logging
|
3 |
import os.path
|
4 |
+
import tempfile
|
5 |
from functools import partial
|
6 |
from typing import Dict, List, Optional, Tuple
|
7 |
|
|
|
72 |
str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
|
73 |
],
|
74 |
vector_store: VectorStore[Tuple[str, str]],
|
75 |
+
) -> pd.DataFrame:
|
76 |
try:
|
77 |
for file_name in file_names:
|
78 |
if file_name.lower().endswith(".txt"):
|
|
|
87 |
except Exception as e:
|
88 |
raise gr.Error(f"Failed to process uploaded files: {e}")
|
89 |
|
90 |
+
return update_processed_documents_df(processed_documents)
|
91 |
+
|
92 |
|
93 |
def open_accordion():
|
94 |
return gr.Accordion(open=True)
|
|
|
184 |
)
|
185 |
|
186 |
|
187 |
+
def download_processed_documents(
|
188 |
+
processed_documents: dict[
|
189 |
+
str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
|
190 |
+
],
|
191 |
+
file_name: str = "processed_documents.json",
|
192 |
+
) -> str:
|
193 |
+
processed_documents_json = {
|
194 |
+
doc_id: document.asdict() for doc_id, document in processed_documents.items()
|
195 |
+
}
|
196 |
+
file_path = os.path.join(tempfile.gettempdir(), file_name)
|
197 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
198 |
+
json.dump(processed_documents_json, f, indent=2)
|
199 |
+
return file_path
|
200 |
+
|
201 |
+
|
202 |
+
def upload_processed_documents(
|
203 |
+
file_name: str,
|
204 |
+
processed_documents: dict[
|
205 |
+
str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
|
206 |
+
],
|
207 |
+
) -> Dict[str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions]:
|
208 |
+
with open(file_name, "r", encoding="utf-8") as f:
|
209 |
+
processed_documents_json = json.load(f)
|
210 |
+
for doc_id, document_json in processed_documents_json.items():
|
211 |
+
document = TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions.fromdict(
|
212 |
+
document_json
|
213 |
+
)
|
214 |
+
# metadata is not automatically deserialized, so we need to set it manually
|
215 |
+
document.metadata["embeddings"] = document_json["metadata"]["embeddings"]
|
216 |
+
if doc_id in processed_documents:
|
217 |
+
gr.Warning(f"Document '{doc_id}' already exists. Overwriting.")
|
218 |
+
processed_documents[doc_id] = document
|
219 |
+
return processed_documents
|
220 |
+
|
221 |
+
|
222 |
def main():
|
223 |
|
224 |
example_text = "Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent."
|
|
|
321 |
)
|
322 |
|
323 |
with gr.Column(scale=1):
|
324 |
+
with gr.Accordion(
|
325 |
+
"Indexed Documents", open=False
|
326 |
+
) as processed_documents_accordion:
|
327 |
processed_documents_df = gr.DataFrame(
|
328 |
headers=["id", "num_adus", "num_relations"],
|
329 |
interactive=False,
|
330 |
)
|
331 |
+
with gr.Row():
|
332 |
+
download_processed_documents_btn = gr.DownloadButton("Download")
|
333 |
+
upload_processed_documents_btn = gr.UploadButton(
|
334 |
+
"Upload", file_types=["json"]
|
335 |
+
)
|
336 |
|
337 |
+
with gr.Accordion("Selected ADU", open=False):
|
338 |
+
selected_adu_id = gr.Textbox(label="ID", elem_id="selected_adu_id")
|
339 |
+
selected_adu_text = gr.Textbox(label="Text")
|
340 |
|
341 |
with gr.Accordion("Retrieval Configuration", open=False):
|
342 |
min_similarity = gr.Slider(
|
|
|
363 |
relevant_adus = gr.DataFrame(
|
364 |
label="Relevant ADUs from other documents",
|
365 |
headers=[
|
|
|
366 |
"relation",
|
367 |
+
"adu",
|
368 |
"reference_adu",
|
369 |
+
"doc_id",
|
370 |
"sim_score",
|
371 |
"rel_score",
|
372 |
],
|
373 |
+
interactive=False,
|
374 |
)
|
375 |
|
376 |
render_event_kwargs = dict(
|
|
|
400 |
)
|
401 |
|
402 |
upload_btn.upload(
|
403 |
+
fn=open_accordion, inputs=[], outputs=[processed_documents_accordion]
|
404 |
+
).then(
|
405 |
fn=process_uploaded_files,
|
406 |
inputs=[upload_btn, models_state, processed_documents_state, vector_store_state],
|
|
|
|
|
|
|
|
|
407 |
outputs=[processed_documents_df],
|
408 |
)
|
409 |
processed_documents_df.select(
|
|
|
412 |
outputs=[document_state],
|
413 |
)
|
414 |
|
415 |
+
download_processed_documents_btn.click(
|
416 |
+
fn=download_processed_documents,
|
417 |
+
inputs=[processed_documents_state],
|
418 |
+
outputs=[download_processed_documents_btn],
|
419 |
+
)
|
420 |
+
upload_processed_documents_btn.upload(
|
421 |
+
fn=upload_processed_documents,
|
422 |
+
inputs=[upload_processed_documents_btn, processed_documents_state],
|
423 |
+
outputs=[processed_documents_state],
|
424 |
+
).success(
|
425 |
+
fn=update_processed_documents_df,
|
426 |
+
inputs=[processed_documents_state],
|
427 |
+
outputs=[processed_documents_df],
|
428 |
+
)
|
429 |
+
|
430 |
retrieve_relevant_adus_event_kwargs = dict(
|
431 |
fn=get_relevant_adus,
|
432 |
inputs=[
|
433 |
+
selected_adu_id,
|
434 |
document_state,
|
435 |
vector_store_state,
|
436 |
processed_documents_state,
|
437 |
min_similarity,
|
438 |
top_k,
|
439 |
relation_types,
|
440 |
+
relevant_adus,
|
441 |
],
|
442 |
outputs=[relevant_adus],
|
443 |
)
|
444 |
|
445 |
+
selected_adu_id.change(
|
446 |
fn=partial(get_annotation_from_document, annotation_layer="labeled_spans"),
|
447 |
+
inputs=[document_state, selected_adu_id],
|
448 |
+
outputs=[selected_adu_text],
|
449 |
).success(**retrieve_relevant_adus_event_kwargs)
|
450 |
|
451 |
retrieve_similar_adus_btn.click(
|
452 |
fn=get_similar_adus,
|
453 |
inputs=[
|
454 |
+
selected_adu_id,
|
455 |
document_state,
|
456 |
vector_store_state,
|
457 |
processed_documents_state,
|
|
|
535 |
}
|
536 |
function setReferenceAduId(entityId) {
|
537 |
// get the textarea element that holds the reference adu id
|
538 |
+
let referenceAduIdDiv = document.querySelector('#selected_adu_id textarea');
|
539 |
// set the value of the input field
|
540 |
referenceAduIdDiv.value = entityId;
|
541 |
// trigger an input event to update the state
|
backend.py
CHANGED
@@ -253,6 +253,7 @@ def get_relevant_adus(
|
|
253 |
min_similarity: float,
|
254 |
top_k: int,
|
255 |
relation_types: List[str],
|
|
|
256 |
) -> pd.DataFrame:
|
257 |
similar_entries = vector_store.retrieve_similar(
|
258 |
ref_id=(ref_document.id, ref_annotation_id),
|
@@ -289,12 +290,10 @@ def get_relevant_adus(
|
|
289 |
"sim_score": score,
|
290 |
"rel_score": rel.score,
|
291 |
"relation": rel.label,
|
292 |
-
"
|
293 |
}
|
294 |
)
|
295 |
|
296 |
# define column order
|
297 |
-
df = pd.DataFrame(
|
298 |
-
result, columns=["text", "relation", "doc_id", "reference_adu", "sim_score", "rel_score"]
|
299 |
-
)
|
300 |
return df
|
|
|
253 |
min_similarity: float,
|
254 |
top_k: int,
|
255 |
relation_types: List[str],
|
256 |
+
previous_result: pd.DataFrame,
|
257 |
) -> pd.DataFrame:
|
258 |
similar_entries = vector_store.retrieve_similar(
|
259 |
ref_id=(ref_document.id, ref_annotation_id),
|
|
|
290 |
"sim_score": score,
|
291 |
"rel_score": rel.score,
|
292 |
"relation": rel.label,
|
293 |
+
"adu": str(rel.tail),
|
294 |
}
|
295 |
)
|
296 |
|
297 |
# define column order
|
298 |
+
df = pd.DataFrame(result, columns=previous_result.columns)
|
|
|
|
|
299 |
return df
|