ArneBinder commited on
Commit
9f76503
1 Parent(s): fece3f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +178 -12
app.py CHANGED
@@ -1,27 +1,23 @@
 
 
1
  import gradio as gr
2
  from pie_modules.models import * # noqa: F403
3
  from pie_modules.taskmodules import * # noqa: F403
4
- from prettytable import PrettyTable
5
  from pytorch_ie.annotations import LabeledSpan
6
  from pytorch_ie.auto import AutoPipeline
7
  from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
8
  from pytorch_ie.models import * # noqa: F403
9
  from pytorch_ie.taskmodules import * # noqa: F403
10
 
11
-
12
- def predict(text):
13
- document = TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions(text=text)
14
-
15
- # add single partition from the whole text (the model only considers text in partitions)
16
- document.labeled_partitions.append(LabeledSpan(start=0, end=len(text), label="text"))
17
-
18
- # execute NER pipeline
19
- pipeline(document)
20
 
21
  t = PrettyTable()
22
  t.field_names = ["head", "tail", "relation"]
23
  t.align = "l"
24
- for relation in document.binary_relations.predictions:
25
  t.add_row([str(relation.head), str(relation.tail), relation.label])
26
 
27
  html = t.get_html_string(format=True)
@@ -30,9 +26,159 @@ def predict(text):
30
  return html
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  if __name__ == "__main__":
34
 
35
  model_name_or_path = "ArneBinder/sam-pointer-bart-base-v0.3"
 
36
  # model_name_or_path = "models/dataset-sciarg/task-ner_re/v0.3/2024-03-01_18-25-32"
37
 
38
  pipeline = AutoPipeline.from_pretrained(model_name_or_path, device=-1, num_workers=0)
@@ -43,14 +189,34 @@ if __name__ == "__main__":
43
  # taskmodule_kwargs=dict(create_relation_candidates=True),
44
  )
45
 
 
 
 
 
 
 
 
46
  iface = gr.Interface(
47
  fn=predict,
48
  inputs=[
49
  gr.Textbox(
50
  lines=20,
51
  value="Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent.",
52
- )
 
 
 
 
 
 
 
 
 
 
 
 
53
  ],
 
54
  outputs=["html"],
55
  )
56
  iface.launch()
 
1
+ import json
2
+
3
  import gradio as gr
4
  from pie_modules.models import * # noqa: F403
5
  from pie_modules.taskmodules import * # noqa: F403
 
6
  from pytorch_ie.annotations import LabeledSpan
7
  from pytorch_ie.auto import AutoPipeline
8
  from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
9
  from pytorch_ie.models import * # noqa: F403
10
  from pytorch_ie.taskmodules import * # noqa: F403
11
 
12
+ def render_pretty_table(
13
+ document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions, **render_kwargs
14
+ ):
15
+ from prettytable import PrettyTable
 
 
 
 
 
16
 
17
  t = PrettyTable()
18
  t.field_names = ["head", "tail", "relation"]
19
  t.align = "l"
20
+ for relation in list(document.binary_relations) + list(document.binary_relations.predictions):
21
  t.add_row([str(relation.head), str(relation.tail), relation.label])
22
 
23
  html = t.get_html_string(format=True)
 
26
  return html
27
 
28
 
29
+ def render_spacy(
30
+ document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
31
+ style="ent",
32
+ inject_relations=True,
33
+ **render_kwargs,
34
+ ):
35
+ from spacy import displacy
36
+
37
+ spans = list(document.labeled_spans) + list(document.labeled_spans.predictions)
38
+ spacy_doc = {
39
+ "text": document.text,
40
+ "ents": [
41
+ {"start": entity.start, "end": entity.end, "label": entity.label} for entity in spans
42
+ ],
43
+ "title": None,
44
+ }
45
+
46
+ html = displacy.render(
47
+ spacy_doc, page=True, manual=True, minify=True, style=style, **render_kwargs
48
+ )
49
+ html = "<div style='max-width:100%; max-height:360px; overflow:auto'>" + html + "</div>"
50
+ if inject_relations:
51
+ print("Injecting relation data")
52
+ binary_relations = list(document.binary_relations) + list(
53
+ document.binary_relations.predictions
54
+ )
55
+ sorted_entities = sorted(spans, key=lambda x: (x.start, x.end))
56
+ html = inject_relation_data(
57
+ html, sorted_entities=sorted_entities, binary_relations=binary_relations
58
+ )
59
+ else:
60
+ print("Not injecting relation data")
61
+ return html
62
+
63
+
64
+ def inject_relation_data(html: str, sorted_entities, binary_relations) -> str:
65
+ from bs4 import BeautifulSoup
66
+
67
+ # Parse the HTML using BeautifulSoup
68
+ soup = BeautifulSoup(html, "html.parser")
69
+
70
+ # Add unique IDs to each entity
71
+ entities = soup.find_all(class_="entity")
72
+ entity2id = {}
73
+ for idx, entity in enumerate(entities):
74
+ entity["id"] = f"entity-{idx}"
75
+ entity["data-original-color"] = (
76
+ entity["style"].split("background:")[1].split(";")[0].strip()
77
+ )
78
+ entity_annotation = sorted_entities[idx]
79
+ # sanity check
80
+ if str(entity_annotation) != entity.next:
81
+ raise ValueError(f"Entity text mismatch: {entity_annotation} != {entity.text}")
82
+ entity2id[sorted_entities[idx]] = f"entity-{idx}"
83
+
84
+ # Generate prefixed relations
85
+ prefixed_relations = [
86
+ {
87
+ "head": entity2id[relation.head],
88
+ "tail": entity2id[relation.tail],
89
+ "label": relation.label,
90
+ }
91
+ for relation in binary_relations
92
+ ]
93
+
94
+ # Create the JavaScript function to handle mouse over and mouse out events
95
+ script = (
96
+ """
97
+ <script>
98
+ function highlightRelations(entityId, relations) {
99
+ // Reset all entities' styles
100
+ const entities = document.querySelectorAll('.entity');
101
+ entities.forEach(entity => {
102
+ entity.style.backgroundColor = entity.getAttribute('data-original-color');
103
+ entity.style.color = '';
104
+ });
105
+
106
+ // If an entity is hovered, highlight it and its related entities with different colors
107
+ if (entityId !== null) {
108
+ const selectedEntity = document.getElementById(entityId);
109
+ if (selectedEntity) {
110
+ selectedEntity.style.backgroundColor = '#ffa';
111
+ selectedEntity.style.color = '#000';
112
+ }
113
+
114
+ relations.forEach(relation => {
115
+ if (relation.head === entityId) {
116
+ const tailEntity = document.getElementById(relation.tail);
117
+ if (tailEntity) {
118
+ tailEntity.style.backgroundColor = '#aff';
119
+ tailEntity.style.color = '#000';
120
+ }
121
+ }
122
+ if (relation.tail === entityId) {
123
+ const headEntity = document.getElementById(relation.head);
124
+ if (headEntity) {
125
+ headEntity.style.backgroundColor = '#faf';
126
+ headEntity.style.color = '#000';
127
+ }
128
+ }
129
+ });
130
+ }
131
+ }
132
+
133
+ // Event listeners for mouse over and mouse out on each entity
134
+ document.addEventListener('DOMContentLoaded', (event) => {
135
+ const relations = %s;
136
+ const entities = document.querySelectorAll('.entity');
137
+ entities.forEach(entity => {
138
+ entity.addEventListener('mouseover', () => {
139
+ highlightRelations(entity.id, relations);
140
+ });
141
+ entity.addEventListener('mouseout', () => {
142
+ highlightRelations(null, relations);
143
+ });
144
+ });
145
+ });
146
+ </script>
147
+ """
148
+ % prefixed_relations
149
+ )
150
+
151
+ # Inject the script into the HTML
152
+ soup.body.append(BeautifulSoup(script, "html.parser"))
153
+
154
+ # Return the modified HTML as a string
155
+ return str(soup)
156
+
157
+
158
+ def predict(text, render_as, render_kwargs_json):
159
+ document = TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions(text=text)
160
+
161
+ # add single partition from the whole text (the model only considers text in partitions)
162
+ document.labeled_partitions.append(LabeledSpan(start=0, end=len(text), label="text"))
163
+
164
+ # execute prediction pipeline
165
+ pipeline(document)
166
+
167
+ render_kwargs = json.loads(render_kwargs_json)
168
+ if render_as == "Pretty Table":
169
+ html = render_pretty_table(document, **render_kwargs)
170
+ elif render_as == "spaCy":
171
+ html = render_spacy(document, **render_kwargs)
172
+ else:
173
+ raise ValueError(f"Unknown render_as value: {render_as}")
174
+
175
+ return html
176
+
177
+
178
  if __name__ == "__main__":
179
 
180
  model_name_or_path = "ArneBinder/sam-pointer-bart-base-v0.3"
181
+ # local path
182
  # model_name_or_path = "models/dataset-sciarg/task-ner_re/v0.3/2024-03-01_18-25-32"
183
 
184
  pipeline = AutoPipeline.from_pretrained(model_name_or_path, device=-1, num_workers=0)
 
189
  # taskmodule_kwargs=dict(create_relation_candidates=True),
190
  )
191
 
192
+ default_render_kwargs = {
193
+ "style": "ent",
194
+ "options": {
195
+ "colors": {"own_claim": "#009933", "background_claim": "#0033cc", "data": "#993399"}
196
+ },
197
+ }
198
+
199
  iface = gr.Interface(
200
  fn=predict,
201
  inputs=[
202
  gr.Textbox(
203
  lines=20,
204
  value="Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent.",
205
+ ),
206
+ ],
207
+ additional_inputs=[
208
+ gr.Dropdown(
209
+ label="Render as",
210
+ choices=["Pretty Table", "spaCy"],
211
+ value="spaCy",
212
+ ),
213
+ gr.Textbox(
214
+ label="Render Arguments",
215
+ lines=5,
216
+ value=json.dumps(default_render_kwargs, indent=2),
217
+ ),
218
  ],
219
+ additional_inputs_accordion=gr.Accordion(label="Render Options", open=False),
220
  outputs=["html"],
221
  )
222
  iface.launch()