lterriel commited on
Commit
74e2066
Β·
1 Parent(s): 1b0028c

clean & refactor components + add doc

Browse files
app.py CHANGED
@@ -1,451 +1,102 @@
1
  #!/usr/bin/env python3
2
  # -*- coding:utf-8 -*-
3
- import requests.exceptions
4
- import zipfile
5
 
6
  import streamlit as st
7
- from streamlit.components.v1 import html
8
- from n4a_analytics_lib.analytics import (GlobalStatistics, IaaStatistics)
9
- from n4a_analytics_lib.constants import (DESCRIPTION)
10
 
11
-
12
- # Set application
13
- st.set_page_config(layout="wide")
14
- # sidebar: meta, inputs etc.
15
- sidebar = st.sidebar
16
- # cols: display results
17
- col1, col2 = st.columns(2)
18
-
19
- # description
20
- sidebar.markdown(DESCRIPTION)
21
-
22
-
23
-
24
-
25
- # to st components
26
- #def clear_cache():
27
- # st.session_state = {}
28
-
29
- def check_login(username, password):
30
- if (len(username) == 0) or (len(password) == 0):
31
- return False
32
- return True
33
-
34
- def logout():
35
- pass
36
-
37
-
38
-
39
-
40
-
41
-
42
- # Level to analyze
43
- option = sidebar.selectbox('Which statistics level?', ('Inter-Annotator Agreement results',
44
- 'Global project statistics'))
45
-
46
- # IAA results view
47
- if option == "Inter-Annotator Agreement results":
48
- annotations = sidebar.file_uploader("Upload IAA annotations (.zip format only): ")
49
- baseline_text = sidebar.file_uploader("Upload baseline text (.txt format only): ")
50
-
51
-
52
- if baseline_text is not None and annotations is not None:
53
- project_analyzed = IaaStatistics(zip_project=annotations, baseline_text=baseline_text.getvalue())
54
- baseline_analyzer = project_analyzed.analyze_text()
55
-
56
- col2.markdown(f"""
57
- ### BASELINE TEXT: {baseline_text.name}
58
-
59
- - sentences: {baseline_analyzer[0]}
60
- - words: {baseline_analyzer[1]}
61
- - characters: {baseline_analyzer[2]}
62
- """)
63
-
64
-
65
-
66
-
67
- #print(project_analyzed.annotations_per_coders)
68
-
69
- commune_mentions = [l for i,j in project_analyzed.mentions_per_coder.items() for l in j]
70
- commune_mentions = list(dict.fromkeys(commune_mentions))
71
- #print(commune_mentions)
72
- #print(project_analyzed.annotations)
73
- #print(project_analyzed.labels_per_coder)
74
- import pandas as pd
75
- from collections import defaultdict, Counter
76
- from itertools import combinations
77
- import seaborn as sn
78
- import matplotlib as plt
79
- import matplotlib.pyplot as pylt
80
-
81
- dicts_coders = []
82
- for coder, annotations in project_analyzed.annotations_per_coders.items():
83
- nombre_annotations = []
84
- # print(f'* {coder}')
85
- for annotation, label in annotations.items():
86
- nombre_annotations.append(label)
87
- # print(f"Nombre total d'annotations : {len(nombre_annotations)}")
88
- dict_coder = dict(Counter(nombre_annotations))
89
- dicts_coders.append(dict_coder)
90
- # print(f'==========================')
91
-
92
- labels = [label for label in dicts_coders[0]]
93
-
94
- from n4a_analytics_lib.metrics_utils import interpret_kappa, fleiss_kappa_function, cohen_kappa_function
95
- df = pd.DataFrame(project_analyzed.annotations_per_coders, index=commune_mentions)
96
-
97
- for ann in project_analyzed.annotators:
98
- df[ann] = 'None'
99
- for mention, value in project_analyzed.annotations_per_coders[ann].items():
100
- df.loc[mention, ann] = value
101
-
102
- total_annotations = len(df)
103
-
104
- # print(f'* Total des annotations : {total_annotations}')
105
-
106
- df_n = df.apply(pd.Series.value_counts, 1).fillna(0).astype(int)
107
- matrix = df_n.values
108
-
109
- pairs = list(combinations(project_analyzed.annotations_per_coders, 2))
110
-
111
- # Display in app
112
- #cont_kappa = st.container()
113
- st.title("Inter-Annotator Agreement (IAA) results")
114
- #tab1, tab2, tab3, tab4, tab5 = st.tabs(
115
- # ["πŸ“ˆ IAA metrics", "πŸ—ƒ IAA Metrics Legend", "βœ”οΈ Agree annotations", "❌ Disagree annotations",
116
- # "🏷️ Global Labels Statistics"])
117
- st.markdown("## πŸ“ˆ IAA metrics")
118
- col1_kappa, col2_kappa = st.columns(2)
119
- col1_kappa.subheader("Fleiss Kappa (global score for group):")
120
-
121
-
122
- col1_kappa.markdown(interpret_kappa(round(fleiss_kappa_function(matrix), 2)), unsafe_allow_html=True)
123
- col1_kappa.subheader("Cohen Kappa Annotators Matrix (score between annotators):")
124
- # tab1.dataframe(df)
125
- data = []
126
- for coder_1, coder_2 in pairs:
127
- cohen_function = cohen_kappa_function(project_analyzed.labels_per_coder[coder_1], project_analyzed.labels_per_coder[coder_2])
128
- data.append(((coder_1, coder_2), cohen_function))
129
- col1_kappa.markdown(f"* {coder_1} <> {coder_2} : {interpret_kappa(cohen_function)}", unsafe_allow_html=True)
130
- # print(f"* {coder_1} <> {coder_2} : {cohen_function}")
131
-
132
- intermediary = defaultdict(Counter)
133
- for (src, tgt), count in data:
134
- intermediary[src][tgt] = count
135
-
136
- letters = sorted({key for inner in intermediary.values() for key in inner} | set(intermediary.keys()))
137
-
138
- confusion_matrix = [[intermediary[src][tgt] for tgt in letters] for src in letters]
139
- import numpy as np
140
-
141
- df_cm = pd.DataFrame(confusion_matrix, letters, letters)
142
- mask = df_cm.values == 0
143
- sn.set(font_scale=0.7) # for label size
144
- colors = ["#e74c3c", "#f39c12", "#f4d03f", "#5dade2", "#58d68d", "#28b463"]
145
- width = st.slider("matrix width", 1, 10, 14)
146
- height = st.slider("matrix height", 1, 10, 4)
147
- fig, ax = pylt.subplots(figsize=(width, height))
148
- sn.heatmap(df_cm, cmap=colors, annot=True, mask=mask, annot_kws={"size": 7}, vmin=0, vmax=1, ax=ax) # font size
149
- # plt.show()
150
- st.pyplot(ax.figure)
151
- col2_kappa.markdown("""
152
- <div>
153
- <div id="legend" style="right: 70em;">
154
- <h3>πŸ—ƒ IAA Metrics Legend</h3>
155
- <table>
156
- <thead>
157
- <tr>
158
- <th
159
- colspan="2"> Kappa
160
- interpretation
161
- legend </th>
162
- </tr>
163
- </thead>
164
- <tbody>
165
- <tr>
166
- <td> Kappa
167
- score(k) </td>
168
- <td>Agreement</td>
169
- </tr>
170
- <tr
171
- style = "background-color: #e74c3c;">
172
- <td> k < 0 </td>
173
- <td> Less
174
- chance
175
- agreement </td>
176
- </tr>
177
- <tr
178
- style = "background-color: #f39c12;">
179
- <td> 0.01 < k < 0.20 </td>
180
- <td> Slight
181
- agreement </td>
182
- </tr>
183
- <tr
184
- style = "background-color: #f4d03f;">
185
- <td> 0.21 < k < 0.40 </td>
186
- <td> Fair
187
- agreement </td>
188
- </tr>
189
- <tr
190
- style = "background-color: #5dade2;">
191
- <td> 0.41 < k < 0.60 </td>
192
- <td> Moderate
193
- agreement </td>
194
- </tr>
195
- <tr
196
- style = "background-color: #58d68d;">
197
- <td> 0.61 < k < 0.80 </td>
198
- <td> Substantial
199
- agreement </td>
200
- </tr>
201
- <tr
202
- style = "background-color: #28b463;">
203
- <td> 0.81 < k < 0.99 </td>
204
- <td> Almost
205
- perfect
206
- agreement </td>
207
- </tr>
208
- </tbody>
209
- </table></div></div>"""
210
-
211
- , unsafe_allow_html = True)
212
-
213
-
214
- ## commune
215
- @st.cache
216
- def convert_df(df_ex):
217
- return df_ex.to_csv(encoding="utf-8").encode('utf-8')
218
-
219
-
220
- ## Agree part
221
-
222
- columns_to_compare = project_analyzed.annotators
223
-
224
-
225
- def check_all_equal(iterator):
226
- return len(set(iterator)) <= 1
227
-
228
-
229
- df_agree = df[df[columns_to_compare].apply(lambda row: check_all_equal(row), axis=1)]
230
- total_unanime = len(df_agree)
231
-
232
- csv_agree = convert_df(df_agree)
233
-
234
- st.subheader("βœ”οΈ Agree annotations")
235
- st.markdown(f"{total_unanime} / {len(df)} annotations ({round((total_unanime / len(df)) * 100, 2)} %)")
236
- st.download_button(
237
- "Press to Download CSV",
238
- csv_agree,
239
- "csv_annotators_agree.csv",
240
- "text/csv",
241
- key='download-csv-1'
242
  )
243
- st.dataframe(df_agree)
244
-
245
-
246
- ## Disagree part
247
-
248
- def check_all_not_equal(iterator):
249
- return len(set(iterator)) > 1
250
-
251
-
252
- df_disagree = df[df[columns_to_compare].apply(lambda row: check_all_not_equal(row), axis=1)]
253
- total_desaccord = len(df_disagree)
254
- csv_disagree = convert_df(df_disagree)
255
- st.subheader("❌ Disagree annotations")
256
- st.markdown(
257
- f"{total_desaccord} / {len(df)} annotations ({round((total_desaccord / len(df)) * 100, 2)} %)")
258
- st.download_button(
259
- "Press to Download CSV",
260
- csv_disagree,
261
- "csv_annotators_disagree.csv",
262
- "text/csv",
263
- key='download-csv-2'
264
  )
265
- st.dataframe(df_disagree)
266
-
267
-
268
- ## alignement chart labels
269
- def count_total_annotations_label(dataframe, labels):
270
- pairs = []
271
- for label in labels:
272
- total = dataframe.astype(object).eq(label).any(1).sum()
273
- pairs.append((label, total))
274
- return pairs
275
-
276
-
277
- totals_annotations_per_labels = count_total_annotations_label(df, labels)
278
 
279
-
280
- # RΓ©cupΓ©rer le nombre de mention portant la mΓͺme classe selon les annotateurs
281
-
282
- def total_agree_disagree_per_label(dataframe, pairs_totals_labels):
283
- new_pairs = []
284
- for t in pairs_totals_labels:
285
- # t[0] : label
286
- # t[1] : total_rows_with_label
287
- agree_res = df[df.nunique(1).eq(1)].eq(t[0]).any(1).sum()
288
- disagree_res = t[1] - agree_res
289
- agree_percent = (agree_res / t[1]) * 100
290
- disagree_percent = (disagree_res / t[1]) * 100
291
- new_pairs.append((t[0], t[1], agree_percent, disagree_percent))
292
- return new_pairs
293
-
294
- to_pie = total_agree_disagree_per_label(df, totals_annotations_per_labels)
295
-
296
-
297
- def plot_pies(tasks_to_pie):
298
- my_labels = 'agree', 'disagree'
299
- my_colors = ['#47DBCD', '#F5B14C']
300
- my_explode = (0, 0.1)
301
- counter = 0
302
- fig, axes = pylt.subplots(1, len(tasks_to_pie), figsize=(20, 3))
303
- for t in tasks_to_pie:
304
- tasks = [t[2], t[3]]
305
- axes[counter].pie(tasks, autopct='%1.1f%%', startangle=15, shadow=True, colors=my_colors,
306
- explode=my_explode)
307
- axes[counter].set_title(t[0])
308
- axes[counter].axis('equal')
309
- counter += 1
310
- fig.set_facecolor("white")
311
- fig.legend(labels=my_labels, loc="center right", borderaxespad=0.1, title="Labels alignement")
312
- # plt.savefig(f'./out/pie_alignement_labels_{filename_no_extension}.png', dpi=400)
313
- return fig
314
-
315
- f = plot_pies(to_pie)
316
- st.subheader("🏷️ Global Labels Statistics")
317
- st.pyplot(f.figure)
318
-
319
- # global project results view
320
- # st_session = {"gs_local":True, "gs_remote":False, "gs_obj":<object>}
321
-
322
- def display_data():
323
- col1.metric("Total curated annotations",
324
- f"{st.session_state['gs_obj'].total_annotations_project} Named entities")
325
- col1.dataframe(st.session_state['gs_obj'].df_i)
326
- selected_data = col1.selectbox('Select specific data to display bar plot:',
327
- st.session_state['gs_obj'].documents, key="selector_data")
328
- col2.pyplot(st.session_state['gs_obj'].create_plot(selected_data))
329
-
330
- def init_session_statistics(remote: bool, local: bool, data: tuple) -> None:
331
- # clear session
332
- st.session_state = {}
333
-
334
- # create a session variable
335
- st.session_state["gs_local"] = local
336
- st.session_state["gs_remote"] = remote
337
-
338
- # create a new object:
339
- # if remote fetch data from API Host first
340
- if remote and not(local):
341
- st.success('Fetch curated documents from host INCEpTION API in progress...')
342
- fetch_curated_data_from_remote(
343
- username=data[0],
344
- password=data[1]
345
- )
346
-
347
- if local and not(remote):
348
- st.session_state["gs_obj"] = GlobalStatistics(zip_project=data, remote=False)
349
-
350
-
351
-
352
-
353
-
354
- from pycaprio import Pycaprio, mappings
355
- from zipfile import ZipFile
356
- import io
357
- import requests
358
-
359
- def fetch_curated_data_from_remote(username: str,
360
- password: str,
361
- endpoint: str = "https://inception.dhlab.epfl.ch/prod",
362
- project_title: str = "ner4archives-template"):
363
- # open a client
364
- try:
365
- client = Pycaprio(inception_host=endpoint, authentication=(str(username), str(password)))
366
- except requests.exceptions.JSONDecodeError:
367
- # username / password incorrect
368
- st.error('Username or Password is incorrect please retry.')
369
-
370
- # get project object
371
- project_name = [p for p in client.api.projects() if p.project_name == project_title]
372
-
373
- # get all documents from project
374
- documents = client.api.documents(project_name[0].project_id)
375
-
376
- curations = []
377
- zipfiles = []
378
- count = 0
379
- flag = "a"
380
- # iterate over all documents and retrieve only curated into ZIP container
381
- for document in documents:
382
- if count > 0:
383
- flag = "r"
384
- if document.document_state == mappings.DocumentState.CURATION_COMPLETE:
385
- curated_content = client.api.curation(project_name[0].project_id, document,
386
- curation_format=mappings.InceptionFormat.UIMA_CAS_XMI_XML_1_1)
387
- curations.append(curated_content)
388
- for curation in curations:
389
- z = ZipFile(io.BytesIO(curation), mode=flag)
390
- zipfiles.append(z)
391
-
392
- count += 1
393
-
394
- # Merge all zip in one
395
- with zipfiles[0] as z1:
396
- for fname in zipfiles[1:]:
397
- zf = fname
398
- # print(zf.namelist())
399
- for n in zf.namelist():
400
- if n not in z1.namelist():
401
- z1.writestr(n, zf.open(n).read())
402
-
403
- # Create a new object
404
- st.session_state["gs_obj"] = GlobalStatistics(zip_project=z1, remote=True)
405
-
406
-
407
-
408
-
409
- if option == "Global project statistics":
410
- # User input controllers
411
- mode = sidebar.radio("Choose mode to retrieve curated data: ", (
412
- "Local directory", "INCEpTION API Host remote"
413
- ))
414
- data = None
415
- if mode == "Local directory":
416
- project = sidebar.file_uploader("Folder that contains curated annotations in XMI 1.1 (.zip format only): ",
417
- type="zip")
418
- data = project
419
- if mode == "INCEpTION API Host remote":
420
- username = sidebar.text_input("Username: ")
421
- password = sidebar.text_input("Password: ", type='password')
422
- data = (username, password)
423
-
424
- # Validate inputs
425
- btn_process = sidebar.button('Process', key='process')
426
-
427
- # Access data with local ressources
428
- if btn_process and mode == "Local directory":
429
- if data is not None:
430
- # create a new session
431
- init_session_statistics(remote=False, local=True, data=data)
432
-
433
- # Access data with remote ressources
434
- if btn_process and mode == "INCEpTION API Host remote":
435
- if data is not None:
436
- if check_login(username=data[0], password=data[1]):
437
  # create a new session
438
- init_session_statistics(remote=True, local=False, data=data)
439
- else:
440
- st.error("Sorry! Username or Password is empty.")
441
-
442
- # Change data values and visualize new plot
443
- if "gs_obj" in st.session_state:
444
- if st.session_state["gs_local"] or st.session_state["gs_remote"]:
445
- display_data()
446
-
447
-
448
 
 
 
 
 
 
 
 
 
449
 
 
 
 
 
450
 
451
 
 
 
 
1
  #!/usr/bin/env python3
2
  # -*- coding:utf-8 -*-
 
 
3
 
4
  import streamlit as st
 
 
 
5
 
6
+ from n4a_analytics_lib.constants import DESCRIPTION
7
+
8
+ from n4a_analytics_lib.st_components import (check_login,
9
+ init_session_statistics,
10
+ init_session_iaa,
11
+ display_data)
12
+
13
+
14
+ def n4a_analytics_dashboard() -> None:
15
+ """Main function to manage dashboard app frontend
16
+ -------------------------------------------------
17
+ * General architecture:
18
+ *
19
+ * metrics_utils.py (collection of statistics calculation)
20
+ * ↓
21
+ * project.py (features extraction from XMI) β†’ analytics.py
22
+ * ↑ (project analyzer: computation/visualisation)
23
+ * ↑ ↓
24
+ * st_components.py (manage data input/output and pipelines with streamlit snippets)
25
+ * ↑ ↓
26
+ * app.py (manage frontend)
27
+ *
28
+ ---------------------------------------------------
29
+ """
30
+ # Set window application
31
+ st.set_page_config(layout="wide")
32
+
33
+ # Sidebar: metadata, inputs etc.
34
+ sidebar = st.sidebar
35
+ # Cols: display results
36
+ col1, col2 = st.columns(2)
37
+
38
+ # Set general description
39
+ sidebar.markdown(DESCRIPTION)
40
+
41
+ # Level to analyze
42
+ option = sidebar.selectbox('Which statistics level?', ('Inter-Annotator Agreement results',
43
+ 'Global project statistics'))
44
+
45
+ # IAA results view
46
+ if option == "Inter-Annotator Agreement results":
47
+ annotations = sidebar.file_uploader(
48
+ "Upload IAA annotations (.zip format only): ",
49
+ type='zip'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  )
51
+ baseline_text = sidebar.file_uploader(
52
+ "Upload baseline text (.txt format only): ",
53
+ type='txt'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ if baseline_text is not None and annotations is not None:
57
+ init_session_iaa(data=annotations, baseline=baseline_text, col=col2)
58
+
59
+ # Global statistics
60
+ if option == "Global project statistics":
61
+ # User input controllers
62
+ mode = sidebar.radio("Choose mode to retrieve curated data: ", (
63
+ "Local directory", "INCEpTION API Host remote"
64
+ ))
65
+ data = None
66
+ if mode == "Local directory":
67
+ project = sidebar.file_uploader(
68
+ "Folder that contains curated annotations in XMI 1.1 (.zip format only): ",
69
+ type="zip"
70
+ )
71
+ data = project
72
+ if mode == "INCEpTION API Host remote":
73
+ username = sidebar.text_input("Username: ")
74
+ password = sidebar.text_input("Password: ", type='password')
75
+ data = (username, password)
76
+
77
+ # Validate inputs
78
+ btn_process = sidebar.button('Process', key='process')
79
+
80
+ # Access data with local ressources
81
+ if btn_process and mode == "Local directory":
82
+ if data is not None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  # create a new session
84
+ init_session_statistics(remote=False, local=True, data=data)
 
 
 
 
 
 
 
 
 
85
 
86
+ # Access data with remote ressources
87
+ if btn_process and mode == "INCEpTION API Host remote":
88
+ if data is not None:
89
+ if check_login(username=data[0], password=data[1]):
90
+ # create a new session
91
+ init_session_statistics(remote=True, local=False, data=data)
92
+ else:
93
+ st.error("Username or Password is empty, please check and retry.")
94
 
95
+ # Change data values and visualize new plot
96
+ if "gs_obj" in st.session_state:
97
+ if st.session_state["gs_local"] or st.session_state["gs_remote"]:
98
+ display_data(col1)
99
 
100
 
101
+ if __name__ == "__main__":
102
+ n4a_analytics_dashboard()
datatest/curation_rapid_global.zip DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:804a01b2ffae53103cd67fa51671ccbbbc988cf2796ec40ccb20f1e9283c1b47
3
- size 4670583
 
 
 
 
datatest/{test.zip β†’ exemple_IAA_annotations.zip} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab5ced7fa96a8b65ad8077d69761f517b19a57d8ec74e86608101d3bb66c6a54
3
- size 74199
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a8058de8efe999f8b2ec4c6162691b4991dbfeee107117f078bdc895c463c6b
3
+ size 91754
n4a_analytics_lib/__pycache__/analytics.cpython-38.pyc CHANGED
Binary files a/n4a_analytics_lib/__pycache__/analytics.cpython-38.pyc and b/n4a_analytics_lib/__pycache__/analytics.cpython-38.pyc differ
 
n4a_analytics_lib/__pycache__/metrics_utils.cpython-38.pyc CHANGED
Binary files a/n4a_analytics_lib/__pycache__/metrics_utils.cpython-38.pyc and b/n4a_analytics_lib/__pycache__/metrics_utils.cpython-38.pyc differ
 
n4a_analytics_lib/__pycache__/project.cpython-38.pyc CHANGED
Binary files a/n4a_analytics_lib/__pycache__/project.cpython-38.pyc and b/n4a_analytics_lib/__pycache__/project.cpython-38.pyc differ
 
n4a_analytics_lib/__pycache__/st_components.cpython-38.pyc CHANGED
Binary files a/n4a_analytics_lib/__pycache__/st_components.cpython-38.pyc and b/n4a_analytics_lib/__pycache__/st_components.cpython-38.pyc differ
 
n4a_analytics_lib/analytics.py CHANGED
@@ -1,17 +1,20 @@
1
  # -*- coding:utf-8 -*-
2
 
 
 
 
3
  import pandas as pd
4
  import seaborn as sns
5
- import matplotlib
6
 
7
- matplotlib.use('Agg')
8
 
9
  import nltk
10
-
11
  nltk.download('punkt')
12
  from nltk.tokenize import sent_tokenize, word_tokenize
13
 
14
  from n4a_analytics_lib.project import Project
 
15
 
16
 
17
  class GlobalStatistics(Project):
@@ -24,7 +27,7 @@ class GlobalStatistics(Project):
24
 
25
  self.total_annotations_project = self.df_i['TOTAL'].sum()
26
 
27
- def create_plot(self, type_data):
28
  # apply data filter
29
  data_tab_filtered = self.df_details.loc[self.df_details['SOURCE_FILE'] == type_data]
30
  # create a new plot
@@ -37,25 +40,38 @@ class GlobalStatistics(Project):
37
  return ax.figure
38
 
39
 
40
-
41
-
42
-
43
  class IaaStatistics(Project):
44
- def __init__(self, zip_project, baseline_text):
45
- super().__init__(zip_project=zip_project, type="iaa")
46
  self.baseline_text = baseline_text.decode('utf-8')
47
 
48
- # self.docs = {}
49
- # self.pairwise = {}
50
- # self.similar_mention = []
51
  self.mentions_per_coder = self.extract_refs(self.annotations, self.annotators, type="mentions")
52
  self.labels_per_coder = self.extract_refs(self.annotations, self.annotators, type="labels")
53
-
54
  self.annotations_per_coders = {coder: dict(zip(ann[1]['mentions'], ann[1]['labels'])) for coder, ann in zip(self.annotators, self.annotations.items())}
 
 
 
 
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  @staticmethod
58
- def extract_refs(annotations, annotators, type):
59
  return {
60
  coder: data for coder, ann in zip(
61
  annotators,
@@ -63,7 +79,82 @@ class IaaStatistics(Project):
63
  ) for ref, data in ann[1].items() if ref == type
64
  }
65
 
66
- def analyze_text(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  """returns total sentences, words and characters
68
  in list format
69
  """
 
1
  # -*- coding:utf-8 -*-
2
 
3
+ from itertools import combinations
4
+ from collections import defaultdict, Counter
5
+
6
  import pandas as pd
7
  import seaborn as sns
8
+ import matplotlib as plt
9
 
10
+ plt.use('Agg')
11
 
12
  import nltk
 
13
  nltk.download('punkt')
14
  from nltk.tokenize import sent_tokenize, word_tokenize
15
 
16
  from n4a_analytics_lib.project import Project
17
+ from n4a_analytics_lib.metrics_utils import (fleiss_kappa_function, cohen_kappa_function, percentage_agreement_pov)
18
 
19
 
20
  class GlobalStatistics(Project):
 
27
 
28
  self.total_annotations_project = self.df_i['TOTAL'].sum()
29
 
30
+ def create_plot(self, type_data: str) -> sns.barplot:
31
  # apply data filter
32
  data_tab_filtered = self.df_details.loc[self.df_details['SOURCE_FILE'] == type_data]
33
  # create a new plot
 
40
  return ax.figure
41
 
42
 
 
 
 
43
  class IaaStatistics(Project):
44
+ def __init__(self, zip_project, baseline_text, remote=False):
45
+ super().__init__(zip_project=zip_project, remote=remote, type="iaa")
46
  self.baseline_text = baseline_text.decode('utf-8')
47
 
 
 
 
48
  self.mentions_per_coder = self.extract_refs(self.annotations, self.annotators, type="mentions")
49
  self.labels_per_coder = self.extract_refs(self.annotations, self.annotators, type="labels")
 
50
  self.annotations_per_coders = {coder: dict(zip(ann[1]['mentions'], ann[1]['labels'])) for coder, ann in zip(self.annotators, self.annotations.items())}
51
+ self.coders_pairs = list(combinations(self.annotations_per_coders, 2))
52
+ self.similar_mention = list(dict.fromkeys([l for i,j in self.mentions_per_coder.items() for l in j]))
53
+
54
+ self.labels_schema = list(dict.fromkeys([label for _, labels in self.labels_per_coder.items() for label in labels]))
55
 
56
+ # dataframes and matrix analysis
57
+ self.base_df = self.build_base_df()
58
+ self.df_agree = self.base_df [self.base_df[self.annotators].apply(lambda row: self.check_all_equal(row), axis=1)]
59
+ self.df_disagree = self.base_df[self.base_df[self.annotators].apply(lambda row: self.check_all_not_equal(row), axis=1)]
60
+ self.coders_matrix = self.base_df.apply(pd.Series.value_counts, 1).fillna(0).astype(int).values
61
+
62
+ # totals
63
+ self.total_annotations = len(self.base_df)
64
+ self.total_agree = len(self.df_agree)
65
+ self.total_disagree = len(self.df_disagree)
66
+
67
+ # access to metrics
68
+ self.fleiss_kappa = round(fleiss_kappa_function(self.coders_matrix), 2)
69
+ self.cohen_kappa_pairs = self.compute_pairs_cohen_kappa()
70
+ self.percent_agree = percentage_agreement_pov(self.total_agree, self.total_annotations)
71
+ self.percent_disagree = percentage_agreement_pov(self.total_disagree, self.total_annotations)
72
 
73
  @staticmethod
74
+ def extract_refs(annotations: dict, annotators: list, type: str) -> dict:
75
  return {
76
  coder: data for coder, ann in zip(
77
  annotators,
 
79
  ) for ref, data in ann[1].items() if ref == type
80
  }
81
 
82
+ @staticmethod
83
+ def check_all_equal(iterator: list) -> bool:
84
+ return len(set(iterator)) <= 1
85
+
86
+ @staticmethod
87
+ def check_all_not_equal(iterator: list) -> bool:
88
+ return len(set(iterator)) > 1
89
+
90
+ def plot_confusion_matrix(self, width: int, height: int) -> plt.pyplot.subplots:
91
+ intermediary = defaultdict(Counter)
92
+ for (src, tgt), count in self.cohen_kappa_pairs.items():
93
+ intermediary[src][tgt] = count
94
+
95
+ letters = sorted({key for inner in intermediary.values() for key in inner} | set(intermediary.keys()))
96
+
97
+ confusion_matrix = [[intermediary[src][tgt] for tgt in letters] for src in letters]
98
+
99
+ df_cm = pd.DataFrame(confusion_matrix, letters, letters)
100
+ mask = df_cm.values == 0
101
+ sns.set(font_scale=0.7) # for label size
102
+ colors = ["#e74c3c", "#f39c12", "#f4d03f", "#5dade2", "#58d68d", "#28b463"]
103
+
104
+ fig, ax = plt.pyplot.subplots(figsize=(width, height))
105
+ sns.heatmap(df_cm, cmap=colors, annot=True, mask=mask, annot_kws={"size": 7}, vmin=0, vmax=1, ax=ax) # font size
106
+ return ax
107
+
108
+ def build_base_df(self) -> pd.DataFrame:
109
+ df = pd.DataFrame(self.annotations_per_coders, index=self.similar_mention)
110
+ for ann in self.annotators:
111
+ df[ann] = 'None'
112
+ for mention, value in self.annotations_per_coders[ann].items():
113
+ df.loc[mention, ann] = value
114
+ return df
115
+
116
+ def compute_pairs_cohen_kappa(self) -> dict:
117
+ return {
118
+ (c1, c2): cohen_kappa_function(self.labels_per_coder[c1],
119
+ self.labels_per_coder[c2]) for c1, c2 in self.coders_pairs
120
+ }
121
+
122
+ def count_total_annotations_label(self) -> list:
123
+ return [
124
+ (label, self.base_df.astype(object).eq(label).any(1).sum()) for label in self.labels_schema
125
+ ]
126
+
127
+ def total_agree_disagree_per_label(self) -> list:
128
+ # t[0] : label
129
+ # t[1] : total_rows_with_label
130
+ return [(
131
+ t[0],
132
+ t[1],
133
+ (self.base_df[self.base_df.nunique(1).eq(1)].eq(t[0]).any(1).sum() / t[1]) * 100,
134
+ ((t[1] - self.base_df[self.base_df.nunique(1).eq(1)].eq(t[0]).any(1).sum()) / t[1]) * 100
135
+ )
136
+ for t in self.count_total_annotations_label()]
137
+
138
+ def plot_agreement_pies(self) -> plt.pyplot.subplots:
139
+ my_labels = 'agree', 'disagree'
140
+ my_colors = ['#47DBCD', '#F5B14C']
141
+ my_explode = (0, 0.1)
142
+ counter = 0
143
+ tasks_to_pie = self.total_agree_disagree_per_label()
144
+ fig, axes = plt.pyplot.subplots(1, len(tasks_to_pie), figsize=(20, 3))
145
+ for t in tasks_to_pie:
146
+ tasks = [t[2], t[3]]
147
+ axes[counter].pie(tasks, autopct='%1.1f%%', startangle=15, shadow=True, colors=my_colors,
148
+ explode=my_explode)
149
+ axes[counter].set_title(t[0])
150
+ axes[counter].axis('equal')
151
+ counter += 1
152
+ fig.set_facecolor("white")
153
+ fig.legend(labels=my_labels, loc="center right", borderaxespad=0.1, title="Labels alignement")
154
+ # plt.savefig(f'./out/pie_alignement_labels_{filename_no_extension}.png', dpi=400)
155
+ return fig
156
+
157
+ def analyze_text(self) -> list:
158
  """returns total sentences, words and characters
159
  in list format
160
  """
n4a_analytics_lib/constants.py CHANGED
@@ -11,4 +11,79 @@ of NER4Archives (Inria/Archives nationales).
11
  - This tool provides two statistics levels:
12
  - *Global project statistics*: Analyze named entities in overall curated documents in project;
13
  - *Inter-Annotator Agreement results*: Analyze results of IAA experiment.
14
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  - This tool provides two statistics levels:
12
  - *Global project statistics*: Analyze named entities in overall curated documents in project;
13
  - *Inter-Annotator Agreement results*: Analyze results of IAA experiment.
14
+ """
15
+
16
+ KAPPA_LEGEND = """
17
+ <div>
18
+ <div id="legend" style="right: 70em;">
19
+ <h3>πŸ—ƒ IAA Metrics Legend</h3>
20
+ <table>
21
+ <thead>
22
+ <tr>
23
+ <th colspan="2">
24
+ Kappa interpretation legend
25
+ </th>
26
+ </tr>
27
+ </thead>
28
+ <tbody>
29
+ <tr>
30
+ <td>
31
+ Kappa score (k)
32
+ </td>
33
+ <td>
34
+ Agreement
35
+ </td>
36
+ </tr>
37
+ <tr style = "background-color: #e74c3c;">
38
+ <td>
39
+ k < 0
40
+ </td>
41
+ <td>
42
+ Less chance agreement
43
+ </td>
44
+ </tr>
45
+ <tr style = "background-color: #f39c12;">
46
+ <td>
47
+ 0.01 < k < 0.20
48
+ </td>
49
+ <td>
50
+ Slight agreement
51
+ </td>
52
+ </tr>
53
+ <tr style = "background-color: #f4d03f;">
54
+ <td>
55
+ 0.21 < k < 0.40
56
+ </td>
57
+ <td>
58
+ Fair agreement
59
+ </td>
60
+ </tr>
61
+ <tr style = "background-color: #5dade2;">
62
+ <td>
63
+ 0.41 < k < 0.60
64
+ </td>
65
+ <td>
66
+ Moderate agreement
67
+ </td>
68
+ </tr>
69
+ <tr style = "background-color: #58d68d;">
70
+ <td>
71
+ 0.61 < k < 0.80
72
+ </td>
73
+ <td>
74
+ Substantial agreement
75
+ </td>
76
+ </tr>
77
+ <tr style = "background-color: #28b463;">
78
+ <td>
79
+ 0.81 < k < 0.99
80
+ </td>
81
+ <td>
82
+ Almost perfect agreement
83
+ </td>
84
+ </tr>
85
+ </tbody>
86
+ </table>
87
+ </div>
88
+ </div>
89
+ """
n4a_analytics_lib/metrics_utils.py CHANGED
@@ -1,31 +1,52 @@
1
  # -*- coding:utf-8 -*-
2
 
 
 
 
3
  import numpy as np
4
 
5
- def fleiss_kappa_function(M):
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  """Computes Fleiss' kappa for group of annotators.
7
- :param M: a matrix of shape (:attr:'N', :attr:'k') with 'N' = number of subjects and 'k' = the number of categories.
8
- 'M[i, j]' represent the number of raters who assigned the 'i'th subject to the 'j'th category.
9
- :type: numpy matrix
 
 
10
  :rtype: float
11
  :return: Fleiss' kappa score
12
  """
13
- N, k = M.shape # N is # of items, k is # of categories
14
- n_annotators = float(np.sum(M[0, :])) # # of annotators
15
  tot_annotations = N * n_annotators # the total # of annotations
16
- category_sum = np.sum(M, axis=0) # the sum of each category over all items
17
 
18
  # chance agreement
19
  p = category_sum / tot_annotations # the distribution of each category over all annotations
20
  PbarE = np.sum(p * p) # average chance agreement over all categories
21
 
22
  # observed agreement
23
- P = (np.sum(M * M, axis=1) - n_annotators) / (n_annotators * (n_annotators - 1))
24
- Pbar = np.sum(P) / N # add all observed agreement chances per item and divide by amount of items
 
 
25
 
26
  return round((Pbar - PbarE) / (1 - PbarE), 4)
27
 
28
- def cohen_kappa_function(ann1, ann2):
 
29
  """Computes Cohen kappa for pair-wise annotators.
30
  :param ann1: annotations provided by first annotator
31
  :type ann1: list
@@ -50,19 +71,3 @@ def cohen_kappa_function(ann1, ann2):
50
 
51
  return round((A - E) / (1 - E), 4)
52
 
53
- def interpret_kappa(score):
54
- color = ""
55
- if score < 0:
56
- color= "#e74c3c;"
57
- elif 0.01 <= score <= 0.20:
58
- color= "#f39c12;"
59
- elif 0.21 <= score <= 0.40:
60
- color= "#f4d03f;"
61
- elif 0.41 <= score <= 0.60:
62
- color= "#5dade2;"
63
- elif 0.61 <= score <= 0.80:
64
- color= "#58d68d;"
65
- elif 0.81 <= score <= 0.99:
66
- color= "#28b463;"
67
-
68
- return f"<span style='font-size:30px; color: {color}'>{round(score*100, 2)} %</span>"
 
1
  # -*- coding:utf-8 -*-
2
 
3
+ """Collection of statistics functions.
4
+ """
5
+
6
  import numpy as np
7
 
8
+
9
+ def percentage_agreement_pov(total_pov: int, total_annotations: int) -> float:
10
+ """Computes a percentage
11
+ :param total_pov: total agree/disagree annotations
12
+ :type total_pov: int
13
+ :param total_annotations: total annotations in project
14
+ :type total_annotations: int
15
+ :rtype: float
16
+ :return: agreement percentage
17
+ """
18
+ return round((total_pov / total_annotations) * 100, 2)
19
+
20
+
21
+ def fleiss_kappa_function(matrix: list) -> float:
22
  """Computes Fleiss' kappa for group of annotators.
23
+ :param matrix: a matrix of shape (:attr:'N', :attr:'k') with
24
+ 'N' = number of subjects and 'k' = the number of categories.
25
+ 'M[i, j]' represent the number of raters who assigned
26
+ the 'i'th subject to the 'j'th category.
27
+ :type matrix: numpy matrix
28
  :rtype: float
29
  :return: Fleiss' kappa score
30
  """
31
+ N, _ = matrix.shape # N is # of items, k is # of categories
32
+ n_annotators = float(np.sum(matrix[0, :])) # # of annotators
33
  tot_annotations = N * n_annotators # the total # of annotations
34
+ category_sum = np.sum(matrix, axis=0) # the sum of each category over all items
35
 
36
  # chance agreement
37
  p = category_sum / tot_annotations # the distribution of each category over all annotations
38
  PbarE = np.sum(p * p) # average chance agreement over all categories
39
 
40
  # observed agreement
41
+ P = (np.sum(matrix * matrix, axis=1) - n_annotators) / (n_annotators * (n_annotators - 1))
42
+ Pbar = np.sum(P) / N
43
+ # add all observed agreement
44
+ # chances per item and divide by amount of items
45
 
46
  return round((Pbar - PbarE) / (1 - PbarE), 4)
47
 
48
+
49
+ def cohen_kappa_function(ann1: list, ann2: list) -> float:
50
  """Computes Cohen kappa for pair-wise annotators.
51
  :param ann1: annotations provided by first annotator
52
  :type ann1: list
 
71
 
72
  return round((A - E) / (1 - E), 4)
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
n4a_analytics_lib/project.py CHANGED
@@ -1,15 +1,35 @@
1
  # -*- coding:utf-8 -*-
2
- import zipfile
3
  from io import BytesIO
4
  import re
5
  from zipfile import ZipFile
6
  import os
7
  from pathlib import Path
8
 
9
-
10
  from cassis import load_typesystem, load_cas_from_xmi
11
 
12
- from n4a_analytics_lib.st_components import st_pb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  class Project:
@@ -44,8 +64,7 @@ class Project:
44
  """
45
  self.annotations = {}
46
 
47
-
48
- if isinstance(self.zip_project, zipfile.ZipFile) and self.remote and self.type == "global":
49
  for fp in self.zip_project.namelist():
50
  if self.typesystem is None:
51
  self.typesystem = load_typesystem(BytesIO(self.zip_project.open('TypeSystem.xml').read()))
@@ -53,43 +72,40 @@ class Project:
53
  self.documents.append(fp)
54
  self.xmi_documents.append(str(self.zip_project.open(fp).read().decode("utf-8")))
55
 
56
-
57
  else:
58
- with ZipFile(self.zip_project) as project_zip:
59
- if self.type == "global":
60
- regex = re.compile('.*curation/.*/(?!\._).*zip$')
61
- elif self.type == "iaa":
62
- regex = re.compile('.*xm[il]$')
63
-
64
- annotation_fps = (fp for fp in project_zip.namelist() if regex.match(fp))
65
- for fp in annotation_fps:
66
- if self.type == "global":
67
- with ZipFile(BytesIO(project_zip.read(fp))) as annotation_zip:
68
- if self.typesystem is None:
69
- self.typesystem = load_typesystem(BytesIO(annotation_zip.read('TypeSystem.xml')))
70
- for f in annotation_zip.namelist():
71
- if f.endswith('.xmi'):
72
- # store source filename
73
- self.documents.append(Path(fp).parent.name)
74
- # annotators = []
75
- # store XMI representation
76
- self.xmi_documents.append(str(annotation_zip.read(f).decode("utf-8")))
77
- elif self.type == "iaa":
78
- if self.typesystem is None and fp.endswith('.xml'):
79
- self.typesystem = load_typesystem(BytesIO(project_zip.read('TypeSystem.xml')))
80
- else:
81
- if fp.endswith('.xmi'):
82
- # store source filename
83
- self.documents.append(fp)
84
- # set annotators
85
- self.annotators.append(os.path.splitext(fp)[0])
86
- # store XMI representation
87
- self.xmi_documents.append(str(project_zip.read(fp).decode("utf-8")))
88
-
89
 
90
  self.extract_ne()
91
 
92
-
93
  @st_pb
94
  def extract_ne(self):
95
  count = 0
@@ -112,4 +128,3 @@ class Project:
112
 
113
 
114
 
115
-
 
1
  # -*- coding:utf-8 -*-
2
+
3
  from io import BytesIO
4
  import re
5
  from zipfile import ZipFile
6
  import os
7
  from pathlib import Path
8
 
9
+ import streamlit as st
10
  from cassis import load_typesystem, load_cas_from_xmi
11
 
12
+
13
+ def st_pb(method):
14
+ """streamlit decorator to display
15
+ progress bar
16
+ """
17
+ def progress_bar(ref):
18
+ container = st.empty()
19
+ bar = st.progress(0)
20
+ pg_gen = method(ref)
21
+ try:
22
+ while True:
23
+ progress = next(pg_gen)
24
+ bar.progress(progress[0])
25
+ if progress[2]:
26
+ container.write("βœ… Processing... " + progress[1])
27
+ else:
28
+ container.write("❌️ Errror with..." + progress[1])
29
+ except StopIteration as result:
30
+ return result.value
31
+
32
+ return progress_bar
33
 
34
 
35
  class Project:
 
64
  """
65
  self.annotations = {}
66
 
67
+ if isinstance(self.zip_project, ZipFile) and self.remote and self.type == "global":
 
68
  for fp in self.zip_project.namelist():
69
  if self.typesystem is None:
70
  self.typesystem = load_typesystem(BytesIO(self.zip_project.open('TypeSystem.xml').read()))
 
72
  self.documents.append(fp)
73
  self.xmi_documents.append(str(self.zip_project.open(fp).read().decode("utf-8")))
74
 
 
75
  else:
76
+ with ZipFile(self.zip_project) as project_zip:
77
+ if self.type == "global":
78
+ regex = re.compile('.*curation/.*/(?!\._).*zip$')
79
+ elif self.type == "iaa":
80
+ regex = re.compile('.*xm[il]$')
81
+
82
+ annotation_fps = (fp for fp in project_zip.namelist() if regex.match(fp))
83
+ for fp in annotation_fps:
84
+ if self.type == "global":
85
+ with ZipFile(BytesIO(project_zip.read(fp))) as annotation_zip:
86
+ if self.typesystem is None:
87
+ self.typesystem = load_typesystem(BytesIO(annotation_zip.read('TypeSystem.xml')))
88
+ for f in annotation_zip.namelist():
89
+ if f.endswith('.xmi'):
90
+ # store source filename
91
+ self.documents.append(Path(fp).parent.name)
92
+ # annotators = []
93
+ # store XMI representation
94
+ self.xmi_documents.append(str(annotation_zip.read(f).decode("utf-8")))
95
+ elif self.type == "iaa":
96
+ if self.typesystem is None and fp.endswith('.xml'):
97
+ self.typesystem = load_typesystem(BytesIO(project_zip.read('TypeSystem.xml')))
98
+ else:
99
+ if fp.endswith('.xmi'):
100
+ # store source filename
101
+ self.documents.append(fp)
102
+ # set annotators
103
+ self.annotators.append(os.path.splitext(fp)[0])
104
+ # store XMI representation
105
+ self.xmi_documents.append(str(project_zip.read(fp).decode("utf-8")))
 
106
 
107
  self.extract_ne()
108
 
 
109
  @st_pb
110
  def extract_ne(self):
111
  count = 0
 
128
 
129
 
130
 
 
n4a_analytics_lib/st_components.py CHANGED
@@ -1,22 +1,198 @@
1
  # -*- coding:utf-8 -*-
2
 
 
 
 
3
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
 
6
- def st_pb(method):
7
- def progress_bar(ref):
8
- container = st.empty()
9
- bar = st.progress(0)
10
- pg_gen = method(ref)
 
 
 
 
 
 
 
11
  try:
12
- while True:
13
- progress = next(pg_gen)
14
- bar.progress(progress[0])
15
- if progress[2]:
16
- container.write("βœ… Processing... " + progress[1])
17
- else:
18
- container.write("❌️ Errror with..." + progress[1])
19
- except StopIteration as result:
20
- return result.value
21
-
22
- return progress_bar
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # -*- coding:utf-8 -*-
2
 
3
+ import io
4
+
5
+ import pandas
6
  import streamlit as st
7
+ from pycaprio import Pycaprio, mappings
8
+ from zipfile import ZipFile
9
+ from requests.exceptions import JSONDecodeError
10
+
11
+ from n4a_analytics_lib.analytics import (GlobalStatistics,
12
+ IaaStatistics)
13
+ from n4a_analytics_lib.constants import KAPPA_LEGEND
14
+
15
+
16
+ @st.cache
17
+ def convert_df(df_ex: pandas.DataFrame) -> bytes:
18
+ return df_ex.to_csv(encoding="utf-8").encode('utf-8')
19
+
20
+
21
+ def check_login(username: str, password: str) -> bool:
22
+ if (len(username) == 0) or (len(password) == 0):
23
+ return False
24
+ return True
25
+
26
+
27
+ def display_data(col: st.columns) -> None:
28
+ col.metric("Total curated annotations",
29
+ f"{st.session_state['gs_obj'].total_annotations_project} Named entities")
30
+ col.dataframe(st.session_state['gs_obj'].df_i)
31
+ selected_data = col.selectbox('Select specific data to display bar plot:',
32
+ st.session_state['gs_obj'].documents, key="selector_data")
33
+ col.pyplot(st.session_state['gs_obj'].create_plot(selected_data))
34
+
35
+
36
+ def template_agreement_dataframe(title: str,
37
+ df: pandas.DataFrame,
38
+ total_pov: int,
39
+ total_annotations: int,
40
+ percentage_pov: float,
41
+ mode: str) -> None:
42
+ st.subheader(title)
43
+ st.markdown(f"{total_pov} / {total_annotations} annotations ({percentage_pov} %)")
44
+ st.download_button(
45
+ "Press to Download CSV",
46
+ convert_df(df),
47
+ f"csv_annotators_{mode}.csv",
48
+ "text/csv",
49
+ key=f'download-csv_{mode}'
50
+ )
51
+ st.dataframe(df)
52
+
53
+
54
+ def init_session_iaa(data: st.file_uploader,
55
+ baseline: st.file_uploader,
56
+ col: st.columns) -> None:
57
+ project_analyzed = IaaStatistics(zip_project=data, baseline_text=baseline.getvalue())
58
+ baseline_analyzer = project_analyzed.analyze_text()
59
+
60
+ col.markdown(f"""
61
+ ### BASELINE TEXT: {baseline.name}
62
+
63
+ - sentences: {baseline_analyzer[0]}
64
+ - words: {baseline_analyzer[1]}
65
+ - characters: {baseline_analyzer[2]}
66
+ """)
67
+
68
+ st.markdown("## πŸ“ˆ IAA metrics")
69
+ col1_kappa, col2_kappa = st.columns(2)
70
+
71
+ # Display Kappa group
72
+ col1_kappa.subheader("Fleiss Kappa (global score for group):")
73
+ col1_kappa.markdown(interpret_kappa(project_analyzed.fleiss_kappa), unsafe_allow_html=True)
74
+
75
+ # Display pairs kappa
76
+ col1_kappa.subheader("Cohen Kappa (score for annotators pair):")
77
+ for coders, c_k in project_analyzed.compute_pairs_cohen_kappa().items():
78
+ col1_kappa.markdown(f"* {coders[0]} <> {coders[1]} : {interpret_kappa(c_k)}", unsafe_allow_html=True)
79
+
80
+ # Display Kappa legend
81
+ col2_kappa.markdown(KAPPA_LEGEND, unsafe_allow_html=True)
82
+
83
+ # Plot confusion matrix
84
+ if st.checkbox('Display confusion matrix'):
85
+ width = st.slider("matrix width", 1, 10, 14)
86
+ height = st.slider("matrix height", 1, 10, 4)
87
+ st.pyplot(project_analyzed.plot_confusion_matrix(width=width, height=height).figure)
88
+
89
+ # Agree CSV
90
+ template_agreement_dataframe(title="βœ…οΈ Agree annotations",
91
+ df=project_analyzed.df_agree,
92
+ total_pov=project_analyzed.total_agree,
93
+ total_annotations=project_analyzed.total_annotations,
94
+ percentage_pov=project_analyzed.percent_agree,
95
+ mode="agree")
96
+ # Disagree CSV
97
+ template_agreement_dataframe(title="❌ Disagree annotations",
98
+ df=project_analyzed.df_disagree,
99
+ total_pov=project_analyzed.total_disagree,
100
+ total_annotations=project_analyzed.total_annotations,
101
+ percentage_pov=project_analyzed.percent_disagree,
102
+ mode="disagree")
103
+ # Pie plot
104
+ st.subheader("🏷️ Global Labels Statistics")
105
+ st.pyplot(project_analyzed.plot_agreement_pies().figure)
106
 
107
 
108
+ def init_session_statistics(remote: bool, local: bool, data: tuple) -> None:
109
+ # clear session
110
+ st.session_state = {}
111
+
112
+ # create a session variable
113
+ st.session_state["gs_local"] = local
114
+ st.session_state["gs_remote"] = remote
115
+
116
+ # create a new object:
117
+ # if remote fetch data from API Host first
118
+ if remote and not(local):
119
+ st.success('Fetch curated documents from host INCEpTION API in progress...')
120
  try:
121
+ fetch_curated_data_from_remote(
122
+ username=data[0],
123
+ password=data[1]
124
+ )
125
+ except JSONDecodeError:
126
+ # username / password incorrect
127
+ st.error('Username or Password is incorrect please retry.')
128
+ st.session_state = {}
129
+
130
+ if local and not(remote):
131
+ st.session_state["gs_obj"] = GlobalStatistics(zip_project=data, remote=False)
132
+
133
+
134
+ def fetch_curated_data_from_remote(username: str,
135
+ password: str,
136
+ endpoint: str = "https://inception.dhlab.epfl.ch/prod",
137
+ project_title: str = "ner4archives-template") -> None:
138
+ # open a client
139
+ client = Pycaprio(inception_host=endpoint, authentication=(str(username), str(password)))
140
+
141
+ # get project object
142
+ project_name = [p for p in client.api.projects() if p.project_name == project_title]
143
+
144
+ # get all documents from project
145
+ documents = client.api.documents(project_name[0].project_id)
146
+
147
+ curations = []
148
+ zipfiles = []
149
+ count = 0
150
+ flag = "a"
151
+ # iterate over all documents and retrieve only curated into ZIP container
152
+ for document in documents:
153
+ if count > 0:
154
+ flag = "r"
155
+ if document.document_state == mappings.DocumentState.CURATION_COMPLETE:
156
+ curated_content = client.api.curation(project_name[0].project_id, document,
157
+ curation_format=mappings.InceptionFormat.UIMA_CAS_XMI_XML_1_1)
158
+ curations.append(curated_content)
159
+ for curation in curations:
160
+ z = ZipFile(io.BytesIO(curation), mode=flag)
161
+ zipfiles.append(z)
162
+
163
+ count += 1
164
+
165
+ # Merge all zip in one
166
+ with zipfiles[0] as z1:
167
+ for fname in zipfiles[1:]:
168
+ zf = fname
169
+ # print(zf.namelist())
170
+ for n in zf.namelist():
171
+ if n not in z1.namelist():
172
+ z1.writestr(n, zf.open(n).read())
173
+
174
+ # Create a new object
175
+ st.session_state["gs_obj"] = GlobalStatistics(zip_project=z1, remote=True)
176
+
177
+
178
+ def interpret_kappa(score: float) -> str:
179
+ color = ""
180
+ if score < 0:
181
+ color= "#e74c3c;"
182
+ elif 0.01 <= score <= 0.20:
183
+ color= "#f39c12;"
184
+ elif 0.21 <= score <= 0.40:
185
+ color= "#f4d03f;"
186
+ elif 0.41 <= score <= 0.60:
187
+ color= "#5dade2;"
188
+ elif 0.61 <= score <= 0.80:
189
+ color= "#58d68d;"
190
+ elif 0.81 <= score <= 0.99:
191
+ color= "#28b463;"
192
+
193
+ return f"<span style='font-size:30px; color: {color}'>{round(score*100, 2)} %</span>"
194
+
195
+
196
+
197
+
198
+