Spaces:
Running
Running
blurb added, q3 now depends on q2, if q2 no suggestion, q3 auto no link
Browse files- app.py +11 -5
- overview.py +44 -3
app.py
CHANGED
@@ -3,8 +3,6 @@ import transformers as tf
|
|
3 |
import pandas as pd
|
4 |
|
5 |
from overview import NQDOverview
|
6 |
-
from fullreport import NQDFullReport
|
7 |
-
|
8 |
|
9 |
# Function to load and cache models
|
10 |
@st.experimental_singleton(show_spinner=False)
|
@@ -33,9 +31,13 @@ def run_models(model_names, models, c):
|
|
33 |
|
34 |
st.title('Assess the *QuAL*ity of your feedback')
|
35 |
st.caption(
|
36 |
-
"""Medical education
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
39 |
""")
|
40 |
|
41 |
### Load models
|
@@ -83,6 +85,10 @@ with st.form('comment_form'):
|
|
83 |
st.experimental_rerun()
|
84 |
|
85 |
results = run_models(models_to_load, models, st.session_state['comment'])
|
|
|
|
|
|
|
|
|
86 |
|
87 |
overview = NQDOverview(st, results)
|
88 |
overview.draw()
|
|
|
3 |
import pandas as pd
|
4 |
|
5 |
from overview import NQDOverview
|
|
|
|
|
6 |
|
7 |
# Function to load and cache models
|
8 |
@st.experimental_singleton(show_spinner=False)
|
|
|
31 |
|
32 |
st.title('Assess the *QuAL*ity of your feedback')
|
33 |
st.caption(
|
34 |
+
"""Medical education requires high-quality *written* feedback,
|
35 |
+
but evaluating these *supervisor narrative comments* is time-consuming.
|
36 |
+
The QuAL score has validity evidence for measuring the quality of short
|
37 |
+
comments in this context. We developed a NLP/ML-powered tool to
|
38 |
+
assess written comment quality via the QuAL score with high accuracy.
|
39 |
+
|
40 |
+
*Try it for yourself!*
|
41 |
""")
|
42 |
|
43 |
### Load models
|
|
|
85 |
st.experimental_rerun()
|
86 |
|
87 |
results = run_models(models_to_load, models, st.session_state['comment'])
|
88 |
+
# Modify results to sum the QuAL score and to ignore Q3 if Q2 no suggestion
|
89 |
+
if results['q2i']['label'] == 1:
|
90 |
+
results['q3i']['label'] = 1 # can't have connection if no suggestion
|
91 |
+
results['qual']['label'] = results['q1']['label'] + (not results['q2i']['label']) + (not results['q3i']['label'])
|
92 |
|
93 |
overview = NQDOverview(st, results)
|
94 |
overview.draw()
|
overview.py
CHANGED
@@ -1,6 +1,44 @@
|
|
1 |
from matplotlib.cm import get_cmap
|
2 |
import plotly.graph_objects as go
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
class NQDOverview(object):
|
5 |
def __init__(self, parent, results,
|
6 |
dial_cmap='RdYlGn'):
|
@@ -32,6 +70,9 @@ class NQDOverview(object):
|
|
32 |
def draw(self):
|
33 |
st = self.p
|
34 |
|
|
|
|
|
|
|
35 |
fig = self._build_figure()
|
36 |
|
37 |
cols = st.columns([7, 3])
|
@@ -48,7 +89,7 @@ class NQDOverview(object):
|
|
48 |
elif q1lab == 3:
|
49 |
md_str = 'π High'
|
50 |
cols[1].metric('Level of Detail', md_str,
|
51 |
-
help='
|
52 |
|
53 |
q2lab = self.results['q2i']['label']
|
54 |
if q2lab == 0:
|
@@ -56,7 +97,7 @@ class NQDOverview(object):
|
|
56 |
else:
|
57 |
md_str = 'β No'
|
58 |
cols[1].metric('Suggestion Given', (md_str),
|
59 |
-
help='
|
60 |
|
61 |
q3lab = self.results['q3i']['label']
|
62 |
if q3lab == 0:
|
@@ -64,4 +105,4 @@ class NQDOverview(object):
|
|
64 |
else:
|
65 |
md_str = 'β No'
|
66 |
cols[1].metric('Suggestion Linked', md_str,
|
67 |
-
help='Is the suggestion
|
|
|
1 |
from matplotlib.cm import get_cmap
|
2 |
import plotly.graph_objects as go
|
3 |
|
4 |
+
about_blurb = '''
|
5 |
+
### About the QuAL Score
|
6 |
+
|
7 |
+
The Quality of Assessment for Learning score (QuAL score),
|
8 |
+
was created to evaluate short qualitative comments that are related to specific
|
9 |
+
scores entered into a workplace-based assessment,
|
10 |
+
common within the competency-based medical education (CBME) context.
|
11 |
+
|
12 |
+
It is rated on a scale of 0-5, with 0 signifying very low quality and 5 very high quality.
|
13 |
+
It consists of three subscores which are summed to calculate the overall QuAL score:
|
14 |
+
|
15 |
+
1. Evidence - Does the rater provide sufficient evidence about resident performance? (0-no comment at all, 1-no, but comment present, 2-somewhat, 3-yes/full description)
|
16 |
+
2. Suggestion - Does the rater provide a suggestion for improvement? (0-no/1-yes)
|
17 |
+
3. Connection - Is the rater's suggestion linked to the behavior described? (0-no/1-yes)
|
18 |
+
|
19 |
+
The QuAL score has validity evidence for accurately measuring the quality of evaluation comments in CBME.
|
20 |
+
|
21 |
+
For more information, see the paper [here](https://doi.org/10.1080/10401334.2019.1708365).
|
22 |
+
|
23 |
+
### About this Tool
|
24 |
+
|
25 |
+
The QuAL score accurately rates the quality of narrative comments in CBME, but
|
26 |
+
it still requires time-consuming manual rating. With large volumes of text generated in a
|
27 |
+
typical CBME program, large-scale assessment of comment quality is impractical.
|
28 |
+
This tool uses machine learning (ML) and natural langugage processing (NLP) to automate
|
29 |
+
the rating of the QuAL score on narratie comments.
|
30 |
+
|
31 |
+
We trained a machine learning model to predict each of the three subscores described above.
|
32 |
+
The resulting models are accurate:
|
33 |
+
1. Evidence - Balanced accuracy of 61.5% for a 0-3 result, within-one accuracy of 96.4%
|
34 |
+
2. Suggestion - Accuracy of 85%, sensitivity for lack of suggestion 86.2%
|
35 |
+
3. Connection - Accuracy of 82%, sensitivity for lack of connection 90%
|
36 |
+
|
37 |
+
The models are highly accurate, but not perfect! You may experience times where
|
38 |
+
the results are not consistent with your interpretation of the text. If you do, please
|
39 |
+
leave us [feedback](https://forms.gle/PfXxcGmvLYvd9jWz5). This tool is intendened as a demonstration only
|
40 |
+
and should not be used for high-stakes assessment (yet!).
|
41 |
+
'''
|
42 |
class NQDOverview(object):
|
43 |
def __init__(self, parent, results,
|
44 |
dial_cmap='RdYlGn'):
|
|
|
70 |
def draw(self):
|
71 |
st = self.p
|
72 |
|
73 |
+
with st.expander('About the QuAL Score and this Tool', expanded=False):
|
74 |
+
st.markdown(about_blurb)
|
75 |
+
|
76 |
fig = self._build_figure()
|
77 |
|
78 |
cols = st.columns([7, 3])
|
|
|
89 |
elif q1lab == 3:
|
90 |
md_str = 'π High'
|
91 |
cols[1].metric('Level of Detail', md_str,
|
92 |
+
help='Q1 - Evidence - Does the rater provide sufficient evidence about resident performance? (0-no comment at all, 1-no, but comment present, 2-somewhat, 3-yes/full description)')
|
93 |
|
94 |
q2lab = self.results['q2i']['label']
|
95 |
if q2lab == 0:
|
|
|
97 |
else:
|
98 |
md_str = 'β No'
|
99 |
cols[1].metric('Suggestion Given', (md_str),
|
100 |
+
help='Q2 - Suggestion - Does the rater provide a suggestion for improvement? (0-no/1-yes)')
|
101 |
|
102 |
q3lab = self.results['q3i']['label']
|
103 |
if q3lab == 0:
|
|
|
105 |
else:
|
106 |
md_str = 'β No'
|
107 |
cols[1].metric('Suggestion Linked', md_str,
|
108 |
+
help='Q3 - Connection - Is the raterβs suggestion linked to the behavior described? (0-no/1-yes)')
|