Spaces:
Running
Running
Add in option to ignore punctuation and case
Browse filesFor many applications, it may be useful to ignore the punctuation and/or case when evaluating word error rate. This PR adds checkboxes to ignore punctuation and case, and then applies the relevant transforms to the ground truth and hypothesis text.
![ignore-punctuation-and-case.png](https://cdn-uploads.huggingface.co/production/uploads/6708a28ad08850e483b6f928/3t7a3ESb1kz8PqUJsTy8B.png)
I contemplated using jiwer's off the shelf transformations as defined here - https://jitsi.github.io/jiwer/reference/transformations/, which can directly be passed into the `process_words` function. However, this didn't have the option to remove punctuation.
Additionally, I also add typing to the python file and format the file.
app.py
CHANGED
@@ -1,38 +1,75 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
-
from jiwer import
|
3 |
|
4 |
-
|
|
|
|
|
5 |
return " ".join(words)
|
6 |
|
7 |
-
|
8 |
-
def highlight_errors(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
highlighted_text = []
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Process each alignment operation in measures
|
15 |
-
for alignment, ref, hyp in zip(
|
|
|
|
|
16 |
for chunk in alignment:
|
17 |
-
if chunk.type ==
|
18 |
# Add equal words without highlighting
|
19 |
-
highlighted_text.extend(ref[chunk.ref_start_idx:chunk.ref_end_idx])
|
20 |
|
21 |
-
elif chunk.type ==
|
22 |
# Highlight inserted words in green
|
23 |
-
highlighted_text.append(
|
|
|
|
|
|
|
24 |
|
25 |
-
elif chunk.type ==
|
26 |
# Highlight substitutions in purple: ground truth is striked through
|
27 |
-
highlighted_text.append(
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
elif chunk.type ==
|
31 |
# Highlight deleted words in red with strikethrough
|
32 |
-
highlighted_text.append(
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
-
highlighted_text_str = ' '.join(highlighted_text)
|
35 |
-
|
36 |
# Color Legend HTML
|
37 |
legend_html = """
|
38 |
<div style="margin-top: 10px;">
|
@@ -46,20 +83,32 @@ def highlight_errors(ground_truth, hypothesis):
|
|
46 |
# Combine highlighted output and legend
|
47 |
combined_output = f"{legend_html}<br>{highlighted_text_str}"
|
48 |
|
49 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
# Gradio Interface
|
52 |
interface = gr.Interface(
|
53 |
fn=highlight_errors,
|
54 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
55 |
outputs=[
|
56 |
gr.HTML(label="Highlighted Transcript"),
|
57 |
gr.Number(label="Word Error Rate"),
|
58 |
gr.Number(label="Substitutions"),
|
59 |
gr.Number(label="Insertions"),
|
60 |
-
gr.Number(label="Deletions")
|
61 |
],
|
62 |
-
title="WER Analysis"
|
63 |
)
|
64 |
|
65 |
interface.launch()
|
|
|
1 |
+
import typing as T
|
2 |
import gradio as gr
|
3 |
+
from jiwer import process_words, RemovePunctuation, ToLowerCase, Compose
|
4 |
|
5 |
+
|
6 |
+
def make_string(words: T.List[str]) -> str:
|
7 |
+
"""Converts list of strings to a string"""
|
8 |
return " ".join(words)
|
9 |
|
10 |
+
|
11 |
+
def highlight_errors(
|
12 |
+
ground_truth: str,
|
13 |
+
hypothesis: str,
|
14 |
+
remove_punctuation: bool,
|
15 |
+
to_lower_case: bool,
|
16 |
+
) -> T.Tuple[str, float, int, int, int]:
|
17 |
+
"""
|
18 |
+
Takes in a ground truth and hypothesis string, applies transformations as specified by
|
19 |
+
remove_punctuation and to_lower_case, and returns data to visualize word error rate.
|
20 |
+
|
21 |
+
Specifically, this returns an HTML string with insertions, deletions, and substitutions
|
22 |
+
highlighted as well as the computed WER, and # of subsititutions, insertions, and deletions.
|
23 |
+
"""
|
24 |
|
25 |
highlighted_text = []
|
26 |
|
27 |
+
transforms = [
|
28 |
+
RemovePunctuation() if remove_punctuation else None,
|
29 |
+
ToLowerCase() if to_lower_case else None,
|
30 |
+
]
|
31 |
+
transform = Compose([t for t in transforms if t is not None])
|
32 |
+
|
33 |
+
processed = process_words(
|
34 |
+
reference=transform(ground_truth), hypothesis=transform(hypothesis)
|
35 |
+
)
|
36 |
|
37 |
# Process each alignment operation in measures
|
38 |
+
for alignment, ref, hyp in zip(
|
39 |
+
processed.alignments, processed.references, processed.hypotheses
|
40 |
+
):
|
41 |
for chunk in alignment:
|
42 |
+
if chunk.type == "equal":
|
43 |
# Add equal words without highlighting
|
44 |
+
highlighted_text.extend(ref[chunk.ref_start_idx : chunk.ref_end_idx])
|
45 |
|
46 |
+
elif chunk.type == "insert":
|
47 |
# Highlight inserted words in green
|
48 |
+
highlighted_text.append(
|
49 |
+
f'<span style="color:green;">'
|
50 |
+
f"{make_string(hyp[chunk.hyp_start_idx:chunk.hyp_end_idx])}</span>"
|
51 |
+
)
|
52 |
|
53 |
+
elif chunk.type == "substitute":
|
54 |
# Highlight substitutions in purple: ground truth is striked through
|
55 |
+
highlighted_text.append(
|
56 |
+
f'<span style="color:purple;">'
|
57 |
+
f"{make_string(hyp[chunk.hyp_start_idx:chunk.hyp_end_idx])}</span>"
|
58 |
+
) # Hypothesis word
|
59 |
+
highlighted_text.append(
|
60 |
+
f'<span style="color:purple; text-decoration:line-through;">'
|
61 |
+
f"{make_string(ref[chunk.ref_start_idx:chunk.ref_end_idx])}</span>"
|
62 |
+
) # Ground truth word
|
63 |
|
64 |
+
elif chunk.type == "delete":
|
65 |
# Highlight deleted words in red with strikethrough
|
66 |
+
highlighted_text.append(
|
67 |
+
f'<span style="color:red; text-decoration:line-through;">'
|
68 |
+
f"{make_string(ref[chunk.ref_start_idx:chunk.ref_end_idx])}</span>"
|
69 |
+
)
|
70 |
+
|
71 |
+
highlighted_text_str = make_string(highlighted_text)
|
72 |
|
|
|
|
|
73 |
# Color Legend HTML
|
74 |
legend_html = """
|
75 |
<div style="margin-top: 10px;">
|
|
|
83 |
# Combine highlighted output and legend
|
84 |
combined_output = f"{legend_html}<br>{highlighted_text_str}"
|
85 |
|
86 |
+
return (
|
87 |
+
combined_output,
|
88 |
+
processed.wer,
|
89 |
+
processed.substitutions,
|
90 |
+
processed.insertions,
|
91 |
+
processed.deletions,
|
92 |
+
)
|
93 |
+
|
94 |
|
95 |
# Gradio Interface
|
96 |
interface = gr.Interface(
|
97 |
fn=highlight_errors,
|
98 |
+
inputs=[
|
99 |
+
gr.Textbox(label="Ground Truth"),
|
100 |
+
gr.Textbox(label="Hypothesis"),
|
101 |
+
gr.Checkbox(label="Ignore Punctuation"),
|
102 |
+
gr.Checkbox(label="Ignore Case"),
|
103 |
+
],
|
104 |
outputs=[
|
105 |
gr.HTML(label="Highlighted Transcript"),
|
106 |
gr.Number(label="Word Error Rate"),
|
107 |
gr.Number(label="Substitutions"),
|
108 |
gr.Number(label="Insertions"),
|
109 |
+
gr.Number(label="Deletions"),
|
110 |
],
|
111 |
+
title="WER Analysis",
|
112 |
)
|
113 |
|
114 |
interface.launch()
|