divi212 commited on
Commit
d4e7648
·
verified ·
1 Parent(s): e4806e6

Add in option to ignore punctuation and case

Browse files

For many applications, it may be useful to ignore the punctuation and/or case when evaluating word error rate. This PR adds checkboxes to ignore punctuation and case, and then applies the relevant transforms to the ground truth and hypothesis text.

![ignore-punctuation-and-case.png](https://cdn-uploads.huggingface.co/production/uploads/6708a28ad08850e483b6f928/3t7a3ESb1kz8PqUJsTy8B.png)

I contemplated using jiwer's off the shelf transformations as defined here - https://jitsi.github.io/jiwer/reference/transformations/, which can directly be passed into the `process_words` function. However, this didn't have the option to remove punctuation.

Additionally, I also add typing to the python file and format the file.

Files changed (1) hide show
  1. app.py +70 -21
app.py CHANGED
@@ -1,38 +1,75 @@
 
1
  import gradio as gr
2
- from jiwer import wer, process_words
3
 
4
- def make_string(words):
 
 
5
  return " ".join(words)
6
 
7
- # Function to highlight errors
8
- def highlight_errors(ground_truth, hypothesis):
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  highlighted_text = []
11
 
12
- processed = process_words(ground_truth, hypothesis)
 
 
 
 
 
 
 
 
13
 
14
  # Process each alignment operation in measures
15
- for alignment, ref, hyp in zip(processed.alignments, processed.references, processed.hypotheses):
 
 
16
  for chunk in alignment:
17
- if chunk.type == 'equal':
18
  # Add equal words without highlighting
19
- highlighted_text.extend(ref[chunk.ref_start_idx:chunk.ref_end_idx])
20
 
21
- elif chunk.type == 'insert':
22
  # Highlight inserted words in green
23
- highlighted_text.append(f'<span style="color:green;">{make_string(hyp[chunk.hyp_start_idx:chunk.hyp_end_idx])}</span>')
 
 
 
24
 
25
- elif chunk.type == 'substitute':
26
  # Highlight substitutions in purple: ground truth is striked through
27
- highlighted_text.append(f'<span style="color:purple;">{make_string(hyp[chunk.hyp_start_idx:chunk.hyp_end_idx])}</span>') # Hypothesis word
28
- highlighted_text.append(f'<span style="color:purple; text-decoration:line-through;">{make_string(ref[chunk.ref_start_idx:chunk.ref_end_idx])}</span>') # Ground truth word
 
 
 
 
 
 
29
 
30
- elif chunk.type == 'delete':
31
  # Highlight deleted words in red with strikethrough
32
- highlighted_text.append(f'<span style="color:red; text-decoration:line-through;">{make_string(ref[chunk.ref_start_idx:chunk.ref_end_idx])}</span>')
 
 
 
 
 
33
 
34
- highlighted_text_str = ' '.join(highlighted_text)
35
-
36
  # Color Legend HTML
37
  legend_html = """
38
  <div style="margin-top: 10px;">
@@ -46,20 +83,32 @@ def highlight_errors(ground_truth, hypothesis):
46
  # Combine highlighted output and legend
47
  combined_output = f"{legend_html}<br>{highlighted_text_str}"
48
 
49
- return combined_output, processed.wer, processed.substitutions, processed.insertions, processed.deletions
 
 
 
 
 
 
 
50
 
51
  # Gradio Interface
52
  interface = gr.Interface(
53
  fn=highlight_errors,
54
- inputs=["text", "text"],
 
 
 
 
 
55
  outputs=[
56
  gr.HTML(label="Highlighted Transcript"),
57
  gr.Number(label="Word Error Rate"),
58
  gr.Number(label="Substitutions"),
59
  gr.Number(label="Insertions"),
60
- gr.Number(label="Deletions")
61
  ],
62
- title="WER Analysis"
63
  )
64
 
65
  interface.launch()
 
1
+ import typing as T
2
  import gradio as gr
3
+ from jiwer import process_words, RemovePunctuation, ToLowerCase, Compose
4
 
5
+
6
+ def make_string(words: T.List[str]) -> str:
7
+ """Converts list of strings to a string"""
8
  return " ".join(words)
9
 
10
+
11
+ def highlight_errors(
12
+ ground_truth: str,
13
+ hypothesis: str,
14
+ remove_punctuation: bool,
15
+ to_lower_case: bool,
16
+ ) -> T.Tuple[str, float, int, int, int]:
17
+ """
18
+ Takes in a ground truth and hypothesis string, applies transformations as specified by
19
+ remove_punctuation and to_lower_case, and returns data to visualize word error rate.
20
+
21
+ Specifically, this returns an HTML string with insertions, deletions, and substitutions
22
+ highlighted as well as the computed WER, and # of subsititutions, insertions, and deletions.
23
+ """
24
 
25
  highlighted_text = []
26
 
27
+ transforms = [
28
+ RemovePunctuation() if remove_punctuation else None,
29
+ ToLowerCase() if to_lower_case else None,
30
+ ]
31
+ transform = Compose([t for t in transforms if t is not None])
32
+
33
+ processed = process_words(
34
+ reference=transform(ground_truth), hypothesis=transform(hypothesis)
35
+ )
36
 
37
  # Process each alignment operation in measures
38
+ for alignment, ref, hyp in zip(
39
+ processed.alignments, processed.references, processed.hypotheses
40
+ ):
41
  for chunk in alignment:
42
+ if chunk.type == "equal":
43
  # Add equal words without highlighting
44
+ highlighted_text.extend(ref[chunk.ref_start_idx : chunk.ref_end_idx])
45
 
46
+ elif chunk.type == "insert":
47
  # Highlight inserted words in green
48
+ highlighted_text.append(
49
+ f'<span style="color:green;">'
50
+ f"{make_string(hyp[chunk.hyp_start_idx:chunk.hyp_end_idx])}</span>"
51
+ )
52
 
53
+ elif chunk.type == "substitute":
54
  # Highlight substitutions in purple: ground truth is striked through
55
+ highlighted_text.append(
56
+ f'<span style="color:purple;">'
57
+ f"{make_string(hyp[chunk.hyp_start_idx:chunk.hyp_end_idx])}</span>"
58
+ ) # Hypothesis word
59
+ highlighted_text.append(
60
+ f'<span style="color:purple; text-decoration:line-through;">'
61
+ f"{make_string(ref[chunk.ref_start_idx:chunk.ref_end_idx])}</span>"
62
+ ) # Ground truth word
63
 
64
+ elif chunk.type == "delete":
65
  # Highlight deleted words in red with strikethrough
66
+ highlighted_text.append(
67
+ f'<span style="color:red; text-decoration:line-through;">'
68
+ f"{make_string(ref[chunk.ref_start_idx:chunk.ref_end_idx])}</span>"
69
+ )
70
+
71
+ highlighted_text_str = make_string(highlighted_text)
72
 
 
 
73
  # Color Legend HTML
74
  legend_html = """
75
  <div style="margin-top: 10px;">
 
83
  # Combine highlighted output and legend
84
  combined_output = f"{legend_html}<br>{highlighted_text_str}"
85
 
86
+ return (
87
+ combined_output,
88
+ processed.wer,
89
+ processed.substitutions,
90
+ processed.insertions,
91
+ processed.deletions,
92
+ )
93
+
94
 
95
  # Gradio Interface
96
  interface = gr.Interface(
97
  fn=highlight_errors,
98
+ inputs=[
99
+ gr.Textbox(label="Ground Truth"),
100
+ gr.Textbox(label="Hypothesis"),
101
+ gr.Checkbox(label="Ignore Punctuation"),
102
+ gr.Checkbox(label="Ignore Case"),
103
+ ],
104
  outputs=[
105
  gr.HTML(label="Highlighted Transcript"),
106
  gr.Number(label="Word Error Rate"),
107
  gr.Number(label="Substitutions"),
108
  gr.Number(label="Insertions"),
109
+ gr.Number(label="Deletions"),
110
  ],
111
+ title="WER Analysis",
112
  )
113
 
114
  interface.launch()