add conversion for INTELLIGIBILITY SCORE and NATURALNESS SCORE
Browse files- .gitignore +22 -0
- app.py +10 -3
- flagged/log.csv +4 -0
- local/WER2INTELI.png +0 -0
- local/convert_metrics.py +71 -0
- local/nat2avaMOS.png +0 -0
.gitignore
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.pyc
|
4 |
+
*.pyo
|
5 |
+
*.pyd
|
6 |
+
__pycache__/
|
7 |
+
*.db
|
8 |
+
*.sqlite3
|
9 |
+
*.sqlite
|
10 |
+
*.log
|
11 |
+
*.bak
|
12 |
+
*.swp
|
13 |
+
*.swo
|
14 |
+
*.tmp
|
15 |
+
*.tmp.*
|
16 |
+
*~
|
17 |
+
|
18 |
+
# flagged
|
19 |
+
flagged/
|
20 |
+
|
21 |
+
#
|
22 |
+
*.wav
|
app.py
CHANGED
@@ -6,6 +6,7 @@ import torch.nn as nn
|
|
6 |
import lightning_module
|
7 |
import pdb
|
8 |
import jiwer
|
|
|
9 |
|
10 |
# ASR part
|
11 |
from transformers import pipeline
|
@@ -57,6 +58,10 @@ def calc_mos(audio_path, ref):
|
|
57 |
trans = p(audio_path)["text"]
|
58 |
# WER
|
59 |
wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
|
|
|
|
|
|
|
|
|
60 |
# MOS
|
61 |
batch = {
|
62 |
'wav': out_wavs,
|
@@ -66,6 +71,8 @@ def calc_mos(audio_path, ref):
|
|
66 |
with torch.no_grad():
|
67 |
output = model(batch)
|
68 |
predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
|
|
|
|
|
69 |
# Phonemes per minute (PPM)
|
70 |
with torch.no_grad():
|
71 |
logits = phoneme_model(out_wavs).logits
|
@@ -75,7 +82,7 @@ def calc_mos(audio_path, ref):
|
|
75 |
wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
|
76 |
ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
|
77 |
|
78 |
-
return
|
79 |
|
80 |
|
81 |
description ="""
|
@@ -93,9 +100,9 @@ iface = gr.Interface(
|
|
93 |
fn=calc_mos,
|
94 |
inputs=[gr.Audio(type='filepath', label="Audio to evaluate"),
|
95 |
gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
|
96 |
-
outputs=[gr.Textbox(placeholder="Naturalness
|
97 |
gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
|
98 |
-
gr.Textbox(placeholder="
|
99 |
gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
|
100 |
gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="PPM")],
|
101 |
title="Laronix's Voice Quality Checking System Demo",
|
|
|
6 |
import lightning_module
|
7 |
import pdb
|
8 |
import jiwer
|
9 |
+
from local.convert_metrics import nat2avaMOS, WER2INTELI
|
10 |
|
11 |
# ASR part
|
12 |
from transformers import pipeline
|
|
|
58 |
trans = p(audio_path)["text"]
|
59 |
# WER
|
60 |
wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
|
61 |
+
|
62 |
+
# WER convert to Intellibility score
|
63 |
+
INTELI_score = WER2INTELI(wer*100)
|
64 |
+
|
65 |
# MOS
|
66 |
batch = {
|
67 |
'wav': out_wavs,
|
|
|
71 |
with torch.no_grad():
|
72 |
output = model(batch)
|
73 |
predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
|
74 |
+
# MOS to AVA MOS
|
75 |
+
AVA_MOS = nat2avaMOS(predic_mos)
|
76 |
# Phonemes per minute (PPM)
|
77 |
with torch.no_grad():
|
78 |
logits = phoneme_model(out_wavs).logits
|
|
|
82 |
wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
|
83 |
ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
|
84 |
|
85 |
+
return AVA_MOS, trans, INTELI_score, phone_transcription, ppm
|
86 |
|
87 |
|
88 |
description ="""
|
|
|
100 |
fn=calc_mos,
|
101 |
inputs=[gr.Audio(type='filepath', label="Audio to evaluate"),
|
102 |
gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
|
103 |
+
outputs=[gr.Textbox(placeholder="Naturalness Score, ranged from 0 to 5, the higher the better.", label="Naturalness Score, ranged from 0 to 5, the higher the better."),
|
104 |
gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
|
105 |
+
gr.Textbox(placeholder="Intelligibility Score", label = "Intelligibility Score, range from 0 to 100, the higher the better"),
|
106 |
gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
|
107 |
gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="PPM")],
|
108 |
title="Laronix's Voice Quality Checking System Demo",
|
flagged/log.csv
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Audio to evaluate,Reference,flag,username,timestamp
|
2 |
+
/mnt/ssdisk/Laronix_voice_quality_checking_system_FILEIO/flagged/Audio to evaluate/733000b063d55f6ef81b9319254d71e0b9f3575a/tmpvs5jv1ic.wav,"Once upon a time, there was a young rat named Arthur who couldn't make up his mind",,,2023-10-23 15:15:26.722712
|
3 |
+
/mnt/ssdisk/Laronix_voice_quality_checking_system_FILEIO/flagged/Audio to evaluate/61f172b4530dab3c975a0f17861216d097df5df1/tmp5ihs8ctx.wav,"Once upon a time, there was a young rat named Arthur who couldn't make up his mind",,,2023-10-23 15:18:15.832797
|
4 |
+
/mnt/ssdisk/Laronix_voice_quality_checking_system_FILEIO/flagged/Audio to evaluate/82c854b768a5a5350164fe165b63b40c85d59b26/tmpjyg2550a.wav,"Once upon a time, there was a young rat named Arthur who couldn't make up his mind",,,2023-10-23 15:20:15.140430
|
local/WER2INTELI.png
ADDED
local/convert_metrics.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
|
4 |
+
# Natural MOS to AVA MOS
|
5 |
+
|
6 |
+
def linear_function(x):
|
7 |
+
return 8 * x - 8
|
8 |
+
|
9 |
+
def quadratic_function(x):
|
10 |
+
return -0.0816 * (x - 5) ** 2 + 5
|
11 |
+
|
12 |
+
# Natural MOS to AVA MOS
|
13 |
+
def nat2avaMOS(x):
|
14 |
+
if x <= 1.5:
|
15 |
+
return linear_function(x)
|
16 |
+
elif x >1.5 and x <= 5:
|
17 |
+
return quadratic_function(x)
|
18 |
+
|
19 |
+
# Word error rate to Intellibility Score (X is percentage)
|
20 |
+
def WER2INTELI(x):
|
21 |
+
if x <= 10:
|
22 |
+
return 100
|
23 |
+
elif x <= 100:
|
24 |
+
slope = (30 - 100) / (100 - 10)
|
25 |
+
intercept = 100 - slope * 10
|
26 |
+
return slope * x + intercept
|
27 |
+
else:
|
28 |
+
return 100 * np.exp(-0.01 * (x - 100))
|
29 |
+
|
30 |
+
# # 生成 x 值
|
31 |
+
# x = np.linspace(0, 200, 400) # 从0到200生成400个点
|
32 |
+
|
33 |
+
# # 计算对应的 y 值
|
34 |
+
# y = [WER2INT(xi) for xi in x]
|
35 |
+
|
36 |
+
# # 绘制函数图像
|
37 |
+
# plt.plot(x, y)
|
38 |
+
# plt.xlabel('x')
|
39 |
+
# plt.ylabel('f(x)')
|
40 |
+
# plt.title('Custom Function')
|
41 |
+
# plt.grid(True)
|
42 |
+
# plt.show()
|
43 |
+
|
44 |
+
# # 生成 x 值的范围
|
45 |
+
# x1 = np.linspace(1, 1.5, 100)
|
46 |
+
# x2 = np.linspace(1.5, 5, 100)
|
47 |
+
|
48 |
+
# # 计算对应的 y 值
|
49 |
+
# y1 = linear_function(x1)
|
50 |
+
# y2 = quadratic_function(x2)
|
51 |
+
|
52 |
+
# # 绘制线性部分
|
53 |
+
# plt.plot(x1, y1, label='Linear Function (1 <= x <= 1.5)')
|
54 |
+
|
55 |
+
# # 绘制二次部分
|
56 |
+
# plt.plot(x2, y2, label='Quadratic Function (1.5 <= x <= 5)')
|
57 |
+
|
58 |
+
# # 添加标签和标题
|
59 |
+
# plt.xlabel('Natural Mean Opinion Score')
|
60 |
+
# plt.ylabel('AVA Mean Opinion Score')
|
61 |
+
# plt.title('nat2avaMOS')
|
62 |
+
|
63 |
+
# # 添加图例
|
64 |
+
# plt.legend()
|
65 |
+
|
66 |
+
# # 显示图形
|
67 |
+
# plt.grid(True)
|
68 |
+
|
69 |
+
# # 显示图像
|
70 |
+
# plt.savefig("./local/nat2avaMOS.png")
|
71 |
+
# plt.savefig("./local/WER2INT.png")
|
local/nat2avaMOS.png
ADDED