KevinGeng commited on
Commit
4497f4b
1 Parent(s): ac46a87

add new phoneme mode;

Browse files
Files changed (1) hide show
  1. app.py +9 -10
app.py CHANGED
@@ -23,8 +23,8 @@ transformation = jiwer.Compose([
23
 
24
  # WPM part
25
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
26
- # processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
27
- # phoneme_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
28
  # phoneme_model = pipeline(model="facebook/wav2vec2-xlsr-53-espeak-cv-ft")
29
  class ChangeSampleRate(nn.Module):
30
  def __init__(self, input_rate: int, output_rate: int):
@@ -79,10 +79,10 @@ def calc_mos(audio_path, ref):
79
  MOS_fig = Naturalness_Plot(AVA_MOS)
80
 
81
  # Phonemes per minute (PPM)
82
- # with torch.no_grad():
83
- # logits = phoneme_model(out_wavs).logits
84
- # phone_predicted_ids = torch.argmax(logits, dim=-1)
85
- # phone_transcription = processor.batch_decode(phone_predicted_ids)
86
 
87
  # Disable PPM for now
88
  phone_transcription = ['D U M M Y']
@@ -95,8 +95,7 @@ def calc_mos(audio_path, ref):
95
  ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
96
 
97
  # pdb.set_trace()
98
- # return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm, f0_db_fig
99
- return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, f0_db_fig
100
 
101
 
102
  with open("local/description.md") as f:
@@ -118,6 +117,8 @@ iface = gr.Interface(
118
  gr.Textbox(placeholder="Intelligibility Score", label = "Intelligibility Score, range from 0 to 100, the higher the better", visible=False),
119
  gr.Plot(label="Intelligibility Score, range from 0 to 100, the higher the better", show_label=True, container=True),
120
  gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
 
 
121
  gr.Plot(label="Pitch Contour and dB Analysis", show_label=True, container=True)],
122
  title="Speech Analysis by Laronix AI",
123
  description=description,
@@ -125,7 +126,5 @@ iface = gr.Interface(
125
  examples=examples,
126
  )
127
  # Currently remove PPM and Phonemes
128
- # gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes", visible=False),
129
- # gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="Speaking Rate, Phonemes per minutes", visible=False),
130
  # add password to protect the interface
131
  iface.launch(share=False, auth=['Laronix', 'LaronixSLP'], auth_message="Authentication Required, ask kevin@laronix.com for password.\n Thanks for your cooperation!")
 
23
 
24
  # WPM part
25
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
26
+ processor = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
27
+ model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
28
  # phoneme_model = pipeline(model="facebook/wav2vec2-xlsr-53-espeak-cv-ft")
29
  class ChangeSampleRate(nn.Module):
30
  def __init__(self, input_rate: int, output_rate: int):
 
79
  MOS_fig = Naturalness_Plot(AVA_MOS)
80
 
81
  # Phonemes per minute (PPM)
82
+ with torch.no_grad():
83
+ logits = phoneme_model(out_wavs).logits
84
+ phone_predicted_ids = torch.argmax(logits, dim=-1)
85
+ phone_transcription = processor.batch_decode(phone_predicted_ids)
86
 
87
  # Disable PPM for now
88
  phone_transcription = ['D U M M Y']
 
95
  ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
96
 
97
  # pdb.set_trace()
98
+ return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm, f0_db_fig
 
99
 
100
 
101
  with open("local/description.md") as f:
 
117
  gr.Textbox(placeholder="Intelligibility Score", label = "Intelligibility Score, range from 0 to 100, the higher the better", visible=False),
118
  gr.Plot(label="Intelligibility Score, range from 0 to 100, the higher the better", show_label=True, container=True),
119
  gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
120
+ gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes", visible=False),
121
+ gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="Speaking Rate, Phonemes per minutes", visible=False),
122
  gr.Plot(label="Pitch Contour and dB Analysis", show_label=True, container=True)],
123
  title="Speech Analysis by Laronix AI",
124
  description=description,
 
126
  examples=examples,
127
  )
128
  # Currently remove PPM and Phonemes
 
 
129
  # add password to protect the interface
130
  iface.launch(share=False, auth=['Laronix', 'LaronixSLP'], auth_message="Authentication Required, ask kevin@laronix.com for password.\n Thanks for your cooperation!")