Aryan Wadhawan commited on
Commit
e59b0bd
β€’
1 Parent(s): a7fd32e

Implemented everything

Browse files
Files changed (2) hide show
  1. app.py +61 -7
  2. requirements.txt +2 -1
app.py CHANGED
@@ -3,23 +3,31 @@ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
3
  import torch
4
  import phonemizer
5
  import librosa
 
6
  import io
7
  import base64
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  def lark(audioAsB64):
11
  # base64 to wav data conversion
12
  wav_data = base64.b64decode(audioAsB64.encode("utf-8"))
13
 
14
- # processing
15
  processor = Wav2Vec2Processor.from_pretrained(
16
  "facebook/wav2vec2-xlsr-53-espeak-cv-ft"
17
  )
18
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
19
 
20
- waveform, sample_rate = librosa.load(
21
- io.BytesIO(wav_data), sr=16000
22
- ) # Downsample 44.1kHz to 8kHz
23
 
24
  input_values = processor(
25
  waveform, sampling_rate=sample_rate, return_tensors="pt"
@@ -29,10 +37,56 @@ def lark(audioAsB64):
29
  logits = model(input_values).logits
30
 
31
  predicted_ids = torch.argmax(logits, dim=-1)
32
- transcription = processor.batch_decode(predicted_ids)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- return transcription
35
 
36
 
37
- iface = gr.Interface(fn=lark, inputs="text", outputs="text")
38
  iface.launch()
 
3
  import torch
4
  import phonemizer
5
  import librosa
6
+ import math
7
  import io
8
  import base64
9
+ from strsimpy.jaro_winkler import JaroWinkler
10
+
11
+ # base64 to audio βœ…
12
+ # audio to transcription βœ…
13
+ # audio to text βœ…
14
+ # text to phoneme βœ…
15
+ # accuracy = jarowinkler(transcription, phoneme) βœ…
16
+ # band = getBandFromAccuracy(accuracy) βœ…
17
+ # return accuracy, band βœ…
18
 
19
 
20
  def lark(audioAsB64):
21
  # base64 to wav data conversion
22
  wav_data = base64.b64decode(audioAsB64.encode("utf-8"))
23
 
24
+ # audio to transcription
25
  processor = Wav2Vec2Processor.from_pretrained(
26
  "facebook/wav2vec2-xlsr-53-espeak-cv-ft"
27
  )
28
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
29
 
30
+ waveform, sample_rate = librosa.load(io.BytesIO(wav_data), sr=16000)
 
 
31
 
32
  input_values = processor(
33
  waveform, sampling_rate=sample_rate, return_tensors="pt"
 
37
  logits = model(input_values).logits
38
 
39
  predicted_ids = torch.argmax(logits, dim=-1)
40
+ speechToPhonemeTranscription = processor.batch_decode(predicted_ids)[0]
41
+
42
+ # audio to text
43
+ processorSTT = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
44
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
45
+
46
+ input_values = processorSTT(
47
+ waveform, sampling_rate=sample_rate, return_tensors="pt"
48
+ ).input_values
49
+
50
+ logits = model(input_values).logits
51
+
52
+ predicted_ids = torch.argmax(logits, dim=-1)
53
+ speechToTextTranscripition = processor.batch_decode(predicted_ids)[0]
54
+
55
+ # text to phoneme
56
+ graphemeToPhonemeTranscription = phonemizer.phonemize(speechToTextTranscripition)
57
+
58
+ # accuracy = jaroWinkler(transcription, phoneme)
59
+
60
+ jarowinkler = JaroWinkler()
61
+ similarity_score = jarowinkler.similarity(
62
+ speechToPhonemeTranscription, graphemeToPhonemeTranscription
63
+ )
64
+
65
+ # ielts pronunciation band estimation
66
+ def getBandFromSimilarityScore(similarity_score):
67
+ if similarity_score >= 0.91:
68
+ return 9
69
+ elif similarity_score >= 0.81:
70
+ return 8
71
+ elif similarity_score >= 0.73:
72
+ return 7
73
+ elif similarity_score >= 0.65:
74
+ return 6
75
+ elif similarity_score >= 0.60:
76
+ return 5
77
+ elif similarity_score >= 0.46:
78
+ return 4
79
+ elif similarity_score >= 0.35:
80
+ return 3
81
+ elif similarity_score >= 0.1:
82
+ return 2
83
+ else:
84
+ return 1
85
+
86
+ IELTSband = getBandFromSimilarityScore(similarity_score)
87
 
88
+ return [similarity_score, IELTSband]
89
 
90
 
91
+ iface = gr.Interface(fn=lark, inputs="text", outputs=["text", "text"])
92
  iface.launch()
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  phonemizer
2
  librosa
3
  transformers
4
- torch
 
 
1
  phonemizer
2
  librosa
3
  transformers
4
+ torch
5
+ strsimpy