Aryan Wadhawan commited on
Commit
5473c42
1 Parent(s): e981df1

Add application file

Browse files
Files changed (1) hide show
  1. app.py +38 -0
app.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
3
+ import torch
4
+ import phonemizer
5
+ import librosa
6
+ import base64
7
+
8
+
9
+ def lark(audioAsB64):
10
+ # convert b64 audio to wav
11
+ with open("audio.wav", "wb") as preWaveform:
12
+ preWaveform.write(base64.b64encode())
13
+
14
+ # processing
15
+ processor = Wav2Vec2Processor.from_pretrained(
16
+ "facebook/wav2vec2-xlsr-53-espeak-cv-ft"
17
+ )
18
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
19
+
20
+ waveform, sample_rate = librosa.load(
21
+ "harvard.wav", sr=16000
22
+ ) # Downsample 44.1kHz to 8kHz
23
+
24
+ input_values = processor(
25
+ waveform, sampling_rate=sample_rate, return_tensors="pt"
26
+ ).input_values
27
+
28
+ with torch.no_grad():
29
+ logits = model(input_values).logits
30
+
31
+ predicted_ids = torch.argmax(logits, dim=-1)
32
+ transcription = processor.batch_decode(predicted_ids)
33
+
34
+ return transcription
35
+
36
+
37
+ iface = gr.Interface(fn=lark, inputs="text", outputs="text")
38
+ iface.launch()