viktor-enzell commited on
Commit
38229d4
1 Parent(s): 3e60481

Basic speech-to-text interface.

Browse files
Files changed (2) hide show
  1. app.py +45 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
3
+ import torch
4
+ import torchaudio
5
+ import torchaudio.functional as F
6
+
7
+ st.set_page_config(
8
+ page_title='Swedish Speech-to-Text',
9
+ page_icon='🎙️'
10
+ )
11
+
12
+
13
+ # Import model and processor
14
+ model_name = 'viktor-enzell/wav2vec2-large-voxrex-swedish-4gram'
15
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
16
+ model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
17
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)
18
+
19
+
20
+ def run_inference(file):
21
+ waveform, sample_rate = torchaudio.load(file)
22
+
23
+ if sample_rate == 16_000:
24
+ waveform = waveform[0]
25
+ else:
26
+ waveform = F.resample(waveform, sample_rate, 16_000)[0]
27
+
28
+ inputs = processor(
29
+ waveform,
30
+ sampling_rate=16_000,
31
+ return_tensors='pt',
32
+ padding=True
33
+ ).to(device)
34
+
35
+ with torch.no_grad():
36
+ logits = model(**inputs).logits
37
+
38
+ return processor.batch_decode(logits.cpu().numpy()).text[0].lower()
39
+
40
+
41
+ uploaded_file = st.file_uploader('Choose a file', type=['.wav'])
42
+ if uploaded_file is not None:
43
+ transcript = run_inference(uploaded_file)
44
+
45
+ st.write(transcript)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ torch==1.10.1
2
+ torchaudio==0.10.1