viktor-enzell commited on
Commit
5b95586
β€’
1 Parent(s): 36d5632

Prettier interface and ability to download transcript.

Browse files
Files changed (2) hide show
  1. README.md +3 -3
  2. app.py +29 -8
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: Wav2vec2 Large Voxrex Swedish 4gram
3
- emoji: πŸ“ˆ
4
- colorFrom: purple
5
- colorTo: green
6
  sdk: streamlit
7
  sdk_version: 1.9.0
8
  app_file: app.py
 
1
  ---
2
  title: Wav2vec2 Large Voxrex Swedish 4gram
3
+ emoji: πŸŽ™οΈ
4
+ colorFrom: orange
5
+ colorTo: black
6
  sdk: streamlit
7
  sdk_version: 1.9.0
8
  app_file: app.py
app.py CHANGED
@@ -1,18 +1,27 @@
1
  import streamlit as st
2
  from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
 
3
  import torch
4
  import torchaudio
5
  import torchaudio.functional as F
6
 
 
7
  st.set_page_config(
8
- page_title='Swedish Speech-to-Text',
9
- page_icon='πŸŽ™οΈ'
 
 
 
 
10
  )
 
 
11
 
 
 
12
 
13
- # Import model and processor
14
- model_name = 'viktor-enzell/wav2vec2-large-voxrex-swedish-4gram'
15
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
16
  model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
17
  processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)
18
 
@@ -28,7 +37,7 @@ def run_inference(file):
28
  inputs = processor(
29
  waveform,
30
  sampling_rate=16_000,
31
- return_tensors='pt',
32
  padding=True
33
  ).to(device)
34
 
@@ -38,8 +47,20 @@ def run_inference(file):
38
  return processor.batch_decode(logits.cpu().numpy()).text[0].lower()
39
 
40
 
41
- uploaded_file = st.file_uploader('Choose a file', type=['.wav'])
42
  if uploaded_file is not None:
 
 
 
 
 
 
 
 
43
  transcript = run_inference(uploaded_file)
44
 
45
- st.write(transcript)
 
 
 
 
 
1
  import streamlit as st
2
  from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
3
+ import ffmpeg
4
  import torch
5
  import torchaudio
6
  import torchaudio.functional as F
7
 
8
+
9
  st.set_page_config(
10
+ page_title="Swedish Speech-to-Text",
11
+ page_icon="πŸŽ™οΈ"
12
+ )
13
+ st.image(
14
+ "https://emojipedia-us.s3.dualstack.us-west-1.amazonaws.com/thumbs/320/apple/325/studio-microphone_1f399-fe0f.png",
15
+ width=100,
16
  )
17
+ st.markdown("""
18
+ # Swedish high-quality transcription
19
 
20
+ Generate Swedish transcripts for download from an audio file with this high-quality speech-to-text model. The model is KBLab's wav2vec 2.0 large VoxRex Swedish (C) with a 4-gram language model, which you can access [here](https://huggingface.co/viktor-enzell/wav2vec2-large-voxrex-swedish-4gram).
21
+ """)
22
 
23
+ model_name = "viktor-enzell/wav2vec2-large-voxrex-swedish-4gram"
24
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
25
  model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
26
  processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)
27
 
 
37
  inputs = processor(
38
  waveform,
39
  sampling_rate=16_000,
40
+ return_tensors="pt",
41
  padding=True
42
  ).to(device)
43
 
 
47
  return processor.batch_decode(logits.cpu().numpy()).text[0].lower()
48
 
49
 
50
+ uploaded_file = st.file_uploader("Choose a file", type=[".wav"])
51
  if uploaded_file is not None:
52
+ if uploaded_file.type != "audio/wav":
53
+ pass
54
+ # TODO: convert to wav
55
+ # bytes = uploaded_file.getvalue()
56
+ # audio_input = ffmpeg.input(bytes).audio
57
+ # audio_output = ffmpeg.output(audio_input, "tmp.wav", format="wav")
58
+ # ffmpeg.run(audio_output)
59
+
60
  transcript = run_inference(uploaded_file)
61
 
62
+ st.download_button("Download transcript", transcript,
63
+ f"{uploaded_file.name}-swedish-transcript.txt")
64
+
65
+ with st.expander("Transcript", expanded=True):
66
+ st.write(transcript)