roman commited on
Commit
62e68d5
1 Parent(s): 8dd0371

update requirements.txt, add whisper small ukr

Browse files
Files changed (3) hide show
  1. app.py +39 -11
  2. app2.py +41 -0
  3. requirements.txt +2 -1
app.py CHANGED
@@ -1,22 +1,39 @@
1
  import streamlit as st
2
- import whisper
 
3
  import tempfile
4
  from pydub import AudioSegment
 
5
 
6
  # Define available models
7
- available_models = ["tiny", "base", "small", "medium", "large"]
 
 
 
 
8
 
9
- st.title("Voice Recognition App")
10
 
11
- st.write("Upload an audio file and choose a Whisper model to transcribe it to text.")
 
 
12
 
13
  # Model selection dropdown
14
- model_choice = st.selectbox("Choose a Whisper model", available_models)
 
 
 
 
 
 
 
 
 
15
 
16
- # Load the selected Whisper model
17
  st.write(f"Loading {model_choice} model...")
18
- model = whisper.load_model(model_choice)
19
  st.write(f"{model_choice} model loaded successfully.")
 
20
  # File uploader for audio file
21
  uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "m4a"])
22
 
@@ -26,7 +43,7 @@ if uploaded_file is not None:
26
  temp_file.write(uploaded_file.read())
27
  temp_file_path = temp_file.name
28
 
29
- # Convert audio file to a format supported by Whisper (if necessary)
30
  audio = AudioSegment.from_file(temp_file_path)
31
  temp_wav_path = tempfile.mktemp(suffix=".wav")
32
  audio.export(temp_wav_path, format="wav")
@@ -35,7 +52,18 @@ if uploaded_file is not None:
35
 
36
  st.write("Transcribing audio...")
37
 
38
- # Transcribe audio using Whisper model
39
- result = model.transcribe(temp_wav_path)
 
 
 
 
 
 
 
 
 
 
 
40
  st.write("Transcription:")
41
- st.write(result["text"])
 
1
  import streamlit as st
2
+ from transformers import AutoModelForSpeechSeq2Seq, Wav2Vec2Processor
3
+ import torch
4
  import tempfile
5
  from pydub import AudioSegment
6
+ import numpy as np
7
 
8
  # Define available models
9
+ # available_models = [
10
+ # "facebook/s2t-small-mustc-en-fr-st",
11
+ # "facebook/s2t-medium-mustc-en-fr-st",
12
+ # "facebook/s2t-large-mustc-en-fr-st"
13
+ # ]
14
 
15
+ available_models = ["Yehor/whisper-small-ukrainian"]
16
 
17
+ st.title("Voice Recognition App using SpeechSeq2Seq")
18
+
19
+ st.write("Upload an audio file and choose a model to transcribe it to text.")
20
 
21
  # Model selection dropdown
22
+ model_choice = st.selectbox("Choose a SpeechSeq2Seq model", available_models)
23
+
24
+
25
+ # Load the selected model and processor
26
+ @st.cache_resource
27
+ def load_model_and_processor(model_name):
28
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
29
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
30
+ return model, processor
31
+
32
 
 
33
  st.write(f"Loading {model_choice} model...")
34
+ model, processor = load_model_and_processor(model_choice)
35
  st.write(f"{model_choice} model loaded successfully.")
36
+
37
  # File uploader for audio file
38
  uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "m4a"])
39
 
 
43
  temp_file.write(uploaded_file.read())
44
  temp_file_path = temp_file.name
45
 
46
+ # Convert audio file to a format supported by the processor (if necessary)
47
  audio = AudioSegment.from_file(temp_file_path)
48
  temp_wav_path = tempfile.mktemp(suffix=".wav")
49
  audio.export(temp_wav_path, format="wav")
 
52
 
53
  st.write("Transcribing audio...")
54
 
55
+ # Load audio
56
+ audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1)
57
+ audio_input = np.array(audio_input.get_array_of_samples())
58
+
59
+ # Process the audio
60
+ input_features = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values
61
+
62
+ # Generate transcription
63
+ with torch.no_grad():
64
+ predicted_ids = model.generate(input_features)
65
+
66
+ transcription = processor.batch_decode(predicted_ids)[0]
67
+
68
  st.write("Transcription:")
69
+ st.write(transcription)
app2.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import whisper
3
+ import tempfile
4
+ from pydub import AudioSegment
5
+
6
+ # Define available models
7
+ available_models = ["tiny", "base", "small", "medium", "large"]
8
+
9
+ st.title("Voice Recognition App")
10
+
11
+ st.write("Upload an audio file and choose a Whisper model to transcribe it to text.")
12
+
13
+ # Model selection dropdown
14
+ model_choice = st.selectbox("Choose a Whisper model", available_models)
15
+
16
+ # Load the selected Whisper model
17
+ st.write(f"Loading {model_choice} model...")
18
+ model = whisper.load_model(model_choice)
19
+ st.write(f"{model_choice} model loaded successfully.")
20
+ # File uploader for audio file
21
+ uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "m4a"])
22
+
23
+ if uploaded_file is not None:
24
+ # Save the uploaded file temporarily
25
+ with tempfile.NamedTemporaryFile(delete=False) as temp_file:
26
+ temp_file.write(uploaded_file.read())
27
+ temp_file_path = temp_file.name
28
+
29
+ # Convert audio file to a format supported by Whisper (if necessary)
30
+ audio = AudioSegment.from_file(temp_file_path)
31
+ temp_wav_path = tempfile.mktemp(suffix=".wav")
32
+ audio.export(temp_wav_path, format="wav")
33
+
34
+ st.audio(uploaded_file, format="audio/wav")
35
+
36
+ st.write("Transcribing audio...")
37
+
38
+ # Transcribe audio using Whisper model
39
+ result = model.transcribe(temp_wav_path)
40
+ st.write("Transcription:")
41
+ st.write(result["text"])
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  streamlit
2
  transformers
3
  pydub
4
- openai-whisper
 
 
1
  streamlit
2
  transformers
3
  pydub
4
+ openai-whisper
5
+ torch