salomonsky commited on
Commit
d6b9b98
·
verified ·
1 Parent(s): 32cf407

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -27
app.py CHANGED
@@ -1,12 +1,16 @@
1
  import io
2
  import base64
3
- import numpy as np
4
- import soundfile as sf
5
  from gtts import gTTS
6
  import streamlit as st
7
  import speech_recognition as sr
8
  from huggingface_hub import InferenceClient
9
  from streamlit_mic_recorder import mic_recorder
 
 
 
 
 
 
10
 
11
  if "history" not in st.session_state:
12
  st.session_state.history = []
@@ -14,24 +18,22 @@ if "history" not in st.session_state:
14
  if "pre_prompt_sent" not in st.session_state:
15
  st.session_state.pre_prompt_sent = False
16
 
17
- pre_prompt_text = "eres una IA conductual, tus respuestas serán breves."
18
-
19
- def recognize_speech(audio_data, sample_rate, show_messages=True):
20
  recognizer = sr.Recognizer()
21
 
22
- try:
23
- adjusted_audio_data = sf.resample(audio_data, sample_rate, 16000, subtype='PCM_16')
24
- audio_text = recognizer.recognize_google(adjusted_audio_data, language="es-ES")
25
- if show_messages:
26
- st.subheader("Texto Reconocido:")
27
- st.write(audio_text)
28
- st.success("Reconocimiento de voz completado.")
29
- except sr.UnknownValueError:
30
- st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
31
- audio_text = ""
32
- except sr.RequestError:
33
- st.error("Hablame para comenzar!")
34
- audio_text = ""
35
 
36
  return audio_text
37
 
@@ -100,15 +102,19 @@ def main():
100
 
101
  audio = mic_recorder(start_prompt="▶️", stop_prompt="🛑", key='recorder')
102
 
103
- if audio:
104
- st.audio(audio['bytes'], format="audio/wav")
105
- audio_bytes = np.frombuffer(audio["bytes"], dtype=np.int16)
106
- sample_rate = audio["sample_rate"]
107
-
108
- audio_text = recognize_speech(audio_bytes, sample_rate)
109
-
110
- if audio_text:
111
- st.session_state.history.append((audio_text, ""))
 
 
 
 
112
 
113
  if __name__ == "__main__":
114
  main()
 
1
  import io
2
  import base64
 
 
3
  from gtts import gTTS
4
  import streamlit as st
5
  import speech_recognition as sr
6
  from huggingface_hub import InferenceClient
7
  from streamlit_mic_recorder import mic_recorder
8
+ import wave
9
+ import numpy as np
10
+ import os
11
+
12
+ pre_prompt_text = "eres una IA conductual, tus respuestas serán breves."
13
+ temp_audio_file_path = "./output.wav"
14
 
15
  if "history" not in st.session_state:
16
  st.session_state.history = []
 
18
  if "pre_prompt_sent" not in st.session_state:
19
  st.session_state.pre_prompt_sent = False
20
 
21
+ def recognize_speech(audio_data, show_messages=True):
 
 
22
  recognizer = sr.Recognizer()
23
 
24
+ with io.BytesIO(audio_data) as audio_file:
25
+ try:
26
+ audio_text = recognizer.recognize_google(audio_file, language="es-ES")
27
+ if show_messages:
28
+ st.subheader("Texto Reconocido:")
29
+ st.write(audio_text)
30
+ st.success("Reconocimiento de voz completado.")
31
+ except sr.UnknownValueError:
32
+ st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
33
+ audio_text = ""
34
+ except sr.RequestError:
35
+ st.error("Hablame para comenzar!")
36
+ audio_text = ""
37
 
38
  return audio_text
39
 
 
102
 
103
  audio = mic_recorder(start_prompt="▶️", stop_prompt="🛑", key='recorder')
104
 
105
+ if audio:
106
+ st.audio(audio['bytes'])
107
+
108
+ audio_bytes = audio["bytes"]
109
+ sample_width = audio["sample_width"] # 2 bytes per sample for 16-bit PCM
110
+ sample_rate = audio["sample_rate"] # 44.1 kHz sample rate
111
+ num_channels = 1 # 1 channel for mono, 2 for stereo
112
+
113
+ with wave.open(temp_audio_file_path, 'w') as wave_file:
114
+ wave_file.setnchannels(num_channels)
115
+ wave_file.setsampwidth(sample_width)
116
+ wave_file.setframerate(sample_rate)
117
+ wave_file.writeframes(audio_bytes)
118
 
119
  if __name__ == "__main__":
120
  main()