salomonsky commited on
Commit
3d57cbf
verified
1 Parent(s): 28d3a0e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -56
app.py CHANGED
@@ -1,59 +1,106 @@
1
  import streamlit as st
2
- import sounddevice as sd
3
- from pydub import AudioSegment
 
 
 
4
  import speech_recognition as sr
5
 
6
- input_devices = sd.query_devices(kind='input')
7
- print("Dispositivos de entrada de audio disponibles:")
8
- for i, device in enumerate(input_devices):
9
- print(f"{i + 1}. {device['name']}")
10
-
11
- # Selecciona el dispositivo de audio
12
- selected_device_index = int(input("Selecciona el n煤mero del dispositivo de entrada de audio: ")) - 1
13
- selected_device = input_devices[selected_device_index]
14
-
15
- # Imprime la informaci贸n del dispositivo seleccionado
16
- print(f"\nDispositivo seleccionado: {selected_device['name']}")
17
- print(f"脥ndice: {selected_device['index']}")
18
- print(f"Canales: {selected_device['max_input_channels']}")
19
- print(f"Frecuencia de muestreo: {selected_device['default_samplerate']} Hz")
20
-
21
- # Configuraci贸n del grabador de audio
22
- fs = 44100 # Frecuencia de muestreo
23
- duration = 10 # Duraci贸n m谩xima de la grabaci贸n en segundos
24
-
25
- # Muestra el bot贸n para iniciar/parar la grabaci贸n
26
- if st.button("Iniciar/Parar Grabaci贸n"):
27
- st.write("Grabando...")
28
-
29
- # Grabar audio
30
- audio_data = sd.rec(int(fs * duration), samplerate=fs, channels=1, dtype="int16")
31
- sd.wait()
32
-
33
- st.write("Grabaci贸n completada.")
34
-
35
- # Guarda la grabaci贸n en un archivo temporal
36
- with st.spinner("Procesando grabaci贸n..."):
37
- temp_file_path = "grabacion.wav"
38
- audio_segment = AudioSegment.from_int(signal=audio_data, sample_width=2, frame_rate=fs, channels=1)
39
- audio_segment.export(temp_file_path, format="wav")
40
-
41
- # Reproduce la grabaci贸n
42
- st.audio(temp_file_path, format="audio/wav", start_time=0)
43
-
44
- # Reconocimiento de voz con Google
45
- recognizer = sr.Recognizer()
46
- audio_file = sr.AudioFile(temp_file_path)
47
- with audio_file as source:
48
- try:
49
- audio_text = recognizer.recognize_google(
50
- source, language="es-ES", show_all=False
51
- )
52
- st.write("Texto reconocido:", audio_text)
53
- except sr.UnknownValueError:
54
- st.warning("No se detect贸 ninguna entrada de audio.")
55
- except sr.RequestError as e:
56
- st.error(f"Error en la solicitud al servicio de reconocimiento de voz: {e}")
57
-
58
- # Elimina el archivo temporal
59
- st.audio_recorder_cleanup()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import base64
3
+ import io
4
+ from huggingface_hub import InferenceClient
5
+ from gtts import gTTS
6
+ from audiorecorder import audiorecorder
7
  import speech_recognition as sr
8
 
9
+ if "history" not in st.session_state:
10
+ st.session_state.history = []
11
+
12
+ def recognize_speech(audio_data, show_messages=True):
13
+ recognizer = sr.Recognizer()
14
+ audio_recording = sr.AudioFile(audio_data)
15
+
16
+ with audio_recording as source:
17
+ audio = recognizer.record(source)
18
+
19
+ try:
20
+ audio_text = recognizer.recognize_google(audio, language="es-ES")
21
+ if show_messages:
22
+ st.subheader("Texto Reconocido:")
23
+ st.write(audio_text)
24
+ st.success("Reconocimiento de voz completado.")
25
+ except sr.UnknownValueError:
26
+ st.warning("No se pudo reconocer el audio. 驴Intentaste grabar algo?")
27
+ audio_text = ""
28
+ except sr.RequestError:
29
+ st.error("Hablame para comenzar!")
30
+ audio_text = ""
31
+
32
+ return audio_text
33
+
34
+ def format_prompt(message, history):
35
+ prompt = "<s>"
36
+
37
+ for user_prompt, bot_response in history:
38
+ prompt += f"[INST] {user_prompt} [/INST]"
39
+ prompt += f" {bot_response}</s> "
40
+
41
+ prompt += f"[INST] {message} [/INST]"
42
+ return prompt
43
+
44
+ def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
45
+ client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
46
+
47
+ temperature = float(temperature) if temperature is not None else 0.9
48
+ if temperature < 1e-2:
49
+ temperature = 1e-2
50
+ top_p = float(top_p)
51
+
52
+ generate_kwargs = dict(
53
+ temperature=temperature,
54
+ max_new_tokens=max_new_tokens,
55
+ top_p=top_p,
56
+ repetition_penalty=repetition_penalty,
57
+ do_sample=True,
58
+ seed=42,)
59
+
60
+ formatted_prompt = format_prompt(audio_text, history)
61
+ stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
62
+ response = ""
63
+
64
+ for response_token in stream:
65
+ response += response_token.token.text
66
+
67
+ response = ' '.join(response.split()).replace('</s>', '')
68
+ audio_file = text_to_speech(response, speed=1.3)
69
+ return response, audio_file
70
+
71
+ def text_to_speech(text, speed=1.3):
72
+ tts = gTTS(text=text, lang='es')
73
+ audio_fp = io.BytesIO()
74
+ tts.write_to_fp(audio_fp)
75
+ audio_fp.seek(0)
76
+ return audio_fp
77
+
78
+ def audio_play(audio_fp):
79
+ st.audio(audio_fp.read(), format="audio/mp3", start_time=0)
80
+
81
+ def display_recognition_result(audio_text, output, audio_file):
82
+ if audio_text:
83
+ st.session_state.history.append((audio_text, output))
84
+
85
+ if audio_file is not None:
86
+ st.markdown(
87
+ f"""<audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>""",
88
+ unsafe_allow_html=True)
89
+
90
+ def main():
91
+ st.title("Chatbot +VAD to TTS")
92
+ audio_data = audiorecorder("Habla para grabar", "Deteniendo la grabaci贸n...")
93
+ icon_path = "https://www.iconfinder.com/icons/171506/download/png/512"
94
+ st.image(icon_path, caption='Habla para grabar', use_column_width=True)
95
+
96
+ if not audio_data.empty():
97
+ st.audio(audio_data.export().read(), format="audio/wav")
98
+ audio_data.export("audio.wav", format="wav")
99
+ audio_text = recognize_speech("audio.wav")
100
+
101
+ if audio_text:
102
+ output, audio_file = generate(audio_text, history=st.session_state.history)
103
+ display_recognition_result(audio_text, output, audio_file)
104
+
105
+ if __name__ == "__main__":
106
+ main()