salomonsky commited on
Commit
5bbd161
verified
1 Parent(s): 1c2b820

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -60
app.py CHANGED
@@ -1,32 +1,28 @@
 
1
  import base64
2
  import io
3
  from huggingface_hub import InferenceClient
4
  from gtts import gTTS
5
- from pydub import AudioSegment
6
- from pydub.playback import play
7
- from streamlit_webrtc import webrtc_streamer, AudioProcessorBase
8
- import cv2
9
- import numpy as np
10
  import speech_recognition as sr
11
- import subprocess
12
- import os
13
- os.environ['STREAMLIT_SERVER_RUN_ON_SAVE'] = 'false'
14
- import streamlit as st
15
 
16
  if "history" not in st.session_state:
17
  st.session_state.history = []
18
 
19
- recognizer = sr.Recognizer()
 
 
20
 
21
- # Reconociendo voz en tiempo real
22
- def recognize_speech_with_vad(audio_data, show_messages=True):
23
- try:
24
- audio_text = recognizer.recognize_google(audio_data, language="es-ES")
25
 
 
 
26
  if show_messages:
27
  st.subheader("Texto Reconocido:")
28
  st.write(audio_text)
29
-
30
  except sr.UnknownValueError:
31
  st.warning("No se pudo reconocer el audio. 驴Intentaste grabar algo?")
32
  audio_text = ""
@@ -36,35 +32,6 @@ def recognize_speech_with_vad(audio_data, show_messages=True):
36
 
37
  return audio_text
38
 
39
- # Procesador de voice activity detection con streamlit_webrtc
40
- class VADProcessor(AudioProcessorBase):
41
- def __init__(self):
42
- self.buffer = np.zeros((0,))
43
- self.vad_active = True
44
-
45
- def recv(self, audio_data):
46
- if self.vad_active:
47
- audio_array = np.frombuffer(audio_data, dtype=np.int16)
48
- self.buffer = np.concatenate((self.buffer, audio_array), axis=None)
49
-
50
- if len(self.buffer) >= 44100 * 5: # 5 seconds of audio
51
- st.audio(self.buffer, format="audio/wav")
52
- audio_text = recognize_speech_with_vad(self.buffer)
53
-
54
- if audio_text:
55
- st.success("Frase detectada. Procesando audio...")
56
- output, audio_file = generate(audio_text, history=st.session_state.history)
57
-
58
- if audio_file is not None:
59
- play(audio_file)
60
-
61
- # Desactiva el VAD despu茅s de detectar una frase
62
- self.vad_active = False
63
-
64
- self.buffer = np.zeros((0,))
65
-
66
-
67
- # Preparando entrada para el modelo de lenguaje
68
  def format_prompt(message, history):
69
  prompt = "<s>"
70
 
@@ -75,7 +42,6 @@ def format_prompt(message, history):
75
  prompt += f"[INST] {message} [/INST]"
76
  return prompt
77
 
78
- # Generando respuesta en texto
79
  def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
80
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
81
 
@@ -104,7 +70,6 @@ def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.
104
  audio_file = text_to_speech(response, speed=1.3)
105
  return response, audio_file
106
 
107
- # Texto a voz
108
  def text_to_speech(text, speed=1.3):
109
  tts = gTTS(text=text, lang='es')
110
  audio_fp = io.BytesIO()
@@ -117,22 +82,28 @@ def text_to_speech(text, speed=1.3):
117
  modified_audio_fp.seek(0)
118
  return modified_audio_fp
119
 
120
- # Reproductor de texto a voz
121
- def audio_player_markup(audio_file):
122
- return f"""
123
- <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
124
- """
125
-
126
- # Interfaz de usuario con streamlit_webrtc
127
  def main():
128
  st.title("Chatbot de Voz a Voz")
129
-
130
- webrtc_ctx = webrtc_streamer(
131
- key="vad",
132
- audio_processor_factory=VADProcessor,
133
- async_processing=True,
134
- media_stream_constraints={"video": False, "audio": True},
135
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  if __name__ == "__main__":
138
  main()
 
1
+ import streamlit as st
2
  import base64
3
  import io
4
  from huggingface_hub import InferenceClient
5
  from gtts import gTTS
6
+ from audiorecorder import audiorecorder
 
 
 
 
7
  import speech_recognition as sr
8
+ from pydub import AudioSegment
 
 
 
9
 
10
  if "history" not in st.session_state:
11
  st.session_state.history = []
12
 
13
+ def recognize_speech(audio_data, show_messages=True):
14
+ recognizer = sr.Recognizer()
15
+ audio_recording = sr.AudioFile(audio_data)
16
 
17
+ with audio_recording as source:
18
+ audio = recognizer.record(source)
 
 
19
 
20
+ try:
21
+ audio_text = recognizer.recognize_google(audio, language="es-ES")
22
  if show_messages:
23
  st.subheader("Texto Reconocido:")
24
  st.write(audio_text)
25
+ st.success("Reconocimiento de voz completado.")
26
  except sr.UnknownValueError:
27
  st.warning("No se pudo reconocer el audio. 驴Intentaste grabar algo?")
28
  audio_text = ""
 
32
 
33
  return audio_text
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def format_prompt(message, history):
36
  prompt = "<s>"
37
 
 
42
  prompt += f"[INST] {message} [/INST]"
43
  return prompt
44
 
 
45
  def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
46
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
47
 
 
70
  audio_file = text_to_speech(response, speed=1.3)
71
  return response, audio_file
72
 
 
73
  def text_to_speech(text, speed=1.3):
74
  tts = gTTS(text=text, lang='es')
75
  audio_fp = io.BytesIO()
 
82
  modified_audio_fp.seek(0)
83
  return modified_audio_fp
84
 
 
 
 
 
 
 
 
85
  def main():
86
  st.title("Chatbot de Voz a Voz")
87
+ audio_data = audiorecorder("Habla para grabar", "Deteniendo la grabaci贸n...")
88
+
89
+ if not audio_data.empty():
90
+ st.audio(audio_data.export().read(), format="audio/wav")
91
+ audio_data.export("audio.wav", format="wav")
92
+ audio_text = recognize_speech("audio.wav")
93
+
94
+ if audio_text:
95
+ output, audio_file = generate(audio_text, history=st.session_state.history)
96
+
97
+ if audio_text:
98
+ st.session_state.history.append((audio_text, output))
99
+
100
+ if audio_file is not None:
101
+ st.markdown(
102
+ f"""
103
+ <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
104
+ """,
105
+ unsafe_allow_html=True
106
+ )
107
 
108
  if __name__ == "__main__":
109
  main()