Spaces:

salomonsky
/

xaman2

Sleeping

App Files Files Community

salomonsky commited on Jan 20, 2024

Commit

5bbd161

verified ·

1 Parent(s): 1c2b820

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -60

app.py CHANGED Viewed

@@ -1,32 +1,28 @@
 import base64
 import io
 from huggingface_hub import InferenceClient
 from gtts import gTTS
-from pydub import AudioSegment
-from pydub.playback import play
-from streamlit_webrtc import webrtc_streamer, AudioProcessorBase
-import cv2
-import numpy as np
 import speech_recognition as sr
-import subprocess
-import os
-os.environ['STREAMLIT_SERVER_RUN_ON_SAVE'] = 'false'
-import streamlit as st
 if "history" not in st.session_state:
     st.session_state.history = []
-recognizer = sr.Recognizer()
-# Reconociendo voz en tiempo real
-def recognize_speech_with_vad(audio_data, show_messages=True):
-    try:
-        audio_text = recognizer.recognize_google(audio_data, language="es-ES")
         if show_messages:
             st.subheader("Texto Reconocido:")
             st.write(audio_text)
     except sr.UnknownValueError:
         st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
         audio_text = ""
@@ -36,35 +32,6 @@ def recognize_speech_with_vad(audio_data, show_messages=True):
     return audio_text
-# Procesador de voice activity detection con streamlit_webrtc
-class VADProcessor(AudioProcessorBase):
-    def __init__(self):
-        self.buffer = np.zeros((0,))
-        self.vad_active = True
-    def recv(self, audio_data):
-        if self.vad_active:
-            audio_array = np.frombuffer(audio_data, dtype=np.int16)
-            self.buffer = np.concatenate((self.buffer, audio_array), axis=None)
-            if len(self.buffer) >= 44100 * 5:  # 5 seconds of audio
-                st.audio(self.buffer, format="audio/wav")
-                audio_text = recognize_speech_with_vad(self.buffer)
-                if audio_text:
-                    st.success("Frase detectada. Procesando audio...")
-                    output, audio_file = generate(audio_text, history=st.session_state.history)
-                    if audio_file is not None:
-                        play(audio_file)
-                    # Desactiva el VAD después de detectar una frase
-                    self.vad_active = False
-                self.buffer = np.zeros((0,))
-# Preparando entrada para el modelo de lenguaje
 def format_prompt(message, history):
     prompt = "<s>"
@@ -75,7 +42,6 @@ def format_prompt(message, history):
     prompt += f"[INST] {message} [/INST]"
     return prompt
-# Generando respuesta en texto
 def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
     client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
@@ -104,7 +70,6 @@ def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.
     audio_file = text_to_speech(response, speed=1.3)
     return response, audio_file
-# Texto a voz
 def text_to_speech(text, speed=1.3):
     tts = gTTS(text=text, lang='es')
     audio_fp = io.BytesIO()
@@ -117,22 +82,28 @@ def text_to_speech(text, speed=1.3):
     modified_audio_fp.seek(0)
     return modified_audio_fp
-# Reproductor de texto a voz
-def audio_player_markup(audio_file):
-    return f"""
-        <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
-    """
-# Interfaz de usuario con streamlit_webrtc
 def main():
     st.title("Chatbot de Voz a Voz")
-    webrtc_ctx = webrtc_streamer(
-        key="vad",
-        audio_processor_factory=VADProcessor,
-        async_processing=True,
-        media_stream_constraints={"video": False, "audio": True},
-    )
 if __name__ == "__main__":
     main()

+import streamlit as st
 import base64
 import io
 from huggingface_hub import InferenceClient
 from gtts import gTTS
+from audiorecorder import audiorecorder
 import speech_recognition as sr
+from pydub import AudioSegment
 if "history" not in st.session_state:
     st.session_state.history = []
+def recognize_speech(audio_data, show_messages=True):
+    recognizer = sr.Recognizer()
+    audio_recording = sr.AudioFile(audio_data)
+    with audio_recording as source:
+        audio = recognizer.record(source)
+    try:
+        audio_text = recognizer.recognize_google(audio, language="es-ES")
         if show_messages:
             st.subheader("Texto Reconocido:")
             st.write(audio_text)
+            st.success("Reconocimiento de voz completado.")
     except sr.UnknownValueError:
         st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
         audio_text = ""
     return audio_text
 def format_prompt(message, history):
     prompt = "<s>"
     prompt += f"[INST] {message} [/INST]"
     return prompt
 def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
     client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
     audio_file = text_to_speech(response, speed=1.3)
     return response, audio_file
 def text_to_speech(text, speed=1.3):
     tts = gTTS(text=text, lang='es')
     audio_fp = io.BytesIO()
     modified_audio_fp.seek(0)
     return modified_audio_fp
 def main():
     st.title("Chatbot de Voz a Voz")
+    audio_data = audiorecorder("Habla para grabar", "Deteniendo la grabación...")
+    if not audio_data.empty():
+        st.audio(audio_data.export().read(), format="audio/wav")
+        audio_data.export("audio.wav", format="wav")
+        audio_text = recognize_speech("audio.wav")
+        if audio_text:
+            output, audio_file = generate(audio_text, history=st.session_state.history)
+            if audio_text:
+                st.session_state.history.append((audio_text, output))
+            if audio_file is not None:
+                st.markdown(
+                    f"""
+                    <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
+                    """,
+                    unsafe_allow_html=True
+                )
 if __name__ == "__main__":
     main()