Spaces:
Sleeping
Sleeping
File size: 3,771 Bytes
3d57cbf 1760204 713e319 3d57cbf 7255a1d d49accf 7255a1d 8842007 84e2e9f 3d57cbf 6f460e4 0173625 6f460e4 713e319 3d57cbf f82511a 713e319 f82511a 3d57cbf 0173625 3d57cbf 0173625 6f460e4 3d57cbf 574c2e1 badb078 713e319 badb078 713e319 ff663fa 713e319 ff663fa 713e319 badb078 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import io
import base64
import numpy as np
import soundfile as sf
from gtts import gTTS
import streamlit as st
import speech_recognition as sr
from huggingface_hub import InferenceClient
from streamlit_mic_recorder import mic_recorder
if "history" not in st.session_state:
st.session_state.history = []
if "pre_prompt_sent" not in st.session_state:
st.session_state.pre_prompt_sent = False
pre_prompt_text = "eres una IA conductual, tus respuestas serán breves."
def recognize_speech(audio_data, sample_rate, show_messages=True):
recognizer = sr.Recognizer()
try:
adjusted_audio_data = sf.resample(audio_data, sample_rate, 16000, subtype='PCM_16')
audio_text = recognizer.recognize_google(adjusted_audio_data, language="es-ES")
if show_messages:
st.subheader("Texto Reconocido:")
st.write(audio_text)
st.success("Reconocimiento de voz completado.")
except sr.UnknownValueError:
st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
audio_text = ""
except sr.RequestError:
st.error("Hablame para comenzar!")
audio_text = ""
return audio_text
def format_prompt(message, history):
prompt = "<s>"
if not st.session_state.pre_prompt_sent:
prompt += f"[INST]{pre_prompt_text}[/INST]"
for user_prompt, bot_response in history:
prompt += f"[INST] {user_prompt} [/INST]"
prompt += f" {bot_response}</s> "
prompt += f"[INST] {message} [/INST]"
return prompt
def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
temperature = float(temperature) if temperature is not None else 0.9
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
seed=42,)
formatted_prompt = format_prompt(audio_text, history)
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
response = ""
for response_token in stream:
response += response_token.token.text
response = ' '.join(response.split()).replace('</s>', '')
audio_file = text_to_speech(response, speed=1.3)
return response, audio_file
def text_to_speech(text, speed=1.3):
tts = gTTS(text=text, lang='es')
audio_fp = io.BytesIO()
tts.write_to_fp(audio_fp)
audio_fp.seek(0)
return audio_fp
def audio_play(audio_fp):
st.audio(audio_fp.read(), format="audio/mp3", start_time=0)
def display_recognition_result(audio_text, output, audio_file):
if audio_text:
st.session_state.history.append((audio_text, output))
if audio_file is not None:
st.markdown(
f"""<audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>""",
unsafe_allow_html=True)
def main():
if not st.session_state.pre_prompt_sent:
st.session_state.pre_prompt_sent = True
audio = mic_recorder(start_prompt="▶️", stop_prompt="🛑", key='recorder')
if audio:
st.audio(audio['bytes'], format="audio/wav")
audio_bytes = np.frombuffer(audio["bytes"], dtype=np.int16)
sample_rate = audio["sample_rate"]
audio_text = recognize_speech(audio_bytes, sample_rate)
if audio_text:
st.session_state.history.append((audio_text, ""))
if __name__ == "__main__":
main() |