Futuresony's picture
Update app.py
d7c7caa verified
raw
history blame
2.85 kB
import gradio as gr
from asr import transcribe_auto
from huggingface_hub import InferenceClient
from ttsmms import download, TTS
from langdetect import detect
# Initialize text generation client
client = InferenceClient("Futuresony/future_ai_12_10_2024.gguf")
# Download and load TTS models for Swahili and English
swahili_dir = download("swh", "./data/swahili")
english_dir = download("eng", "./data/english") # Ensure an English TTS model is available
swahili_tts = TTS(swahili_dir)
english_tts = TTS(english_dir)
def is_uncertain(question, response):
"""Check if the model's response is unreliable."""
if len(response.split()) < 4 or response.lower() in question.lower():
return True
uncertain_phrases = ["Kulingana na utafiti", "Inaaminika kuwa", "Ninadhani", "It is believed that", "Some people say"]
return any(phrase.lower() in response.lower() for phrase in uncertain_phrases)
def generate_text(prompt):
"""Generate a response from the text generation model."""
messages = [{"role": "user", "content": prompt}]
response = ""
for message in client.chat_completion(messages, max_tokens=512, stream=True, temperature=0.7, top_p=0.95):
token = message.choices[0].delta.content
response += token
if is_uncertain(prompt, response):
return "AI is uncertain about the response."
return response
# Function to detect language and generate speech
def text_to_speech(text):
lang = detect(text) # Detect language
wav_path = "./output.wav"
if lang == "sw": # Swahili
swahili_tts.synthesis(text, wav_path=wav_path)
else: # Default to English if not Swahili
english_tts.synthesis(text, wav_path=wav_path)
return wav_path
def process_audio(audio):
# Step 1: Transcribe the audio
transcription = transcribe_auto(audio)
# Step 2: Generate text based on the transcription
generated_text = generate_text(transcription)
# Step 3: Convert the generated text to speech
speech = text_to_speech(generated_text)
return transcription, generated_text, speech
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("<p align='center' style='font-size: 20px;'>End-to-End ASR, Text Generation, and TTS</p>")
gr.HTML("<center>Upload or record audio. The model will transcribe, generate a response, and read it out.</center>")
audio_input = gr.Audio(label="Input Audio", type="filepath")
text_output = gr.Textbox(label="Transcription")
generated_text_output = gr.Textbox(label="Generated Text")
audio_output = gr.Audio(label="Output Speech")
submit_btn = gr.Button("Submit")
submit_btn.click(
fn=process_audio,
inputs=audio_input,
outputs=[text_output, generated_text_output, audio_output]
)
if __name__ == "__main__":
demo.launch()