Spaces:
Sleeping
Sleeping
File size: 2,845 Bytes
4cc3c9c d7c7caa 4cc3c9c 5e021b3 4cc3c9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import gradio as gr
from asr import transcribe_auto
from huggingface_hub import InferenceClient
from ttsmms import download, TTS
from langdetect import detect
# Initialize text generation client
client = InferenceClient("Futuresony/future_ai_12_10_2024.gguf")
# Download and load TTS models for Swahili and English
swahili_dir = download("swh", "./data/swahili")
english_dir = download("eng", "./data/english") # Ensure an English TTS model is available
swahili_tts = TTS(swahili_dir)
english_tts = TTS(english_dir)
def is_uncertain(question, response):
"""Check if the model's response is unreliable."""
if len(response.split()) < 4 or response.lower() in question.lower():
return True
uncertain_phrases = ["Kulingana na utafiti", "Inaaminika kuwa", "Ninadhani", "It is believed that", "Some people say"]
return any(phrase.lower() in response.lower() for phrase in uncertain_phrases)
def generate_text(prompt):
"""Generate a response from the text generation model."""
messages = [{"role": "user", "content": prompt}]
response = ""
for message in client.chat_completion(messages, max_tokens=512, stream=True, temperature=0.7, top_p=0.95):
token = message.choices[0].delta.content
response += token
if is_uncertain(prompt, response):
return "AI is uncertain about the response."
return response
# Function to detect language and generate speech
def text_to_speech(text):
lang = detect(text) # Detect language
wav_path = "./output.wav"
if lang == "sw": # Swahili
swahili_tts.synthesis(text, wav_path=wav_path)
else: # Default to English if not Swahili
english_tts.synthesis(text, wav_path=wav_path)
return wav_path
def process_audio(audio):
# Step 1: Transcribe the audio
transcription = transcribe_auto(audio)
# Step 2: Generate text based on the transcription
generated_text = generate_text(transcription)
# Step 3: Convert the generated text to speech
speech = text_to_speech(generated_text)
return transcription, generated_text, speech
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("<p align='center' style='font-size: 20px;'>End-to-End ASR, Text Generation, and TTS</p>")
gr.HTML("<center>Upload or record audio. The model will transcribe, generate a response, and read it out.</center>")
audio_input = gr.Audio(label="Input Audio", type="filepath")
text_output = gr.Textbox(label="Transcription")
generated_text_output = gr.Textbox(label="Generated Text")
audio_output = gr.Audio(label="Output Speech")
submit_btn = gr.Button("Submit")
submit_btn.click(
fn=process_audio,
inputs=audio_input,
outputs=[text_output, generated_text_output, audio_output]
)
if __name__ == "__main__":
demo.launch() |