File size: 2,845 Bytes
4cc3c9c
 
 
 
 
 
 
d7c7caa
4cc3c9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e021b3
4cc3c9c
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
from asr import transcribe_auto
from huggingface_hub import InferenceClient
from ttsmms import download, TTS
from langdetect import detect

# Initialize text generation client
client = InferenceClient("Futuresony/future_ai_12_10_2024.gguf")

# Download and load TTS models for Swahili and English
swahili_dir = download("swh", "./data/swahili")
english_dir = download("eng", "./data/english")  # Ensure an English TTS model is available

swahili_tts = TTS(swahili_dir)
english_tts = TTS(english_dir)

def is_uncertain(question, response):
    """Check if the model's response is unreliable."""
    if len(response.split()) < 4 or response.lower() in question.lower():
        return True
    uncertain_phrases = ["Kulingana na utafiti", "Inaaminika kuwa", "Ninadhani", "It is believed that", "Some people say"]
    return any(phrase.lower() in response.lower() for phrase in uncertain_phrases)

def generate_text(prompt):
    """Generate a response from the text generation model."""
    messages = [{"role": "user", "content": prompt}]
    
    response = ""
    for message in client.chat_completion(messages, max_tokens=512, stream=True, temperature=0.7, top_p=0.95):
        token = message.choices[0].delta.content
        response += token
    
    if is_uncertain(prompt, response):
        return "AI is uncertain about the response."
    
    return response

# Function to detect language and generate speech
def text_to_speech(text):
    lang = detect(text)  # Detect language
    wav_path = "./output.wav"

    if lang == "sw":  # Swahili
        swahili_tts.synthesis(text, wav_path=wav_path)
    else:  # Default to English if not Swahili
        english_tts.synthesis(text, wav_path=wav_path)

    return wav_path

def process_audio(audio):
    # Step 1: Transcribe the audio
    transcription = transcribe_auto(audio)

    # Step 2: Generate text based on the transcription
    generated_text = generate_text(transcription)

    # Step 3: Convert the generated text to speech
    speech = text_to_speech(generated_text)

    return transcription, generated_text, speech

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("<p align='center' style='font-size: 20px;'>End-to-End ASR, Text Generation, and TTS</p>")
    gr.HTML("<center>Upload or record audio. The model will transcribe, generate a response, and read it out.</center>")

    audio_input = gr.Audio(label="Input Audio", type="filepath")
    text_output = gr.Textbox(label="Transcription")
    generated_text_output = gr.Textbox(label="Generated Text")
    audio_output = gr.Audio(label="Output Speech")

    submit_btn = gr.Button("Submit")

    submit_btn.click(
        fn=process_audio,
        inputs=audio_input,
        outputs=[text_output, generated_text_output, audio_output]
    )

if __name__ == "__main__":
    demo.launch()