File size: 3,855 Bytes
37e79cc
b9f656e
 
 
 
1ca46ec
37e79cc
b9f656e
323f6ed
b9f656e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ca46ec
 
 
 
 
 
 
 
 
b9f656e
 
 
1ca46ec
 
b9f656e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ca46ec
b9f656e
 
1ca46ec
 
b9f656e
1ca46ec
 
b9f656e
 
 
 
 
 
 
 
 
 
 
 
1ca46ec
b9f656e
 
 
 
 
1ca46ec
b9f656e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
import groq
import io
import numpy as np
import soundfile as sf
import pyttsx3  # Text-to-Speech engine

# Define the API key directly in the code
API_KEY = "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"  # Replace this with your actual Groq API key

def transcribe_audio(audio):
    if audio is None:
        return ""
    
    client = groq.Client(api_key=API_KEY)
    
    # Convert audio to the format expected by the model
    audio_data = audio[1]  # Get the numpy array from the tuple
    buffer = io.BytesIO()
    sf.write(buffer, audio_data, audio[0], format='wav')
    buffer.seek(0)

    try:
        # Use Distil-Whisper English powered by Groq for transcription
        completion = client.audio.transcriptions.create(
            model="distil-whisper-large-v3-en",
            file=("audio.wav", buffer),
            response_format="text"
        )
        return completion
    except Exception as e:
        return f"Error in transcription: {str(e)}"

def generate_response(transcription):
    if not transcription:
        return "No transcription available. Please try speaking again."
    
    client = groq.Client(api_key=API_KEY)
    
    try:
        # Use Llama 3 70B powered by Groq for text generation
        completion = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[{"role": "system", "content": "You are a helpful assistant."},
                      {"role": "user", "content": transcription}],
        )
        return completion.choices[0].message.content
    except Exception as e:
        return f"Error in response generation: {str(e)}"

def text_to_speech(response_text):
    # Initialize the pyttsx3 engine for text-to-speech
    engine = pyttsx3.init()
    audio_buffer = io.BytesIO()
    engine.save_to_file(response_text, audio_buffer)
    engine.runAndWait()
    audio_buffer.seek(0)
    return audio_buffer

def process_audio(audio):
    transcription = transcribe_audio(audio)
    response = generate_response(transcription)
    audio_response = text_to_speech(response)
    return transcription, response, audio_response

custom_css = """
.gradio-container {
    background-color: #f5f5f5;
}
.gr-button-primary {
    background-color: #f55036 !important;
    border-color: #f55036 !important;
}
.gr-button-secondary {
    color: #f55036 !important;
    border-color: #f55036 !important;
}
#groq-badge {
    position: fixed;
    bottom: 20px;
    right: 20px;
    z-index: 1000;
}
"""

with gr.Blocks(theme=gr.themes.Default()) as demo:
    gr.Markdown("# ๐ŸŽ™๏ธ LLAVA Voice-Powered AI Assistant")
    
    with gr.Row():
        audio_input = gr.Audio(label="Speak!", type="numpy", streaming=True)  # Enable real-time streaming
    
    with gr.Row():
        transcription_output = gr.Textbox(label="Transcription", interactive=False)
        response_output = gr.Textbox(label="AI Assistant Response", interactive=False)
    
    audio_output = gr.Audio(label="AI Response Audio", interactive=False)

    submit_button = gr.Button("Process", variant="primary")
    
    # Add the Groq badge
    gr.HTML("""
    <div id="groq-badge">
        <div style="color: #f55036; font-weight: bold;">POWERED BY LLAVA</div>
    </div>
    """)
    
    submit_button.click(
        process_audio,
        inputs=[audio_input],
        outputs=[transcription_output, response_output, audio_output]
    )

    gr.Markdown("""
    ## How to use this app:
    1. Click on the microphone icon and speak your message (or upload an audio file). Supported audio files include mp3, mp4, mpeg, mpga, m4a, wav, and webm file types.
    2. The system will automatically transcribe your speech, generate a response, and play it as audio.
    3. The transcription and AI assistant response will appear in the respective text boxes.
    """)

demo.launch()