File size: 6,293 Bytes
859119c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
"""
This application provides an interface for transcription and summarization using models powered by Groq.
The interface allows users to record audio or provide an audio file in supported formats.
The user will receive a transcription and a generated summary.
"""

import gradio as gr
import groq
import io
import numpy as np
import soundfile as sf

def transcribe_audio(audio: tuple, api_key: str) -> str:
    """
    Transcribes the given audio using the Whisper Large v3 Turbo model via the Groq API.
    The model supports mp3, mp4, mpeg, mpga, m4a, wav, and webm file types.

    Args:
        audio (tuple): A tuple where the first element is the sample rate and the second is a numpy array with audio data.
        api_key (str): API key for Groq.

    Returns:
        str: Transcription result or an error message.
    """
    if audio is None:
        return ""
    
    client = groq.Client(api_key=api_key)
    
    # Convert the audio data to WAV format in-memory
    audio_data = audio[1]  # Get the numpy array from the tuple
    buffer = io.BytesIO()

    # Write numpy array to buffer as WAV format
    sf.write(buffer, audio_data, audio[0], format='wav')
    buffer.seek(0)  # Reset buffer position

    # Save audio data
    bytes_audio = io.BytesIO()
    np.save(bytes_audio, audio_data)
    bytes_audio.seek(0)

    try:
        # Use Whisper Large v3 Turbo powered by Groq for transcription
        completion = client.audio.transcriptions.create(
            model="whisper-large-v3-turbo",
            file=("audio.wav", buffer),
            response_format="text"
        )
        return completion
    
    except Exception as e:
        return f"Error in transcription: {str(e)}"

def generate_response(transcription: str, api_key: str) -> str:
    """
    Generate a response summary from the provided transcription using a Groq model.

    Args:
        transcription (str): The text transcription of the audio.
        api_key (str): The API key to authenticate the request to Groq.

    Returns:
        str: Generated response summary or an error message.
    """
    if not transcription:
        return "No transcription available. Please try recording again."
    
    client = groq.Client(api_key=api_key)
    
    try:
        # Use Llama 3.1 70B powered by Groq for text generation
        completion = client.chat.completions.create(
            model="llama-3.1-70b-versatile",
            messages=[
                {
                    "role": "system", 
                    "content": (
                        "You are a helpful assistant powered by Groq's Language "
                        "Processing Units (LPU), designed for fast AI inference. "
                        "Use the following transcription of an audio file and generate 5 "
                        "bullet points that summarize what is covered in the audio. "
                        "Maintain a professional and conversational tone. Do not use "
                        "images or emojis in your answer. Prioritize accuracy and only "
                        "provide information directly supported by the text transcription."
                    )
                },
                {"role": "user", "content": transcription}
            ],
        )
        
        return completion.choices[0].message.content
    
    except Exception as e:
        return f"Error in response generation: {str(e)}"

def process_audio(audio: object, api_key: str) -> tuple[str, str]:
    """
    Process the given audio by first transcribing it and then generating a response
    using the Groq API.

    Args:
        audio (object): The audio file to be processed, expected as a numpy array or other format.
        api_key (str): The API key to authenticate the request to Groq.

    Returns:
        tuple: A tuple containing the transcription of the audio and the generated response.
    """
    if not api_key:
        return "Please enter your Groq API key.", "API key is required."

    if not audio:
        return "No audio provided.", "Audio input is required for transcription."

    # Transcribe audio and generate response
    transcription = transcribe_audio(audio, api_key)
    response = generate_response(transcription, api_key)
    
    return transcription, response

# Custom CSS for the Groq badge and color scheme
custom_css = """
.gradio-container {
    background-color: #f5f5f5;
}
.gr-button-primary {
    background-color: #f55036 !important;
    border-color: #f55036 !important;
}
.gr-button-secondary {
    color: #f55036 !important;
    border-color: #f55036 !important;
}
#groq-badge {
    position: fixed;
    bottom: 20px;
    right: 20px;
    z-index: 1000;
}
"""

# Define the Gradio interface
with gr.Blocks(theme=gr.themes.Default()) as demo:
    gr.Markdown("# Groq Scribe")

    # Input for Groq API key (password protected)
    api_key_input = gr.Textbox(type="password", label="Enter your Groq API Key")

    # Row for audio input
    with gr.Row():
        audio_input = gr.Audio(label="Audio", type="numpy")

    # Row for transcription and summary outputs
    with gr.Row():
        transcription_output = gr.Textbox(label="Transcription")
        response_output = gr.Textbox(label="Summary")

    # Submit button
    submit_button = gr.Button("Process", variant="primary")
    
    # Add the Groq badge
    gr.HTML(
        """
        <div id="groq-badge">
            <div style="color: #f55036; font-weight: bold;">POWERED BY GROQ</div>
        </div>
        """
    )

    # Connect button click to the process_audio function
    submit_button.click(
        process_audio,
        inputs=[audio_input, api_key_input],
        outputs=[transcription_output, response_output]
    )

    # Markdown instructions for using the app
    gr.Markdown(
        """
        ## How to use this app:
        1. Enter your [Groq API Key](https://console.groq.com/keys) in the provided field.
        2. Click on the microphone icon to record audio or provide a file in mp3, mp4, mpeg, mpga, m4a, wav, or webm format.
        3. Click the "Process" button to transcribe the audio and generate a summary.
        4. The transcription and summary will appear in the respective text boxes.
        """
    )

# Launch the Gradio interface
demo.launch()