Daryl Lim commited on
Commit
859119c
·
1 Parent(s): fc9d693

Add application file

Browse files
Files changed (1) hide show
  1. app.py +189 -0
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This application provides an interface for transcription and summarization using models powered by Groq.
3
+ The interface allows users to record audio or provide an audio file in supported formats.
4
+ The user will receive a transcription and a generated summary.
5
+ """
6
+
7
+ import gradio as gr
8
+ import groq
9
+ import io
10
+ import numpy as np
11
+ import soundfile as sf
12
+
13
+ def transcribe_audio(audio: tuple, api_key: str) -> str:
14
+ """
15
+ Transcribes the given audio using the Whisper Large v3 Turbo model via the Groq API.
16
+ The model supports mp3, mp4, mpeg, mpga, m4a, wav, and webm file types.
17
+
18
+ Args:
19
+ audio (tuple): A tuple where the first element is the sample rate and the second is a numpy array with audio data.
20
+ api_key (str): API key for Groq.
21
+
22
+ Returns:
23
+ str: Transcription result or an error message.
24
+ """
25
+ if audio is None:
26
+ return ""
27
+
28
+ client = groq.Client(api_key=api_key)
29
+
30
+ # Convert the audio data to WAV format in-memory
31
+ audio_data = audio[1] # Get the numpy array from the tuple
32
+ buffer = io.BytesIO()
33
+
34
+ # Write numpy array to buffer as WAV format
35
+ sf.write(buffer, audio_data, audio[0], format='wav')
36
+ buffer.seek(0) # Reset buffer position
37
+
38
+ # Save audio data
39
+ bytes_audio = io.BytesIO()
40
+ np.save(bytes_audio, audio_data)
41
+ bytes_audio.seek(0)
42
+
43
+ try:
44
+ # Use Whisper Large v3 Turbo powered by Groq for transcription
45
+ completion = client.audio.transcriptions.create(
46
+ model="whisper-large-v3-turbo",
47
+ file=("audio.wav", buffer),
48
+ response_format="text"
49
+ )
50
+ return completion
51
+
52
+ except Exception as e:
53
+ return f"Error in transcription: {str(e)}"
54
+
55
+ def generate_response(transcription: str, api_key: str) -> str:
56
+ """
57
+ Generate a response summary from the provided transcription using a Groq model.
58
+
59
+ Args:
60
+ transcription (str): The text transcription of the audio.
61
+ api_key (str): The API key to authenticate the request to Groq.
62
+
63
+ Returns:
64
+ str: Generated response summary or an error message.
65
+ """
66
+ if not transcription:
67
+ return "No transcription available. Please try recording again."
68
+
69
+ client = groq.Client(api_key=api_key)
70
+
71
+ try:
72
+ # Use Llama 3.1 70B powered by Groq for text generation
73
+ completion = client.chat.completions.create(
74
+ model="llama-3.1-70b-versatile",
75
+ messages=[
76
+ {
77
+ "role": "system",
78
+ "content": (
79
+ "You are a helpful assistant powered by Groq's Language "
80
+ "Processing Units (LPU), designed for fast AI inference. "
81
+ "Use the following transcription of an audio file and generate 5 "
82
+ "bullet points that summarize what is covered in the audio. "
83
+ "Maintain a professional and conversational tone. Do not use "
84
+ "images or emojis in your answer. Prioritize accuracy and only "
85
+ "provide information directly supported by the text transcription."
86
+ )
87
+ },
88
+ {"role": "user", "content": transcription}
89
+ ],
90
+ )
91
+
92
+ return completion.choices[0].message.content
93
+
94
+ except Exception as e:
95
+ return f"Error in response generation: {str(e)}"
96
+
97
+ def process_audio(audio: object, api_key: str) -> tuple[str, str]:
98
+ """
99
+ Process the given audio by first transcribing it and then generating a response
100
+ using the Groq API.
101
+
102
+ Args:
103
+ audio (object): The audio file to be processed, expected as a numpy array or other format.
104
+ api_key (str): The API key to authenticate the request to Groq.
105
+
106
+ Returns:
107
+ tuple: A tuple containing the transcription of the audio and the generated response.
108
+ """
109
+ if not api_key:
110
+ return "Please enter your Groq API key.", "API key is required."
111
+
112
+ if not audio:
113
+ return "No audio provided.", "Audio input is required for transcription."
114
+
115
+ # Transcribe audio and generate response
116
+ transcription = transcribe_audio(audio, api_key)
117
+ response = generate_response(transcription, api_key)
118
+
119
+ return transcription, response
120
+
121
+ # Custom CSS for the Groq badge and color scheme
122
+ custom_css = """
123
+ .gradio-container {
124
+ background-color: #f5f5f5;
125
+ }
126
+ .gr-button-primary {
127
+ background-color: #f55036 !important;
128
+ border-color: #f55036 !important;
129
+ }
130
+ .gr-button-secondary {
131
+ color: #f55036 !important;
132
+ border-color: #f55036 !important;
133
+ }
134
+ #groq-badge {
135
+ position: fixed;
136
+ bottom: 20px;
137
+ right: 20px;
138
+ z-index: 1000;
139
+ }
140
+ """
141
+
142
+ # Define the Gradio interface
143
+ with gr.Blocks(theme=gr.themes.Default()) as demo:
144
+ gr.Markdown("# Groq Scribe")
145
+
146
+ # Input for Groq API key (password protected)
147
+ api_key_input = gr.Textbox(type="password", label="Enter your Groq API Key")
148
+
149
+ # Row for audio input
150
+ with gr.Row():
151
+ audio_input = gr.Audio(label="Audio", type="numpy")
152
+
153
+ # Row for transcription and summary outputs
154
+ with gr.Row():
155
+ transcription_output = gr.Textbox(label="Transcription")
156
+ response_output = gr.Textbox(label="Summary")
157
+
158
+ # Submit button
159
+ submit_button = gr.Button("Process", variant="primary")
160
+
161
+ # Add the Groq badge
162
+ gr.HTML(
163
+ """
164
+ <div id="groq-badge">
165
+ <div style="color: #f55036; font-weight: bold;">POWERED BY GROQ</div>
166
+ </div>
167
+ """
168
+ )
169
+
170
+ # Connect button click to the process_audio function
171
+ submit_button.click(
172
+ process_audio,
173
+ inputs=[audio_input, api_key_input],
174
+ outputs=[transcription_output, response_output]
175
+ )
176
+
177
+ # Markdown instructions for using the app
178
+ gr.Markdown(
179
+ """
180
+ ## How to use this app:
181
+ 1. Enter your [Groq API Key](https://console.groq.com/keys) in the provided field.
182
+ 2. Click on the microphone icon to record audio or provide a file in mp3, mp4, mpeg, mpga, m4a, wav, or webm format.
183
+ 3. Click the "Process" button to transcribe the audio and generate a summary.
184
+ 4. The transcription and summary will appear in the respective text boxes.
185
+ """
186
+ )
187
+
188
+ # Launch the Gradio interface
189
+ demo.launch()