Gabriel C commited on
Commit
1003643
1 Parent(s): b57b0c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -50
app.py CHANGED
@@ -1,83 +1,62 @@
1
  import os
2
- import time
3
 
4
  import gradio as gr
5
  import numpy as np
6
- import soundfile as sf
7
-
8
  from groq import Groq
9
- from openai import OpenAI
 
 
10
 
11
  groq_client = Groq(api_key=os.getenv('GROQ_API_KEY'))
12
- openai_client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
13
 
14
- def transcribe(audio_path):
15
  """
16
- Transcribe the audio segment using Whisper.
17
  """
18
- with open(audio_path, 'rb') as audio_file:
19
- transcription = openai_client.audio.transcriptions.create(
20
- file=audio_file,
21
- language="en",
22
- model="whisper-1"
23
- )
24
- return transcription.text
25
 
26
- def autocomplete(text):
 
 
 
 
 
 
27
  """
28
  Autocomplete the text using Gemma.
29
  """
30
  if text != "":
31
  response = groq_client.chat.completions.create(
32
  model='gemma-7b-it',
33
- messages=[{"role": "system", "content": "You are a friendly assistant."},
34
  {"role": "user", "content": text}]
35
  )
36
 
37
  return response.choices[0].message.content
38
 
39
- def process_audio(input_audio):
40
  """
41
  Process the audio input by transcribing and completing the sentences.
 
42
  """
43
- # Now you can use the audio_file_path with soundfile.read()
44
- audio_data, sample_rate = sf.read(input_audio)
45
-
46
- # Ensure mono audio
47
- if len(audio_data.shape) > 1:
48
- audio_data = np.mean(audio_data, axis=1)
49
-
50
- transcription_list = []
51
- for start in range(0, len(audio_data), sample_rate):
52
- end = start + sample_rate
53
- segment = audio_data[start:end]
54
-
55
- # Temporarily saving each segment to a file (Whisper requires a file input)
56
- segment_filename = f"/tmp/audio_segment_{start}.wav"
57
- sf.write(segment_filename, segment, sample_rate)
58
-
59
- # Transcribe the audio segment
60
- transcription = transcribe(segment_filename)
61
-
62
- transcription_list.append(transcription)
63
 
64
- # # Send the transcription for completion
65
- completion_result = autocomplete(transcription)
66
-
67
- text = f"Qn: {transcription} \n \n Ans: {completion_result}"
 
68
 
69
- return text
70
 
71
- # Define the Gradio interface
72
- interface = gr.Interface(
73
- fn=process_audio,
74
- inputs=gr.Audio(sources="microphone", streaming=True, type="filepath"),
75
- outputs=gr.Markdown(),
76
  title="Dear Gemma",
77
- description="Talk to the AI assistant. It completes your sentences in real time.",
78
  live=True,
79
  allow_flagging="never"
80
  )
81
 
82
- if __name__ == "__main__":
83
- interface.launch()
 
1
  import os
 
2
 
3
  import gradio as gr
4
  import numpy as np
 
 
5
  from groq import Groq
6
+ from transformers import pipeline
7
+
8
+ transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
9
 
10
  groq_client = Groq(api_key=os.getenv('GROQ_API_KEY'))
 
11
 
12
+ def transcribe(stream, new_chunk):
13
  """
14
+ Transcribes using whisper
15
  """
16
+ sr, y = new_chunk
17
+ y = y.astype(np.float32)
18
+ y /= np.max(np.abs(y))
 
 
 
 
19
 
20
+ if stream is not None:
21
+ stream = np.concatenate([stream, y])
22
+ else:
23
+ stream = y
24
+ return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"]
25
+
26
+ def autocomplete(text):
27
  """
28
  Autocomplete the text using Gemma.
29
  """
30
  if text != "":
31
  response = groq_client.chat.completions.create(
32
  model='gemma-7b-it',
33
+ messages=[{"role": "system", "content": "You are a friendly assistant named Gemma."},
34
  {"role": "user", "content": text}]
35
  )
36
 
37
  return response.choices[0].message.content
38
 
39
+ def process_audio(input_audio, new_chunk):
40
  """
41
  Process the audio input by transcribing and completing the sentences.
42
+ Accumulate results to return to Gradio interface.
43
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ stream, transcription = transcribe(input_audio, new_chunk)
46
+ text = autocomplete(transcription)
47
+
48
+ print (transcription, text)
49
+ return stream, text
50
 
 
51
 
52
+ demo = gr.Interface(
53
+ fn = process_audio,
54
+ inputs = ["state", gr.Audio(sources=["microphone"], streaming=True)],
55
+ outputs = ["state", gr.Markdown()],
 
56
  title="Dear Gemma",
57
+ description="Talk to the AI assistant.",
58
  live=True,
59
  allow_flagging="never"
60
  )
61
 
62
+ demo.launch()