freddyaboulton HF staff commited on
Commit
1ee1757
·
1 Parent(s): bcdc799
Files changed (3) hide show
  1. README.md +2 -2
  2. app.py +48 -37
  3. requirements.txt +1 -1
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Llama 3.2 3b Voice
3
- emoji: 👁
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
 
1
  ---
2
+ title: Llama 3.2 3b WebRTC
3
+ emoji: ⚡️
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
app.py CHANGED
@@ -25,12 +25,6 @@ else:
25
  rtc_configuration = None
26
 
27
 
28
- def create_client(api_key):
29
- return openai.OpenAI(
30
- base_url="https://llama3-1-8b.lepton.run/api/v1/",
31
- api_key=api_key
32
- )
33
-
34
 
35
  def update_or_append_conversation(conversation, id, role, content):
36
  # Find if there's an existing message with the given id
@@ -43,11 +37,12 @@ def update_or_append_conversation(conversation, id, role, content):
43
 
44
 
45
  def generate_response_and_audio(audio_bytes: bytes, lepton_conversation: list[dict],
46
- client: openai.OpenAI, output_format: str):
47
  if client is None:
48
  raise gr.Error("Please enter a valid API key first.")
49
 
50
- bitrate = 128 if output_format == "mp3" else 32 # Higher bitrate for MP3, lower for OPUS
 
51
  audio_data = base64.b64encode(audio_bytes).decode()
52
 
53
  try:
@@ -55,7 +50,7 @@ def generate_response_and_audio(audio_bytes: bytes, lepton_conversation: list[di
55
  extra_body={
56
  "require_audio": True,
57
  "tts_preset_id": "jessica",
58
- "tts_audio_format": format_,
59
  "tts_audio_bitrate": bitrate
60
  },
61
  model="llama3.1-8b",
@@ -68,40 +63,48 @@ def generate_response_and_audio(audio_bytes: bytes, lepton_conversation: list[di
68
  id = str(time.time())
69
  full_response = ""
70
  asr_result = ""
 
71
 
72
- for chunk in stream:
73
  if not chunk.choices:
74
  continue
75
  delta = chunk.choices[0].delta
76
- content = delta.get("content", "")
77
  audio = getattr(chunk.choices[0], "audio", [])
78
  asr_results = getattr(chunk.choices[0], "asr_results", [])
79
 
80
  if asr_results:
 
81
  asr_result += "".join(asr_results)
82
  yield id, None, asr_result, None
83
 
84
  if content:
 
85
  full_response += content
86
  yield id, full_response, None, None
87
 
88
  if audio:
 
89
  # Accumulate audio bytes and yield them
90
  audio_bytes_accumulated = b''.join([base64.b64decode(a) for a in audio])
91
- audio = AudioSegment.from_file(io.BytesIO(audio_bytes_accumulated))
 
92
  audio_array = np.array(audio.get_array_of_samples(), dtype=np.int16).reshape(1, -1)
93
- print("audio.shape", audio_array.shape)
94
- print("sampling_rate", audio.frame_rate)
95
 
96
  yield id, None, None, (audio.frame_rate, audio_array)
 
 
 
 
97
 
98
  yield id, full_response, asr_result, None
99
-
100
  except Exception as e:
101
  raise gr.Error(f"Error during audio streaming: {e}")
102
 
103
  def response(audio: tuple[int, np.ndarray], lepton_conversation: list[dict],
104
- gradio_conversation: list[dict], client: openai.OpenAI, output_format: str):
105
 
106
  audio_buffer = io.BytesIO()
107
  segment = AudioSegment(
@@ -110,55 +113,63 @@ def response(audio: tuple[int, np.ndarray], lepton_conversation: list[dict],
110
  sample_width=audio[1].dtype.itemsize,
111
  channels=1,
112
  )
113
- segment.export(audio_buffer, format="wav")
114
 
115
- generator = generate_response_and_audio(audio_buffer.getvalue(), lepton_conversation, client, output_format)
116
 
117
  for id, text, asr, audio in generator:
118
  if asr:
119
  update_or_append_conversation(lepton_conversation, id, "user", asr)
120
  update_or_append_conversation(gradio_conversation, id, "user", asr)
 
121
  if text:
122
  update_or_append_conversation(lepton_conversation, id, "assistant", text)
123
  update_or_append_conversation(gradio_conversation, id, "assistant", text)
 
124
  if audio:
125
- yield audio, AdditionalOutputs(lepton_conversation, gradio_conversation)
126
  else:
127
  yield AdditionalOutputs(lepton_conversation, gradio_conversation)
128
 
129
 
130
- def set_api_key(api_key):
131
- if not api_key:
132
- raise gr.Error("Please enter a valid API key.")
133
- client = create_client(api_key)
134
- gr.Info("Set API Key Successfully")
135
- return client, gr.skip()
 
 
 
 
136
 
137
 
138
  with gr.Blocks() as demo:
139
  with gr.Group():
140
  with gr.Row():
141
  chatbot = gr.Chatbot(label="Conversation", type="messages")
142
- with gr.Row(equal_height=True):
143
- with gr.Column(scale=1):
144
- format_dropdown = gr.Dropdown(choices=["mp3", "opus"], value="mp3", label="Output Audio Format")
145
- api_key_input = gr.Textbox(type="password", label="Enter your Lepton API Key")
146
- set_key_button = gr.Button("Set API Key", variant="primary")
147
- with gr.Column(scale=3):
148
- audio = WebRTC(modality="audio", mode="send-receive",
149
  label="Audio Stream",
150
  rtc_configuration=rtc_configuration)
 
 
 
 
151
 
152
  client_state = gr.State(None)
153
- lepton_conversation = gr.State([])
154
-
155
- set_key_button.click(set_api_key, inputs=[api_key_input], outputs=[client_state, set_key_button])
156
 
 
 
157
  audio.stream(
158
  ReplyOnPause(response),
159
- inputs=[audio, lepton_conversation, chatbot, client_state, format_dropdown],
160
  outputs=[audio]
161
  )
162
- audio.on_additional_outputs(lambda l, g: (l, g), outputs=[lepton_conversation, chatbot])
 
163
 
164
  demo.launch()
 
25
  rtc_configuration = None
26
 
27
 
 
 
 
 
 
 
28
 
29
  def update_or_append_conversation(conversation, id, role, content):
30
  # Find if there's an existing message with the given id
 
37
 
38
 
39
  def generate_response_and_audio(audio_bytes: bytes, lepton_conversation: list[dict],
40
+ client: openai.OpenAI):
41
  if client is None:
42
  raise gr.Error("Please enter a valid API key first.")
43
 
44
+ # mp3 bitrate
45
+ bitrate = 128
46
  audio_data = base64.b64encode(audio_bytes).decode()
47
 
48
  try:
 
50
  extra_body={
51
  "require_audio": True,
52
  "tts_preset_id": "jessica",
53
+ "tts_audio_format": "mp3",
54
  "tts_audio_bitrate": bitrate
55
  },
56
  model="llama3.1-8b",
 
63
  id = str(time.time())
64
  full_response = ""
65
  asr_result = ""
66
+ all_audio = b""
67
 
68
+ for i, chunk in enumerate(stream):
69
  if not chunk.choices:
70
  continue
71
  delta = chunk.choices[0].delta
72
+ content = delta.content
73
  audio = getattr(chunk.choices[0], "audio", [])
74
  asr_results = getattr(chunk.choices[0], "asr_results", [])
75
 
76
  if asr_results:
77
+ print(i, "asr_results")
78
  asr_result += "".join(asr_results)
79
  yield id, None, asr_result, None
80
 
81
  if content:
82
+ print(i, "content")
83
  full_response += content
84
  yield id, full_response, None, None
85
 
86
  if audio:
87
+ print(i, "audio")
88
  # Accumulate audio bytes and yield them
89
  audio_bytes_accumulated = b''.join([base64.b64decode(a) for a in audio])
90
+ all_audio += audio_bytes_accumulated
91
+ audio = AudioSegment.from_file(io.BytesIO(audio_bytes_accumulated), format="mp3")
92
  audio_array = np.array(audio.get_array_of_samples(), dtype=np.int16).reshape(1, -1)
93
+ print("audio.frame_rate", audio.frame_rate)
 
94
 
95
  yield id, None, None, (audio.frame_rate, audio_array)
96
+
97
+ if all_audio:
98
+ all_audio = AudioSegment.from_file(io.BytesIO(all_audio), format="mp3")
99
+ all_audio.export("all_audio.mp3", format="mp3")
100
 
101
  yield id, full_response, asr_result, None
102
+ print("finishing loop")
103
  except Exception as e:
104
  raise gr.Error(f"Error during audio streaming: {e}")
105
 
106
  def response(audio: tuple[int, np.ndarray], lepton_conversation: list[dict],
107
+ gradio_conversation: list[dict], client: openai.OpenAI):
108
 
109
  audio_buffer = io.BytesIO()
110
  segment = AudioSegment(
 
113
  sample_width=audio[1].dtype.itemsize,
114
  channels=1,
115
  )
116
+ segment.export(audio_buffer, format="mp3")
117
 
118
+ generator = generate_response_and_audio(audio_buffer.getvalue(), lepton_conversation, client)
119
 
120
  for id, text, asr, audio in generator:
121
  if asr:
122
  update_or_append_conversation(lepton_conversation, id, "user", asr)
123
  update_or_append_conversation(gradio_conversation, id, "user", asr)
124
+ yield AdditionalOutputs(lepton_conversation, gradio_conversation)
125
  if text:
126
  update_or_append_conversation(lepton_conversation, id, "assistant", text)
127
  update_or_append_conversation(gradio_conversation, id, "assistant", text)
128
+ yield AdditionalOutputs(lepton_conversation, gradio_conversation)
129
  if audio:
130
+ yield audio
131
  else:
132
  yield AdditionalOutputs(lepton_conversation, gradio_conversation)
133
 
134
 
135
+ def set_api_key(lepton_api_key):
136
+ try:
137
+ client = openai.OpenAI(
138
+ base_url="https://llama3-1-8b.lepton.run/api/v1/",
139
+ api_key=lepton_api_key
140
+ )
141
+ except:
142
+ raise gr.Error("Invalid API keys. Please try again.")
143
+ gr.Info("Successfully set API keys.", duration=3)
144
+ return client, gr.update(visible=True), gr.update(visible=False)
145
 
146
 
147
  with gr.Blocks() as demo:
148
  with gr.Group():
149
  with gr.Row():
150
  chatbot = gr.Chatbot(label="Conversation", type="messages")
151
+ with gr.Row(visible=False) as mic_row:
152
+ audio = WebRTC(modality="audio", mode="send-receive",
 
 
 
 
 
153
  label="Audio Stream",
154
  rtc_configuration=rtc_configuration)
155
+ with gr.Row(equal_height=True) as api_row:
156
+ api_key_input = gr.Textbox(type="password", value=os.getenv("LEPTONAI_API_KEY"),
157
+ label="Enter Your Lepton AI Key")
158
+
159
 
160
  client_state = gr.State(None)
161
+ lepton_conversation = gr.State([{"role": "system",
162
+ "content": "You are a knowledgeable assistant who will engage in spoken conversations with users. "
163
+ "Keep your answers short and natural as they will be read aloud."}])
164
 
165
+ api_key_input.submit(set_api_key, inputs=[api_key_input],
166
+ outputs=[client_state, mic_row, api_row])
167
  audio.stream(
168
  ReplyOnPause(response),
169
+ inputs=[audio, lepton_conversation, chatbot, client_state],
170
  outputs=[audio]
171
  )
172
+ audio.on_additional_outputs(lambda l, g: (l, g), outputs=[lepton_conversation, chatbot],
173
+ queue=False, show_progress="hidden")
174
 
175
  demo.launch()
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- gradio_webrtc[vad]==0.0.11
2
  openai
3
  twilio
 
1
+ gradio_webrtc[vad]==0.0.12
2
  openai
3
  twilio