Helw150 commited on
Commit
3268a02
1 Parent(s): 5ded772

Multi turn

Browse files
Files changed (1) hide show
  1. app.py +13 -5
app.py CHANGED
@@ -26,7 +26,7 @@ resampler = Audio(sampling_rate=16_000)
26
 
27
  @spaces.GPU
28
  @torch.no_grad
29
- def diva_audio(audio_input, do_sample=False, temperature=0.001):
30
  sr, y = audio_input
31
  x = xxhash.xxh32(bytes(y)).hexdigest()
32
  y = y.astype(np.float32)
@@ -35,7 +35,12 @@ def diva_audio(audio_input, do_sample=False, temperature=0.001):
35
  resampler.encode_example({"array": y, "sampling_rate": sr})
36
  )
37
  yield from diva_model.generate_stream(
38
- a["array"], None, do_sample=do_sample, max_new_tokens=256
 
 
 
 
 
39
  )
40
 
41
 
@@ -70,7 +75,7 @@ def run_vad(ori_audio, sr):
70
 
71
 
72
  def warm_up():
73
- frames = b"\x00\x00" * 1024 * 2 # 1024 frames of 2 bytes each
74
  dur, frames, tcost = run_vad(frames, 16000)
75
  print(f"warm up done, time_cost: {tcost:.3f} s")
76
 
@@ -86,6 +91,7 @@ class AppState:
86
  started_talking: bool = False
87
  stopped: bool = False
88
  conversation: list = field(default_factory=list)
 
89
 
90
 
91
  def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
@@ -134,7 +140,9 @@ def response(state: AppState):
134
  )
135
 
136
  start = False
137
- for resp in diva_audio((state.sampling_rate, state.stream)):
 
 
138
  if not start:
139
  state.conversation.append({"role": "assistant", "content": resp})
140
  start = True
@@ -142,7 +150,7 @@ def response(state: AppState):
142
  state.conversation[-1]["content"] = resp
143
  yield state, state.conversation
144
 
145
- yield AppState(conversation=state.conversation), state.conversation
146
 
147
 
148
  def start_recording_user(state: AppState):
 
26
 
27
  @spaces.GPU
28
  @torch.no_grad
29
+ def diva_audio(audio_input, do_sample=False, temperature=0.001, prev_outs=None):
30
  sr, y = audio_input
31
  x = xxhash.xxh32(bytes(y)).hexdigest()
32
  y = y.astype(np.float32)
 
35
  resampler.encode_example({"array": y, "sampling_rate": sr})
36
  )
37
  yield from diva_model.generate_stream(
38
+ a["array"],
39
+ None,
40
+ do_sample=do_sample,
41
+ max_new_tokens=256,
42
+ init_outputs=prev_outs,
43
+ return_outputs=True,
44
  )
45
 
46
 
 
75
 
76
 
77
  def warm_up():
78
+ frames = np.ones(2048) # 1024 frames of 2 bytes each
79
  dur, frames, tcost = run_vad(frames, 16000)
80
  print(f"warm up done, time_cost: {tcost:.3f} s")
81
 
 
91
  started_talking: bool = False
92
  stopped: bool = False
93
  conversation: list = field(default_factory=list)
94
+ model_outs: any = None
95
 
96
 
97
  def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
 
140
  )
141
 
142
  start = False
143
+ for resp, outs in diva_audio(
144
+ (state.sampling_rate, state.stream), prev_outs=state.model_outs
145
+ ):
146
  if not start:
147
  state.conversation.append({"role": "assistant", "content": resp})
148
  start = True
 
150
  state.conversation[-1]["content"] = resp
151
  yield state, state.conversation
152
 
153
+ yield AppState(conversation=state.conversation, model_outs=outs), state.conversation
154
 
155
 
156
  def start_recording_user(state: AppState):