Spaces:

utkarsh-dixit
/

WhisperFusion

Paused

App Files Files Community

makaveli10 commited on Jan 17, 2024

Commit

16388cf

1 Parent(s): 81cb63c

integrate whisperspeech

Browse files

Files changed (4) hide show

llm_service.py +32 -19
main.py +4 -4
tts_service.py +33 -11
whisper_live/trt_server.py +6 -13

llm_service.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import json
 from pathlib import Path
 from typing import Optional
 import numpy as np
 import torch
 from transformers import AutoTokenizer
@@ -104,6 +107,8 @@ class MistralTensorRTLLM:
                          debug_mode=False,
                          lora_ckpt_source='hf')
         self.runner = self.runner_cls.from_dir(**self.runner_kwargs)
     def parse_input(
         self,
@@ -156,7 +161,7 @@ class MistralTensorRTLLM:
                 outputs = output_ids[batch_idx][beam][
                     output_begin:output_end].tolist()
                 output_text = self.tokenizer.decode(outputs)
-                print("[LLM] output:", output_text)
                 output.append(output_text)
         return output
@@ -177,7 +182,7 @@ class MistralTensorRTLLM:
         max_output_len=40,
         max_attention_window_size=4096,
         num_beams=1,
-        streaming=True,
         streaming_interval=4,
         debug=False,
     ):
@@ -186,27 +191,26 @@ class MistralTensorRTLLM:
             tokenizer_path,
         )
-        print("[LLM] loaded: True")
         while True:
             # Get the last transcription output from the queue
             transcription_output = transcription_queue.get()
             if transcription_queue.qsize() != 0:
-                print("[LLM] transcription queue size:", transcription_queue.qsize())
                 continue
-            # while True:
-            #     try:
-            #         transcription_output = transcription_queue.get_nowait()
-            #     except Exception as e:
-            #         print("[Queue] exception", e)
-            #         break
-            # transcription_output = transcription_queue.get()
             prompt = transcription_output['prompt'].strip()
             input_text=[self.format_prompt_qa(prompt)]
-            print("[Whisper] prompt:", prompt)
             batch_input_ids = self.parse_input(
                 input_text=input_text,
                 add_special_tokens=True,
@@ -252,15 +256,16 @@ class MistralTensorRTLLM:
                         break
                 # Interrupted by transcription queue
                 if output is None:
-                    print("[LLM] interrupted by transcription queue!!!!!!!!!!!!!!!!!!!!!!!!", transcription_queue.qsize())
                     continue
             else:
                 output_ids = outputs['output_ids']
                 sequence_lengths = outputs['sequence_lengths']
                 context_logits = None
                 generation_logits = None
-                if runner.gather_all_token_logits:
                     context_logits = outputs['context_logits']
                     generation_logits = outputs['generation_logits']
                 output = self.decode_tokens(
                     output_ids,
@@ -268,8 +273,16 @@ class MistralTensorRTLLM:
                     sequence_lengths,
                     transcription_queue
                 )
-            llm_queue.put({"uid": transcription_output["uid"], "llm_output": output})
-            audio_queue.put(output)
 if __name__=="__main__":
@@ -278,11 +291,11 @@ if __name__=="__main__":
         "/root/TensorRT-LLM/examples/llama/tmp/mistral/7B/trt_engines/fp16/1-gpu",
         "teknium/OpenHermes-2.5-Mistral-7B",
     )
-    print("intialized")
     for i in range(1):
         output = llm(
             ["Born in north-east France, Soyer trained as a"], streaming=True
         )
-    print(output)

 import json
 from pathlib import Path
 from typing import Optional
+import logging
+logging.basicConfig(level = logging.INFO)
 import numpy as np
 import torch
 from transformers import AutoTokenizer
                          debug_mode=False,
                          lora_ckpt_source='hf')
         self.runner = self.runner_cls.from_dir(**self.runner_kwargs)
+        self.last_prompt = None
+        self.last_output = None
     def parse_input(
         self,
                 outputs = output_ids[batch_idx][beam][
                     output_begin:output_end].tolist()
                 output_text = self.tokenizer.decode(outputs)
+                logging.info(f"[LLM] output: {output_text}")
                 output.append(output_text)
         return output
         max_output_len=40,
         max_attention_window_size=4096,
         num_beams=1,
+        streaming=False,
         streaming_interval=4,
         debug=False,
     ):
             tokenizer_path,
         )
+        logging.info("[LLM] loaded: True")
         while True:
             # Get the last transcription output from the queue
             transcription_output = transcription_queue.get()
             if transcription_queue.qsize() != 0:
+                logging.info("[LLM] interrupted by transcription queue!!!!!!!!!!!!!!!!!!!!!!!!")
                 continue
             prompt = transcription_output['prompt'].strip()
             input_text=[self.format_prompt_qa(prompt)]
+            self.eos = transcription_output["eos"]
+            if self.last_prompt == prompt:
+                if self.last_output is not None:
+                    # logging.info(f"[LLM info:] Same prompt, adding last llm output to audio queue.")
+                    audio_queue.put({"llm_output": self.last_output, "eos": self.eos})
+                    continue
+            logging.info(f"[LLM INFO:] WhisperLive prompt: {prompt}, eos: {self.eos}")
             batch_input_ids = self.parse_input(
                 input_text=input_text,
                 add_special_tokens=True,
                         break
                 # Interrupted by transcription queue
                 if output is None:
+                    logging.info(f"[LLM] interrupted by transcription queue!!!!!!!!!!!!!!!!!!!!!!!!")
                     continue
             else:
                 output_ids = outputs['output_ids']
                 sequence_lengths = outputs['sequence_lengths']
                 context_logits = None
                 generation_logits = None
+                if self.runner.gather_context_logits:
                     context_logits = outputs['context_logits']
+                if self.runner.gather_generation_logits:
                     generation_logits = outputs['generation_logits']
                 output = self.decode_tokens(
                     output_ids,
                     sequence_lengths,
                     transcription_queue
                 )
+            # if self.eos:
+            if output is not None:
+                self.last_output = output
+                self.last_prompt = prompt
+                llm_queue.put({"uid": transcription_output["uid"], "llm_output": output})
+                audio_queue.put({"llm_output": output, "eos": self.eos})
+            if self.eos:
+                self.last_prompt = None
 if __name__=="__main__":
         "/root/TensorRT-LLM/examples/llama/tmp/mistral/7B/trt_engines/fp16/1-gpu",
         "teknium/OpenHermes-2.5-Mistral-7B",
     )
+    logging.info("intialized")
     for i in range(1):
         output = llm(
             ["Born in north-east France, Soyer trained as a"], streaming=True
         )
+    logging.info(output)

main.py CHANGED Viewed

@@ -105,10 +105,10 @@ if __name__ == "__main__":
     llm_process.start()
     # audio process
-    # tts_runner = WhisperSpeechTTS()
-    # tts_process = multiprocessing.Process(target=tts_runner.run, args=("0.0.0.0", 8888, audio_queue))
-    # tts_process.start()
     llm_process.join()
     whisper_process.join()
-    # tts_process.join()

     llm_process.start()
     # audio process
+    tts_runner = WhisperSpeechTTS()
+    tts_process = multiprocessing.Process(target=tts_runner.run, args=("0.0.0.0", 8888, audio_queue))
+    tts_process.start()
     llm_process.join()
     whisper_process.join()
+    tts_process.join()

tts_service.py CHANGED Viewed

@@ -1,4 +1,7 @@
 import functools
 from websockets.sync.server import serve
 from whisperspeech.pipeline import Pipeline
@@ -9,8 +12,13 @@ class WhisperSpeechTTS:
     def initialize_model(self):
         self.pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model')
-    def run(self, host, port=6080, audio_queue=None):
         with serve(
             functools.partial(self.start_whisperspeech_tts, audio_queue=audio_queue),
             host, port
@@ -18,19 +26,33 @@ class WhisperSpeechTTS:
             server.serve_forever()
     def start_whisperspeech_tts(self, websocket, audio_queue=None):
-        self.initialize_model()
         while True:
             if audio_queue.empty(): continue
-            llm_output = audio_queue.get()[0]
-            audio = self.pipe.vocoder.decode(self.pipe.generate_atoks(llm_output.strip()))
-            audio = audio.cpu().numpy()
-            audio = audio * 32768.0
-            # send audio to client on another websocket
             try:
-                websocket.send(audio.astype('int16').tobytes())
             except Exception as e:
-                print("Audio error:", e)

 import functools
+import time
+import logging
+logging.basicConfig(level = logging.INFO)
 from websockets.sync.server import serve
 from whisperspeech.pipeline import Pipeline
     def initialize_model(self):
         self.pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model')
+        self.last_llm_response = None
+    def run(self, host, port, audio_queue=None):
+        # initialize and warmup model
+        self.initialize_model()
+        for i in range(3): self.pipe.vocoder.decode(self.pipe.generate_atoks("Hello, I am warming up."))
         with serve(
             functools.partial(self.start_whisperspeech_tts, audio_queue=audio_queue),
             host, port
             server.serve_forever()
     def start_whisperspeech_tts(self, websocket, audio_queue=None):
+        self.eos = False
+        self.output_audio = None
         while True:
             if audio_queue.empty(): continue
+            # check if this websocket exists
             try:
+                websocket.ping()
             except Exception as e:
+                del websocket
+                break
+            llm_response = audio_queue.get()
+            llm_output = llm_response["llm_output"][0]
+            self.eos = llm_response["eos"]
+            # only process if the output updated
+            if self.last_llm_response != llm_output.strip():
+                logging.INFO("[WhisperSpeech INFO:] Tunning TTS inference ...")
+                audio = self.pipe.vocoder.decode(self.pipe.generate_atoks(llm_output.strip()))
+                self.output_audio = audio.cpu().numpy()
+                self.last_llm_response = llm_output.strip()
+            if self.eos and self.output_audio is not None:
+                try:
+                    websocket.send(self.output_audio.tobytes())
+                except Exception as e:
+                    logging.error("[WhisperSpeech INFO:] Audio error:", e)

whisper_live/trt_server.py CHANGED Viewed

@@ -150,6 +150,7 @@ class TranscriptionServer:
                 except Exception as e:
                     logging.error(e)
                     return
                 self.clients[websocket].add_frames(frame_np)
                 elapsed_time = time.time() - self.clients_start_time[websocket]
@@ -379,6 +380,7 @@ class ServeClient:
             input_bytes = self.frames_np[int(samples_take):].copy()
             duration = input_bytes.shape[0] / self.RATE
             if duration<0.4:
                 continue
             try:
@@ -401,23 +403,14 @@ class ServeClient:
                             )
                             logging.info(f"[INFO]: {segments}, eos: {self.eos}")
                         if self.eos:
                             # self.append_segment(last_segment)
                             self.timestamp_offset += duration
-                            if self.last_prompt != self.prompt:
-                                self.transcription_queue.put({"uid": self.client_uid, "prompt": self.prompt})
-                            self.last_prompt = None
-                            # self.set_eos(False)
-                            logging.info(f"[INFO:] Processed : {self.timestamp_offset} seconds / {self.frames_np.shape[0] / self.RATE} seconds"
                             )
-                        else:
-                            if self.last_prompt != self.prompt:
-                                self.transcription_queue.put({"uid": self.client_uid, "prompt": self.prompt})
-                            self.last_prompt = self.prompt
                     except Exception as e:

                 except Exception as e:
                     logging.error(e)
                     return
+                print("[WhisperLive INFO:] adding frames ...")
                 self.clients[websocket].add_frames(frame_np)
                 elapsed_time = time.time() - self.clients_start_time[websocket]
             input_bytes = self.frames_np[int(samples_take):].copy()
             duration = input_bytes.shape[0] / self.RATE
             if duration<0.4:
+                time.sleep(0.01)    # 5ms sleep to wait for some voice active audio to arrive
                 continue
             try:
                             )
                             logging.info(f"[INFO]: {segments}, eos: {self.eos}")
+                        self.transcription_queue.put({"uid": self.client_uid, "prompt": self.prompt, "eos": self.eos})
                         if self.eos:
                             # self.append_segment(last_segment)
                             self.timestamp_offset += duration
+                            logging.info(
+                                f"[INFO:] Processed : {self.timestamp_offset} seconds / {self.frames_np.shape[0] / self.RATE} seconds"
                             )
                     except Exception as e: