Voice_Assistant_TTS_long

Sleeping

App Files Files Community

Siddhant Arora commited on Sep 8

Commit

330bd18

•

1 Parent(s): 38787ca

Update space

Browse files

Files changed (14) hide show

LLM/__pycache__/chat.cpython-310.pyc +0 -0
LLM/__pycache__/chat.cpython-39.pyc +0 -0
LLM/mlx_language_model.py +97 -0
VAD/__pycache__/vad_iterator.cpython-310.pyc +0 -0
VAD/__pycache__/vad_iterator.cpython-39.pyc +0 -0
app.py +96 -117
flagged/log.csv +2 -0
flagged/new_chunk/65327197a5439319f87d/audio.wav +0 -0
main.js +74 -0
mlx_models/distil-large-v3/config.json +13 -0
mlx_models/distil-large-v3/weights.npz +3 -0
record_button.js +40 -0
recorder.js +112 -0
requirements.txt +8 -4

LLM/__pycache__/chat.cpython-310.pyc ADDED Viewed

Binary file (1.04 kB). View file

LLM/__pycache__/chat.cpython-39.pyc ADDED Viewed

Binary file (1.03 kB). View file

LLM/mlx_language_model.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import logging
+from LLM.chat import Chat
+from baseHandler import BaseHandler
+from mlx_lm import load, stream_generate, generate
+from rich.console import Console
+import torch
+logging.basicConfig(
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+console = Console()
+class MLXLanguageModelHandler(BaseHandler):
+    """
+    Handles the language model part.
+    """
+    def setup(
+        self,
+        model_name="microsoft/Phi-3-mini-4k-instruct",
+        device="mps",
+        torch_dtype="float16",
+        gen_kwargs={},
+        user_role="user",
+        chat_size=1,
+        init_chat_role=None,
+        init_chat_prompt="You are a helpful AI assistant.",
+    ):
+        self.model_name = model_name
+        self.model, self.tokenizer = load(self.model_name)
+        self.gen_kwargs = gen_kwargs
+        self.chat = Chat(chat_size)
+        if init_chat_role:
+            if not init_chat_prompt:
+                raise ValueError(
+                    "An initial promt needs to be specified when setting init_chat_role."
+                )
+            self.chat.init_chat({"role": init_chat_role, "content": init_chat_prompt})
+        self.user_role = user_role
+        self.warmup()
+    def warmup(self):
+        logger.info(f"Warming up {self.__class__.__name__}")
+        dummy_input_text = "Write me a poem about Machine Learning."
+        dummy_chat = [{"role": self.user_role, "content": dummy_input_text}]
+        n_steps = 2
+        for _ in range(n_steps):
+            prompt = self.tokenizer.apply_chat_template(dummy_chat, tokenize=False)
+            generate(
+                self.model,
+                self.tokenizer,
+                prompt=prompt,
+                max_tokens=self.gen_kwargs["max_new_tokens"],
+                verbose=False,
+            )
+    def process(self, prompt):
+        logger.debug("infering language model...")
+        self.chat.append({"role": self.user_role, "content": prompt})
+        # Remove system messages if using a Gemma model
+        if "gemma" in self.model_name.lower():
+            chat_messages = [
+                msg for msg in self.chat.to_list() if msg["role"] != "system"
+            ]
+        else:
+            chat_messages = self.chat.to_list()
+        prompt = self.tokenizer.apply_chat_template(
+            chat_messages, tokenize=False, add_generation_prompt=True
+        )
+        output = ""
+        curr_output = ""
+        for t in stream_generate(
+            self.model,
+            self.tokenizer,
+            prompt,
+            max_tokens=self.gen_kwargs["max_new_tokens"],
+        ):
+            output += t
+            curr_output += t
+            if curr_output.endswith((".", "?", "!", "<|end|>")):
+                yield curr_output.replace("<|end|>", "")
+                curr_output = ""
+        generated_text = output.replace("<|end|>", "")
+        torch.mps.empty_cache()
+        self.chat.append({"role": "assistant", "content": generated_text})

VAD/__pycache__/vad_iterator.cpython-310.pyc ADDED Viewed

Binary file (2.98 kB). View file

VAD/__pycache__/vad_iterator.cpython-39.pyc ADDED Viewed

Binary file (2.96 kB). View file

app.py CHANGED Viewed

@@ -1,8 +1,43 @@
 import gradio as gr
 from transformers import pipeline
 import numpy as np
 from VAD.vad_iterator import VADIterator
 import torch
 def int2float(sound):
     """
@@ -16,10 +51,13 @@ def int2float(sound):
     sound = sound.squeeze()  # depends on the use case
     return sound
-min_speech_ms=500,
-max_speech_ms=float("inf"),
 transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
-vad_model, _ = torch.hub.load("snakers4/silero-vad", "silero_vad")
 vad_iterator = VADIterator(
     vad_model,
     threshold=0.3,
@@ -31,131 +69,72 @@ vad_iterator = VADIterator(
 def transcribe(stream, new_chunk):
     sr, y = new_chunk
-    print(sr)
-    print(y.shape)
     audio_int16 = np.frombuffer(y, dtype=np.int16)
     audio_float32 = int2float(audio_int16)
     vad_output = vad_iterator(torch.from_numpy(audio_float32))
     if vad_output is not None and len(vad_output) != 0:
-        logger.debug("VAD: end of speech detected")
         array = torch.cat(vad_output).cpu().numpy()
         duration_ms = len(array) / sr * 1000
         if (not(duration_ms < min_speech_ms or duration_ms > max_speech_ms)):
-            y = array.astype(np.float32)
-            y /= np.max(np.abs(y))
-            return stream, transcriber({"sampling_rate": sr, "raw": y})["text"] , y
-    return stream, None , None
 demo = gr.Interface(
     transcribe,
     ["state", gr.Audio(sources=["microphone"], streaming=True, waveform_options=gr.WaveformOptions(sample_rate=16000))],
-    ["state", "text", gr.Audio(label="Output", streaming=True, autoplay=True, waveform_options=gr.WaveformOptions(sample_rate=16000))],
     live=True,
 )
 demo.launch()
-# from transformers import pipeline
-# import torch
-# device = "cuda:0" if torch.cuda.is_available() else "cpu"
-# classifier = pipeline(
-#     "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
-# )
-# from transformers.pipelines.audio_utils import ffmpeg_microphone_live
-# def launch_fn(
-#     wake_word="marvin",
-#     prob_threshold=0.5,
-#     chunk_length_s=2.0,
-#     stream_chunk_s=0.25,
-#     debug=False,
-# ):
-#     if wake_word not in classifier.model.config.label2id.keys():
-#         raise ValueError(
-#             f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
-#         )
-#     sampling_rate = classifier.feature_extractor.sampling_rate
-#     mic = ffmpeg_microphone_live(
-#         sampling_rate=sampling_rate,
-#         chunk_length_s=chunk_length_s,
-#         stream_chunk_s=stream_chunk_s,
-#     )
-#     print("Listening for wake word...")
-#     for prediction in classifier(mic):
-#         prediction = prediction[0]
-#         if debug:
-#             print(prediction)
-#         if prediction["label"] == wake_word:
-#             if prediction["score"] > prob_threshold:
-#                 return True
-# transcriber = pipeline(
-#     "automatic-speech-recognition", model="openai/whisper-base.en", device=device
-# )
-# import sys
-# def transcribe(chunk_length_s=5.0, stream_chunk_s=1.0):
-#     sampling_rate = transcriber.feature_extractor.sampling_rate
-#     mic = ffmpeg_microphone_live(
-#         sampling_rate=sampling_rate,
-#         chunk_length_s=chunk_length_s,
-#         stream_chunk_s=stream_chunk_s,
-#     )
-#     print("Start speaking...")
-#     for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}):
-#         sys.stdout.write("\033[K")
-#         print(item["text"], end="\r")
-#         if not item["partial"][0]:
-#             break
-#     return item["text"]
-# from huggingface_hub import HfFolder
-# import requests
-# def query(text, model_id="tiiuae/falcon-7b-instruct"):
-#     api_url = f"https://api-inference.huggingface.co/models/{model_id}"
-#     headers = {"Authorization": f"Bearer {HfFolder().get_token()}"}
-#     payload = {"inputs": text}
-#     print(f"Querying...: {text}")
-#     response = requests.post(api_url, headers=headers, json=payload)
-#     return response.json()[0]["generated_text"][len(text) + 1 :]
-# from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
-# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
-# from datasets import load_dataset
-# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-# speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
-# def synthesise(text):
-#     inputs = processor(text=text, return_tensors="pt")
-#     speech = model.generate_speech(
-#         inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
-#     )
-#     return speech.cpu()
-# if __name__ == "__main__":
-#     launch_fn(debug=True)
-#     # transcription = transcribe()
-#     # response = query(transcription)
-#     # audio = synthesise(response)
-#     # Audio(audio, rate=16000, autoplay=True)

+# import base64
+# import pathlib
+# import tempfile
 import gradio as gr
+# recorder_js = pathlib.Path('recorder.js').read_text()
+# main_js = pathlib.Path('main.js').read_text()
+# record_button_js = pathlib.Path('record_button.js').read_text().replace('let recorder_js = null;', recorder_js).replace(
+#     'let main_js = null;', main_js)
+# def save_base64_video(base64_string):
+#     base64_video = base64_string
+#     video_data = base64.b64decode(base64_video)
+#     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
+#         temp_filename = temp_file.name
+#         temp_file.write(video_data)
+#     print(f"Temporary MP4 file saved as: {temp_filename}")
+#     return temp_filename
+# import os
+# os.system('python -m unidic download')
 from transformers import pipeline
 import numpy as np
 from VAD.vad_iterator import VADIterator
 import torch
+import librosa
+from mlx_lm import load, stream_generate, generate
+from LLM.chat import Chat
+from lightning_whisper_mlx import LightningWhisperMLX
+from melo.api import TTS
+LM_model, LM_tokenizer = load("mlx-community/SmolLM-360M-Instruct")
+chat = Chat(2)
+chat.init_chat({"role": "system", "content": "You are a helpful and friendly AI assistant. You are polite, respectful, and aim to provide concise responses of less than 20 words."})
+user_role = "user"
+tts_model = TTS(language="EN_NEWEST", device="auto")
+speaker_id = tts_model.hps.data.spk2id["EN-Newest"]
+blocksize = 512
 def int2float(sound):
     """
     sound = sound.squeeze()  # depends on the use case
     return sound
+text_str=""
+audio_output = None
+min_speech_ms=500
+max_speech_ms=float("inf")
+ASR_model = LightningWhisperMLX(model="distil-large-v3", batch_size=6, quant=None)
 transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
+vad_model, _ = torch.hub.load("snakers4/silero-vad:v4.0", "silero_vad")
 vad_iterator = VADIterator(
     vad_model,
     threshold=0.3,
 def transcribe(stream, new_chunk):
     sr, y = new_chunk
+    global text_str
+    global chat
+    global user_role
+    global audio_output
     audio_int16 = np.frombuffer(y, dtype=np.int16)
     audio_float32 = int2float(audio_int16)
+    audio_float32=librosa.resample(audio_float32, orig_sr=sr, target_sr=16000)
+    sr=16000
+    print(sr)
+    print(audio_float32.shape)
     vad_output = vad_iterator(torch.from_numpy(audio_float32))
     if vad_output is not None and len(vad_output) != 0:
+        print("VAD: end of speech detected")
         array = torch.cat(vad_output).cpu().numpy()
         duration_ms = len(array) / sr * 1000
         if (not(duration_ms < min_speech_ms or duration_ms > max_speech_ms)):
+            prompt=ASR_model.transcribe(array)["text"].strip()
+            chat.append({"role": user_role, "content": prompt})
+            chat_messages = chat.to_list()
+            prompt = LM_tokenizer.apply_chat_template(
+                chat_messages, tokenize=False, add_generation_prompt=True
+            )
+            output = generate(
+                LM_model,
+                LM_tokenizer,
+                prompt,
+                max_tokens=128,
+            )
+        # import pdb;pdb.set_trace()
+        generated_text = output.replace("<|end|>", "")
+        torch.mps.empty_cache()
+        chat.append({"role": "assistant", "content": generated_text})
+        text_str=generated_text
+        # import pdb;pdb.set_trace()
+        audio_chunk = tts_model.tts_to_file(text_str, speaker_id, quiet=True)
+        audio_chunk = (audio_chunk * 32768).astype(np.int16)
+        audio_output=(44100, audio_chunk)
+    # else:
+    #     audio_output=None
+    text_str1=text_str
+    return stream, text_str1, audio_output
 demo = gr.Interface(
     transcribe,
     ["state", gr.Audio(sources=["microphone"], streaming=True, waveform_options=gr.WaveformOptions(sample_rate=16000))],
+    ["state", "text", gr.Audio(label="Output", autoplay=True)],
     live=True,
 )
+# with demo:
+#     start_button = gr.Button("Record Screen 🔴")
+#     video_component = gr.Video(interactive=True, show_share_button=True, include_audio=True)
+#     def toggle_button_label(returned_string):
+#         if returned_string.startswith("Record"):
+#             return gr.Button(value="Stop Recording ⚪"), None
+#         else:
+#             try:
+#                 temp_filename = save_base64_video(returned_string)
+#             except Exception as e:
+#                 return gr.Button(value="Record Screen 🔴"), gr.Warning(f'Failed to convert video to mp4:\n{e}')
+#             return gr.Button(value="Record Screen 🔴"), gr.Video(value=temp_filename, interactive=True,
+#                                                                 show_share_button=True)
+#     start_button.click(toggle_button_label, start_button, [start_button, video_component], js=record_button_js)
 demo.launch()

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ stream,new_chunk,stream,output 1,flag,username,timestamp
2	+ ,flagged/new_chunk/65327197a5439319f87d/audio.wav,,,,,2024-09-07 15:26:52.280189

flagged/new_chunk/65327197a5439319f87d/audio.wav ADDED Viewed

Binary file (34.6 kB). View file

main.js ADDED Viewed

	@@ -0,0 +1,74 @@

+// main.js
+if (!ScreenCastRecorder.isSupportedBrowser()) {
+    console.error("Screen Recording not supported in this browser");
+}
+let recorder;
+let outputBlob;
+const stopRecording = () => __awaiter(void 0, void 0, void 0, function* () {
+    let currentState = "RECORDING";
+    // We should do nothing if the user try to stop recording when it is not started
+    if (currentState === "OFF" || recorder == null) {
+        return;
+    }
+    // if (currentState === "COUNTDOWN") {
+    //     this.setState({
+    //         currentState: "OFF",
+    //     })
+    // }
+    if (currentState === "RECORDING") {
+        if (recorder.getState() === "inactive") {
+            // this.setState({
+            //     currentState: "OFF",
+            // })
+            console.log("Inactive");
+        }
+        else {
+            outputBlob = yield recorder.stop();
+            console.log("Done recording");
+            // this.setState({
+            //   outputBlob,
+            //   currentState: "PREVIEW_FILE",
+            // })
+            window.currentState = "PREVIEW_FILE";
+            const videoSource = URL.createObjectURL(outputBlob);
+            window.videoSource = videoSource;
+            const fileName = "recording";
+            const link = document.createElement("a");
+            link.setAttribute("href", videoSource);
+            link.setAttribute("download", `${fileName}.webm`);
+            link.click();
+        }
+    }
+});
+const startRecording = () => __awaiter(void 0, void 0, void 0, function* () {
+    const recordAudio = true;
+    recorder = new ScreenCastRecorder({
+        recordAudio,
+        onErrorOrStop: () => stopRecording(),
+    });
+    try {
+        yield recorder.initialize();
+    }
+    catch (e) {
+        console.warn(`ScreenCastRecorder.initialize error: ${e}`);
+        //   this.setState({ currentState: "UNSUPPORTED" })
+        window.currentState = "UNSUPPORTED";
+        return;
+    }
+    // this.setState({ currentState: "COUNTDOWN" })
+    const hasStarted = recorder.start();
+    if (hasStarted) {
+        // this.setState({
+        //     currentState: "RECORDING",
+        // })
+        console.log("Started recording");
+        window.currentState = "RECORDING";
+    }
+    else {
+        stopRecording().catch(err => console.warn(`withScreencast.stopRecording threw an error: ${err}`));
+    }
+});
+// Set global functions to window.
+window.startRecording = startRecording;
+window.stopRecording = stopRecording;

mlx_models/distil-large-v3/config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "n_mels": 128,
+    "n_audio_ctx": 1500,
+    "n_audio_state": 1280,
+    "n_audio_head": 20,
+    "n_audio_layer": 32,
+    "n_vocab": 51866,
+    "n_text_ctx": 448,
+    "n_text_state": 1280,
+    "n_text_head": 20,
+    "n_text_layer": 2,
+    "model_type": "whisper"
+}

mlx_models/distil-large-v3/weights.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8fd01bf050289525f91ff3d96e2880381367a34beb3520ad516181517b209ebc
+size 1509130112

record_button.js ADDED Viewed

	@@ -0,0 +1,40 @@

+// Setup if needed and start recording.
+async () => {
+    // Set up recording functions if not already initialized
+    if (!window.startRecording) {
+        let recorder_js = null;
+        let main_js = null;
+    }
+    // Function to fetch and convert video blob to base64 using async/await without explicit Promise
+    async function getVideoBlobAsBase64(objectURL) {
+        const response = await fetch(objectURL);
+        if (!response.ok) {
+          throw new Error('Failed to fetch video blob.');
+        }
+        const blob = await response.blob();
+        const reader = new FileReader();
+        reader.readAsDataURL(blob);
+        return new Promise((resolve, reject) => {
+          reader.onloadend = () => {
+            if (reader.result) {
+              resolve(reader.result.split(',')[1]); // Return the base64 string (without data URI prefix)
+            } else {
+              reject('Failed to convert blob to base64.');
+            }
+          };
+        });
+    }
+    if (window.currentState === "RECORDING") {
+        await window.stopRecording();
+        const base64String = await getVideoBlobAsBase64(window.videoSource);
+        return base64String;
+    } else {
+        window.startRecording();
+        return "Record";
+    }
+}

recorder.js ADDED Viewed

	@@ -0,0 +1,112 @@

+// recorder.js
+var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
+    function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
+    return new (P || (P = Promise))(function (resolve, reject) {
+        function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
+        function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
+        function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+    });
+};
+const BLOB_TYPE = "video/webm";
+class ScreenCastRecorder {
+    /** True if the current browser likely supports screencasts. */
+    static isSupportedBrowser() {
+        return (navigator.mediaDevices != null &&
+            navigator.mediaDevices.getUserMedia != null &&
+            navigator.mediaDevices.getDisplayMedia != null &&
+            MediaRecorder.isTypeSupported(BLOB_TYPE));
+    }
+    constructor({ recordAudio, onErrorOrStop }) {
+        this.recordAudio = recordAudio;
+        this.onErrorOrStopCallback = onErrorOrStop;
+        this.inputStream = null;
+        this.recordedChunks = [];
+        this.mediaRecorder = null;
+    }
+    /**
+     * This asynchronous method will initialize the screen recording object asking
+     * for permissions to the user which are needed to start recording.
+     */
+    initialize() {
+        return __awaiter(this, void 0, void 0, function* () {
+            const desktopStream = yield navigator.mediaDevices.getDisplayMedia({
+                video: true,
+            });
+            let tracks = desktopStream.getTracks();
+            if (this.recordAudio) {
+                const voiceStream = yield navigator.mediaDevices.getUserMedia({
+                    video: false,
+                    audio: true,
+                });
+                tracks = tracks.concat(voiceStream.getAudioTracks());
+            }
+            this.recordedChunks = [];
+            this.inputStream = new MediaStream(tracks);
+            this.mediaRecorder = new MediaRecorder(this.inputStream, {
+                mimeType: BLOB_TYPE,
+            });
+            this.mediaRecorder.ondataavailable = e => this.recordedChunks.push(e.data);
+        });
+    }
+    getState() {
+        if (this.mediaRecorder) {
+            return this.mediaRecorder.state;
+        }
+        return "inactive";
+    }
+    /**
+     * This method will start the screen recording if the user has granted permissions
+     * and the mediaRecorder has been initialized
+     *
+     * @returns {boolean}
+     */
+    start() {
+        if (!this.mediaRecorder) {
+            console.warn(`ScreenCastRecorder.start: mediaRecorder is null`);
+            return false;
+        }
+        const logRecorderError = (e) => {
+            console.warn(`mediaRecorder.start threw an error: ${e}`);
+        };
+        this.mediaRecorder.onerror = (e) => {
+            logRecorderError(e);
+            this.onErrorOrStopCallback();
+        };
+        this.mediaRecorder.onstop = () => this.onErrorOrStopCallback();
+        try {
+            this.mediaRecorder.start();
+        }
+        catch (e) {
+            logRecorderError(e);
+            return false;
+        }
+        return true;
+    }
+    /**
+     * This method will stop recording and then return the generated Blob
+     *
+     * @returns {(Promise|undefined)}
+     *  A Promise which will return the generated Blob
+     *  Undefined if the MediaRecorder could not initialize
+     */
+    stop() {
+        if (!this.mediaRecorder) {
+            return undefined;
+        }
+        let resolver;
+        const promise = new Promise(r => {
+            resolver = r;
+        });
+        this.mediaRecorder.onstop = () => resolver();
+        this.mediaRecorder.stop();
+        if (this.inputStream) {
+            this.inputStream.getTracks().forEach(s => s.stop());
+            this.inputStream = null;
+        }
+        return promise.then(() => this.buildOutputBlob());
+    }
+    buildOutputBlob() {
+        return new Blob(this.recordedChunks, { type: BLOB_TYPE });
+    }
+}

requirements.txt CHANGED Viewed

@@ -1,7 +1,11 @@
-huggingface_hub==0.22.2
 transformers[sentencepiece]
-sentencepiece
 datasets
-huggingface_hub
 torch==2.4.0
-torchaudio

+huggingface_hub==0.23.2
 transformers[sentencepiece]
+sentencepiece==0.2.0
 datasets
 torch==2.4.0
+torchaudio
+librosa
+lightning-whisper-mlx>=0.0.10
+mlx-lm>=0.14.0
+melotts @ git+https://github.com/andimarafioti/MeloTTS.git#egg=MeloTTS  # made a copy of MeloTTS to have compatible versions of transformers
+sounddevice==0.5.0