Spaces:
Sleeping
Sleeping
Siddhant Arora
commited on
Commit
•
330bd18
1
Parent(s):
38787ca
Update space
Browse files- LLM/__pycache__/chat.cpython-310.pyc +0 -0
- LLM/__pycache__/chat.cpython-39.pyc +0 -0
- LLM/mlx_language_model.py +97 -0
- VAD/__pycache__/vad_iterator.cpython-310.pyc +0 -0
- VAD/__pycache__/vad_iterator.cpython-39.pyc +0 -0
- app.py +96 -117
- flagged/log.csv +2 -0
- flagged/new_chunk/65327197a5439319f87d/audio.wav +0 -0
- main.js +74 -0
- mlx_models/distil-large-v3/config.json +13 -0
- mlx_models/distil-large-v3/weights.npz +3 -0
- record_button.js +40 -0
- recorder.js +112 -0
- requirements.txt +8 -4
LLM/__pycache__/chat.cpython-310.pyc
ADDED
Binary file (1.04 kB). View file
|
|
LLM/__pycache__/chat.cpython-39.pyc
ADDED
Binary file (1.03 kB). View file
|
|
LLM/mlx_language_model.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from LLM.chat import Chat
|
3 |
+
from baseHandler import BaseHandler
|
4 |
+
from mlx_lm import load, stream_generate, generate
|
5 |
+
from rich.console import Console
|
6 |
+
import torch
|
7 |
+
|
8 |
+
logging.basicConfig(
|
9 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
10 |
+
)
|
11 |
+
logger = logging.getLogger(__name__)
|
12 |
+
|
13 |
+
console = Console()
|
14 |
+
|
15 |
+
|
16 |
+
class MLXLanguageModelHandler(BaseHandler):
|
17 |
+
"""
|
18 |
+
Handles the language model part.
|
19 |
+
"""
|
20 |
+
|
21 |
+
def setup(
|
22 |
+
self,
|
23 |
+
model_name="microsoft/Phi-3-mini-4k-instruct",
|
24 |
+
device="mps",
|
25 |
+
torch_dtype="float16",
|
26 |
+
gen_kwargs={},
|
27 |
+
user_role="user",
|
28 |
+
chat_size=1,
|
29 |
+
init_chat_role=None,
|
30 |
+
init_chat_prompt="You are a helpful AI assistant.",
|
31 |
+
):
|
32 |
+
self.model_name = model_name
|
33 |
+
self.model, self.tokenizer = load(self.model_name)
|
34 |
+
self.gen_kwargs = gen_kwargs
|
35 |
+
|
36 |
+
self.chat = Chat(chat_size)
|
37 |
+
if init_chat_role:
|
38 |
+
if not init_chat_prompt:
|
39 |
+
raise ValueError(
|
40 |
+
"An initial promt needs to be specified when setting init_chat_role."
|
41 |
+
)
|
42 |
+
self.chat.init_chat({"role": init_chat_role, "content": init_chat_prompt})
|
43 |
+
self.user_role = user_role
|
44 |
+
|
45 |
+
self.warmup()
|
46 |
+
|
47 |
+
def warmup(self):
|
48 |
+
logger.info(f"Warming up {self.__class__.__name__}")
|
49 |
+
|
50 |
+
dummy_input_text = "Write me a poem about Machine Learning."
|
51 |
+
dummy_chat = [{"role": self.user_role, "content": dummy_input_text}]
|
52 |
+
|
53 |
+
n_steps = 2
|
54 |
+
|
55 |
+
for _ in range(n_steps):
|
56 |
+
prompt = self.tokenizer.apply_chat_template(dummy_chat, tokenize=False)
|
57 |
+
generate(
|
58 |
+
self.model,
|
59 |
+
self.tokenizer,
|
60 |
+
prompt=prompt,
|
61 |
+
max_tokens=self.gen_kwargs["max_new_tokens"],
|
62 |
+
verbose=False,
|
63 |
+
)
|
64 |
+
|
65 |
+
def process(self, prompt):
|
66 |
+
logger.debug("infering language model...")
|
67 |
+
|
68 |
+
self.chat.append({"role": self.user_role, "content": prompt})
|
69 |
+
|
70 |
+
# Remove system messages if using a Gemma model
|
71 |
+
if "gemma" in self.model_name.lower():
|
72 |
+
chat_messages = [
|
73 |
+
msg for msg in self.chat.to_list() if msg["role"] != "system"
|
74 |
+
]
|
75 |
+
else:
|
76 |
+
chat_messages = self.chat.to_list()
|
77 |
+
|
78 |
+
prompt = self.tokenizer.apply_chat_template(
|
79 |
+
chat_messages, tokenize=False, add_generation_prompt=True
|
80 |
+
)
|
81 |
+
output = ""
|
82 |
+
curr_output = ""
|
83 |
+
for t in stream_generate(
|
84 |
+
self.model,
|
85 |
+
self.tokenizer,
|
86 |
+
prompt,
|
87 |
+
max_tokens=self.gen_kwargs["max_new_tokens"],
|
88 |
+
):
|
89 |
+
output += t
|
90 |
+
curr_output += t
|
91 |
+
if curr_output.endswith((".", "?", "!", "<|end|>")):
|
92 |
+
yield curr_output.replace("<|end|>", "")
|
93 |
+
curr_output = ""
|
94 |
+
generated_text = output.replace("<|end|>", "")
|
95 |
+
torch.mps.empty_cache()
|
96 |
+
|
97 |
+
self.chat.append({"role": "assistant", "content": generated_text})
|
VAD/__pycache__/vad_iterator.cpython-310.pyc
ADDED
Binary file (2.98 kB). View file
|
|
VAD/__pycache__/vad_iterator.cpython-39.pyc
ADDED
Binary file (2.96 kB). View file
|
|
app.py
CHANGED
@@ -1,8 +1,43 @@
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from transformers import pipeline
|
3 |
import numpy as np
|
4 |
from VAD.vad_iterator import VADIterator
|
5 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
def int2float(sound):
|
8 |
"""
|
@@ -16,10 +51,13 @@ def int2float(sound):
|
|
16 |
sound = sound.squeeze() # depends on the use case
|
17 |
return sound
|
18 |
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
21 |
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
|
22 |
-
vad_model, _ = torch.hub.load("snakers4/silero-vad", "silero_vad")
|
23 |
vad_iterator = VADIterator(
|
24 |
vad_model,
|
25 |
threshold=0.3,
|
@@ -31,131 +69,72 @@ vad_iterator = VADIterator(
|
|
31 |
|
32 |
def transcribe(stream, new_chunk):
|
33 |
sr, y = new_chunk
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
36 |
audio_int16 = np.frombuffer(y, dtype=np.int16)
|
37 |
audio_float32 = int2float(audio_int16)
|
|
|
|
|
|
|
|
|
38 |
vad_output = vad_iterator(torch.from_numpy(audio_float32))
|
|
|
39 |
if vad_output is not None and len(vad_output) != 0:
|
40 |
-
|
41 |
array = torch.cat(vad_output).cpu().numpy()
|
42 |
duration_ms = len(array) / sr * 1000
|
43 |
if (not(duration_ms < min_speech_ms or duration_ms > max_speech_ms)):
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
demo = gr.Interface(
|
50 |
transcribe,
|
51 |
["state", gr.Audio(sources=["microphone"], streaming=True, waveform_options=gr.WaveformOptions(sample_rate=16000))],
|
52 |
-
["state", "text", gr.Audio(label="Output",
|
53 |
live=True,
|
54 |
)
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
demo.launch()
|
57 |
-
# from transformers import pipeline
|
58 |
-
# import torch
|
59 |
-
|
60 |
-
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
61 |
-
|
62 |
-
# classifier = pipeline(
|
63 |
-
# "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
|
64 |
-
# )
|
65 |
-
|
66 |
-
# from transformers.pipelines.audio_utils import ffmpeg_microphone_live
|
67 |
-
|
68 |
-
|
69 |
-
# def launch_fn(
|
70 |
-
# wake_word="marvin",
|
71 |
-
# prob_threshold=0.5,
|
72 |
-
# chunk_length_s=2.0,
|
73 |
-
# stream_chunk_s=0.25,
|
74 |
-
# debug=False,
|
75 |
-
# ):
|
76 |
-
# if wake_word not in classifier.model.config.label2id.keys():
|
77 |
-
# raise ValueError(
|
78 |
-
# f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
|
79 |
-
# )
|
80 |
-
|
81 |
-
# sampling_rate = classifier.feature_extractor.sampling_rate
|
82 |
-
|
83 |
-
# mic = ffmpeg_microphone_live(
|
84 |
-
# sampling_rate=sampling_rate,
|
85 |
-
# chunk_length_s=chunk_length_s,
|
86 |
-
# stream_chunk_s=stream_chunk_s,
|
87 |
-
# )
|
88 |
-
|
89 |
-
# print("Listening for wake word...")
|
90 |
-
# for prediction in classifier(mic):
|
91 |
-
# prediction = prediction[0]
|
92 |
-
# if debug:
|
93 |
-
# print(prediction)
|
94 |
-
# if prediction["label"] == wake_word:
|
95 |
-
# if prediction["score"] > prob_threshold:
|
96 |
-
# return True
|
97 |
-
|
98 |
-
# transcriber = pipeline(
|
99 |
-
# "automatic-speech-recognition", model="openai/whisper-base.en", device=device
|
100 |
-
# )
|
101 |
-
# import sys
|
102 |
-
|
103 |
-
|
104 |
-
# def transcribe(chunk_length_s=5.0, stream_chunk_s=1.0):
|
105 |
-
# sampling_rate = transcriber.feature_extractor.sampling_rate
|
106 |
-
|
107 |
-
# mic = ffmpeg_microphone_live(
|
108 |
-
# sampling_rate=sampling_rate,
|
109 |
-
# chunk_length_s=chunk_length_s,
|
110 |
-
# stream_chunk_s=stream_chunk_s,
|
111 |
-
# )
|
112 |
-
|
113 |
-
# print("Start speaking...")
|
114 |
-
# for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}):
|
115 |
-
# sys.stdout.write("\033[K")
|
116 |
-
# print(item["text"], end="\r")
|
117 |
-
# if not item["partial"][0]:
|
118 |
-
# break
|
119 |
-
|
120 |
-
# return item["text"]
|
121 |
-
|
122 |
-
# from huggingface_hub import HfFolder
|
123 |
-
# import requests
|
124 |
-
|
125 |
-
|
126 |
-
# def query(text, model_id="tiiuae/falcon-7b-instruct"):
|
127 |
-
# api_url = f"https://api-inference.huggingface.co/models/{model_id}"
|
128 |
-
# headers = {"Authorization": f"Bearer {HfFolder().get_token()}"}
|
129 |
-
# payload = {"inputs": text}
|
130 |
-
|
131 |
-
# print(f"Querying...: {text}")
|
132 |
-
# response = requests.post(api_url, headers=headers, json=payload)
|
133 |
-
# return response.json()[0]["generated_text"][len(text) + 1 :]
|
134 |
-
|
135 |
-
# from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
136 |
-
|
137 |
-
# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
138 |
-
|
139 |
-
# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
|
140 |
-
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
|
141 |
-
|
142 |
-
# from datasets import load_dataset
|
143 |
-
|
144 |
-
# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
145 |
-
# speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
146 |
-
|
147 |
-
# def synthesise(text):
|
148 |
-
# inputs = processor(text=text, return_tensors="pt")
|
149 |
-
# speech = model.generate_speech(
|
150 |
-
# inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
|
151 |
-
# )
|
152 |
-
# return speech.cpu()
|
153 |
-
|
154 |
-
|
155 |
-
# if __name__ == "__main__":
|
156 |
-
# launch_fn(debug=True)
|
157 |
-
# # transcription = transcribe()
|
158 |
-
# # response = query(transcription)
|
159 |
-
# # audio = synthesise(response)
|
160 |
-
|
161 |
-
# # Audio(audio, rate=16000, autoplay=True)
|
|
|
1 |
+
# import base64
|
2 |
+
# import pathlib
|
3 |
+
# import tempfile
|
4 |
import gradio as gr
|
5 |
+
|
6 |
+
# recorder_js = pathlib.Path('recorder.js').read_text()
|
7 |
+
# main_js = pathlib.Path('main.js').read_text()
|
8 |
+
# record_button_js = pathlib.Path('record_button.js').read_text().replace('let recorder_js = null;', recorder_js).replace(
|
9 |
+
# 'let main_js = null;', main_js)
|
10 |
+
|
11 |
+
|
12 |
+
# def save_base64_video(base64_string):
|
13 |
+
# base64_video = base64_string
|
14 |
+
# video_data = base64.b64decode(base64_video)
|
15 |
+
# with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
|
16 |
+
# temp_filename = temp_file.name
|
17 |
+
# temp_file.write(video_data)
|
18 |
+
# print(f"Temporary MP4 file saved as: {temp_filename}")
|
19 |
+
# return temp_filename
|
20 |
+
# import os
|
21 |
+
|
22 |
+
# os.system('python -m unidic download')
|
23 |
from transformers import pipeline
|
24 |
import numpy as np
|
25 |
from VAD.vad_iterator import VADIterator
|
26 |
import torch
|
27 |
+
import librosa
|
28 |
+
from mlx_lm import load, stream_generate, generate
|
29 |
+
from LLM.chat import Chat
|
30 |
+
from lightning_whisper_mlx import LightningWhisperMLX
|
31 |
+
from melo.api import TTS
|
32 |
+
|
33 |
+
LM_model, LM_tokenizer = load("mlx-community/SmolLM-360M-Instruct")
|
34 |
+
chat = Chat(2)
|
35 |
+
chat.init_chat({"role": "system", "content": "You are a helpful and friendly AI assistant. You are polite, respectful, and aim to provide concise responses of less than 20 words."})
|
36 |
+
user_role = "user"
|
37 |
+
|
38 |
+
tts_model = TTS(language="EN_NEWEST", device="auto")
|
39 |
+
speaker_id = tts_model.hps.data.spk2id["EN-Newest"]
|
40 |
+
blocksize = 512
|
41 |
|
42 |
def int2float(sound):
|
43 |
"""
|
|
|
51 |
sound = sound.squeeze() # depends on the use case
|
52 |
return sound
|
53 |
|
54 |
+
text_str=""
|
55 |
+
audio_output = None
|
56 |
+
min_speech_ms=500
|
57 |
+
max_speech_ms=float("inf")
|
58 |
+
ASR_model = LightningWhisperMLX(model="distil-large-v3", batch_size=6, quant=None)
|
59 |
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
|
60 |
+
vad_model, _ = torch.hub.load("snakers4/silero-vad:v4.0", "silero_vad")
|
61 |
vad_iterator = VADIterator(
|
62 |
vad_model,
|
63 |
threshold=0.3,
|
|
|
69 |
|
70 |
def transcribe(stream, new_chunk):
|
71 |
sr, y = new_chunk
|
72 |
+
global text_str
|
73 |
+
global chat
|
74 |
+
global user_role
|
75 |
+
global audio_output
|
76 |
+
|
77 |
audio_int16 = np.frombuffer(y, dtype=np.int16)
|
78 |
audio_float32 = int2float(audio_int16)
|
79 |
+
audio_float32=librosa.resample(audio_float32, orig_sr=sr, target_sr=16000)
|
80 |
+
sr=16000
|
81 |
+
print(sr)
|
82 |
+
print(audio_float32.shape)
|
83 |
vad_output = vad_iterator(torch.from_numpy(audio_float32))
|
84 |
+
|
85 |
if vad_output is not None and len(vad_output) != 0:
|
86 |
+
print("VAD: end of speech detected")
|
87 |
array = torch.cat(vad_output).cpu().numpy()
|
88 |
duration_ms = len(array) / sr * 1000
|
89 |
if (not(duration_ms < min_speech_ms or duration_ms > max_speech_ms)):
|
90 |
+
prompt=ASR_model.transcribe(array)["text"].strip()
|
91 |
+
chat.append({"role": user_role, "content": prompt})
|
92 |
+
chat_messages = chat.to_list()
|
93 |
+
prompt = LM_tokenizer.apply_chat_template(
|
94 |
+
chat_messages, tokenize=False, add_generation_prompt=True
|
95 |
+
)
|
96 |
+
output = generate(
|
97 |
+
LM_model,
|
98 |
+
LM_tokenizer,
|
99 |
+
prompt,
|
100 |
+
max_tokens=128,
|
101 |
+
)
|
102 |
+
# import pdb;pdb.set_trace()
|
103 |
+
generated_text = output.replace("<|end|>", "")
|
104 |
+
torch.mps.empty_cache()
|
105 |
+
|
106 |
+
chat.append({"role": "assistant", "content": generated_text})
|
107 |
+
text_str=generated_text
|
108 |
+
# import pdb;pdb.set_trace()
|
109 |
+
audio_chunk = tts_model.tts_to_file(text_str, speaker_id, quiet=True)
|
110 |
+
audio_chunk = (audio_chunk * 32768).astype(np.int16)
|
111 |
+
audio_output=(44100, audio_chunk)
|
112 |
+
# else:
|
113 |
+
# audio_output=None
|
114 |
+
text_str1=text_str
|
115 |
+
|
116 |
+
return stream, text_str1, audio_output
|
117 |
|
118 |
demo = gr.Interface(
|
119 |
transcribe,
|
120 |
["state", gr.Audio(sources=["microphone"], streaming=True, waveform_options=gr.WaveformOptions(sample_rate=16000))],
|
121 |
+
["state", "text", gr.Audio(label="Output", autoplay=True)],
|
122 |
live=True,
|
123 |
)
|
124 |
+
# with demo:
|
125 |
+
# start_button = gr.Button("Record Screen 🔴")
|
126 |
+
# video_component = gr.Video(interactive=True, show_share_button=True, include_audio=True)
|
127 |
+
|
128 |
+
|
129 |
+
# def toggle_button_label(returned_string):
|
130 |
+
# if returned_string.startswith("Record"):
|
131 |
+
# return gr.Button(value="Stop Recording ⚪"), None
|
132 |
+
# else:
|
133 |
+
# try:
|
134 |
+
# temp_filename = save_base64_video(returned_string)
|
135 |
+
# except Exception as e:
|
136 |
+
# return gr.Button(value="Record Screen 🔴"), gr.Warning(f'Failed to convert video to mp4:\n{e}')
|
137 |
+
# return gr.Button(value="Record Screen 🔴"), gr.Video(value=temp_filename, interactive=True,
|
138 |
+
# show_share_button=True)
|
139 |
+
# start_button.click(toggle_button_label, start_button, [start_button, video_component], js=record_button_js)
|
140 |
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
flagged/log.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
stream,new_chunk,stream,output 1,flag,username,timestamp
|
2 |
+
,flagged/new_chunk/65327197a5439319f87d/audio.wav,,,,,2024-09-07 15:26:52.280189
|
flagged/new_chunk/65327197a5439319f87d/audio.wav
ADDED
Binary file (34.6 kB). View file
|
|
main.js
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// main.js
|
2 |
+
if (!ScreenCastRecorder.isSupportedBrowser()) {
|
3 |
+
console.error("Screen Recording not supported in this browser");
|
4 |
+
}
|
5 |
+
let recorder;
|
6 |
+
let outputBlob;
|
7 |
+
const stopRecording = () => __awaiter(void 0, void 0, void 0, function* () {
|
8 |
+
let currentState = "RECORDING";
|
9 |
+
// We should do nothing if the user try to stop recording when it is not started
|
10 |
+
if (currentState === "OFF" || recorder == null) {
|
11 |
+
return;
|
12 |
+
}
|
13 |
+
// if (currentState === "COUNTDOWN") {
|
14 |
+
// this.setState({
|
15 |
+
// currentState: "OFF",
|
16 |
+
// })
|
17 |
+
// }
|
18 |
+
if (currentState === "RECORDING") {
|
19 |
+
if (recorder.getState() === "inactive") {
|
20 |
+
// this.setState({
|
21 |
+
// currentState: "OFF",
|
22 |
+
// })
|
23 |
+
console.log("Inactive");
|
24 |
+
}
|
25 |
+
else {
|
26 |
+
outputBlob = yield recorder.stop();
|
27 |
+
console.log("Done recording");
|
28 |
+
// this.setState({
|
29 |
+
// outputBlob,
|
30 |
+
// currentState: "PREVIEW_FILE",
|
31 |
+
// })
|
32 |
+
window.currentState = "PREVIEW_FILE";
|
33 |
+
const videoSource = URL.createObjectURL(outputBlob);
|
34 |
+
window.videoSource = videoSource;
|
35 |
+
const fileName = "recording";
|
36 |
+
const link = document.createElement("a");
|
37 |
+
link.setAttribute("href", videoSource);
|
38 |
+
link.setAttribute("download", `${fileName}.webm`);
|
39 |
+
link.click();
|
40 |
+
}
|
41 |
+
}
|
42 |
+
});
|
43 |
+
const startRecording = () => __awaiter(void 0, void 0, void 0, function* () {
|
44 |
+
const recordAudio = true;
|
45 |
+
recorder = new ScreenCastRecorder({
|
46 |
+
recordAudio,
|
47 |
+
onErrorOrStop: () => stopRecording(),
|
48 |
+
});
|
49 |
+
try {
|
50 |
+
yield recorder.initialize();
|
51 |
+
}
|
52 |
+
catch (e) {
|
53 |
+
console.warn(`ScreenCastRecorder.initialize error: ${e}`);
|
54 |
+
// this.setState({ currentState: "UNSUPPORTED" })
|
55 |
+
window.currentState = "UNSUPPORTED";
|
56 |
+
return;
|
57 |
+
}
|
58 |
+
// this.setState({ currentState: "COUNTDOWN" })
|
59 |
+
const hasStarted = recorder.start();
|
60 |
+
if (hasStarted) {
|
61 |
+
// this.setState({
|
62 |
+
// currentState: "RECORDING",
|
63 |
+
// })
|
64 |
+
console.log("Started recording");
|
65 |
+
window.currentState = "RECORDING";
|
66 |
+
}
|
67 |
+
else {
|
68 |
+
stopRecording().catch(err => console.warn(`withScreencast.stopRecording threw an error: ${err}`));
|
69 |
+
}
|
70 |
+
});
|
71 |
+
|
72 |
+
// Set global functions to window.
|
73 |
+
window.startRecording = startRecording;
|
74 |
+
window.stopRecording = stopRecording;
|
mlx_models/distil-large-v3/config.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_mels": 128,
|
3 |
+
"n_audio_ctx": 1500,
|
4 |
+
"n_audio_state": 1280,
|
5 |
+
"n_audio_head": 20,
|
6 |
+
"n_audio_layer": 32,
|
7 |
+
"n_vocab": 51866,
|
8 |
+
"n_text_ctx": 448,
|
9 |
+
"n_text_state": 1280,
|
10 |
+
"n_text_head": 20,
|
11 |
+
"n_text_layer": 2,
|
12 |
+
"model_type": "whisper"
|
13 |
+
}
|
mlx_models/distil-large-v3/weights.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8fd01bf050289525f91ff3d96e2880381367a34beb3520ad516181517b209ebc
|
3 |
+
size 1509130112
|
record_button.js
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Setup if needed and start recording.
|
2 |
+
async () => {
|
3 |
+
// Set up recording functions if not already initialized
|
4 |
+
if (!window.startRecording) {
|
5 |
+
let recorder_js = null;
|
6 |
+
let main_js = null;
|
7 |
+
}
|
8 |
+
|
9 |
+
// Function to fetch and convert video blob to base64 using async/await without explicit Promise
|
10 |
+
async function getVideoBlobAsBase64(objectURL) {
|
11 |
+
const response = await fetch(objectURL);
|
12 |
+
if (!response.ok) {
|
13 |
+
throw new Error('Failed to fetch video blob.');
|
14 |
+
}
|
15 |
+
|
16 |
+
const blob = await response.blob();
|
17 |
+
|
18 |
+
const reader = new FileReader();
|
19 |
+
reader.readAsDataURL(blob);
|
20 |
+
|
21 |
+
return new Promise((resolve, reject) => {
|
22 |
+
reader.onloadend = () => {
|
23 |
+
if (reader.result) {
|
24 |
+
resolve(reader.result.split(',')[1]); // Return the base64 string (without data URI prefix)
|
25 |
+
} else {
|
26 |
+
reject('Failed to convert blob to base64.');
|
27 |
+
}
|
28 |
+
};
|
29 |
+
});
|
30 |
+
}
|
31 |
+
|
32 |
+
if (window.currentState === "RECORDING") {
|
33 |
+
await window.stopRecording();
|
34 |
+
const base64String = await getVideoBlobAsBase64(window.videoSource);
|
35 |
+
return base64String;
|
36 |
+
} else {
|
37 |
+
window.startRecording();
|
38 |
+
return "Record";
|
39 |
+
}
|
40 |
+
}
|
recorder.js
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// recorder.js
|
2 |
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
3 |
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
4 |
+
return new (P || (P = Promise))(function (resolve, reject) {
|
5 |
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
6 |
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
7 |
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
8 |
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
9 |
+
});
|
10 |
+
};
|
11 |
+
const BLOB_TYPE = "video/webm";
|
12 |
+
class ScreenCastRecorder {
|
13 |
+
/** True if the current browser likely supports screencasts. */
|
14 |
+
static isSupportedBrowser() {
|
15 |
+
return (navigator.mediaDevices != null &&
|
16 |
+
navigator.mediaDevices.getUserMedia != null &&
|
17 |
+
navigator.mediaDevices.getDisplayMedia != null &&
|
18 |
+
MediaRecorder.isTypeSupported(BLOB_TYPE));
|
19 |
+
}
|
20 |
+
constructor({ recordAudio, onErrorOrStop }) {
|
21 |
+
this.recordAudio = recordAudio;
|
22 |
+
this.onErrorOrStopCallback = onErrorOrStop;
|
23 |
+
this.inputStream = null;
|
24 |
+
this.recordedChunks = [];
|
25 |
+
this.mediaRecorder = null;
|
26 |
+
}
|
27 |
+
/**
|
28 |
+
* This asynchronous method will initialize the screen recording object asking
|
29 |
+
* for permissions to the user which are needed to start recording.
|
30 |
+
*/
|
31 |
+
initialize() {
|
32 |
+
return __awaiter(this, void 0, void 0, function* () {
|
33 |
+
const desktopStream = yield navigator.mediaDevices.getDisplayMedia({
|
34 |
+
video: true,
|
35 |
+
});
|
36 |
+
let tracks = desktopStream.getTracks();
|
37 |
+
if (this.recordAudio) {
|
38 |
+
const voiceStream = yield navigator.mediaDevices.getUserMedia({
|
39 |
+
video: false,
|
40 |
+
audio: true,
|
41 |
+
});
|
42 |
+
tracks = tracks.concat(voiceStream.getAudioTracks());
|
43 |
+
}
|
44 |
+
this.recordedChunks = [];
|
45 |
+
this.inputStream = new MediaStream(tracks);
|
46 |
+
this.mediaRecorder = new MediaRecorder(this.inputStream, {
|
47 |
+
mimeType: BLOB_TYPE,
|
48 |
+
});
|
49 |
+
this.mediaRecorder.ondataavailable = e => this.recordedChunks.push(e.data);
|
50 |
+
});
|
51 |
+
}
|
52 |
+
getState() {
|
53 |
+
if (this.mediaRecorder) {
|
54 |
+
return this.mediaRecorder.state;
|
55 |
+
}
|
56 |
+
return "inactive";
|
57 |
+
}
|
58 |
+
/**
|
59 |
+
* This method will start the screen recording if the user has granted permissions
|
60 |
+
* and the mediaRecorder has been initialized
|
61 |
+
*
|
62 |
+
* @returns {boolean}
|
63 |
+
*/
|
64 |
+
start() {
|
65 |
+
if (!this.mediaRecorder) {
|
66 |
+
console.warn(`ScreenCastRecorder.start: mediaRecorder is null`);
|
67 |
+
return false;
|
68 |
+
}
|
69 |
+
const logRecorderError = (e) => {
|
70 |
+
console.warn(`mediaRecorder.start threw an error: ${e}`);
|
71 |
+
};
|
72 |
+
this.mediaRecorder.onerror = (e) => {
|
73 |
+
logRecorderError(e);
|
74 |
+
this.onErrorOrStopCallback();
|
75 |
+
};
|
76 |
+
this.mediaRecorder.onstop = () => this.onErrorOrStopCallback();
|
77 |
+
try {
|
78 |
+
this.mediaRecorder.start();
|
79 |
+
}
|
80 |
+
catch (e) {
|
81 |
+
logRecorderError(e);
|
82 |
+
return false;
|
83 |
+
}
|
84 |
+
return true;
|
85 |
+
}
|
86 |
+
/**
|
87 |
+
* This method will stop recording and then return the generated Blob
|
88 |
+
*
|
89 |
+
* @returns {(Promise|undefined)}
|
90 |
+
* A Promise which will return the generated Blob
|
91 |
+
* Undefined if the MediaRecorder could not initialize
|
92 |
+
*/
|
93 |
+
stop() {
|
94 |
+
if (!this.mediaRecorder) {
|
95 |
+
return undefined;
|
96 |
+
}
|
97 |
+
let resolver;
|
98 |
+
const promise = new Promise(r => {
|
99 |
+
resolver = r;
|
100 |
+
});
|
101 |
+
this.mediaRecorder.onstop = () => resolver();
|
102 |
+
this.mediaRecorder.stop();
|
103 |
+
if (this.inputStream) {
|
104 |
+
this.inputStream.getTracks().forEach(s => s.stop());
|
105 |
+
this.inputStream = null;
|
106 |
+
}
|
107 |
+
return promise.then(() => this.buildOutputBlob());
|
108 |
+
}
|
109 |
+
buildOutputBlob() {
|
110 |
+
return new Blob(this.recordedChunks, { type: BLOB_TYPE });
|
111 |
+
}
|
112 |
+
}
|
requirements.txt
CHANGED
@@ -1,7 +1,11 @@
|
|
1 |
-
huggingface_hub==0.
|
2 |
transformers[sentencepiece]
|
3 |
-
sentencepiece
|
4 |
datasets
|
5 |
-
huggingface_hub
|
6 |
torch==2.4.0
|
7 |
-
torchaudio
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
huggingface_hub==0.23.2
|
2 |
transformers[sentencepiece]
|
3 |
+
sentencepiece==0.2.0
|
4 |
datasets
|
|
|
5 |
torch==2.4.0
|
6 |
+
torchaudio
|
7 |
+
librosa
|
8 |
+
lightning-whisper-mlx>=0.0.10
|
9 |
+
mlx-lm>=0.14.0
|
10 |
+
melotts @ git+https://github.com/andimarafioti/MeloTTS.git#egg=MeloTTS # made a copy of MeloTTS to have compatible versions of transformers
|
11 |
+
sounddevice==0.5.0
|