File size: 7,686 Bytes
330bd18
 
 
925a881
 
a5ee5dc
 
f7f39bd
330bd18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7f39bd
129c500
b80895e
330bd18
8071afe
330bd18
8071afe
 
 
 
 
 
 
 
330bd18
 
8071afe
330bd18
 
 
 
 
 
 
8ed40d7
 
764da9e
20a87d9
ccc910d
 
8b0046a
8ed40d7
 
 
 
145241d
8ed40d7
 
 
966d40f
 
 
 
 
 
 
 
 
 
 
 
330bd18
 
 
 
8071afe
16c6824
 
 
 
 
ef3e243
8071afe
a3aa488
8b0046a
8071afe
8b0046a
8071afe
8ed40d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330bd18
966d40f
 
 
 
 
 
 
 
4ba2ab9
f7f39bd
 
330bd18
 
 
 
 
966d40f
 
330bd18
 
 
 
f92ca2e
330bd18
966d40f
330bd18
966d40f
f5a084e
966d40f
b9d404b
 
 
 
 
 
 
 
 
 
4ba2ab9
b9d404b
e39cf31
4ba2ab9
8071afe
330bd18
 
03648b3
8071afe
6f96d97
c66a07c
 
 
330bd18
a0b5d57
4ba2ab9
fd27c98
ab79be3
fd27c98
8071afe
 
 
 
 
 
 
4ba2ab9
330bd18
 
 
 
 
f7f39bd
 
 
f5a084e
330bd18
f7f39bd
 
330bd18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
925a881
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# import base64
# import pathlib
# import tempfile
import os
os.system("python -m unidic download")
import nltk
nltk.download('averaged_perceptron_tagger_eng')
import gradio as gr

# recorder_js = pathlib.Path('recorder.js').read_text()
# main_js = pathlib.Path('main.js').read_text()
# record_button_js = pathlib.Path('record_button.js').read_text().replace('let recorder_js = null;', recorder_js).replace(
#     'let main_js = null;', main_js)


# def save_base64_video(base64_string):
#     base64_video = base64_string
#     video_data = base64.b64decode(base64_video)
#     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
#         temp_filename = temp_file.name
#         temp_file.write(video_data)
#     print(f"Temporary MP4 file saved as: {temp_filename}")
#     return temp_filename
# import os

# os.system('python -m unidic download')
import numpy as np
from VAD.vad_iterator import VADIterator
import torch
import librosa
# from mlx_lm import load, stream_generate, generate
from LLM.chat import Chat
# from lightning_whisper_mlx import LightningWhisperMLX
from transformers import (
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
)
from melo.api import TTS

# LM_model, LM_tokenizer = load("mlx-community/SmolLM-360M-Instruct")
chat = Chat(2)
chat.init_chat({"role": "system", "content": "You are a helpful and friendly AI assistant. You are polite, respectful, and aim to provide concise responses of less than 20 words."})
user_role = "user"

tts_model = TTS(language="EN_NEWEST", device="auto")
speaker_id = tts_model.hps.data.spk2id["EN-Newest"]
blocksize = 512
tts_model.tts_to_file("text", speaker_id, quiet=True)
dummy_input = torch.randn(
        (3000),
        dtype=getattr(torch, "float16"),
        device="cpu",
).cpu().numpy()
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en", device="cuda")
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize()
start_event.record()
transcriber({"sampling_rate": 16000, "raw": dummy_input})["text"]
end_event.record()
torch.cuda.synchronize()

def int2float(sound):
    """
    Taken from https://github.com/snakers4/silero-vad
    """

    abs_max = np.abs(sound).max()
    sound = sound.astype("float32")
    if abs_max > 0:
        sound *= 1 / 32768
    sound = sound.squeeze()  # depends on the use case
    return sound

text_str=""
audio_output = None
min_speech_ms=500
max_speech_ms=float("inf")
# ASR_model = LightningWhisperMLX(model="distil-large-v3", batch_size=6, quant=None)
# ASR_processor = AutoProcessor.from_pretrained("distil-whisper/distil-large-v3")
# ASR_model = AutoModelForSpeechSeq2Seq.from_pretrained(
#     "distil-whisper/distil-large-v3",
#     torch_dtype="float16",
# ).to("cpu")
LM_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-135M-Instruct")
LM_model = AutoModelForCausalLM.from_pretrained(
    "HuggingFaceTB/SmolLM-360M-Instruct", torch_dtype="float16", trust_remote_code=True
).to("cuda")
LM_pipe = pipeline(
    "text-generation", model=LM_model, tokenizer=LM_tokenizer, device="cuda"
)
dummy_input_text = "Write me a poem about Machine Learning."
dummy_chat = [{"role": "user", "content": dummy_input_text}]
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize()
start_event.record()
LM_pipe(
    dummy_chat,
    max_new_tokens=32,
    min_new_tokens=0,
    temperature=0.0,
    do_sample=False,
)
end_event.record()
torch.cuda.synchronize()
vad_model, _ = torch.hub.load("snakers4/silero-vad:v4.0", "silero_vad")
vad_iterator = VADIterator(
    vad_model,
    threshold=0.3,
    sampling_rate=16000,
    min_silence_duration_ms=250,
    speech_pad_ms=500,
)

import time
def transcribe(stream, new_chunk):
    sr, y = new_chunk
    global text_str
    global chat
    global user_role
    global audio_output
    
    audio_int16 = np.frombuffer(y, dtype=np.int16)
    audio_float32 = int2float(audio_int16)
    audio_float32=librosa.resample(audio_float32, orig_sr=sr, target_sr=16000)
    sr=16000
    print(sr)
    print(audio_float32.shape)
    vad_output = vad_iterator(torch.from_numpy(audio_float32))
    
    if vad_output is not None and len(vad_output) != 0:
        print("VAD: end of speech detected")
        array = torch.cat(vad_output).cpu().numpy()
        duration_ms = len(array) / sr * 1000
        if (not(duration_ms < min_speech_ms or duration_ms > max_speech_ms)):
            # input_features = ASR_processor(
            #     array, sampling_rate=16000, return_tensors="pt"
            # ).input_features
            # print(input_features)
            # input_features = input_features.to("cpu", dtype=getattr(torch, "float16"))
            # pred_ids = ASR_model.generate(input_features, max_new_tokens=128, min_new_tokens=0, num_beams=1, return_timestamps=False,task="transcribe",language="en")
            # print(pred_ids)
            # prompt = ASR_processor.batch_decode(
            #     pred_ids, skip_special_tokens=True, decode_with_timestamps=False
            # )[0]
            start_time = time.time()
            prompt=transcriber({"sampling_rate": sr, "raw": array})["text"]
            print(prompt)
            print("--- %s seconds ---" % (time.time() - start_time))
            # prompt=ASR_model.transcribe(array)["text"].strip()
            chat.append({"role": user_role, "content": prompt})
            chat_messages = chat.to_list()
            output=LM_pipe(
                chat_messages,
                max_new_tokens=64,
                min_new_tokens=0,
                temperature=0.0,
                do_sample=False,
            )
                
            print("--- %s seconds ---" % (time.time() - start_time))
            generated_text = output[0]['generated_text'][-1]["content"]
            
            # torch.mps.empty_cache()
    
            chat.append({"role": "assistant", "content": generated_text})
            text_str=generated_text
            # import pdb;pdb.set_trace()
            audio_chunk = tts_model.tts_to_file(text_str, speaker_id, quiet=True)
            audio_chunk = (audio_chunk * 32768).astype(np.int16)
            audio_output=(44100, audio_chunk)
            print("--- %s seconds ---" % (time.time() - start_time))
    # else:
    #     audio_output=None
    text_str1=text_str
    
    return stream, text_str1, audio_output

demo = gr.Interface(
    transcribe,
    ["state", gr.Audio(sources=["microphone"], streaming=True, waveform_options=gr.WaveformOptions(sample_rate=16000))],
    ["state", "text", gr.Audio(label="Output", autoplay=True)],
    live=True,
)
# with demo:
#     start_button = gr.Button("Record Screen 🔴")
#     video_component = gr.Video(interactive=True, show_share_button=True, include_audio=True)


#     def toggle_button_label(returned_string):
#         if returned_string.startswith("Record"):
#             return gr.Button(value="Stop Recording ⚪"), None
#         else:
#             try:
#                 temp_filename = save_base64_video(returned_string)
#             except Exception as e:
#                 return gr.Button(value="Record Screen 🔴"), gr.Warning(f'Failed to convert video to mp4:\n{e}')
#             return gr.Button(value="Record Screen 🔴"), gr.Video(value=temp_filename, interactive=True,
#                                                                 show_share_button=True)
#     start_button.click(toggle_button_label, start_button, [start_button, video_component], js=record_button_js)
demo.launch("share=True")