Spaces:
Runtime error
Runtime error
phamngoctukts
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import speech_recognition as sr
|
2 |
-
import ollama
|
3 |
from gtts import gTTS
|
4 |
import gradio as gr
|
5 |
from io import BytesIO
|
@@ -14,22 +13,27 @@ from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIter
|
|
14 |
import torch
|
15 |
from huggingface_hub import login
|
16 |
import os
|
17 |
-
|
18 |
-
|
|
|
|
|
19 |
ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
20 |
-
model = MllamaForConditionalGeneration.from_pretrained(ckpt,
|
21 |
-
torch_dtype=torch.bfloat16).to("cpu")
|
22 |
processor = AutoProcessor.from_pretrained(ckpt)
|
23 |
r = sr.Recognizer()
|
24 |
|
25 |
@dataclass
|
26 |
class AppState:
|
27 |
stream: np.ndarray | None = None
|
|
|
28 |
sampling_rate: int = 0
|
29 |
pause_detected: bool = False
|
30 |
started_talking: bool = False
|
31 |
stopped: bool = False
|
|
|
|
|
32 |
conversation: list = field(default_factory=list)
|
|
|
33 |
|
34 |
def run_vad(ori_audio, sr):
|
35 |
_st = time.time()
|
@@ -57,8 +61,8 @@ def run_vad(ori_audio, sr):
|
|
57 |
print(msg)
|
58 |
return -1, ori_audio, round(time.time() - _st, 4)
|
59 |
|
60 |
-
def determine_pause(audio:
|
61 |
-
"""
|
62 |
temp_audio = audio
|
63 |
dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
|
64 |
duration = len(audio) / sampling_rate
|
@@ -69,19 +73,24 @@ def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> b
|
|
69 |
print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
|
70 |
return (duration - dur_vad) > 1
|
71 |
|
72 |
-
def process_audio(audio:tuple, state:AppState):
|
73 |
if state.stream is None:
|
74 |
state.stream = audio[1]
|
75 |
state.sampling_rate = audio[0]
|
76 |
else:
|
77 |
state.stream = np.concatenate((state.stream, audio[1]))
|
|
|
|
|
|
|
|
|
78 |
pause_detected = determine_pause(state.stream, state.sampling_rate, state)
|
79 |
state.pause_detected = pause_detected
|
80 |
if state.pause_detected and state.started_talking:
|
81 |
return gr.Audio(recording=False), state
|
82 |
return None, state
|
83 |
|
84 |
-
def response(state:AppState
|
|
|
85 |
if not state.pause_detected and not state.started_talking:
|
86 |
return None, AppState()
|
87 |
audio_buffer = BytesIO()
|
@@ -99,70 +108,81 @@ def response(state:AppState, message, history, max_new_tokens=250):
|
|
99 |
textin=r.recognize_google(audio_data,language='vi')
|
100 |
except:
|
101 |
textin = ""
|
102 |
-
state.conversation.append({"role": "user", "content": "Bạn: " + textin})
|
|
|
103 |
if textin != "":
|
104 |
print("Đang nghĩ...")
|
105 |
-
|
106 |
-
|
107 |
|
108 |
-
|
|
|
|
|
|
|
|
|
109 |
if isinstance(msg[0], tuple):
|
110 |
-
messages.append({"role": "user", "content": [{"type": "text", "text": history[i
|
111 |
-
messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i
|
112 |
images.append(Image.open(msg[0][0]).convert("RGB"))
|
113 |
-
elif isinstance(history[i
|
114 |
# messages are already handled
|
115 |
pass
|
116 |
-
elif isinstance(history[i
|
117 |
messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
|
118 |
messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
|
119 |
|
120 |
# add current message
|
121 |
-
if
|
122 |
-
|
123 |
-
image = Image.open(message["files"][0]).convert("RGB")
|
124 |
-
else: # regular input
|
125 |
-
image = Image.open(message["files"][0]["path"]).convert("RGB")
|
126 |
images.append(image)
|
127 |
messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
|
128 |
-
else:
|
129 |
messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
else:
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
|
|
141 |
|
142 |
-
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
143 |
-
thread.start()
|
144 |
-
buffer = ""
|
145 |
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
yield buffer
|
151 |
-
|
152 |
-
|
153 |
-
textout = generated_text.replace('*','')
|
154 |
-
state.conversation.append({"role": "user", "content": "Trợ lý: " + textout})
|
155 |
-
if textout != "":
|
156 |
-
print("Đang đọc...")
|
157 |
mp3 = gTTS(textout,tld='com.vn',lang='vi',slow=False)
|
158 |
mp3_fp = BytesIO()
|
159 |
mp3.write_to_fp(mp3_fp)
|
160 |
srr=mp3_fp.getvalue()
|
|
|
|
|
|
|
161 |
mp3_fp.close()
|
162 |
-
|
163 |
-
yield srr, AppState(conversation=state.conversation)
|
164 |
|
165 |
-
def start_recording_user(state:
|
166 |
if not state.stopped:
|
167 |
return gr.Audio(recording=True)
|
168 |
|
@@ -173,14 +193,16 @@ with gr.Blocks() as demo:
|
|
173 |
with gr.Row():
|
174 |
with gr.Column():
|
175 |
input_audio = gr.Audio(label="Nói cho tôi nghe nào", sources="microphone", type="numpy")
|
|
|
176 |
with gr.Column():
|
177 |
chatbot = gr.Chatbot(label="Nội dung trò chuyện", type="messages")
|
178 |
output_audio = gr.Audio(label="Trợ lý", autoplay=True)
|
|
|
|
|
179 |
state = gr.State(value=AppState())
|
180 |
-
|
181 |
stream = input_audio.stream(
|
182 |
process_audio,
|
183 |
-
[input_audio, state],
|
184 |
[input_audio, state],
|
185 |
stream_every=0.50,
|
186 |
time_limit=30,
|
@@ -191,10 +213,10 @@ with gr.Blocks() as demo:
|
|
191 |
[output_audio, state],
|
192 |
)
|
193 |
respond.then(lambda s: s.conversation, [state], [chatbot])
|
194 |
-
|
195 |
restart = output_audio.stop(
|
196 |
start_recording_user,
|
197 |
-
[state],
|
198 |
[input_audio],
|
199 |
)
|
200 |
cancel = gr.Button("Stop Conversation", variant="stop")
|
|
|
1 |
import speech_recognition as sr
|
|
|
2 |
from gtts import gTTS
|
3 |
import gradio as gr
|
4 |
from io import BytesIO
|
|
|
13 |
import torch
|
14 |
from huggingface_hub import login
|
15 |
import os
|
16 |
+
from PIL import Image
|
17 |
+
from threading import Thread
|
18 |
+
tk = os.environ.get("HF_TOKEN")
|
19 |
+
#login(tk)
|
20 |
ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
21 |
+
model = MllamaForConditionalGeneration.from_pretrained(ckpt,torch_dtype=torch.bfloat16).to("cuda")
|
|
|
22 |
processor = AutoProcessor.from_pretrained(ckpt)
|
23 |
r = sr.Recognizer()
|
24 |
|
25 |
@dataclass
|
26 |
class AppState:
|
27 |
stream: np.ndarray | None = None
|
28 |
+
image: dict = field(default_factory=dict)
|
29 |
sampling_rate: int = 0
|
30 |
pause_detected: bool = False
|
31 |
started_talking: bool = False
|
32 |
stopped: bool = False
|
33 |
+
message: dict = field(default_factory=dict)
|
34 |
+
history: list = field(default_factory=list)
|
35 |
conversation: list = field(default_factory=list)
|
36 |
+
textout: str = ""
|
37 |
|
38 |
def run_vad(ori_audio, sr):
|
39 |
_st = time.time()
|
|
|
61 |
print(msg)
|
62 |
return -1, ori_audio, round(time.time() - _st, 4)
|
63 |
|
64 |
+
def determine_pause(audio:np.ndarray,sampling_rate:int,state:AppState) -> bool:
|
65 |
+
"""Phát hiện tạm dừng trong âm thanh."""
|
66 |
temp_audio = audio
|
67 |
dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
|
68 |
duration = len(audio) / sampling_rate
|
|
|
73 |
print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
|
74 |
return (duration - dur_vad) > 1
|
75 |
|
76 |
+
def process_audio(audio:tuple, image: Image, state:AppState):
|
77 |
if state.stream is None:
|
78 |
state.stream = audio[1]
|
79 |
state.sampling_rate = audio[0]
|
80 |
else:
|
81 |
state.stream = np.concatenate((state.stream, audio[1]))
|
82 |
+
if image is None:
|
83 |
+
state.image = {"file":""}
|
84 |
+
else:
|
85 |
+
state.image = {"file":str(image)}
|
86 |
pause_detected = determine_pause(state.stream, state.sampling_rate, state)
|
87 |
state.pause_detected = pause_detected
|
88 |
if state.pause_detected and state.started_talking:
|
89 |
return gr.Audio(recording=False), state
|
90 |
return None, state
|
91 |
|
92 |
+
def response(state:AppState = AppState()):
|
93 |
+
max_new_tokens = 1024
|
94 |
if not state.pause_detected and not state.started_talking:
|
95 |
return None, AppState()
|
96 |
audio_buffer = BytesIO()
|
|
|
108 |
textin=r.recognize_google(audio_data,language='vi')
|
109 |
except:
|
110 |
textin = ""
|
111 |
+
#state.conversation.append({"role": "user", "content": "Bạn: " + textin})
|
112 |
+
textout = ""
|
113 |
if textin != "":
|
114 |
print("Đang nghĩ...")
|
115 |
+
state.message = {}
|
116 |
+
state.message={"text": textin,"files": state.image["file"]}
|
117 |
|
118 |
+
# phần phiên dịch
|
119 |
+
txt = state.message["text"]
|
120 |
+
messages= []
|
121 |
+
images = []
|
122 |
+
for i, msg in enumerate(state.history):
|
123 |
if isinstance(msg[0], tuple):
|
124 |
+
messages.append({"role": "user", "content": [{"type": "text", "text": state.history[i][0]}, {"type": "image"}]})
|
125 |
+
messages.append({"role": "assistant", "content": [{"type": "text", "text": state.history[i][1]}]})
|
126 |
images.append(Image.open(msg[0][0]).convert("RGB"))
|
127 |
+
elif isinstance(state.history[i], tuple) and isinstance(msg[0], str):
|
128 |
# messages are already handled
|
129 |
pass
|
130 |
+
elif isinstance(state.history[i][0], str) and isinstance(msg[0], str): # text only turn
|
131 |
messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
|
132 |
messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
|
133 |
|
134 |
# add current message
|
135 |
+
if state.message["files"] != "": # examples
|
136 |
+
image = Image.open(state.message["files"]).convert("RGB")
|
|
|
|
|
|
|
137 |
images.append(image)
|
138 |
messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
|
139 |
+
else: # regular input
|
140 |
messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
|
141 |
+
try:
|
142 |
+
texts = processor.apply_chat_template(messages, add_generation_prompt=True)
|
143 |
+
if images == []:
|
144 |
+
inputs = processor(text=texts, return_tensors="pt").to("cuda")
|
145 |
+
else:
|
146 |
+
inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
|
147 |
+
streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
|
148 |
+
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
|
149 |
+
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
150 |
+
thread.start()
|
151 |
+
buffer = ""
|
152 |
+
for new_text in streamer:
|
153 |
+
buffer += new_text
|
154 |
+
time.sleep(0.01)
|
155 |
+
state.textout=buffer
|
156 |
+
textout=buffer
|
157 |
+
except:
|
158 |
+
print("Chưa lấy được thông tin dịch")
|
159 |
+
if state.message["files"] != "":
|
160 |
+
state.history.append([(txt,state.image["file"]),buffer])
|
161 |
+
state.conversation.append({"role":"user","content":"Bạn: " + str(txt) + str(state.image["file"])})
|
162 |
+
state.conversation.append({"role":"assistant", "content": "Bot: " + str(buffer)})
|
163 |
else:
|
164 |
+
state.history.append([txt,buffer])
|
165 |
+
state.conversation.append({"role": "user", "content":"Bạn: " + str(txt)})
|
166 |
+
state.conversation.append({"role": "assistant", "content":"Bot: " + str(buffer)})
|
167 |
+
else:
|
168 |
+
textout = "Tôi không nghe rõ"
|
169 |
|
|
|
|
|
|
|
170 |
|
171 |
+
#phần đọc chữ đã dịch
|
172 |
+
ssr = state.stream.tobytes()
|
173 |
+
print("Đang đọc...")
|
174 |
+
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
mp3 = gTTS(textout,tld='com.vn',lang='vi',slow=False)
|
176 |
mp3_fp = BytesIO()
|
177 |
mp3.write_to_fp(mp3_fp)
|
178 |
srr=mp3_fp.getvalue()
|
179 |
+
except:
|
180 |
+
print("Lỗi không đọc được")
|
181 |
+
finally:
|
182 |
mp3_fp.close()
|
183 |
+
yield srr, AppState(conversation=state.conversation, history=state.history)
|
|
|
184 |
|
185 |
+
def start_recording_user(state:AppState): # Sửa lỗi tại đây
|
186 |
if not state.stopped:
|
187 |
return gr.Audio(recording=True)
|
188 |
|
|
|
193 |
with gr.Row():
|
194 |
with gr.Column():
|
195 |
input_audio = gr.Audio(label="Nói cho tôi nghe nào", sources="microphone", type="numpy")
|
196 |
+
input_image = gr.Image(label="Hình ảnh của bạn", sources="upload", type="filepath")
|
197 |
with gr.Column():
|
198 |
chatbot = gr.Chatbot(label="Nội dung trò chuyện", type="messages")
|
199 |
output_audio = gr.Audio(label="Trợ lý", autoplay=True)
|
200 |
+
with gr.Row():
|
201 |
+
output_image = gr.Image(label="Hình ảnh sau xử lý", sources="clipboard", type="filepath",visible=True)
|
202 |
state = gr.State(value=AppState())
|
|
|
203 |
stream = input_audio.stream(
|
204 |
process_audio,
|
205 |
+
[input_audio, input_image, state],
|
206 |
[input_audio, state],
|
207 |
stream_every=0.50,
|
208 |
time_limit=30,
|
|
|
213 |
[output_audio, state],
|
214 |
)
|
215 |
respond.then(lambda s: s.conversation, [state], [chatbot])
|
216 |
+
respond.then(lambda s: s.image, [state], [output_image])
|
217 |
restart = output_audio.stop(
|
218 |
start_recording_user,
|
219 |
+
[state, input_image],
|
220 |
[input_audio],
|
221 |
)
|
222 |
cancel = gr.Button("Stop Conversation", variant="stop")
|