phamngoctukts commited on
Commit
83498a5
·
verified ·
1 Parent(s): 747b2d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -54
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import speech_recognition as sr
2
- import ollama
3
  from gtts import gTTS
4
  import gradio as gr
5
  from io import BytesIO
@@ -14,22 +13,27 @@ from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIter
14
  import torch
15
  from huggingface_hub import login
16
  import os
17
- login(HF_TOKEN)
18
- model_id = "meta-llama/Llama-3.2-1B"
 
 
19
  ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
20
- model = MllamaForConditionalGeneration.from_pretrained(ckpt,
21
- torch_dtype=torch.bfloat16).to("cpu")
22
  processor = AutoProcessor.from_pretrained(ckpt)
23
  r = sr.Recognizer()
24
 
25
  @dataclass
26
  class AppState:
27
  stream: np.ndarray | None = None
 
28
  sampling_rate: int = 0
29
  pause_detected: bool = False
30
  started_talking: bool = False
31
  stopped: bool = False
 
 
32
  conversation: list = field(default_factory=list)
 
33
 
34
  def run_vad(ori_audio, sr):
35
  _st = time.time()
@@ -57,8 +61,8 @@ def run_vad(ori_audio, sr):
57
  print(msg)
58
  return -1, ori_audio, round(time.time() - _st, 4)
59
 
60
- def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
61
- """Take in the stream, determine if a pause happened"""
62
  temp_audio = audio
63
  dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
64
  duration = len(audio) / sampling_rate
@@ -69,19 +73,24 @@ def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> b
69
  print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
70
  return (duration - dur_vad) > 1
71
 
72
- def process_audio(audio:tuple, state:AppState):
73
  if state.stream is None:
74
  state.stream = audio[1]
75
  state.sampling_rate = audio[0]
76
  else:
77
  state.stream = np.concatenate((state.stream, audio[1]))
 
 
 
 
78
  pause_detected = determine_pause(state.stream, state.sampling_rate, state)
79
  state.pause_detected = pause_detected
80
  if state.pause_detected and state.started_talking:
81
  return gr.Audio(recording=False), state
82
  return None, state
83
 
84
- def response(state:AppState, message, history, max_new_tokens=250):
 
85
  if not state.pause_detected and not state.started_talking:
86
  return None, AppState()
87
  audio_buffer = BytesIO()
@@ -99,70 +108,81 @@ def response(state:AppState, message, history, max_new_tokens=250):
99
  textin=r.recognize_google(audio_data,language='vi')
100
  except:
101
  textin = ""
102
- state.conversation.append({"role": "user", "content": "Bạn: " + textin})
 
103
  if textin != "":
104
  print("Đang nghĩ...")
105
- textout=str(text2text(textin))
106
-
107
 
108
- for i, msg in enumerate(history):
 
 
 
 
109
  if isinstance(msg[0], tuple):
110
- messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
111
- messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
112
  images.append(Image.open(msg[0][0]).convert("RGB"))
113
- elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
114
  # messages are already handled
115
  pass
116
- elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn
117
  messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
118
  messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
119
 
120
  # add current message
121
- if len(message["files"]) == 1:
122
- if isinstance(message["files"][0], str): # examples
123
- image = Image.open(message["files"][0]).convert("RGB")
124
- else: # regular input
125
- image = Image.open(message["files"][0]["path"]).convert("RGB")
126
  images.append(image)
127
  messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
128
- else:
129
  messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
130
-
131
-
132
- texts = processor.apply_chat_template(messages, add_generation_prompt=True)
133
-
134
- if images == []:
135
- inputs = processor(text=texts, return_tensors="pt").to("cpu")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  else:
137
- inputs = processor(text=texts, images=images, return_tensors="pt").to("cpu")
138
- streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
139
- generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
140
- generated_text = streamer
 
141
 
142
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
143
- thread.start()
144
- buffer = ""
145
 
146
- for new_text in streamer:
147
- buffer += new_text
148
- generated_text_without_prompt = buffer
149
- time.sleep(0.01)
150
- yield buffer
151
-
152
-
153
- textout = generated_text.replace('*','')
154
- state.conversation.append({"role": "user", "content": "Trợ lý: " + textout})
155
- if textout != "":
156
- print("Đang đọc...")
157
  mp3 = gTTS(textout,tld='com.vn',lang='vi',slow=False)
158
  mp3_fp = BytesIO()
159
  mp3.write_to_fp(mp3_fp)
160
  srr=mp3_fp.getvalue()
 
 
 
161
  mp3_fp.close()
162
- #yield srr, state
163
- yield srr, AppState(conversation=state.conversation)
164
 
165
- def start_recording_user(state: AppState):
166
  if not state.stopped:
167
  return gr.Audio(recording=True)
168
 
@@ -173,14 +193,16 @@ with gr.Blocks() as demo:
173
  with gr.Row():
174
  with gr.Column():
175
  input_audio = gr.Audio(label="Nói cho tôi nghe nào", sources="microphone", type="numpy")
 
176
  with gr.Column():
177
  chatbot = gr.Chatbot(label="Nội dung trò chuyện", type="messages")
178
  output_audio = gr.Audio(label="Trợ lý", autoplay=True)
 
 
179
  state = gr.State(value=AppState())
180
-
181
  stream = input_audio.stream(
182
  process_audio,
183
- [input_audio, state],
184
  [input_audio, state],
185
  stream_every=0.50,
186
  time_limit=30,
@@ -191,10 +213,10 @@ with gr.Blocks() as demo:
191
  [output_audio, state],
192
  )
193
  respond.then(lambda s: s.conversation, [state], [chatbot])
194
-
195
  restart = output_audio.stop(
196
  start_recording_user,
197
- [state],
198
  [input_audio],
199
  )
200
  cancel = gr.Button("Stop Conversation", variant="stop")
 
1
  import speech_recognition as sr
 
2
  from gtts import gTTS
3
  import gradio as gr
4
  from io import BytesIO
 
13
  import torch
14
  from huggingface_hub import login
15
  import os
16
+ from PIL import Image
17
+ from threading import Thread
18
+ tk = os.environ.get("HF_TOKEN")
19
+ #login(tk)
20
  ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
21
+ model = MllamaForConditionalGeneration.from_pretrained(ckpt,torch_dtype=torch.bfloat16).to("cuda")
 
22
  processor = AutoProcessor.from_pretrained(ckpt)
23
  r = sr.Recognizer()
24
 
25
  @dataclass
26
  class AppState:
27
  stream: np.ndarray | None = None
28
+ image: dict = field(default_factory=dict)
29
  sampling_rate: int = 0
30
  pause_detected: bool = False
31
  started_talking: bool = False
32
  stopped: bool = False
33
+ message: dict = field(default_factory=dict)
34
+ history: list = field(default_factory=list)
35
  conversation: list = field(default_factory=list)
36
+ textout: str = ""
37
 
38
  def run_vad(ori_audio, sr):
39
  _st = time.time()
 
61
  print(msg)
62
  return -1, ori_audio, round(time.time() - _st, 4)
63
 
64
+ def determine_pause(audio:np.ndarray,sampling_rate:int,state:AppState) -> bool:
65
+ """Phát hiện tạm dừng trong âm thanh."""
66
  temp_audio = audio
67
  dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
68
  duration = len(audio) / sampling_rate
 
73
  print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
74
  return (duration - dur_vad) > 1
75
 
76
+ def process_audio(audio:tuple, image: Image, state:AppState):
77
  if state.stream is None:
78
  state.stream = audio[1]
79
  state.sampling_rate = audio[0]
80
  else:
81
  state.stream = np.concatenate((state.stream, audio[1]))
82
+ if image is None:
83
+ state.image = {"file":""}
84
+ else:
85
+ state.image = {"file":str(image)}
86
  pause_detected = determine_pause(state.stream, state.sampling_rate, state)
87
  state.pause_detected = pause_detected
88
  if state.pause_detected and state.started_talking:
89
  return gr.Audio(recording=False), state
90
  return None, state
91
 
92
+ def response(state:AppState = AppState()):
93
+ max_new_tokens = 1024
94
  if not state.pause_detected and not state.started_talking:
95
  return None, AppState()
96
  audio_buffer = BytesIO()
 
108
  textin=r.recognize_google(audio_data,language='vi')
109
  except:
110
  textin = ""
111
+ #state.conversation.append({"role": "user", "content": "Bạn: " + textin})
112
+ textout = ""
113
  if textin != "":
114
  print("Đang nghĩ...")
115
+ state.message = {}
116
+ state.message={"text": textin,"files": state.image["file"]}
117
 
118
+ # phần phiên dịch
119
+ txt = state.message["text"]
120
+ messages= []
121
+ images = []
122
+ for i, msg in enumerate(state.history):
123
  if isinstance(msg[0], tuple):
124
+ messages.append({"role": "user", "content": [{"type": "text", "text": state.history[i][0]}, {"type": "image"}]})
125
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": state.history[i][1]}]})
126
  images.append(Image.open(msg[0][0]).convert("RGB"))
127
+ elif isinstance(state.history[i], tuple) and isinstance(msg[0], str):
128
  # messages are already handled
129
  pass
130
+ elif isinstance(state.history[i][0], str) and isinstance(msg[0], str): # text only turn
131
  messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
132
  messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
133
 
134
  # add current message
135
+ if state.message["files"] != "": # examples
136
+ image = Image.open(state.message["files"]).convert("RGB")
 
 
 
137
  images.append(image)
138
  messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
139
+ else: # regular input
140
  messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
141
+ try:
142
+ texts = processor.apply_chat_template(messages, add_generation_prompt=True)
143
+ if images == []:
144
+ inputs = processor(text=texts, return_tensors="pt").to("cuda")
145
+ else:
146
+ inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
147
+ streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
148
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
149
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
150
+ thread.start()
151
+ buffer = ""
152
+ for new_text in streamer:
153
+ buffer += new_text
154
+ time.sleep(0.01)
155
+ state.textout=buffer
156
+ textout=buffer
157
+ except:
158
+ print("Chưa lấy được thông tin dịch")
159
+ if state.message["files"] != "":
160
+ state.history.append([(txt,state.image["file"]),buffer])
161
+ state.conversation.append({"role":"user","content":"Bạn: " + str(txt) + str(state.image["file"])})
162
+ state.conversation.append({"role":"assistant", "content": "Bot: " + str(buffer)})
163
  else:
164
+ state.history.append([txt,buffer])
165
+ state.conversation.append({"role": "user", "content":"Bạn: " + str(txt)})
166
+ state.conversation.append({"role": "assistant", "content":"Bot: " + str(buffer)})
167
+ else:
168
+ textout = "Tôi không nghe rõ"
169
 
 
 
 
170
 
171
+ #phần đọc chữ đã dịch
172
+ ssr = state.stream.tobytes()
173
+ print("Đang đọc...")
174
+ try:
 
 
 
 
 
 
 
175
  mp3 = gTTS(textout,tld='com.vn',lang='vi',slow=False)
176
  mp3_fp = BytesIO()
177
  mp3.write_to_fp(mp3_fp)
178
  srr=mp3_fp.getvalue()
179
+ except:
180
+ print("Lỗi không đọc được")
181
+ finally:
182
  mp3_fp.close()
183
+ yield srr, AppState(conversation=state.conversation, history=state.history)
 
184
 
185
+ def start_recording_user(state:AppState): # Sửa lỗi tại đây
186
  if not state.stopped:
187
  return gr.Audio(recording=True)
188
 
 
193
  with gr.Row():
194
  with gr.Column():
195
  input_audio = gr.Audio(label="Nói cho tôi nghe nào", sources="microphone", type="numpy")
196
+ input_image = gr.Image(label="Hình ảnh của bạn", sources="upload", type="filepath")
197
  with gr.Column():
198
  chatbot = gr.Chatbot(label="Nội dung trò chuyện", type="messages")
199
  output_audio = gr.Audio(label="Trợ lý", autoplay=True)
200
+ with gr.Row():
201
+ output_image = gr.Image(label="Hình ảnh sau xử lý", sources="clipboard", type="filepath",visible=True)
202
  state = gr.State(value=AppState())
 
203
  stream = input_audio.stream(
204
  process_audio,
205
+ [input_audio, input_image, state],
206
  [input_audio, state],
207
  stream_every=0.50,
208
  time_limit=30,
 
213
  [output_audio, state],
214
  )
215
  respond.then(lambda s: s.conversation, [state], [chatbot])
216
+ respond.then(lambda s: s.image, [state], [output_image])
217
  restart = output_audio.stop(
218
  start_recording_user,
219
+ [state, input_image],
220
  [input_audio],
221
  )
222
  cancel = gr.Button("Stop Conversation", variant="stop")