nisten commited on
Commit
deeaafe
1 Parent(s): 32720ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -37
app.py CHANGED
@@ -3,12 +3,19 @@ import spaces
3
  import torch
4
  import subprocess
5
  import sys
 
 
 
 
6
 
7
  # Install required packages
8
  subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "--force-reinstall", "--no-deps", "einops", "accelerate", "git+https://github.com/Muennighoff/transformers.git@olmoe"])
9
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
10
 
11
- from transformers import OlmoeForCausalLM, AutoTokenizer
 
 
 
12
 
13
  model_name = "allenai/OLMoE-1B-7B-0924-Instruct"
14
 
@@ -18,12 +25,12 @@ try:
18
  model = OlmoeForCausalLM.from_pretrained(
19
  model_name,
20
  trust_remote_code=True,
21
- torch_dtype=torch.float16, # Using float16 for lower precision
22
  low_cpu_mem_usage=True,
23
  device_map="auto",
24
- _attn_implementation="flash_attention_2" # Enable Flash Attention 2
25
  ).to(DEVICE)
26
- model.gradient_checkpointing_enable() # Enable gradient checkpointing
27
  tokenizer = AutoTokenizer.from_pretrained(model_name)
28
  except Exception as e:
29
  print(f"Error loading model: {e}")
@@ -35,10 +42,35 @@ system_prompt = ("Adopt the persona of hilariously pissed off Andrej Karpathy "
35
  "while always answering questions in full first principles analysis type of thinking "
36
  "without using any analogies and always showing full working code or output in his answers.")
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  @spaces.GPU
39
- def generate_response(message, history, temperature, max_new_tokens):
40
  if model is None or tokenizer is None:
41
- yield "Model or tokenizer not loaded properly. Please check the logs."
42
  return
43
 
44
  messages = [{"role": "system", "content": system_prompt}]
@@ -46,29 +78,36 @@ def generate_response(message, history, temperature, max_new_tokens):
46
  messages.append({"role": "user", "content": user_msg})
47
  if assistant_msg:
48
  messages.append({"role": "assistant", "content": assistant_msg})
49
- messages.append({"role": "user", "content": message})
50
 
51
  inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
52
 
53
  try:
54
- with torch.no_grad():
55
- generated_ids = model.generate(
56
- inputs,
57
- max_new_tokens=max_new_tokens,
58
- do_sample=True,
59
- temperature=temperature,
60
- eos_token_id=tokenizer.eos_token_id,
61
- )
62
-
63
- generated_text = tokenizer.decode(generated_ids[0, inputs.shape[1]:], skip_special_tokens=True)
64
- yield generated_text.strip()
65
- except RuntimeError as e:
66
- if "CUDA out of memory" in str(e):
67
- yield "GPU memory exceeded. Try reducing the max tokens or using a smaller model."
68
- else:
69
- yield f"An error occurred: {str(e)}"
 
 
 
70
  except Exception as e:
71
- yield f"An unexpected error occurred: {str(e)}"
 
 
 
 
 
72
 
73
  css = """
74
  #output {
@@ -79,30 +118,30 @@ css = """
79
  """
80
 
81
  with gr.Blocks(css=css) as demo:
82
- gr.Markdown("# Nisten's Karpathy Chatbot with OSS OLMoE (Now with Flash Attention 2!)")
83
  chatbot = gr.Chatbot(elem_id="output")
 
 
84
  msg = gr.Textbox(label="Meow")
85
  with gr.Row():
86
  temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
87
  max_new_tokens = gr.Slider(minimum=50, maximum=4000, value=2000, step=50, label="Max New Tokens")
88
  clear = gr.Button("Clear")
89
 
90
- def user(user_message, history):
91
- return "", history + [[user_message, None]]
92
-
93
- def bot(history, temp, max_tokens):
94
- user_message = history[-1][0]
95
- bot_message = ""
96
- for token in generate_response(user_message, history[:-1], temp, max_tokens):
97
- bot_message = token
98
- history[-1][1] = bot_message
99
- yield history
100
 
 
101
  msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
102
  bot, [chatbot, temperature, max_new_tokens], chatbot
 
 
103
  )
104
  clear.click(lambda: None, None, chatbot, queue=False)
105
 
106
  if __name__ == "__main__":
107
- demo.queue(api_open=True, max_size=10) # Limiting queue size
108
- demo.launch(debug=True, show_api=True, share=False) # Disabled sharing for security
 
3
  import torch
4
  import subprocess
5
  import sys
6
+ from threading import Thread
7
+ from transformers import OlmoeForCausalLM, AutoTokenizer, TextIteratorStreamer
8
+ import numpy as np
9
+ import librosa
10
 
11
  # Install required packages
12
  subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "--force-reinstall", "--no-deps", "einops", "accelerate", "git+https://github.com/Muennighoff/transformers.git@olmoe"])
13
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
14
 
15
+ # Import speech-to-speech components
16
+ from VAD.vad_handler import VADHandler
17
+ from STT.whisper_stt_handler import WhisperSTTHandler
18
+ from TTS.parler_handler import ParlerTTSHandler
19
 
20
  model_name = "allenai/OLMoE-1B-7B-0924-Instruct"
21
 
 
25
  model = OlmoeForCausalLM.from_pretrained(
26
  model_name,
27
  trust_remote_code=True,
28
+ torch_dtype=torch.float16,
29
  low_cpu_mem_usage=True,
30
  device_map="auto",
31
+ _attn_implementation="flash_attention_2"
32
  ).to(DEVICE)
33
+ model.gradient_checkpointing_enable()
34
  tokenizer = AutoTokenizer.from_pretrained(model_name)
35
  except Exception as e:
36
  print(f"Error loading model: {e}")
 
42
  "while always answering questions in full first principles analysis type of thinking "
43
  "without using any analogies and always showing full working code or output in his answers.")
44
 
45
+ # Setup speech-to-speech components
46
+ from queue import Queue
47
+ from threading import Event
48
+
49
+ stop_event = Event()
50
+ should_listen = Event()
51
+ vad = VADHandler(stop_event, Queue(), Queue(), setup_args=(should_listen,))
52
+ stt = WhisperSTTHandler(stop_event, Queue(), Queue())
53
+ tts = ParlerTTSHandler(stop_event, Queue(), Queue(), setup_args=(should_listen,))
54
+
55
+ @spaces.GPU
56
+ def speech_to_text(audio):
57
+ if audio is None:
58
+ return ""
59
+ audio_np = librosa.resample(audio[1], orig_sr=audio[0], target_sr=16000)
60
+ audio_np = (audio_np * 32768).astype(np.int16)
61
+
62
+ vad_output = vad.process(audio_np)
63
+ stt_output, _ = next(stt.process(vad_output))
64
+ return stt_output
65
+
66
+ @spaces.GPU
67
+ def user(user_message, history):
68
+ return "", history + [[user_message, None]]
69
+
70
  @spaces.GPU
71
+ def bot(history, temperature, max_new_tokens):
72
  if model is None or tokenizer is None:
73
+ yield history
74
  return
75
 
76
  messages = [{"role": "system", "content": system_prompt}]
 
78
  messages.append({"role": "user", "content": user_msg})
79
  if assistant_msg:
80
  messages.append({"role": "assistant", "content": assistant_msg})
 
81
 
82
  inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
83
 
84
  try:
85
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
86
+ generation_kwargs = dict(
87
+ inputs=inputs,
88
+ streamer=streamer,
89
+ max_new_tokens=max_new_tokens,
90
+ do_sample=True,
91
+ temperature=temperature,
92
+ eos_token_id=tokenizer.eos_token_id,
93
+ )
94
+
95
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
96
+ thread.start()
97
+
98
+ generated_text = ""
99
+ for new_text in streamer:
100
+ generated_text += new_text
101
+ history[-1][1] = generated_text
102
+ yield history
103
+
104
  except Exception as e:
105
+ history[-1][1] = f"An error occurred: {str(e)}"
106
+ yield history
107
+
108
+ def text_to_speech(text):
109
+ audio_output = np.concatenate(list(tts.process(text)))
110
+ return (16000, audio_output)
111
 
112
  css = """
113
  #output {
 
118
  """
119
 
120
  with gr.Blocks(css=css) as demo:
121
+ gr.Markdown("# Nisten's Karpathy Chatbot with OSS OLMoE (Now with Speech-to-Speech!)")
122
  chatbot = gr.Chatbot(elem_id="output")
123
+ audio_input = gr.Audio(source="microphone", type="numpy")
124
+ audio_output = gr.Audio()
125
  msg = gr.Textbox(label="Meow")
126
  with gr.Row():
127
  temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
128
  max_new_tokens = gr.Slider(minimum=50, maximum=4000, value=2000, step=50, label="Max New Tokens")
129
  clear = gr.Button("Clear")
130
 
131
+ def process_audio(audio, history, temp, max_tokens):
132
+ text = speech_to_text(audio)
133
+ history = history + [[text, None]]
134
+ for new_history in bot(history, temp, max_tokens):
135
+ yield new_history, text_to_speech(new_history[-1][1])
 
 
 
 
 
136
 
137
+ audio_input.stop_recording(process_audio, [audio_input, chatbot, temperature, max_new_tokens], [chatbot, audio_output])
138
  msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
139
  bot, [chatbot, temperature, max_new_tokens], chatbot
140
+ ).then(
141
+ lambda history: text_to_speech(history[-1][1]), chatbot, audio_output
142
  )
143
  clear.click(lambda: None, None, chatbot, queue=False)
144
 
145
  if __name__ == "__main__":
146
+ demo.queue(api_open=True, max_size=10)
147
+ demo.launch(debug=True, show_api=True, share=False)