pabloce commited on
Commit
ec06a49
·
verified ·
1 Parent(s): c554b45

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -38
app.py CHANGED
@@ -1,13 +1,18 @@
1
  import spaces
2
  import gradio as gr
3
- import torch
4
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
5
- from threading import Thread
 
6
 
7
  """
8
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
9
  """
10
  # client = InferenceClient("cognitivecomputations/dolphin-2.8-mistral-7b-v02")
 
 
 
 
11
 
12
  @spaces.GPU
13
  def respond(
@@ -18,42 +23,21 @@ def respond(
18
  temperature,
19
  top_p,
20
  ):
21
- torch.set_default_device("cuda")
22
-
23
- tokenizer = AutoTokenizer.from_pretrained(
24
- "cognitivecomputations/dolphin-2.8-mistral-7b-v02",
25
- trust_remote_code=True
26
- )
27
- model = AutoModelForCausalLM.from_pretrained(
28
- "cognitivecomputations/dolphin-2.8-mistral-7b-v02",
29
- torch_dtype="auto",
30
- load_in_4bit=True,
31
- trust_remote_code=True
32
  )
33
- history_transformer_format = history + [[message, ""]]
34
 
35
- system_prompt = f"<|im_start|>system\n{system_message}.<|im_end|>"
36
- messages = system_prompt + "".join(["".join(["\n<|im_start|>user\n" + item[0], "<|im_end|>\n<|im_start|>assistant\n" + item[1]]) for item in history_transformer_format])
37
- input_ids = tokenizer([messages], return_tensors="pt").to('cuda')
38
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
39
- generate_kwargs = dict(
40
- input_ids,
41
- streamer=streamer,
42
- max_new_tokens=max_tokens,
43
- do_sample=True,
44
- top_p=top_p,
45
- top_k=50,
46
- temperature=temperature,
47
- num_beams=1
48
- )
49
- t = Thread(target=model.generate, kwargs=generate_kwargs)
50
- t.start()
51
- partial_message = ""
52
- for new_token in streamer:
53
- partial_message += new_token
54
- if '<|im_end|>' in partial_message:
55
- break
56
- yield partial_message
57
 
58
  """
59
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
@@ -61,7 +45,7 @@ For information on how to customize the ChatInterface, peruse the gradio docs: h
61
  demo = gr.ChatInterface(
62
  respond,
63
  additional_inputs=[
64
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
65
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
66
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
67
  gr.Slider(
 
1
  import spaces
2
  import gradio as gr
3
+ from llama_cpp import Llama
4
+ from llama_cpp_agent import LlamaCppAgent
5
+ from llama_cpp_agent import MessagesFormatterType
6
+ from llama_cpp_agent.providers import LlamaCppPythonProvider
7
 
8
  """
9
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
10
  """
11
  # client = InferenceClient("cognitivecomputations/dolphin-2.8-mistral-7b-v02")
12
+ llama_model = Llama(r"Meta-Llama-3-8B.Q5_K_M.gguf", n_batch=1024, n_threads=10, n_gpu_layers=33, n_ctx=8192, verbose=False)
13
+
14
+ provider = LlamaCppPythonProvider(llama_model)
15
+
16
 
17
  @spaces.GPU
18
  def respond(
 
23
  temperature,
24
  top_p,
25
  ):
26
+ agent = LlamaCppAgent(
27
+ provider,
28
+ system_prompt=system_message,
29
+ predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
30
+ debug_output=True
 
 
 
 
 
 
31
  )
 
32
 
33
+ settings = provider.get_provider_default_settings()
34
+
35
+ settings.max_tokens = max_tokens
36
+ settings.temperature = temperature
37
+ settings.top_p = top_p
38
+
39
+ agent_output = agent.get_chat_response(message, llm_sampling_settings=settings)
40
+ yield agent_output.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  """
43
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 
45
  demo = gr.ChatInterface(
46
  respond,
47
  additional_inputs=[
48
+ gr.Textbox(value="You are a helpful assistant.", label="System message"),
49
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
  gr.Slider(