pabloce commited on
Commit
7418606
·
verified ·
1 Parent(s): 63c66b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -35
app.py CHANGED
@@ -15,7 +15,8 @@ from llama_index.llms.llama_cpp.llama_utils import (
15
  )
16
  from llama_index.core.memory import ChatMemoryBuffer
17
 
18
- subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', env={'CMAKE_ARGS': "-DLLAMA_CUDA=on"}, shell=True)
 
19
 
20
  hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf", local_dir = "./models")
21
 
@@ -28,43 +29,61 @@ def respond(
28
  temperature,
29
  top_p,
30
  ):
31
- stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
32
- chat_template = '<s>[INST] ' + system_message
33
- # for human, assistant in history:
34
- # chat_template += human + ' [/INST] ' + assistant + '</s>[INST]'
35
- chat_template += ' ' + message + ' [/INST]'
36
 
37
- print(chat_template)
38
-
39
- llm = LlamaCPP(
40
- model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
41
- temperature=temperature,
42
- max_new_tokens=max_tokens,
43
- context_window=2048,
44
- generate_kwargs={
45
- "top_k": 50,
46
- "top_p": top_p,
47
- "repeat_penalty": 1.3
48
- },
49
- model_kwargs={
50
- "n_threads": 0,
51
- "n_gpu_layers": 33
52
- },
53
- messages_to_prompt=messages_to_prompt,
54
- completion_to_prompt=completion_to_prompt,
55
- verbose=True,
56
  )
57
- # response = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  # for chunk in llm.stream_complete(message):
59
- # print(chunk.delta, end="", flush=True)
60
- # response += str(chunk.delta)
61
- # yield response
62
- outputs = []
63
- for chunk in llm.stream_complete(message):
64
- outputs.append(chunk.delta)
65
- if chunk.delta in stop_tokens:
66
- break
67
- yield "".join(outputs)
68
 
69
  demo = gr.ChatInterface(
70
  respond,
 
15
  )
16
  from llama_index.core.memory import ChatMemoryBuffer
17
 
18
+ subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
19
+ subprocess.run('pip install llama-cpp-agent', shell=True)
20
 
21
  hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf", local_dir = "./models")
22
 
 
29
  temperature,
30
  top_p,
31
  ):
32
+ llama_model = Llama(r"models/mistral-7b-instruct-v0.2.Q6_K.gguf", n_batch=1024, n_threads=0, n_gpu_layers=33, n_ctx=8192, verbose=False)
 
 
 
 
33
 
34
+ provider = LlamaCppPythonProvider(llama_model)
35
+
36
+ agent = LlamaCppAgent(
37
+ provider,
38
+ system_prompt=f"{system_message}",
39
+ predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
40
+ debug_output=True
 
 
 
 
 
 
 
 
 
 
 
 
41
  )
42
+
43
+ settings = provider.get_provider_default_settings()
44
+ settings.stream = True
45
+ settings.max_tokens = max_tokens
46
+ settings.temperature = temperature
47
+ settings.top_p = top_p
48
+
49
+ yield agent.get_chat_response(message, llm_sampling_settings=settings, returns_streaming_generator=True)
50
+ # stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
51
+ # chat_template = '<s>[INST] ' + system_message
52
+ # # for human, assistant in history:
53
+ # # chat_template += human + ' [/INST] ' + assistant + '</s>[INST]'
54
+ # chat_template += ' ' + message + ' [/INST]'
55
+
56
+ # print(chat_template)
57
+
58
+ # llm = LlamaCPP(
59
+ # model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
60
+ # temperature=temperature,
61
+ # max_new_tokens=max_tokens,
62
+ # context_window=2048,
63
+ # generate_kwargs={
64
+ # "top_k": 50,
65
+ # "top_p": top_p,
66
+ # "repeat_penalty": 1.3
67
+ # },
68
+ # model_kwargs={
69
+ # "n_threads": 0,
70
+ # "n_gpu_layers": 33
71
+ # },
72
+ # messages_to_prompt=messages_to_prompt,
73
+ # completion_to_prompt=completion_to_prompt,
74
+ # verbose=True,
75
+ # )
76
+ # # response = ""
77
+ # # for chunk in llm.stream_complete(message):
78
+ # # print(chunk.delta, end="", flush=True)
79
+ # # response += str(chunk.delta)
80
+ # # yield response
81
+ # outputs = []
82
  # for chunk in llm.stream_complete(message):
83
+ # outputs.append(chunk.delta)
84
+ # if chunk.delta in stop_tokens:
85
+ # break
86
+ # yield "".join(outputs)
 
 
 
 
 
87
 
88
  demo = gr.ChatInterface(
89
  respond,