Spaces:
Paused
Paused
Daniel Marques
commited on
Commit
•
fe3defb
1
Parent(s):
416e7fd
feat: add websocket
Browse files- constants.py +2 -2
- load_models.py +0 -2
- main.py +1 -0
- prompt_template_utils.py +6 -1
constants.py
CHANGED
@@ -37,8 +37,8 @@ MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE # int(CONTEXT_WINDOW_SIZE/4)
|
|
37 |
|
38 |
#### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
|
39 |
|
40 |
-
N_GPU_LAYERS =
|
41 |
-
N_BATCH =
|
42 |
|
43 |
### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
|
44 |
# N_GPU_LAYERS = 20
|
|
|
37 |
|
38 |
#### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
|
39 |
|
40 |
+
N_GPU_LAYERS = 100 # Llama-2-70B has 83 layers
|
41 |
+
N_BATCH = CONTEXT_WINDOW_SIZE
|
42 |
|
43 |
### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
|
44 |
# N_GPU_LAYERS = 20
|
load_models.py
CHANGED
@@ -58,8 +58,6 @@ def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, loggin
|
|
58 |
"model_path": model_path,
|
59 |
"n_ctx": CONTEXT_WINDOW_SIZE,
|
60 |
"max_tokens": MAX_NEW_TOKENS,
|
61 |
-
"n_batch": MAX_NEW_TOKENS,
|
62 |
-
|
63 |
# set this based on your GPU & CPU RAM
|
64 |
}
|
65 |
if device_type.lower() == "mps":
|
|
|
58 |
"model_path": model_path,
|
59 |
"n_ctx": CONTEXT_WINDOW_SIZE,
|
60 |
"max_tokens": MAX_NEW_TOKENS,
|
|
|
|
|
61 |
# set this based on your GPU & CPU RAM
|
62 |
}
|
63 |
if device_type.lower() == "mps":
|
main.py
CHANGED
@@ -51,6 +51,7 @@ QA = RetrievalQA.from_chain_type(
|
|
51 |
return_source_documents=SHOW_SOURCES,
|
52 |
chain_type_kwargs={
|
53 |
"prompt": prompt,
|
|
|
54 |
},
|
55 |
)
|
56 |
|
|
|
51 |
return_source_documents=SHOW_SOURCES,
|
52 |
chain_type_kwargs={
|
53 |
"prompt": prompt,
|
54 |
+
"memory": memory
|
55 |
},
|
56 |
)
|
57 |
|
prompt_template_utils.py
CHANGED
@@ -6,6 +6,11 @@ This seems to have significant impact on the output of the LLM.
|
|
6 |
|
7 |
from langchain.memory import ConversationBufferMemory
|
8 |
from langchain.prompts import PromptTemplate
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
# this is specific to Llama-2.
|
11 |
|
@@ -84,7 +89,7 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
|
|
84 |
)
|
85 |
prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
|
86 |
|
87 |
-
memory = ConversationBufferMemory(input_key="question", memory_key="history")
|
88 |
|
89 |
return (
|
90 |
prompt,
|
|
|
6 |
|
7 |
from langchain.memory import ConversationBufferMemory
|
8 |
from langchain.prompts import PromptTemplate
|
9 |
+
from langchain.memory.chat_message_histories import RedisChatMessageHistory
|
10 |
+
|
11 |
+
message_history = RedisChatMessageHistory(
|
12 |
+
url="redis://localhost:6379/1", ttl=600, session_id="my-session"
|
13 |
+
)
|
14 |
|
15 |
# this is specific to Llama-2.
|
16 |
|
|
|
89 |
)
|
90 |
prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
|
91 |
|
92 |
+
memory = ConversationBufferMemory(input_key="question", memory_key="history", chat_memory=message_history)
|
93 |
|
94 |
return (
|
95 |
prompt,
|