Spaces:

dkdaniz
/

katara

Paused

Daniel Marques commited on Oct 15, 2023

Commit

c77782f

1 Parent(s): cb776ef

feat: add history

Files changed (3) hide show

constants.py CHANGED Viewed

@@ -35,15 +35,14 @@ MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE  # int(CONTEXT_WINDOW_SIZE/4)
 #### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
-N_GPU_LAYERS = 40  # Llama-2-70B has 83 layers
-N_BATCH = 1024
 ### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
 # N_GPU_LAYERS = 20
 # N_BATCH = 512
 # https://python.langchain.com/en/latest/_modules/langchain/document_loaders/excel.html#UnstructuredExcelLoader
 DOCUMENT_MAP = {
     ".txt": TextLoader,

 #### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
+N_GPU_LAYERS = 50  # Llama-2-70B has 83 layers
+N_BATCH = 2048
 ### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
 # N_GPU_LAYERS = 20
 # N_BATCH = 512
 # https://python.langchain.com/en/latest/_modules/langchain/document_loaders/excel.html#UnstructuredExcelLoader
 DOCUMENT_MAP = {
     ".txt": TextLoader,

main.py CHANGED Viewed

@@ -62,7 +62,7 @@ Question: {question}
 memory = ConversationBufferMemory(input_key="question", memory_key="history")
-QA_CHAIN_PROMPT = PromptTemplate.from_template(input_variables=["history", "context", "question"], template=template)
 QA = RetrievalQA.from_chain_type(
     llm=LLM,

 memory = ConversationBufferMemory(input_key="question", memory_key="history")
+QA_CHAIN_PROMPT = PromptTemplate(input_variables=["history", "context", "question"], template=template)
 QA = RetrievalQA.from_chain_type(
     llm=LLM,

run_localGPT.py CHANGED Viewed

@@ -79,7 +79,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
     # Create a pipeline for text generation
-    streamer = TextStreamer(tokenizer)
     pipe = pipeline(
         "text-generation",
@@ -91,7 +91,9 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
         top_k=40,
         repetition_penalty=1.0,
         generation_config=generation_config,
-        streamer=streamer
     )
     local_llm = HuggingFacePipeline(pipeline=pipe)

     # Create a pipeline for text generation
+    streamer = TextStreamer(tokenizer, skip_prompt=True)
     pipe = pipeline(
         "text-generation",
         top_k=40,
         repetition_penalty=1.0,
         generation_config=generation_config,
+        streamer=streamer,
+        num_return_sequences=1,
+        eos_token_id=tokenizer.eos_token_id
     )
     local_llm = HuggingFacePipeline(pipeline=pipe)