Spaces:

dkdaniz
/

katara

Paused

Daniel Marques commited on Oct 15, 2023

Commit

8fa0233

•

1 Parent(s): 2ea73cf

fix: add streamer

Files changed (2) hide show

load_models.py CHANGED Viewed

@@ -22,7 +22,7 @@ torch.set_grad_enabled(False)
 from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, N_GPU_LAYERS, N_BATCH, MODELS_PATH
-def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, logging):
     """
     Load a GGUF/GGML quantized model using LlamaCpp.
@@ -56,13 +56,17 @@ def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, loggin
             "model_path": model_path,
             "n_ctx": CONTEXT_WINDOW_SIZE,
             "max_tokens": MAX_NEW_TOKENS,
-            "n_batch": N_BATCH,  # set this based on your GPU & CPU RAM
         }
         if device_type.lower() == "mps":
             kwargs["n_gpu_layers"] = 1
         if device_type.lower() == "cuda":
             kwargs["n_gpu_layers"] = N_GPU_LAYERS  # set this based on your GPU
         return LlamaCpp(**kwargs)
     except:
         if "ggml" in model_basename:
@@ -185,7 +189,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging, stre
     if model_basename is not None:
         if ".gguf" in model_basename.lower():
-            llm = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
             return llm
         elif ".ggml" in model_basename.lower():
             model, tokenizer = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)

 from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, N_GPU_LAYERS, N_BATCH, MODELS_PATH
+def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, logging, stream = False):
     """
     Load a GGUF/GGML quantized model using LlamaCpp.
             "model_path": model_path,
             "n_ctx": CONTEXT_WINDOW_SIZE,
             "max_tokens": MAX_NEW_TOKENS,
+            "n_batch": N_BATCH,
+             # set this based on your GPU & CPU RAM
         }
         if device_type.lower() == "mps":
             kwargs["n_gpu_layers"] = 1
         if device_type.lower() == "cuda":
             kwargs["n_gpu_layers"] = N_GPU_LAYERS  # set this based on your GPU
+        #add stream
+        kwargs["stream"] = stream
         return LlamaCpp(**kwargs)
     except:
         if "ggml" in model_basename:
     if model_basename is not None:
         if ".gguf" in model_basename.lower():
+            llm = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING, stream)
             return llm
         elif ".ggml" in model_basename.lower():
             model, tokenizer = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)

main.py CHANGED Viewed

@@ -42,7 +42,8 @@ DB = Chroma(
 RETRIEVER = DB.as_retriever()
-models = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME, stream=False)
 LLM = models[0]
 STREAMER = models[1]
@@ -164,8 +165,6 @@ async def predict(data: Predict):
     global QA
     user_prompt = data.prompt
     if user_prompt:
-        # print(f'User Prompt: {user_prompt}')
-        # Get the answer from the chain
         res = QA(user_prompt)
         print(res)

 RETRIEVER = DB.as_retriever()
+models = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME, stream=True)
 LLM = models[0]
 STREAMER = models[1]
     global QA
     user_prompt = data.prompt
     if user_prompt:
         res = QA(user_prompt)
         print(res)