Spaces:

grantjw
/

Aesop_Review_Chatbot

Sleeping

grantjw commited on Mar 16, 2024

Commit

59cc10d

verified ·

1 Parent(s): 3f1b4eb

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -46,7 +46,7 @@ def get_retriever(url):
     docs = text_splitter.split_documents(documents)
     embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
     db = DocArrayInMemorySearch.from_documents(docs, embeddings)
-    print("at least we ar ehere?")
     retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 5, "fetch_k": 10})
     return retriever
@@ -63,12 +63,18 @@ def create_chain(_retriever):
     # stream handler to make it appear as if the LLM is typing the
     # responses in real time.
     # callback_manager = CallbackManager([stream_handler])
     n_gpu_layers = 1  # Change this value based on your model and your GPU VRAM pool.
     n_batch = 1024 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
     llm = LlamaCpp(
-            model_path="models/mistral-7b-instruct-v0.1.Q5_0.gguf",
             n_batch=n_batch,
             n_ctx=2048,
             max_tokens=2048,

     docs = text_splitter.split_documents(documents)
     embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
     db = DocArrayInMemorySearch.from_documents(docs, embeddings)
+    #print("at least we ar ehere?")
     retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 5, "fetch_k": 10})
     return retriever
     # stream handler to make it appear as if the LLM is typing the
     # responses in real time.
     # callback_manager = CallbackManager([stream_handler])
+    (repo_id, model_file_name) = ("TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
+                                  "mistral-7b-instruct-v0.1.Q5_0.gguf")
+    model_path = hf_hub_download(repo_id=repo_id,
+                                 filename=model_file_name,
+                                 repo_type="model")
     n_gpu_layers = 1  # Change this value based on your model and your GPU VRAM pool.
     n_batch = 1024 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
     llm = LlamaCpp(
+            model_path=model_path,
             n_batch=n_batch,
             n_ctx=2048,
             max_tokens=2048,