Spaces:

gufett0
/

chatbot-llamaindex

Sleeping

gufett0 commited on Sep 14, 2024

Commit

1c8dd0f

1 Parent(s): 708da42

added text iterator

Files changed (1) hide show

backend.py CHANGED Viewed

@@ -13,26 +13,13 @@ from llama_cpp import Llama
 import spaces
 from huggingface_hub import login
 huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
 login(huggingface_token)
-"""hf_hub_download(
-    repo_id="google/gemma-2-2b-it-GGUF",
-    filename="2b_it_v2.gguf",
-    local_dir="./models",
-    token=huggingface_token
-)
-llm = Llama(
-            model_path=f"models/2b_it_v2.gguf",
-            flash_attn=True,
-            _gpu_layers=81,
-            n_batch=1024,
-            n_ctx=8192,
-        )"""
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model_id = "google/gemma-2-2b-it"
@@ -85,8 +72,28 @@ def handle_query(query_str, chathistory):
         ("user", qa_prompt_str),
     ]
     text_qa_template = ChatPromptTemplate.from_messages(chat_text_qa_msgs)
     try:
         result = index.as_query_engine(text_qa_template=text_qa_template).query(query_str)
         response_text = result.response
@@ -95,7 +102,7 @@ def handle_query(query_str, chathistory):
         yield cleaned_result
     except Exception as e:
-        yield f"Error processing query: {str(e)}"

 import spaces
 from huggingface_hub import login
+from transformers import TextIteratorStreamer
+import threading
 huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
 login(huggingface_token)
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model_id = "google/gemma-2-2b-it"
         ("user", qa_prompt_str),
     ]
     text_qa_template = ChatPromptTemplate.from_messages(chat_text_qa_msgs)
+    # Create the query engine
+    query_engine = index.as_query_engine(text_qa_template=text_qa_template)
     try:
+        # Setup the TextIteratorStreamer for streaming the response
+        streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
+        # Create a thread to run the generation in the background
+        def generate_response():
+            query_engine.query(query_str, streamer=streamer)
+        generation_thread = threading.Thread(target=generate_response)
+        generation_thread.start()
+        # Stream tokens as they are generated
+        for new_text in streamer:
+            yield new_text
+    except Exception as e:
+        yield f"Error processing query: {str(e)}"
+"""    try:
         result = index.as_query_engine(text_qa_template=text_qa_template).query(query_str)
         response_text = result.response
         yield cleaned_result
     except Exception as e:
+        yield f"Error processing query: {str(e)}""""