Spaces:

gufett0
/

chatbot-llamaindex

Sleeping

App Files Files Community

gufett0 commited on Sep 16, 2024

Commit

cf360c7

1 Parent(s): 86b68c0

changed class interface with iterator

Browse files

Files changed (2) hide show

backend.py +2 -37
interface.py +4 -4

backend.py CHANGED Viewed

@@ -34,9 +34,7 @@ model.eval()
 # what models will be used by LlamaIndex:
 Settings.embed_model = InstructorEmbedding(model_name="hkunlp/instructor-base")
 Settings.llm = GemmaLLMInterface()
-#Settings.llm = GemmaLLMInterface(model_name=model_id)
 ############################---------------------------------
@@ -60,43 +58,8 @@ def build_index():
 def handle_query(query_str, chathistory) -> Iterator[str]:
     index = build_index()
-    qa_prompt_str = (
-        "Context information is below.\n"
-        "---------------------\n"
-        "{context_str}\n"
-        "---------------------\n"
-        "Given the context information and not prior knowledge, "
-        "answer the question: {query_str}\n"
-    )
-    # Text QA Prompt
-    chat_text_qa_msgs = [
-        (
-            "system",
-            "Sei un assistente italiano di nome Ossy che risponde solo alle domande o richieste pertinenti. ",
-        ),
-        ("user", qa_prompt_str),
-    ]
-    text_qa_template = ChatPromptTemplate.from_messages(chat_text_qa_msgs)
     try:
-        # Create a streaming query engine
-        """query_engine = index.as_query_engine(text_qa_template=text_qa_template, streaming=False, similarity_top_k=1)
-        # Execute the query
-        streaming_response = query_engine.query(query_str)
-        r = streaming_response.response
-        cleaned_result = r.replace("<end_of_turn>", "").strip()
-        yield cleaned_result"""
-        # Stream the response
-        """outputs = []
-        for text in streaming_response.response_gen:
-            outputs.append(str(text))
-            yield "".join(outputs)"""
         memory = ChatMemoryBuffer.from_defaults(token_limit=1500)
         chat_engine = index.as_chat_engine(
@@ -112,6 +75,8 @@ def handle_query(query_str, chathistory) -> Iterator[str]:
         response = chat_engine.stream_chat(query_str)
         #response = chat_engine.chat(query_str)
         for token in response.response_gen:
             outputs.append(str(token))
             print(f"Generated token: {token}")
             yield "".join(outputs)

 # what models will be used by LlamaIndex:
 Settings.embed_model = InstructorEmbedding(model_name="hkunlp/instructor-base")
 Settings.llm = GemmaLLMInterface()
 ############################---------------------------------
 def handle_query(query_str, chathistory) -> Iterator[str]:
     index = build_index()
     try:
         memory = ChatMemoryBuffer.from_defaults(token_limit=1500)
         chat_engine = index.as_chat_engine(
         response = chat_engine.stream_chat(query_str)
         #response = chat_engine.chat(query_str)
         for token in response.response_gen:
+          if not token.startswith("system:") and not token.startswith("user:"):
             outputs.append(str(token))
             print(f"Generated token: {token}")
             yield "".join(outputs)

interface.py CHANGED Viewed

@@ -7,19 +7,19 @@ from transformers import TextIteratorStreamer
 from threading import Thread
 from pydantic import Field, field_validator
-# for transformers 2
 class GemmaLLMInterface(CustomLLM):
     def __init__(self, model_id: str = "google/gemma-2-2b-it", **kwargs):
         super().__init__(**kwargs)
-        object.__setattr__(self, "model_id", model_id)  # Bypass Pydantic for model_id
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             device_map="auto",
             torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
         )
         tokenizer = AutoTokenizer.from_pretrained(model_id)
-        object.__setattr__(self, "model", model)  # Bypass Pydantic for model
-        object.__setattr__(self, "tokenizer", tokenizer)  # Bypass Pydantic for tokenizer
         object.__setattr__(self, "context_window", 8192)
         object.__setattr__(self, "num_output", 2048)

 from threading import Thread
 from pydantic import Field, field_validator
+# for transformers 2 (__setattr__ is used to bypass Pydantic check )
 class GemmaLLMInterface(CustomLLM):
     def __init__(self, model_id: str = "google/gemma-2-2b-it", **kwargs):
         super().__init__(**kwargs)
+        object.__setattr__(self, "model_id", model_id)
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             device_map="auto",
             torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
         )
         tokenizer = AutoTokenizer.from_pretrained(model_id)
+        object.__setattr__(self, "model", model)
+        object.__setattr__(self, "tokenizer", tokenizer)
         object.__setattr__(self, "context_window", 8192)
         object.__setattr__(self, "num_output", 2048)