Spaces:

ccm
/

chat-with-SFF

Sleeping

App Files Files Community

ccm commited on Nov 9, 2024

Commit

d0b143b

verified ·

1 Parent(s): 80d6350

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -42

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import gradio  # Interface handling
-import spaces  # For GPU
 import langchain_community.vectorstores  # Vectorstore for publications
 import langchain_huggingface  # Embeddings
-import transformers
 # The number of publications to retrieve for the prompt
 PUBLICATIONS_TO_RETRIEVE = 5
@@ -11,13 +11,10 @@ PUBLICATIONS_TO_RETRIEVE = 5
 RAG_TEMPLATE = """You are an AI assistant who enjoys helping users learn about research.
 Answer the USER_QUERY on additive manufacturing research using the RESEARCH_EXCERPTS.
 Provide a concise ANSWER based on these excerpts. Avoid listing references.
 ===== RESEARCH_EXCERPTS =====
 {research_excerpts}
 ===== USER_QUERY =====
 {query}
 ===== ANSWER =====
 """
@@ -31,22 +28,23 @@ publication_vectorstore = langchain_community.vectorstores.FAISS.load_local(
     ),
     allow_dangerous_deserialization=True,
 )
-#
-# # Create the callable LLM
-# llm = transformers.pipeline(
-#     task="text-generation",
-#     model="Qwen/Qwen2.5-7B-Instruct-AWQ",
-#     device="cuda",
-# )
 def preprocess(query: str) -> str:
     """
     Generates a prompt based on the top k documents matching the query.
     Args:
         query (str): The user's query.
     Returns:
         str: The formatted prompt containing research excerpts and the user's query.
     """
@@ -67,47 +65,29 @@ def preprocess(query: str) -> str:
     return prompt
-import threading
 @spaces.GPU
 def reply(message: str, history: list[str]) -> str:
     """
     Generates a response to the user’s message.
     Args:
         message (str): The user's message or query.
         history (list[str]): The conversation history.
     Returns:
         str: The generated response from the language model.
     """
-    tok = transformers.AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct-AWQ")
-    model = transformers.AutoModelForCausalLM.from_pretrained(
-        "Qwen/Qwen2.5-7B-Instruct-AWQ"
-    )
-    inputs = tok([preprocess(message)], return_tensors="pt")
-    streamer = transformers.TextIteratorStreamer(tok)
-    generation_kwargs = dict(
-        inputs, streamer=streamer, max_new_tokens=512, return_full_text=False
     )
-    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    generated_text = ""
-    for new_text in streamer:
-        generated_text += new_text
-        yield generated_text
-    # yield llm(
-    #     preprocess(message),
-    #     max_new_tokens=512,
-    #     return_full_text=False,
-    #     streamer=transformers.TextIteratorStreamer(
-    #         transformers.AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct-AWQ")
-    #     ),
-    # )[0]["generated_text"]
 # Example Queries for Interface

 import gradio  # Interface handling
+import spaces  # GPU
 import langchain_community.vectorstores  # Vectorstore for publications
 import langchain_huggingface  # Embeddings
+import transformers  # LLM
 # The number of publications to retrieve for the prompt
 PUBLICATIONS_TO_RETRIEVE = 5
 RAG_TEMPLATE = """You are an AI assistant who enjoys helping users learn about research.
 Answer the USER_QUERY on additive manufacturing research using the RESEARCH_EXCERPTS.
 Provide a concise ANSWER based on these excerpts. Avoid listing references.
 ===== RESEARCH_EXCERPTS =====
 {research_excerpts}
 ===== USER_QUERY =====
 {query}
 ===== ANSWER =====
 """
     ),
     allow_dangerous_deserialization=True,
 )
+# Create the callable LLM
+llm = transformers.pipeline(
+    task="text-generation",
+    model="Qwen/Qwen2.5-7B-Instruct-AWQ",
+    device="cuda",
+    streamer=transformers.TextStreamer(
+        transformers.AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct-AWQ")
+    ),
+)
 def preprocess(query: str) -> str:
     """
     Generates a prompt based on the top k documents matching the query.
     Args:
         query (str): The user's query.
     Returns:
         str: The formatted prompt containing research excerpts and the user's query.
     """
     return prompt
 @spaces.GPU
 def reply(message: str, history: list[str]) -> str:
     """
     Generates a response to the user’s message.
     Args:
         message (str): The user's message or query.
         history (list[str]): The conversation history.
     Returns:
         str: The generated response from the language model.
     """
+    # Preprocess the user's message
+    rag_prompt = preprocess(message)
+    # Generate a response from the language model
+    response = llm(
+        rag_prompt,
+        max_new_tokens=512,
+        return_full_text=False,
     )
+    # Return the generated response
+    return response[0]["generated_text"].strip("= ")
 # Example Queries for Interface