Spaces:

ariansyahdedy
/

chat

Build error

App Files Files Community

ariansyahdedy commited on Dec 23, 2024

Commit

e0c1af0

1 Parent(s): 9282958

Test Rag

Browse files

Files changed (11) hide show

.gitignore +1 -1
app/crud/process_file.py +1 -1
app/handlers/message_handler.py +5 -4
app/handlers/webhook_handler.py +2 -0
app/main.py +7 -0
app/search/hybrid_search.py +1 -1
app/search/rag_pipeline.py +8 -9
app/services/message.py +6 -3
app/utils/system_prompt.py +1 -0
document_logs_2024-12-20.txt +0 -2
document_logs_2024-12-21.txt +0 -0

.gitignore CHANGED Viewed

@@ -4,6 +4,6 @@ __pycache__
 user_media/
 toolkits/
 test*.py

 user_media/
 toolkits/
 test*.py
+document*.txt

app/crud/process_file.py CHANGED Viewed

@@ -130,7 +130,7 @@ async def process_uploaded_file(
     # Add to RAG system if rag_system is provided and load_only is False
     if rag_system:
         try:
-            rag_system.add_document(doc_id = f"{id}_{documents.title}", text = documents.text_content)
             print(f"doc_id: {id}_{documents.title}")
             print(f"content: {documents.text_content}")

     # Add to RAG system if rag_system is provided and load_only is False
     if rag_system:
         try:
+            rag_system.add_document(doc_id = f"{id}_{documents.title}", text = documents.text_content, meta_data = {"source": file_path})
             print(f"doc_id: {id}_{documents.title}")
             print(f"content: {documents.text_content}")

app/handlers/message_handler.py CHANGED Viewed

@@ -30,7 +30,7 @@ class MessageHandler:
         try:
             # Parse message
             message = MessageParser.parse(raw_message)
             if self.message_cache.exists(message.id):
                 self.logger.info(f"Duplicate message detected and skipped: {message.id}")
                 return {"status": "duplicate", "message_id": message.id}
@@ -40,18 +40,18 @@ class MessageHandler:
             self.chat_manager.initialize_chat(message.sender_id)
             # Process message with LLM
             result = await process_message_with_llm(
                 message.sender_id,
                 message.content,
                 self.chat_manager.get_chat_history(message.sender_id),
-                rag_sytem = rag_system,
                 whatsapp_token=whatsapp_token,
                 whatsapp_url=whatsapp_url,
                 **media_paths
             )
             # Append message to chat to keep track of conversation
             self.chat_manager.append_message(message.sender_id, "user", message.content)
             self.chat_manager.append_message(message.sender_id, "model", result)
@@ -61,6 +61,7 @@ class MessageHandler:
             return {"status": "success", "message_id": message.id, "result": result}
         except Exception as e:
             return {"status": "error", "message_id": raw_message.get("id"), "error": str(e)}
     async def _process_media(self, message: Message, whatsapp_token: str) -> Dict[str, Optional[str]]:

         try:
             # Parse message
             message = MessageParser.parse(raw_message)
+            self.logger.info(f"Parsed Message: {message}")
             if self.message_cache.exists(message.id):
                 self.logger.info(f"Duplicate message detected and skipped: {message.id}")
                 return {"status": "duplicate", "message_id": message.id}
             self.chat_manager.initialize_chat(message.sender_id)
             # Process message with LLM
             result = await process_message_with_llm(
                 message.sender_id,
                 message.content,
                 self.chat_manager.get_chat_history(message.sender_id),
+                rag_system = rag_system,
                 whatsapp_token=whatsapp_token,
                 whatsapp_url=whatsapp_url,
                 **media_paths
             )
+            self.logger.info(f"Result: {result}")
             # Append message to chat to keep track of conversation
             self.chat_manager.append_message(message.sender_id, "user", message.content)
             self.chat_manager.append_message(message.sender_id, "model", result)
             return {"status": "success", "message_id": message.id, "result": result}
         except Exception as e:
+            self.logger.error(f"Error processing message: {str(e)}")
             return {"status": "error", "message_id": raw_message.get("id"), "error": str(e)}
     async def _process_media(self, message: Message, whatsapp_token: str) -> Dict[str, Optional[str]]:

app/handlers/webhook_handler.py CHANGED Viewed

@@ -33,7 +33,9 @@ class WebhookHandler:
                 changes = entry.get("changes", [])
                 for change in changes:
                     messages = change.get("value", {}).get("messages", [])
                     for message in messages:
                         self.logger.info(f"Processing message: {message}")
                         response = await self.message_handler.handle(
                             raw_message=message,

                 changes = entry.get("changes", [])
                 for change in changes:
                     messages = change.get("value", {}).get("messages", [])
+                    self.logger.info(f"message length: {len(messages)}")
                     for message in messages:
                         self.logger.info(f"Processing message: {message}")
                         response = await self.message_handler.handle(
                             raw_message=message,

app/main.py CHANGED Viewed

@@ -57,6 +57,7 @@ async def setup_rag_system():
     embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Replace with your model if different
     rag_system = RAGSystem(embedding_model)
     return rag_system
 # Initialize FastAPI app
 @asynccontextmanager
@@ -123,6 +124,12 @@ async def webhook(request: Request):
         gemini_api = request.query_params.get("gemini_api")
         llm_model = request.query_params.get("cx_code")
         print(f"payload: {payload}")
         response = await webhook_handler.process_webhook(
             payload=payload,

     embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Replace with your model if different
     rag_system = RAGSystem(embedding_model)
     return rag_system
 # Initialize FastAPI app
 @asynccontextmanager
         gemini_api = request.query_params.get("gemini_api")
         llm_model = request.query_params.get("cx_code")
+          # Return HTTP 200 immediately
+        response = JSONResponse(
+            content={"status": "received"},
+            status_code=200
+        )
         print(f"payload: {payload}")
         response = await webhook_handler.process_webhook(
             payload=payload,

app/search/hybrid_search.py CHANGED Viewed

@@ -69,7 +69,7 @@ class Hybrid_search:
         # Apply threshold and get top_n results
         results = self._get_top_n_results(filtered_doc_ids, hybrid_scores, top_n, threshold)
-        # self.logger.info(f"Results before reranking: {results}")
         # If results exist, apply re-ranking
         # if results:

         # Apply threshold and get top_n results
         results = self._get_top_n_results(filtered_doc_ids, hybrid_scores, top_n, threshold)
+        self.logger.info(f"Results before reranking: {results}")
         # If results exist, apply re-ranking
         # if results:

app/search/rag_pipeline.py CHANGED Viewed

@@ -21,10 +21,9 @@ logger = logging.getLogger(__name__)
 from keybert import KeyBERT
 import asyncio
-async def extract_keywords_async(doc, threshold=0.4):
     kw_model = KeyBERT()
-    loop = asyncio.get_event_loop()
-    keywords = await loop.run_in_executor(None, kw_model.extract_keywords, doc, threshold)
     keywords = [key for key, _ in keywords]
     return keywords
@@ -41,11 +40,11 @@ class RAGSystem:
         self.faiss_wrapper = FAISS_search(embedding_model)
         self.hybrid_search = Hybrid_search(self.bm25_wrapper, self.faiss_wrapper)
-    def add_document(self, doc_id, text, metadata=None):
         self.token_counter.add_document(doc_id, text)
         self.doc_ids.append(doc_id)
         self.documents.append(text)
-        self.meta_data.append(metadata)
         self.bm25_wrapper.add_document(doc_id, text)
         self.faiss_wrapper.add_document(doc_id, text)
@@ -92,14 +91,14 @@ class RAGSystem:
                     doc = self.documents[index]
-                    # meta_data = self.meta_data[index]
                     # Extract the file name and page number
                     # file_name = meta_data['source'].split('/')[-1]  # Extracts 'POJK 31 - 2018.pdf'
                     # page_number = meta_data.get('page', 'unknown')
                     # url = meta_data['source']
                     # file_name = meta_data.get('source', 'unknown_source').split('/')[-1]  # Safe extraction
                     # page_number = meta_data.get('page', 'unknown')  # Default to 'unknown' if 'page' is missing
-                    # url = meta_data.get('source', 'unknown_url')  # Default URL fallback
                     # logger.info(f"file_name: {file_name}, page_number: {page_number}, url: {url}")
@@ -108,10 +107,10 @@ class RAGSystem:
                     # doc_name = f"{file_name}"
                     self.results.append(doc)
-                    retrieved_docs.append({"text": doc})
             return retrieved_docs
         else:
-            return [{"name": "No relevant documents found.", "text": None}]
     def get_total_tokens(self):
         return self.token_counter.get_total_tokens()

 from keybert import KeyBERT
 import asyncio
+def extract_keywords_async(doc, threshold=0.4, top_n = 5):
     kw_model = KeyBERT()
+    keywords = kw_model.extract_keywords(doc, threshold=threshold, top_n=top_n)
     keywords = [key for key, _ in keywords]
     return keywords
         self.faiss_wrapper = FAISS_search(embedding_model)
         self.hybrid_search = Hybrid_search(self.bm25_wrapper, self.faiss_wrapper)
+    def add_document(self, doc_id, text, meta_data=None):
         self.token_counter.add_document(doc_id, text)
         self.doc_ids.append(doc_id)
         self.documents.append(text)
+        self.meta_data.append(meta_data)
         self.bm25_wrapper.add_document(doc_id, text)
         self.faiss_wrapper.add_document(doc_id, text)
                     doc = self.documents[index]
+                    meta_data = self.meta_data[index]
                     # Extract the file name and page number
                     # file_name = meta_data['source'].split('/')[-1]  # Extracts 'POJK 31 - 2018.pdf'
                     # page_number = meta_data.get('page', 'unknown')
                     # url = meta_data['source']
                     # file_name = meta_data.get('source', 'unknown_source').split('/')[-1]  # Safe extraction
                     # page_number = meta_data.get('page', 'unknown')  # Default to 'unknown' if 'page' is missing
+                    url = meta_data.get('source', 'unknown_url')  # Default URL fallback
                     # logger.info(f"file_name: {file_name}, page_number: {page_number}, url: {url}")
                     # doc_name = f"{file_name}"
                     self.results.append(doc)
+                    retrieved_docs.append({"url":url, "text": doc})
             return retrieved_docs
         else:
+            return [{"url": "None.", "text": None}]
     def get_total_tokens(self):
         return self.token_counter.get_total_tokens()

app/services/message.py CHANGED Viewed

@@ -16,6 +16,7 @@ from app.utils.load_env import ACCESS_TOKEN, WHATSAPP_API_URL, GEMINI_API, OPENA
 from app.utils.system_prompt import system_prompt
 from app.services.search_engine import google_search
 # Load environment variables
 load_dotenv()
@@ -147,16 +148,18 @@ async def generate_response_from_gemini(
         chat = model.start_chat(history=history)
         if rag_system:
-            keywords = await rag_system.extract_keywords_async(content)
             # Implement RAG: Retrieve relevant documents
             retrieved_docs = await rag_system.adv_query(content, keywords=keywords, top_k=1)
             if retrieved_docs:
                 logger.info(f"Retrieved {len(retrieved_docs)} documents for context.")
                 # Format the retrieved documents as a context string
-                context = "\n\n".join([f"Content: {doc['text']}" for doc in retrieved_docs])
                 # Option 1: Append to history as a system message
-                history.append({"role": "system", "content": f"Relevant documents:\n{context}"})
                 # Reinitialize chat with updated history
                 chat = model.start_chat(history=history)

 from app.utils.system_prompt import system_prompt
 from app.services.search_engine import google_search
+from app.search.rag_pipeline import extract_keywords_async
 # Load environment variables
 load_dotenv()
         chat = model.start_chat(history=history)
         if rag_system:
+            keywords = extract_keywords_async(content)
+            logger.info(f"Extracted Keywords: {keywords}")
             # Implement RAG: Retrieve relevant documents
             retrieved_docs = await rag_system.adv_query(content, keywords=keywords, top_k=1)
             if retrieved_docs:
                 logger.info(f"Retrieved {len(retrieved_docs)} documents for context.")
                 # Format the retrieved documents as a context string
+                context = "\n\n".join([f"Source:{doc['url']}\nContent: {doc['text']}" for doc in retrieved_docs])
                 # Option 1: Append to history as a system message
+                history.append({"role": "user", "parts": f"Relevant documents:\n{context}"})
+                logger.info(f"History: {history}")
                 # Reinitialize chat with updated history
                 chat = model.start_chat(history=history)

app/utils/system_prompt.py CHANGED Viewed

@@ -14,6 +14,7 @@ Content Guidelines:
     Up-to-Date Information: Provide the most current and relevant data available about ongoing projects and city improvements. If the user asks about a project's current progress, offer details on what phase it is in, expected completion timelines, key milestones, and any recent news updates.
     Neutrality and Positivity: Always maintain a neutral stance. Do not criticize the local government or its officials. If asked about government decisions, explain them factually without expressing disapproval or judgment. Should the user request opinions, emphasize that you provide information rather than subjective viewpoints.
     Encouraging Verification: If the user requests official documentation or more detailed information, guide them to verified channels such as the city's official website, relevant department portals, or licensed information centers.
 Example Interactions:

     Up-to-Date Information: Provide the most current and relevant data available about ongoing projects and city improvements. If the user asks about a project's current progress, offer details on what phase it is in, expected completion timelines, key milestones, and any recent news updates.
     Neutrality and Positivity: Always maintain a neutral stance. Do not criticize the local government or its officials. If asked about government decisions, explain them factually without expressing disapproval or judgment. Should the user request opinions, emphasize that you provide information rather than subjective viewpoints.
     Encouraging Verification: If the user requests official documentation or more detailed information, guide them to verified channels such as the city's official website, relevant department portals, or licensed information centers.
+    Always Include Sources: When your response is based on information provided from external sources, include the source link explicitly. For example: "Informasi ini berasal dari www.indosource.com. Anda dapat mengunjungi tautan tersebut untuk detail lebih lanjut." Clearly attribute the link to maintain transparency. Make sure you don't include redundant links.
 Example Interactions:

document_logs_2024-12-20.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- 2024-12-20 12:49:01,713 - INFO - ID: ea205193-4582-44bc-ab71-80176aac7aef, Snippet: [SSW](https://sswalfa.surabaya.go.id/home) [![](https://sswalfa.surabaya.go.id/assets/images/logo-
2	- 2024-12-20 16:59:00,859 - INFO - ID: 693547c8-b87c-4601-ae57-2ec8c2a32d06_Informasi Perizinan \| Surabaya Single Window, Snippet: [SSW](https://sswalfa.surabaya.go.id/home) [![](https://sswalfa.surabaya.go.id/assets/images/logo-

document_logs_2024-12-21.txt DELETED Viewed

File without changes