Spaces:
Build error
Build error
Commit
·
e0c1af0
1
Parent(s):
9282958
Test Rag
Browse files- .gitignore +1 -1
- app/crud/process_file.py +1 -1
- app/handlers/message_handler.py +5 -4
- app/handlers/webhook_handler.py +2 -0
- app/main.py +7 -0
- app/search/hybrid_search.py +1 -1
- app/search/rag_pipeline.py +8 -9
- app/services/message.py +6 -3
- app/utils/system_prompt.py +1 -0
- document_logs_2024-12-20.txt +0 -2
- document_logs_2024-12-21.txt +0 -0
.gitignore
CHANGED
@@ -4,6 +4,6 @@ __pycache__
|
|
4 |
user_media/
|
5 |
toolkits/
|
6 |
test*.py
|
7 |
-
|
8 |
|
9 |
|
|
|
4 |
user_media/
|
5 |
toolkits/
|
6 |
test*.py
|
7 |
+
document*.txt
|
8 |
|
9 |
|
app/crud/process_file.py
CHANGED
@@ -130,7 +130,7 @@ async def process_uploaded_file(
|
|
130 |
# Add to RAG system if rag_system is provided and load_only is False
|
131 |
if rag_system:
|
132 |
try:
|
133 |
-
rag_system.add_document(doc_id = f"{id}_{documents.title}", text = documents.text_content)
|
134 |
|
135 |
print(f"doc_id: {id}_{documents.title}")
|
136 |
print(f"content: {documents.text_content}")
|
|
|
130 |
# Add to RAG system if rag_system is provided and load_only is False
|
131 |
if rag_system:
|
132 |
try:
|
133 |
+
rag_system.add_document(doc_id = f"{id}_{documents.title}", text = documents.text_content, meta_data = {"source": file_path})
|
134 |
|
135 |
print(f"doc_id: {id}_{documents.title}")
|
136 |
print(f"content: {documents.text_content}")
|
app/handlers/message_handler.py
CHANGED
@@ -30,7 +30,7 @@ class MessageHandler:
|
|
30 |
try:
|
31 |
# Parse message
|
32 |
message = MessageParser.parse(raw_message)
|
33 |
-
|
34 |
if self.message_cache.exists(message.id):
|
35 |
self.logger.info(f"Duplicate message detected and skipped: {message.id}")
|
36 |
return {"status": "duplicate", "message_id": message.id}
|
@@ -40,18 +40,18 @@ class MessageHandler:
|
|
40 |
|
41 |
self.chat_manager.initialize_chat(message.sender_id)
|
42 |
|
43 |
-
|
44 |
# Process message with LLM
|
45 |
result = await process_message_with_llm(
|
46 |
message.sender_id,
|
47 |
message.content,
|
48 |
self.chat_manager.get_chat_history(message.sender_id),
|
49 |
-
|
50 |
whatsapp_token=whatsapp_token,
|
51 |
whatsapp_url=whatsapp_url,
|
52 |
**media_paths
|
53 |
)
|
54 |
-
|
55 |
# Append message to chat to keep track of conversation
|
56 |
self.chat_manager.append_message(message.sender_id, "user", message.content)
|
57 |
self.chat_manager.append_message(message.sender_id, "model", result)
|
@@ -61,6 +61,7 @@ class MessageHandler:
|
|
61 |
return {"status": "success", "message_id": message.id, "result": result}
|
62 |
|
63 |
except Exception as e:
|
|
|
64 |
return {"status": "error", "message_id": raw_message.get("id"), "error": str(e)}
|
65 |
|
66 |
async def _process_media(self, message: Message, whatsapp_token: str) -> Dict[str, Optional[str]]:
|
|
|
30 |
try:
|
31 |
# Parse message
|
32 |
message = MessageParser.parse(raw_message)
|
33 |
+
self.logger.info(f"Parsed Message: {message}")
|
34 |
if self.message_cache.exists(message.id):
|
35 |
self.logger.info(f"Duplicate message detected and skipped: {message.id}")
|
36 |
return {"status": "duplicate", "message_id": message.id}
|
|
|
40 |
|
41 |
self.chat_manager.initialize_chat(message.sender_id)
|
42 |
|
43 |
+
|
44 |
# Process message with LLM
|
45 |
result = await process_message_with_llm(
|
46 |
message.sender_id,
|
47 |
message.content,
|
48 |
self.chat_manager.get_chat_history(message.sender_id),
|
49 |
+
rag_system = rag_system,
|
50 |
whatsapp_token=whatsapp_token,
|
51 |
whatsapp_url=whatsapp_url,
|
52 |
**media_paths
|
53 |
)
|
54 |
+
self.logger.info(f"Result: {result}")
|
55 |
# Append message to chat to keep track of conversation
|
56 |
self.chat_manager.append_message(message.sender_id, "user", message.content)
|
57 |
self.chat_manager.append_message(message.sender_id, "model", result)
|
|
|
61 |
return {"status": "success", "message_id": message.id, "result": result}
|
62 |
|
63 |
except Exception as e:
|
64 |
+
self.logger.error(f"Error processing message: {str(e)}")
|
65 |
return {"status": "error", "message_id": raw_message.get("id"), "error": str(e)}
|
66 |
|
67 |
async def _process_media(self, message: Message, whatsapp_token: str) -> Dict[str, Optional[str]]:
|
app/handlers/webhook_handler.py
CHANGED
@@ -33,7 +33,9 @@ class WebhookHandler:
|
|
33 |
changes = entry.get("changes", [])
|
34 |
for change in changes:
|
35 |
messages = change.get("value", {}).get("messages", [])
|
|
|
36 |
for message in messages:
|
|
|
37 |
self.logger.info(f"Processing message: {message}")
|
38 |
response = await self.message_handler.handle(
|
39 |
raw_message=message,
|
|
|
33 |
changes = entry.get("changes", [])
|
34 |
for change in changes:
|
35 |
messages = change.get("value", {}).get("messages", [])
|
36 |
+
self.logger.info(f"message length: {len(messages)}")
|
37 |
for message in messages:
|
38 |
+
|
39 |
self.logger.info(f"Processing message: {message}")
|
40 |
response = await self.message_handler.handle(
|
41 |
raw_message=message,
|
app/main.py
CHANGED
@@ -57,6 +57,7 @@ async def setup_rag_system():
|
|
57 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Replace with your model if different
|
58 |
rag_system = RAGSystem(embedding_model)
|
59 |
|
|
|
60 |
return rag_system
|
61 |
# Initialize FastAPI app
|
62 |
@asynccontextmanager
|
@@ -123,6 +124,12 @@ async def webhook(request: Request):
|
|
123 |
gemini_api = request.query_params.get("gemini_api")
|
124 |
llm_model = request.query_params.get("cx_code")
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
print(f"payload: {payload}")
|
127 |
response = await webhook_handler.process_webhook(
|
128 |
payload=payload,
|
|
|
57 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Replace with your model if different
|
58 |
rag_system = RAGSystem(embedding_model)
|
59 |
|
60 |
+
|
61 |
return rag_system
|
62 |
# Initialize FastAPI app
|
63 |
@asynccontextmanager
|
|
|
124 |
gemini_api = request.query_params.get("gemini_api")
|
125 |
llm_model = request.query_params.get("cx_code")
|
126 |
|
127 |
+
# Return HTTP 200 immediately
|
128 |
+
response = JSONResponse(
|
129 |
+
content={"status": "received"},
|
130 |
+
status_code=200
|
131 |
+
)
|
132 |
+
|
133 |
print(f"payload: {payload}")
|
134 |
response = await webhook_handler.process_webhook(
|
135 |
payload=payload,
|
app/search/hybrid_search.py
CHANGED
@@ -69,7 +69,7 @@ class Hybrid_search:
|
|
69 |
|
70 |
# Apply threshold and get top_n results
|
71 |
results = self._get_top_n_results(filtered_doc_ids, hybrid_scores, top_n, threshold)
|
72 |
-
|
73 |
|
74 |
# If results exist, apply re-ranking
|
75 |
# if results:
|
|
|
69 |
|
70 |
# Apply threshold and get top_n results
|
71 |
results = self._get_top_n_results(filtered_doc_ids, hybrid_scores, top_n, threshold)
|
72 |
+
self.logger.info(f"Results before reranking: {results}")
|
73 |
|
74 |
# If results exist, apply re-ranking
|
75 |
# if results:
|
app/search/rag_pipeline.py
CHANGED
@@ -21,10 +21,9 @@ logger = logging.getLogger(__name__)
|
|
21 |
from keybert import KeyBERT
|
22 |
import asyncio
|
23 |
|
24 |
-
|
25 |
kw_model = KeyBERT()
|
26 |
-
|
27 |
-
keywords = await loop.run_in_executor(None, kw_model.extract_keywords, doc, threshold)
|
28 |
keywords = [key for key, _ in keywords]
|
29 |
return keywords
|
30 |
|
@@ -41,11 +40,11 @@ class RAGSystem:
|
|
41 |
self.faiss_wrapper = FAISS_search(embedding_model)
|
42 |
self.hybrid_search = Hybrid_search(self.bm25_wrapper, self.faiss_wrapper)
|
43 |
|
44 |
-
def add_document(self, doc_id, text,
|
45 |
self.token_counter.add_document(doc_id, text)
|
46 |
self.doc_ids.append(doc_id)
|
47 |
self.documents.append(text)
|
48 |
-
self.meta_data.append(
|
49 |
self.bm25_wrapper.add_document(doc_id, text)
|
50 |
self.faiss_wrapper.add_document(doc_id, text)
|
51 |
|
@@ -92,14 +91,14 @@ class RAGSystem:
|
|
92 |
|
93 |
doc = self.documents[index]
|
94 |
|
95 |
-
|
96 |
# Extract the file name and page number
|
97 |
# file_name = meta_data['source'].split('/')[-1] # Extracts 'POJK 31 - 2018.pdf'
|
98 |
# page_number = meta_data.get('page', 'unknown')
|
99 |
# url = meta_data['source']
|
100 |
# file_name = meta_data.get('source', 'unknown_source').split('/')[-1] # Safe extraction
|
101 |
# page_number = meta_data.get('page', 'unknown') # Default to 'unknown' if 'page' is missing
|
102 |
-
|
103 |
|
104 |
# logger.info(f"file_name: {file_name}, page_number: {page_number}, url: {url}")
|
105 |
|
@@ -108,10 +107,10 @@ class RAGSystem:
|
|
108 |
# doc_name = f"{file_name}"
|
109 |
|
110 |
self.results.append(doc)
|
111 |
-
retrieved_docs.append({"text": doc})
|
112 |
return retrieved_docs
|
113 |
else:
|
114 |
-
return [{"
|
115 |
|
116 |
def get_total_tokens(self):
|
117 |
return self.token_counter.get_total_tokens()
|
|
|
21 |
from keybert import KeyBERT
|
22 |
import asyncio
|
23 |
|
24 |
+
def extract_keywords_async(doc, threshold=0.4, top_n = 5):
|
25 |
kw_model = KeyBERT()
|
26 |
+
keywords = kw_model.extract_keywords(doc, threshold=threshold, top_n=top_n)
|
|
|
27 |
keywords = [key for key, _ in keywords]
|
28 |
return keywords
|
29 |
|
|
|
40 |
self.faiss_wrapper = FAISS_search(embedding_model)
|
41 |
self.hybrid_search = Hybrid_search(self.bm25_wrapper, self.faiss_wrapper)
|
42 |
|
43 |
+
def add_document(self, doc_id, text, meta_data=None):
|
44 |
self.token_counter.add_document(doc_id, text)
|
45 |
self.doc_ids.append(doc_id)
|
46 |
self.documents.append(text)
|
47 |
+
self.meta_data.append(meta_data)
|
48 |
self.bm25_wrapper.add_document(doc_id, text)
|
49 |
self.faiss_wrapper.add_document(doc_id, text)
|
50 |
|
|
|
91 |
|
92 |
doc = self.documents[index]
|
93 |
|
94 |
+
meta_data = self.meta_data[index]
|
95 |
# Extract the file name and page number
|
96 |
# file_name = meta_data['source'].split('/')[-1] # Extracts 'POJK 31 - 2018.pdf'
|
97 |
# page_number = meta_data.get('page', 'unknown')
|
98 |
# url = meta_data['source']
|
99 |
# file_name = meta_data.get('source', 'unknown_source').split('/')[-1] # Safe extraction
|
100 |
# page_number = meta_data.get('page', 'unknown') # Default to 'unknown' if 'page' is missing
|
101 |
+
url = meta_data.get('source', 'unknown_url') # Default URL fallback
|
102 |
|
103 |
# logger.info(f"file_name: {file_name}, page_number: {page_number}, url: {url}")
|
104 |
|
|
|
107 |
# doc_name = f"{file_name}"
|
108 |
|
109 |
self.results.append(doc)
|
110 |
+
retrieved_docs.append({"url":url, "text": doc})
|
111 |
return retrieved_docs
|
112 |
else:
|
113 |
+
return [{"url": "None.", "text": None}]
|
114 |
|
115 |
def get_total_tokens(self):
|
116 |
return self.token_counter.get_total_tokens()
|
app/services/message.py
CHANGED
@@ -16,6 +16,7 @@ from app.utils.load_env import ACCESS_TOKEN, WHATSAPP_API_URL, GEMINI_API, OPENA
|
|
16 |
from app.utils.system_prompt import system_prompt
|
17 |
|
18 |
from app.services.search_engine import google_search
|
|
|
19 |
# Load environment variables
|
20 |
load_dotenv()
|
21 |
|
@@ -147,16 +148,18 @@ async def generate_response_from_gemini(
|
|
147 |
chat = model.start_chat(history=history)
|
148 |
|
149 |
if rag_system:
|
150 |
-
keywords =
|
|
|
151 |
# Implement RAG: Retrieve relevant documents
|
152 |
retrieved_docs = await rag_system.adv_query(content, keywords=keywords, top_k=1)
|
153 |
if retrieved_docs:
|
154 |
logger.info(f"Retrieved {len(retrieved_docs)} documents for context.")
|
155 |
# Format the retrieved documents as a context string
|
156 |
-
context = "\n\n".join([f"
|
157 |
# Option 1: Append to history as a system message
|
158 |
-
history.append({"role": "
|
159 |
|
|
|
160 |
# Reinitialize chat with updated history
|
161 |
chat = model.start_chat(history=history)
|
162 |
|
|
|
16 |
from app.utils.system_prompt import system_prompt
|
17 |
|
18 |
from app.services.search_engine import google_search
|
19 |
+
from app.search.rag_pipeline import extract_keywords_async
|
20 |
# Load environment variables
|
21 |
load_dotenv()
|
22 |
|
|
|
148 |
chat = model.start_chat(history=history)
|
149 |
|
150 |
if rag_system:
|
151 |
+
keywords = extract_keywords_async(content)
|
152 |
+
logger.info(f"Extracted Keywords: {keywords}")
|
153 |
# Implement RAG: Retrieve relevant documents
|
154 |
retrieved_docs = await rag_system.adv_query(content, keywords=keywords, top_k=1)
|
155 |
if retrieved_docs:
|
156 |
logger.info(f"Retrieved {len(retrieved_docs)} documents for context.")
|
157 |
# Format the retrieved documents as a context string
|
158 |
+
context = "\n\n".join([f"Source:{doc['url']}\nContent: {doc['text']}" for doc in retrieved_docs])
|
159 |
# Option 1: Append to history as a system message
|
160 |
+
history.append({"role": "user", "parts": f"Relevant documents:\n{context}"})
|
161 |
|
162 |
+
logger.info(f"History: {history}")
|
163 |
# Reinitialize chat with updated history
|
164 |
chat = model.start_chat(history=history)
|
165 |
|
app/utils/system_prompt.py
CHANGED
@@ -14,6 +14,7 @@ Content Guidelines:
|
|
14 |
Up-to-Date Information: Provide the most current and relevant data available about ongoing projects and city improvements. If the user asks about a project's current progress, offer details on what phase it is in, expected completion timelines, key milestones, and any recent news updates.
|
15 |
Neutrality and Positivity: Always maintain a neutral stance. Do not criticize the local government or its officials. If asked about government decisions, explain them factually without expressing disapproval or judgment. Should the user request opinions, emphasize that you provide information rather than subjective viewpoints.
|
16 |
Encouraging Verification: If the user requests official documentation or more detailed information, guide them to verified channels such as the city's official website, relevant department portals, or licensed information centers.
|
|
|
17 |
|
18 |
Example Interactions:
|
19 |
|
|
|
14 |
Up-to-Date Information: Provide the most current and relevant data available about ongoing projects and city improvements. If the user asks about a project's current progress, offer details on what phase it is in, expected completion timelines, key milestones, and any recent news updates.
|
15 |
Neutrality and Positivity: Always maintain a neutral stance. Do not criticize the local government or its officials. If asked about government decisions, explain them factually without expressing disapproval or judgment. Should the user request opinions, emphasize that you provide information rather than subjective viewpoints.
|
16 |
Encouraging Verification: If the user requests official documentation or more detailed information, guide them to verified channels such as the city's official website, relevant department portals, or licensed information centers.
|
17 |
+
Always Include Sources: When your response is based on information provided from external sources, include the source link explicitly. For example: "Informasi ini berasal dari www.indosource.com. Anda dapat mengunjungi tautan tersebut untuk detail lebih lanjut." Clearly attribute the link to maintain transparency. Make sure you don't include redundant links.
|
18 |
|
19 |
Example Interactions:
|
20 |
|
document_logs_2024-12-20.txt
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
2024-12-20 12:49:01,713 - INFO - ID: ea205193-4582-44bc-ab71-80176aac7aef, Snippet: [SSW](https://sswalfa.surabaya.go.id/home) [ [![](https://sswalfa.surabaya.go.id/assets/images/logo-
|
|
|
|
|
|
document_logs_2024-12-21.txt
DELETED
File without changes
|