ariansyahdedy commited on
Commit
e0c1af0
·
1 Parent(s): 9282958
.gitignore CHANGED
@@ -4,6 +4,6 @@ __pycache__
4
  user_media/
5
  toolkits/
6
  test*.py
7
-
8
 
9
 
 
4
  user_media/
5
  toolkits/
6
  test*.py
7
+ document*.txt
8
 
9
 
app/crud/process_file.py CHANGED
@@ -130,7 +130,7 @@ async def process_uploaded_file(
130
  # Add to RAG system if rag_system is provided and load_only is False
131
  if rag_system:
132
  try:
133
- rag_system.add_document(doc_id = f"{id}_{documents.title}", text = documents.text_content)
134
 
135
  print(f"doc_id: {id}_{documents.title}")
136
  print(f"content: {documents.text_content}")
 
130
  # Add to RAG system if rag_system is provided and load_only is False
131
  if rag_system:
132
  try:
133
+ rag_system.add_document(doc_id = f"{id}_{documents.title}", text = documents.text_content, meta_data = {"source": file_path})
134
 
135
  print(f"doc_id: {id}_{documents.title}")
136
  print(f"content: {documents.text_content}")
app/handlers/message_handler.py CHANGED
@@ -30,7 +30,7 @@ class MessageHandler:
30
  try:
31
  # Parse message
32
  message = MessageParser.parse(raw_message)
33
-
34
  if self.message_cache.exists(message.id):
35
  self.logger.info(f"Duplicate message detected and skipped: {message.id}")
36
  return {"status": "duplicate", "message_id": message.id}
@@ -40,18 +40,18 @@ class MessageHandler:
40
 
41
  self.chat_manager.initialize_chat(message.sender_id)
42
 
43
-
44
  # Process message with LLM
45
  result = await process_message_with_llm(
46
  message.sender_id,
47
  message.content,
48
  self.chat_manager.get_chat_history(message.sender_id),
49
- rag_sytem = rag_system,
50
  whatsapp_token=whatsapp_token,
51
  whatsapp_url=whatsapp_url,
52
  **media_paths
53
  )
54
-
55
  # Append message to chat to keep track of conversation
56
  self.chat_manager.append_message(message.sender_id, "user", message.content)
57
  self.chat_manager.append_message(message.sender_id, "model", result)
@@ -61,6 +61,7 @@ class MessageHandler:
61
  return {"status": "success", "message_id": message.id, "result": result}
62
 
63
  except Exception as e:
 
64
  return {"status": "error", "message_id": raw_message.get("id"), "error": str(e)}
65
 
66
  async def _process_media(self, message: Message, whatsapp_token: str) -> Dict[str, Optional[str]]:
 
30
  try:
31
  # Parse message
32
  message = MessageParser.parse(raw_message)
33
+ self.logger.info(f"Parsed Message: {message}")
34
  if self.message_cache.exists(message.id):
35
  self.logger.info(f"Duplicate message detected and skipped: {message.id}")
36
  return {"status": "duplicate", "message_id": message.id}
 
40
 
41
  self.chat_manager.initialize_chat(message.sender_id)
42
 
43
+
44
  # Process message with LLM
45
  result = await process_message_with_llm(
46
  message.sender_id,
47
  message.content,
48
  self.chat_manager.get_chat_history(message.sender_id),
49
+ rag_system = rag_system,
50
  whatsapp_token=whatsapp_token,
51
  whatsapp_url=whatsapp_url,
52
  **media_paths
53
  )
54
+ self.logger.info(f"Result: {result}")
55
  # Append message to chat to keep track of conversation
56
  self.chat_manager.append_message(message.sender_id, "user", message.content)
57
  self.chat_manager.append_message(message.sender_id, "model", result)
 
61
  return {"status": "success", "message_id": message.id, "result": result}
62
 
63
  except Exception as e:
64
+ self.logger.error(f"Error processing message: {str(e)}")
65
  return {"status": "error", "message_id": raw_message.get("id"), "error": str(e)}
66
 
67
  async def _process_media(self, message: Message, whatsapp_token: str) -> Dict[str, Optional[str]]:
app/handlers/webhook_handler.py CHANGED
@@ -33,7 +33,9 @@ class WebhookHandler:
33
  changes = entry.get("changes", [])
34
  for change in changes:
35
  messages = change.get("value", {}).get("messages", [])
 
36
  for message in messages:
 
37
  self.logger.info(f"Processing message: {message}")
38
  response = await self.message_handler.handle(
39
  raw_message=message,
 
33
  changes = entry.get("changes", [])
34
  for change in changes:
35
  messages = change.get("value", {}).get("messages", [])
36
+ self.logger.info(f"message length: {len(messages)}")
37
  for message in messages:
38
+
39
  self.logger.info(f"Processing message: {message}")
40
  response = await self.message_handler.handle(
41
  raw_message=message,
app/main.py CHANGED
@@ -57,6 +57,7 @@ async def setup_rag_system():
57
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Replace with your model if different
58
  rag_system = RAGSystem(embedding_model)
59
 
 
60
  return rag_system
61
  # Initialize FastAPI app
62
  @asynccontextmanager
@@ -123,6 +124,12 @@ async def webhook(request: Request):
123
  gemini_api = request.query_params.get("gemini_api")
124
  llm_model = request.query_params.get("cx_code")
125
 
 
 
 
 
 
 
126
  print(f"payload: {payload}")
127
  response = await webhook_handler.process_webhook(
128
  payload=payload,
 
57
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Replace with your model if different
58
  rag_system = RAGSystem(embedding_model)
59
 
60
+
61
  return rag_system
62
  # Initialize FastAPI app
63
  @asynccontextmanager
 
124
  gemini_api = request.query_params.get("gemini_api")
125
  llm_model = request.query_params.get("cx_code")
126
 
127
+ # Return HTTP 200 immediately
128
+ response = JSONResponse(
129
+ content={"status": "received"},
130
+ status_code=200
131
+ )
132
+
133
  print(f"payload: {payload}")
134
  response = await webhook_handler.process_webhook(
135
  payload=payload,
app/search/hybrid_search.py CHANGED
@@ -69,7 +69,7 @@ class Hybrid_search:
69
 
70
  # Apply threshold and get top_n results
71
  results = self._get_top_n_results(filtered_doc_ids, hybrid_scores, top_n, threshold)
72
- # self.logger.info(f"Results before reranking: {results}")
73
 
74
  # If results exist, apply re-ranking
75
  # if results:
 
69
 
70
  # Apply threshold and get top_n results
71
  results = self._get_top_n_results(filtered_doc_ids, hybrid_scores, top_n, threshold)
72
+ self.logger.info(f"Results before reranking: {results}")
73
 
74
  # If results exist, apply re-ranking
75
  # if results:
app/search/rag_pipeline.py CHANGED
@@ -21,10 +21,9 @@ logger = logging.getLogger(__name__)
21
  from keybert import KeyBERT
22
  import asyncio
23
 
24
- async def extract_keywords_async(doc, threshold=0.4):
25
  kw_model = KeyBERT()
26
- loop = asyncio.get_event_loop()
27
- keywords = await loop.run_in_executor(None, kw_model.extract_keywords, doc, threshold)
28
  keywords = [key for key, _ in keywords]
29
  return keywords
30
 
@@ -41,11 +40,11 @@ class RAGSystem:
41
  self.faiss_wrapper = FAISS_search(embedding_model)
42
  self.hybrid_search = Hybrid_search(self.bm25_wrapper, self.faiss_wrapper)
43
 
44
- def add_document(self, doc_id, text, metadata=None):
45
  self.token_counter.add_document(doc_id, text)
46
  self.doc_ids.append(doc_id)
47
  self.documents.append(text)
48
- self.meta_data.append(metadata)
49
  self.bm25_wrapper.add_document(doc_id, text)
50
  self.faiss_wrapper.add_document(doc_id, text)
51
 
@@ -92,14 +91,14 @@ class RAGSystem:
92
 
93
  doc = self.documents[index]
94
 
95
- # meta_data = self.meta_data[index]
96
  # Extract the file name and page number
97
  # file_name = meta_data['source'].split('/')[-1] # Extracts 'POJK 31 - 2018.pdf'
98
  # page_number = meta_data.get('page', 'unknown')
99
  # url = meta_data['source']
100
  # file_name = meta_data.get('source', 'unknown_source').split('/')[-1] # Safe extraction
101
  # page_number = meta_data.get('page', 'unknown') # Default to 'unknown' if 'page' is missing
102
- # url = meta_data.get('source', 'unknown_url') # Default URL fallback
103
 
104
  # logger.info(f"file_name: {file_name}, page_number: {page_number}, url: {url}")
105
 
@@ -108,10 +107,10 @@ class RAGSystem:
108
  # doc_name = f"{file_name}"
109
 
110
  self.results.append(doc)
111
- retrieved_docs.append({"text": doc})
112
  return retrieved_docs
113
  else:
114
- return [{"name": "No relevant documents found.", "text": None}]
115
 
116
  def get_total_tokens(self):
117
  return self.token_counter.get_total_tokens()
 
21
  from keybert import KeyBERT
22
  import asyncio
23
 
24
+ def extract_keywords_async(doc, threshold=0.4, top_n = 5):
25
  kw_model = KeyBERT()
26
+ keywords = kw_model.extract_keywords(doc, threshold=threshold, top_n=top_n)
 
27
  keywords = [key for key, _ in keywords]
28
  return keywords
29
 
 
40
  self.faiss_wrapper = FAISS_search(embedding_model)
41
  self.hybrid_search = Hybrid_search(self.bm25_wrapper, self.faiss_wrapper)
42
 
43
+ def add_document(self, doc_id, text, meta_data=None):
44
  self.token_counter.add_document(doc_id, text)
45
  self.doc_ids.append(doc_id)
46
  self.documents.append(text)
47
+ self.meta_data.append(meta_data)
48
  self.bm25_wrapper.add_document(doc_id, text)
49
  self.faiss_wrapper.add_document(doc_id, text)
50
 
 
91
 
92
  doc = self.documents[index]
93
 
94
+ meta_data = self.meta_data[index]
95
  # Extract the file name and page number
96
  # file_name = meta_data['source'].split('/')[-1] # Extracts 'POJK 31 - 2018.pdf'
97
  # page_number = meta_data.get('page', 'unknown')
98
  # url = meta_data['source']
99
  # file_name = meta_data.get('source', 'unknown_source').split('/')[-1] # Safe extraction
100
  # page_number = meta_data.get('page', 'unknown') # Default to 'unknown' if 'page' is missing
101
+ url = meta_data.get('source', 'unknown_url') # Default URL fallback
102
 
103
  # logger.info(f"file_name: {file_name}, page_number: {page_number}, url: {url}")
104
 
 
107
  # doc_name = f"{file_name}"
108
 
109
  self.results.append(doc)
110
+ retrieved_docs.append({"url":url, "text": doc})
111
  return retrieved_docs
112
  else:
113
+ return [{"url": "None.", "text": None}]
114
 
115
  def get_total_tokens(self):
116
  return self.token_counter.get_total_tokens()
app/services/message.py CHANGED
@@ -16,6 +16,7 @@ from app.utils.load_env import ACCESS_TOKEN, WHATSAPP_API_URL, GEMINI_API, OPENA
16
  from app.utils.system_prompt import system_prompt
17
 
18
  from app.services.search_engine import google_search
 
19
  # Load environment variables
20
  load_dotenv()
21
 
@@ -147,16 +148,18 @@ async def generate_response_from_gemini(
147
  chat = model.start_chat(history=history)
148
 
149
  if rag_system:
150
- keywords = await rag_system.extract_keywords_async(content)
 
151
  # Implement RAG: Retrieve relevant documents
152
  retrieved_docs = await rag_system.adv_query(content, keywords=keywords, top_k=1)
153
  if retrieved_docs:
154
  logger.info(f"Retrieved {len(retrieved_docs)} documents for context.")
155
  # Format the retrieved documents as a context string
156
- context = "\n\n".join([f"Content: {doc['text']}" for doc in retrieved_docs])
157
  # Option 1: Append to history as a system message
158
- history.append({"role": "system", "content": f"Relevant documents:\n{context}"})
159
 
 
160
  # Reinitialize chat with updated history
161
  chat = model.start_chat(history=history)
162
 
 
16
  from app.utils.system_prompt import system_prompt
17
 
18
  from app.services.search_engine import google_search
19
+ from app.search.rag_pipeline import extract_keywords_async
20
  # Load environment variables
21
  load_dotenv()
22
 
 
148
  chat = model.start_chat(history=history)
149
 
150
  if rag_system:
151
+ keywords = extract_keywords_async(content)
152
+ logger.info(f"Extracted Keywords: {keywords}")
153
  # Implement RAG: Retrieve relevant documents
154
  retrieved_docs = await rag_system.adv_query(content, keywords=keywords, top_k=1)
155
  if retrieved_docs:
156
  logger.info(f"Retrieved {len(retrieved_docs)} documents for context.")
157
  # Format the retrieved documents as a context string
158
+ context = "\n\n".join([f"Source:{doc['url']}\nContent: {doc['text']}" for doc in retrieved_docs])
159
  # Option 1: Append to history as a system message
160
+ history.append({"role": "user", "parts": f"Relevant documents:\n{context}"})
161
 
162
+ logger.info(f"History: {history}")
163
  # Reinitialize chat with updated history
164
  chat = model.start_chat(history=history)
165
 
app/utils/system_prompt.py CHANGED
@@ -14,6 +14,7 @@ Content Guidelines:
14
  Up-to-Date Information: Provide the most current and relevant data available about ongoing projects and city improvements. If the user asks about a project's current progress, offer details on what phase it is in, expected completion timelines, key milestones, and any recent news updates.
15
  Neutrality and Positivity: Always maintain a neutral stance. Do not criticize the local government or its officials. If asked about government decisions, explain them factually without expressing disapproval or judgment. Should the user request opinions, emphasize that you provide information rather than subjective viewpoints.
16
  Encouraging Verification: If the user requests official documentation or more detailed information, guide them to verified channels such as the city's official website, relevant department portals, or licensed information centers.
 
17
 
18
  Example Interactions:
19
 
 
14
  Up-to-Date Information: Provide the most current and relevant data available about ongoing projects and city improvements. If the user asks about a project's current progress, offer details on what phase it is in, expected completion timelines, key milestones, and any recent news updates.
15
  Neutrality and Positivity: Always maintain a neutral stance. Do not criticize the local government or its officials. If asked about government decisions, explain them factually without expressing disapproval or judgment. Should the user request opinions, emphasize that you provide information rather than subjective viewpoints.
16
  Encouraging Verification: If the user requests official documentation or more detailed information, guide them to verified channels such as the city's official website, relevant department portals, or licensed information centers.
17
+ Always Include Sources: When your response is based on information provided from external sources, include the source link explicitly. For example: "Informasi ini berasal dari www.indosource.com. Anda dapat mengunjungi tautan tersebut untuk detail lebih lanjut." Clearly attribute the link to maintain transparency. Make sure you don't include redundant links.
18
 
19
  Example Interactions:
20
 
document_logs_2024-12-20.txt DELETED
@@ -1,2 +0,0 @@
1
- 2024-12-20 12:49:01,713 - INFO - ID: ea205193-4582-44bc-ab71-80176aac7aef, Snippet: [SSW](https://sswalfa.surabaya.go.id/home) [![](https://sswalfa.surabaya.go.id/assets/images/logo-
2
- 2024-12-20 16:59:00,859 - INFO - ID: 693547c8-b87c-4601-ae57-2ec8c2a32d06_Informasi Perizinan | Surabaya Single Window, Snippet: [SSW](https://sswalfa.surabaya.go.id/home) [![](https://sswalfa.surabaya.go.id/assets/images/logo-
 
 
 
document_logs_2024-12-21.txt DELETED
File without changes