Daniel Foley commited on
Commit
2b516d5
·
1 Parent(s): 5c86e88

reverted back to pre-parallel version

Browse files
Files changed (1) hide show
  1. RAG.py +18 -43
RAG.py CHANGED
@@ -12,10 +12,10 @@ from langchain_core.documents import Document
12
  from langchain_community.retrievers import BM25Retriever
13
  import requests
14
  from typing import Dict, Any, Optional, List, Tuple
 
15
  import logging
16
- import concurrent.futures
17
 
18
- def retrieve(query: str,vectorstore:PineconeVectorStore, k: int = 100) -> Tuple[List[Document], List[float]]:
19
  start = time.time()
20
  # pinecone_api_key = os.getenv("PINECONE_API_KEY")
21
  # pc = Pinecone(api_key=pinecone_api_key)
@@ -39,6 +39,7 @@ def retrieve(query: str,vectorstore:PineconeVectorStore, k: int = 100) -> Tuple[
39
 
40
  def safe_get_json(url: str) -> Optional[Dict]:
41
  """Safely fetch and parse JSON from a URL."""
 
42
  try:
43
  response = requests.get(url, timeout=10)
44
  response.raise_for_status()
@@ -63,52 +64,26 @@ def extract_text_from_json(json_data: Dict) -> str:
63
 
64
  return " ".join(text_parts) if text_parts else "No content available"
65
 
66
- def process_single_document(doc: Document) -> Optional[Document]:
67
- """Process a single document by fetching and extracting metadata."""
68
- if not doc.metadata.get('source'):
69
- return None
70
-
71
- url = f"https://www.digitalcommonwealth.org/search/{doc.metadata['source']}"
72
- json_data = safe_get_json(f"{url}.json")
73
-
74
- if json_data:
75
- text_content = extract_text_from_json(json_data)
76
- if text_content:
77
- return Document(
78
- page_content=text_content,
79
- metadata={
80
- "source": doc.metadata['source'],
81
- "field": doc.metadata['field'],
82
- "URL": url
83
- }
84
- )
85
- return None
86
-
87
- def rerank(documents: List[Document], query: str, max_workers: int = 1) -> List[Document]:
88
- """Ingest more metadata and rerank documents using BM25 with parallel processing."""
89
  start = time.time()
90
  if not documents:
91
  return []
92
 
93
- meta_start = time.time()
94
  full_docs = []
95
-
96
- # Process documents in parallel using ThreadPoolExecutor
97
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
98
- # Submit all document processing tasks
99
- future_to_doc = {
100
- executor.submit(process_single_document, doc): doc
101
- for doc in documents
102
- }
103
 
104
- # Collect results as they complete
105
- for future in concurrent.futures.as_completed(future_to_doc):
106
- processed_doc = future.result()
107
- if processed_doc:extract_text_from_json():
108
- full_docs.append(processed_doc)
109
-
110
  logging.info(f"Took {time.time()-meta_start} seconds to retrieve all metadata")
111
-
112
  # If no valid documents were processed, return empty list
113
  if not full_docs:
114
  return []
@@ -117,7 +92,7 @@ def rerank(documents: List[Document], query: str, max_workers: int = 1) -> List[
117
  reranker = BM25Retriever.from_documents(full_docs, k=min(10, len(full_docs)))
118
  reranked_docs = reranker.invoke(query)
119
  logging.info(f"Finished reranking: {time.time()-start}")
120
- return full_docs
121
 
122
  def parse_xml_and_query(query:str,xml_string:str) -> str:
123
  """parse xml and return rephrased query"""
@@ -203,7 +178,7 @@ def RAG(llm: Any, query: str,vectorstore:PineconeVectorStore, top: int = 10, k:
203
  First, reason about the answer between <REASONING></REASONING> headers,
204
  based on the context determine if there is sufficient material for answering the exact question,
205
  return either <VALID>YES</VALID> or <VALID>NO</VALID>
206
- then return a response between <RESPONSE></RESPONSE> headers, your response should be well formatted and an individual summary of each piece of relevant context:
207
  Here is an example
208
  <EXAMPLE>
209
  <QUERY>Are pineapples a good fuel for cars?</QUERY>
 
12
  from langchain_community.retrievers import BM25Retriever
13
  import requests
14
  from typing import Dict, Any, Optional, List, Tuple
15
+ import json
16
  import logging
 
17
 
18
+ def retrieve(query: str,vectorstore:PineconeVectorStore, k: int = 1000) -> Tuple[List[Document], List[float]]:
19
  start = time.time()
20
  # pinecone_api_key = os.getenv("PINECONE_API_KEY")
21
  # pc = Pinecone(api_key=pinecone_api_key)
 
39
 
40
  def safe_get_json(url: str) -> Optional[Dict]:
41
  """Safely fetch and parse JSON from a URL."""
42
+ print("Fetching JSON")
43
  try:
44
  response = requests.get(url, timeout=10)
45
  response.raise_for_status()
 
64
 
65
  return " ".join(text_parts) if text_parts else "No content available"
66
 
67
+ def rerank(documents: List[Document], query: str) -> List[Document]:
68
+ """Ingest more metadata. Rerank documents using BM25"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  start = time.time()
70
  if not documents:
71
  return []
72
 
 
73
  full_docs = []
74
+ meta_start = time.time()
75
+ for doc in documents:
76
+ if not doc.metadata.get('source'):
77
+ continue
78
+
79
+ url = f"https://www.digitalcommonwealth.org/search/{doc.metadata['source']}"
80
+ json_data = safe_get_json(f"{url}.json")
 
81
 
82
+ if json_data:
83
+ text_content = extract_text_from_json(json_data)
84
+ if text_content: # Only add documents with actual content
85
+ full_docs.append(Document(page_content=text_content, metadata={"source":doc.metadata['source'],"field":doc.metadata['field'],"URL":url}))
 
 
86
  logging.info(f"Took {time.time()-meta_start} seconds to retrieve all metadata")
 
87
  # If no valid documents were processed, return empty list
88
  if not full_docs:
89
  return []
 
92
  reranker = BM25Retriever.from_documents(full_docs, k=min(10, len(full_docs)))
93
  reranked_docs = reranker.invoke(query)
94
  logging.info(f"Finished reranking: {time.time()-start}")
95
+ return reranked_docs
96
 
97
  def parse_xml_and_query(query:str,xml_string:str) -> str:
98
  """parse xml and return rephrased query"""
 
178
  First, reason about the answer between <REASONING></REASONING> headers,
179
  based on the context determine if there is sufficient material for answering the exact question,
180
  return either <VALID>YES</VALID> or <VALID>NO</VALID>
181
+ then return a response between <RESPONSE></RESPONSE> headers:
182
  Here is an example
183
  <EXAMPLE>
184
  <QUERY>Are pineapples a good fuel for cars?</QUERY>