Daniel Foley commited on
Commit
8547e22
·
1 Parent(s): 48ae0fa

accidentally removed extract_text_fromo_json

Browse files
Files changed (1) hide show
  1. RAG.py +19 -1
RAG.py CHANGED
@@ -14,6 +14,7 @@ import requests
14
  from typing import Dict, Any, Optional, List, Tuple
15
  import logging
16
  import concurrent.futures
 
17
 
18
  def retrieve(query: str,vectorstore:PineconeVectorStore, k: int = 100) -> Tuple[List[Document], List[float]]:
19
  start = time.time()
@@ -47,6 +48,22 @@ def safe_get_json(url: str) -> Optional[Dict]:
47
  logging.error(f"Error fetching from {url}: {str(e)}")
48
  return None
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def process_single_document(doc: Document) -> Optional[Document]:
51
  """Process a single document by fetching and extracting metadata."""
52
  if not doc.metadata.get('source'):
@@ -68,7 +85,7 @@ def process_single_document(doc: Document) -> Optional[Document]:
68
  )
69
  return None
70
 
71
- def rerank(documents: List[Document], query: str, max_workers: int = 2) -> List[Document]:
72
  """Ingest more metadata and rerank documents using BM25 with parallel processing."""
73
  start = time.time()
74
  if not documents:
@@ -103,6 +120,7 @@ def rerank(documents: List[Document], query: str, max_workers: int = 2) -> List[
103
  logging.info(f"Finished reranking: {time.time()-start}")
104
  return full_docs
105
 
 
106
  def parse_xml_and_query(query:str,xml_string:str) -> str:
107
  """parse xml and return rephrased query"""
108
  if not xml_string:
 
14
  from typing import Dict, Any, Optional, List, Tuple
15
  import logging
16
  import concurrent.futures
17
+ import json
18
 
19
  def retrieve(query: str,vectorstore:PineconeVectorStore, k: int = 100) -> Tuple[List[Document], List[float]]:
20
  start = time.time()
 
48
  logging.error(f"Error fetching from {url}: {str(e)}")
49
  return None
50
 
51
+ def extract_text_from_json(json_data: Dict) -> str:
52
+ """Extract text content from JSON response."""
53
+ if not json_data:
54
+ return ""
55
+
56
+ text_parts = []
57
+
58
+ # Handle direct text fields
59
+ text_fields = ["title_info_primary_tsi","abstract_tsi","subject_geographic_sim","genre_basic_ssim","genre_specific_ssim","date_tsim"]
60
+ for field in text_fields:
61
+ if field in json_data['data']['attributes'] and json_data['data']['attributes'][field]:
62
+ # print(json_data[field])
63
+ text_parts.append(str(json_data['data']['attributes'][field]))
64
+
65
+ return " ".join(text_parts) if text_parts else "No content available"
66
+
67
  def process_single_document(doc: Document) -> Optional[Document]:
68
  """Process a single document by fetching and extracting metadata."""
69
  if not doc.metadata.get('source'):
 
85
  )
86
  return None
87
 
88
+ def rerank(documents: List[Document], query: str, max_workers: int = 3) -> List[Document]:
89
  """Ingest more metadata and rerank documents using BM25 with parallel processing."""
90
  start = time.time()
91
  if not documents:
 
120
  logging.info(f"Finished reranking: {time.time()-start}")
121
  return full_docs
122
 
123
+
124
  def parse_xml_and_query(query:str,xml_string:str) -> str:
125
  """parse xml and return rephrased query"""
126
  if not xml_string: