Spaces:
Running
Running
Daniel Foley
commited on
Commit
·
8547e22
1
Parent(s):
48ae0fa
accidentally removed extract_text_fromo_json
Browse files
RAG.py
CHANGED
@@ -14,6 +14,7 @@ import requests
|
|
14 |
from typing import Dict, Any, Optional, List, Tuple
|
15 |
import logging
|
16 |
import concurrent.futures
|
|
|
17 |
|
18 |
def retrieve(query: str,vectorstore:PineconeVectorStore, k: int = 100) -> Tuple[List[Document], List[float]]:
|
19 |
start = time.time()
|
@@ -47,6 +48,22 @@ def safe_get_json(url: str) -> Optional[Dict]:
|
|
47 |
logging.error(f"Error fetching from {url}: {str(e)}")
|
48 |
return None
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
def process_single_document(doc: Document) -> Optional[Document]:
|
51 |
"""Process a single document by fetching and extracting metadata."""
|
52 |
if not doc.metadata.get('source'):
|
@@ -68,7 +85,7 @@ def process_single_document(doc: Document) -> Optional[Document]:
|
|
68 |
)
|
69 |
return None
|
70 |
|
71 |
-
def rerank(documents: List[Document], query: str, max_workers: int =
|
72 |
"""Ingest more metadata and rerank documents using BM25 with parallel processing."""
|
73 |
start = time.time()
|
74 |
if not documents:
|
@@ -103,6 +120,7 @@ def rerank(documents: List[Document], query: str, max_workers: int = 2) -> List[
|
|
103 |
logging.info(f"Finished reranking: {time.time()-start}")
|
104 |
return full_docs
|
105 |
|
|
|
106 |
def parse_xml_and_query(query:str,xml_string:str) -> str:
|
107 |
"""parse xml and return rephrased query"""
|
108 |
if not xml_string:
|
|
|
14 |
from typing import Dict, Any, Optional, List, Tuple
|
15 |
import logging
|
16 |
import concurrent.futures
|
17 |
+
import json
|
18 |
|
19 |
def retrieve(query: str,vectorstore:PineconeVectorStore, k: int = 100) -> Tuple[List[Document], List[float]]:
|
20 |
start = time.time()
|
|
|
48 |
logging.error(f"Error fetching from {url}: {str(e)}")
|
49 |
return None
|
50 |
|
51 |
+
def extract_text_from_json(json_data: Dict) -> str:
|
52 |
+
"""Extract text content from JSON response."""
|
53 |
+
if not json_data:
|
54 |
+
return ""
|
55 |
+
|
56 |
+
text_parts = []
|
57 |
+
|
58 |
+
# Handle direct text fields
|
59 |
+
text_fields = ["title_info_primary_tsi","abstract_tsi","subject_geographic_sim","genre_basic_ssim","genre_specific_ssim","date_tsim"]
|
60 |
+
for field in text_fields:
|
61 |
+
if field in json_data['data']['attributes'] and json_data['data']['attributes'][field]:
|
62 |
+
# print(json_data[field])
|
63 |
+
text_parts.append(str(json_data['data']['attributes'][field]))
|
64 |
+
|
65 |
+
return " ".join(text_parts) if text_parts else "No content available"
|
66 |
+
|
67 |
def process_single_document(doc: Document) -> Optional[Document]:
|
68 |
"""Process a single document by fetching and extracting metadata."""
|
69 |
if not doc.metadata.get('source'):
|
|
|
85 |
)
|
86 |
return None
|
87 |
|
88 |
+
def rerank(documents: List[Document], query: str, max_workers: int = 3) -> List[Document]:
|
89 |
"""Ingest more metadata and rerank documents using BM25 with parallel processing."""
|
90 |
start = time.time()
|
91 |
if not documents:
|
|
|
120 |
logging.info(f"Finished reranking: {time.time()-start}")
|
121 |
return full_docs
|
122 |
|
123 |
+
|
124 |
def parse_xml_and_query(query:str,xml_string:str) -> str:
|
125 |
"""parse xml and return rephrased query"""
|
126 |
if not xml_string:
|