File size: 10,287 Bytes
b296661
 
 
 
 
 
 
 
 
 
 
 
 
 
2b516d5
b296661
1d172f1
2b516d5
83b3fa4
16f0715
 
b296661
16f0715
 
 
b296661
 
 
 
 
 
07f86c6
 
 
b296661
 
ea5cbee
b296661
 
 
 
2b516d5
b296661
 
 
 
 
 
 
 
8547e22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b516d5
 
bfbadc3
b296661
 
 
48ae0fa
2b516d5
 
 
 
 
 
 
b296661
2b516d5
 
 
 
3e602d9
b296661
 
 
 
 
 
 
ea5cbee
2b516d5
8547e22
26a5c64
 
 
 
 
 
 
 
 
 
 
 
 
b296661
 
 
 
 
 
 
 
 
 
4ef2f39
b296661
 
 
ae94932
b296661
bfbadc3
b296661
a9e136f
 
 
d1935d1
 
 
 
 
 
 
 
 
 
26a5c64
d1935d1
26a5c64
d1935d1
26a5c64
d1935d1
26a5c64
d1935d1
26a5c64
d1935d1
26a5c64
d1935d1
26a5c64
d1935d1
 
 
 
 
 
ca4996e
26a5c64
d1935d1
b296661
 
 
 
b609620
b296661
 
 
 
 
 
 
1ba93a0
b296661
26a5c64
b296661
 
 
 
 
 
2b516d5
b296661
 
 
 
 
 
1d172f1
b296661
 
 
 
 
 
 
 
26a5c64
 
b296661
 
 
ca4996e
 
b296661
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import getpass
import os
import time
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
import re
from langchain_core.documents import Document
from langchain_community.retrievers import BM25Retriever
import requests
from typing import Dict, Any, Optional, List, Tuple
import json
import logging

def retrieve(query: str,vectorstore:PineconeVectorStore, k: int = 1000) -> Tuple[List[Document], List[float]]:    
    start = time.time()
    # pinecone_api_key = os.getenv("PINECONE_API_KEY")
    # pc = Pinecone(api_key=pinecone_api_key)
    
    # index = pc.Index(index_name)
    # vector_store = PineconeVectorStore(index=index, embedding=embeddings)
    results = vectorstore.similarity_search_with_score(
        query,
        k=k,
    )
    documents = []
    scores = []
    for res, score in results:
        # check to make sure response isnt too long for context window of 4o-mini
        if len(res.page_content) > 4000:
            res.page_content = res.page_content[:4000]
        documents.append(res)
        scores.append(score)
    logging.info(f"Finished Retrieval: {time.time() - start}")
    return documents, scores

def safe_get_json(url: str) -> Optional[Dict]:
    """Safely fetch and parse JSON from a URL."""
    print("Fetching JSON")
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        logging.error(f"Error fetching from {url}: {str(e)}")
        return None

def extract_text_from_json(json_data: Dict) -> str:
    """Extract text content from JSON response."""
    if not json_data:
        return ""
    
    text_parts = []
    
    # Handle direct text fields
    text_fields = ["title_info_primary_tsi","abstract_tsi","subject_geographic_sim","genre_basic_ssim","genre_specific_ssim","date_tsim"]
    for field in text_fields:
        if field in json_data['data']['attributes'] and json_data['data']['attributes'][field]:
            # print(json_data[field])
            text_parts.append(str(json_data['data']['attributes'][field]))
    
    return " ".join(text_parts) if text_parts else "No content available"

def rerank(documents: List[Document], query: str) -> List[Document]:
    """Ingest more metadata. Rerank documents using BM25"""
    start = time.time()
    if not documents:
        return []
    
    full_docs = []
    meta_start = time.time()
    for doc in documents:
        if not doc.metadata.get('source'):
            continue
            
        url = f"https://www.digitalcommonwealth.org/search/{doc.metadata['source']}"
        json_data = safe_get_json(f"{url}.json")
        
        if json_data:
            text_content = extract_text_from_json(json_data)
            if text_content:  # Only add documents with actual content
                full_docs.append(Document(page_content=text_content, metadata={"source":doc.metadata['source'],"field":doc.metadata['field'],"URL":url}))
    logging.info(f"Took {time.time()-meta_start} seconds to retrieve all metadata")
    # If no valid documents were processed, return empty list
    if not full_docs:
        return []
    
    # Create BM25 retriever with the processed documents
    reranker = BM25Retriever.from_documents(full_docs, k=min(10, len(full_docs)))
    reranked_docs = reranker.invoke(query)
    logging.info(f"Finished reranking: {time.time()-start}")
    return reranked_docs

def parse_xml_and_query(query:str,xml_string:str) -> str:
    """parse xml and return rephrased query"""
    if not xml_string:
        return "No response generated."
    
    pattern = r"<(\w+)>(.*?)</\1>"
    matches = re.findall(pattern, xml_string, re.DOTALL)
    parsed_response = dict(matches)
    if parsed_response.get('VALID') == 'NO':
        return query
    return parsed_response.get('STATEMENT', query)


def parse_xml_and_check(xml_string: str) -> str:
    """Parse XML-style tags and handle validation."""
    if not xml_string:
        return "No response generated."
    
    pattern = r"<(\w+)>(.*?)</\1>"
    matches = re.findall(pattern, xml_string, re.DOTALL)
    parsed_response = dict(matches)
    
    if parsed_response.get('VALID') == 'NO':
        return "Sorry, I was unable to find any documents for your query.\n\n Here are some documents I found that might be relevant."
    
    return parsed_response.get('RESPONSE', "No response found in the output")

def RAG(llm: Any, query: str,vectorstore:PineconeVectorStore, top: int = 10, k: int = 100) -> Tuple[str, List[Document]]:
    """Main RAG function with improved error handling and validation."""
    start = time.time()
    try:

        # Query alignment is commented our, however I have decided to leave it in for potential future use.

        # Retrieve initial documents using rephrased query -- not working as intended currently, maybe would be better for data with more words.
        # query_template = PromptTemplate.from_template(
        #     """
        #     Your job is to think about a query and then generate a statement that only includes information from the query that would answer the query.
        #     You will be provided with a query in <QUERY></QUERY> tags. 
        #     Then you will think about what kind of information the query is looking for between <REASONING></REASONING> tags.
        #     Then, based on the reasoning, you will generate a sample response to the query that only includes information from the query between <STATEMENT></STATEMENT> tags.
        #     Afterwards, you will determine and reason about whether or not the statement you generated only includes information from the original query and would answer the query between <DETERMINATION></DETERMINATION> tags.
        #     Finally, you will return a YES, or NO response between <VALID></VALID> tags based on whether or not you determined the statment to be valid.
        #     Let me provide you with an exmaple:

        #     <QUERY>I would really like to learn more about Bermudan geography<QUERY>

        #     <REASONING>This query is interested in geograph as it relates to Bermuda. Some things they might be interested in are Bermudan climate, towns, cities, and geography</REASONING>

        #     <STATEMENT>Bermuda's Climate is [blank]. Some of Bermuda's cities and towns are [blank]. Other points of interested about Bermuda's geography are [blank].</STATEMENT>

        #     <DETERMINATION>The query originally only mentions bermuda and geography. The answers do not provide any false information, instead replacing meaningful responses with a placeholder [blank]. If it had hallucinated, it would not be valid. Because the statements do not hallucinate anything, this is a valid statement.</DETERMINATION>
            
        #     <VALID>YES</VALID>

        #     Now it's your turn! Remember not to hallucinate:

        #     <QUERY>{query}</QUERY>
        #     """
        # )
        # query_prompt = query_template.invoke({"query":query})
        # query_response = llm.invoke(query_prompt)
        # new_query = parse_xml_and_query(query=query,xml_string=query_response.content)
        logging.info(f"\n---\nQUERY: {query}")

        retrieved, _ = retrieve(query=query, vectorstore=vectorstore, k=k)
        if not retrieved:
            return "No documents found for your query.", []
        
        # Rerank documents
        reranked = rerank(documents=retrieved, query=query)
        if not reranked:
            return "Unable to process the retrieved documents.", []
        
        # Prepare context from reranked documents
        context = "\n\n".join(doc.page_content for doc in reranked[:top] if doc.page_content)
        if not context.strip():
            return "No relevant content found in the documents.", []
        # change for the sake of another commit
        # Prepare prompt
        answer_template = PromptTemplate.from_template(
            """Pretend you are a professional librarian. Please Summarize The Following Context as though you had retrieved it for a patron:

            Context:{context}

            Make sure to answer in the following format

            First, reason about the answer between <REASONING></REASONING> headers,

            based on the context determine if there is sufficient material for answering the exact question,

            return either <VALID>YES</VALID> or <VALID>NO</VALID>

            then return a response between <RESPONSE></RESPONSE> headers:

            Here is an example

            <EXAMPLE>

            <QUERY>Are pineapples a good fuel for cars?</QUERY>

            <CONTEXT>Cars use gasoline for fuel. Some cars use electricity for fuel.Tesla stock has increased by 10 percent over the last quarter.</CONTEXT>

            <REASONING>Based on the context pineapples have not been explored as a fuel for cars. The context discusses gasoline, electricity, and tesla stock, therefore it is not relevant to the query about pineapples for fuel</REASONING>

            <VALID>NO</VALID>

            <RESPONSE>Pineapples are not a good fuel for cars, however with further research they might be</RESPONSE> 

            </EXAMPLE>

            Now it's your turn 

            <QUERY>

            {query}

            </QUERY>"""
        )
        
        # Generate response
        ans_prompt = answer_template.invoke({"context": context, "query": query})
        response = llm.invoke(ans_prompt)
        
        # Parse and return response
        parsed = parse_xml_and_check(response.content)
        logging.info(f"RESPONSE: {parsed}\nRETRIEVED: {reranked}")
        logging.info(f"RAG Finished: {time.time()-start}\n---\n")
        return parsed, reranked
        
    except Exception as e:
        logging.error(f"Error in RAG function: {str(e)}")
        return f"An error occurred while processing your query: {str(e)}", []