Spaces:
Running
Running
Daniel Foley
commited on
Commit
·
26a5c64
1
Parent(s):
89e7a42
Added query alignment to retrieval
Browse files
RAG.py
CHANGED
@@ -87,6 +87,19 @@ def rerank(documents: List[Document], query: str) -> List[Document]:
|
|
87 |
reranked_docs = reranker.invoke(query)
|
88 |
return reranked_docs
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
def parse_xml_and_check(xml_string: str) -> str:
|
91 |
"""Parse XML-style tags and handle validation."""
|
92 |
if not xml_string:
|
@@ -105,12 +118,42 @@ def RAG(llm: Any, query: str, index_name: str, embeddings: Any, top: int = 10, k
|
|
105 |
"""Main RAG function with improved error handling and validation."""
|
106 |
try:
|
107 |
# Retrieve initial documents
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
if not retrieved:
|
110 |
return "No documents found for your query.", []
|
111 |
|
112 |
# Rerank documents
|
113 |
-
reranked = rerank(documents=retrieved, query=
|
114 |
if not reranked:
|
115 |
return "Unable to process the retrieved documents.", []
|
116 |
|
@@ -120,7 +163,7 @@ def RAG(llm: Any, query: str, index_name: str, embeddings: Any, top: int = 10, k
|
|
120 |
return "No relevant content found in the documents.", []
|
121 |
|
122 |
# Prepare prompt
|
123 |
-
|
124 |
"""Pretend you are a professional librarian. Please Summarize The Following Context as though you had retrieved it for a patron:
|
125 |
Context:{context}
|
126 |
Make sure to answer in the following format
|
@@ -143,9 +186,8 @@ def RAG(llm: Any, query: str, index_name: str, embeddings: Any, top: int = 10, k
|
|
143 |
)
|
144 |
|
145 |
# Generate response
|
146 |
-
|
147 |
-
|
148 |
-
response = llm.invoke(prompt)
|
149 |
|
150 |
# Parse and return response
|
151 |
parsed = parse_xml_and_check(response.content)
|
|
|
87 |
reranked_docs = reranker.invoke(query)
|
88 |
return reranked_docs
|
89 |
|
90 |
+
def parse_xml_and_query(query:str,xml_string:str) -> str:
|
91 |
+
"""parse xml and return rephrased query"""
|
92 |
+
if not xml_string:
|
93 |
+
return "No response generated."
|
94 |
+
|
95 |
+
pattern = r"<(\w+)>(.*?)</\1>"
|
96 |
+
matches = re.findall(pattern, xml_string, re.DOTALL)
|
97 |
+
parsed_response = dict(matches)
|
98 |
+
if parsed_response.get('VALID') == 'NO':
|
99 |
+
return query
|
100 |
+
return parsed_response.get('STATEMENT', query)
|
101 |
+
|
102 |
+
|
103 |
def parse_xml_and_check(xml_string: str) -> str:
|
104 |
"""Parse XML-style tags and handle validation."""
|
105 |
if not xml_string:
|
|
|
118 |
"""Main RAG function with improved error handling and validation."""
|
119 |
try:
|
120 |
# Retrieve initial documents
|
121 |
+
query_template = PromptTemplate.from_template(
|
122 |
+
"""
|
123 |
+
Your job is to think about a query and then generate a statement that only includes information from the query that would answer the query.
|
124 |
+
You will be provided with a query in <QUERY></QUERY> tags.
|
125 |
+
Then you will think about what kind of information the query is looking for between <REASONING></REASONING> tags.
|
126 |
+
Then, based on the reasoning, you will generate a sample response to the query that only includes information from the query between <STATEMENT></STATEMENT> tags.
|
127 |
+
Afterwards, you will determine and reason about whether or not the statement you generated only includes information from the original query and would answer the query between <DETERMINATION></DETERMINATION> tags.
|
128 |
+
Finally, you will return a YES, or NO response between <VALID></VALID> tags based on whether or not you determined the statment to be valid.
|
129 |
+
Let me provide you with an exmaple:
|
130 |
+
|
131 |
+
<QUERY>I would really like to learn more about Bermudan geography<QUERY>
|
132 |
+
|
133 |
+
<REASONING>This query is interested in geograph as it relates to Bermuda. Some things they might be interested in are Bermudan climate, towns, cities, and geography</REASONING>
|
134 |
+
|
135 |
+
<STATEMENT>Bermuda's Climate is [blank]. Some of Bermuda's cities and towns are [blank]. Other points of interested about Bermuda's geography are [blank].</STATEMENT>
|
136 |
+
|
137 |
+
<DETERMINATION>The query originally only mentions bermuda and geography. The answers do not provide any false information, instead replacing meaningful responses with a placeholder [blank]. If it had hallucinated, it would not be valid. Because the statements do not hallucinate anything, this is a valid statement.</DETERMINATION>
|
138 |
+
|
139 |
+
<VALID>YES</VALID>
|
140 |
+
|
141 |
+
Now it's your turn! Remember not to hallucinate:
|
142 |
+
|
143 |
+
<QUERY>{query}</QUERY>
|
144 |
+
"""
|
145 |
+
|
146 |
+
)
|
147 |
+
query_prompt = query_template.invoke({"query":query})
|
148 |
+
query_response = llm.invoke(query_prompt)
|
149 |
+
new_query = parse_xml_and_query(query_response.content)
|
150 |
+
|
151 |
+
retrieved, _ = retrieve(index_name=index_name, query=new_query, embeddings=embeddings, k=k)
|
152 |
if not retrieved:
|
153 |
return "No documents found for your query.", []
|
154 |
|
155 |
# Rerank documents
|
156 |
+
reranked = rerank(documents=retrieved, query=new_query)
|
157 |
if not reranked:
|
158 |
return "Unable to process the retrieved documents.", []
|
159 |
|
|
|
163 |
return "No relevant content found in the documents.", []
|
164 |
|
165 |
# Prepare prompt
|
166 |
+
answer_template = PromptTemplate.from_template(
|
167 |
"""Pretend you are a professional librarian. Please Summarize The Following Context as though you had retrieved it for a patron:
|
168 |
Context:{context}
|
169 |
Make sure to answer in the following format
|
|
|
186 |
)
|
187 |
|
188 |
# Generate response
|
189 |
+
ans_prompt = answer_template.invoke({"context": context, "query": query})
|
190 |
+
response = llm.invoke(ans_prompt)
|
|
|
191 |
|
192 |
# Parse and return response
|
193 |
parsed = parse_xml_and_check(response.content)
|