Daniel Foley commited on
Commit
26a5c64
·
1 Parent(s): 89e7a42

Added query alignment to retrieval

Browse files
Files changed (1) hide show
  1. RAG.py +48 -6
RAG.py CHANGED
@@ -87,6 +87,19 @@ def rerank(documents: List[Document], query: str) -> List[Document]:
87
  reranked_docs = reranker.invoke(query)
88
  return reranked_docs
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  def parse_xml_and_check(xml_string: str) -> str:
91
  """Parse XML-style tags and handle validation."""
92
  if not xml_string:
@@ -105,12 +118,42 @@ def RAG(llm: Any, query: str, index_name: str, embeddings: Any, top: int = 10, k
105
  """Main RAG function with improved error handling and validation."""
106
  try:
107
  # Retrieve initial documents
108
- retrieved, _ = retrieve(index_name=index_name, query=query, embeddings=embeddings, k=k)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  if not retrieved:
110
  return "No documents found for your query.", []
111
 
112
  # Rerank documents
113
- reranked = rerank(documents=retrieved, query=query)
114
  if not reranked:
115
  return "Unable to process the retrieved documents.", []
116
 
@@ -120,7 +163,7 @@ def RAG(llm: Any, query: str, index_name: str, embeddings: Any, top: int = 10, k
120
  return "No relevant content found in the documents.", []
121
 
122
  # Prepare prompt
123
- prompt_template = PromptTemplate.from_template(
124
  """Pretend you are a professional librarian. Please Summarize The Following Context as though you had retrieved it for a patron:
125
  Context:{context}
126
  Make sure to answer in the following format
@@ -143,9 +186,8 @@ def RAG(llm: Any, query: str, index_name: str, embeddings: Any, top: int = 10, k
143
  )
144
 
145
  # Generate response
146
- prompt = prompt_template.invoke({"context": context, "query": query})
147
- print(prompt)
148
- response = llm.invoke(prompt)
149
 
150
  # Parse and return response
151
  parsed = parse_xml_and_check(response.content)
 
87
  reranked_docs = reranker.invoke(query)
88
  return reranked_docs
89
 
90
+ def parse_xml_and_query(query:str,xml_string:str) -> str:
91
+ """parse xml and return rephrased query"""
92
+ if not xml_string:
93
+ return "No response generated."
94
+
95
+ pattern = r"<(\w+)>(.*?)</\1>"
96
+ matches = re.findall(pattern, xml_string, re.DOTALL)
97
+ parsed_response = dict(matches)
98
+ if parsed_response.get('VALID') == 'NO':
99
+ return query
100
+ return parsed_response.get('STATEMENT', query)
101
+
102
+
103
  def parse_xml_and_check(xml_string: str) -> str:
104
  """Parse XML-style tags and handle validation."""
105
  if not xml_string:
 
118
  """Main RAG function with improved error handling and validation."""
119
  try:
120
  # Retrieve initial documents
121
+ query_template = PromptTemplate.from_template(
122
+ """
123
+ Your job is to think about a query and then generate a statement that only includes information from the query that would answer the query.
124
+ You will be provided with a query in <QUERY></QUERY> tags.
125
+ Then you will think about what kind of information the query is looking for between <REASONING></REASONING> tags.
126
+ Then, based on the reasoning, you will generate a sample response to the query that only includes information from the query between <STATEMENT></STATEMENT> tags.
127
+ Afterwards, you will determine and reason about whether or not the statement you generated only includes information from the original query and would answer the query between <DETERMINATION></DETERMINATION> tags.
128
+ Finally, you will return a YES, or NO response between <VALID></VALID> tags based on whether or not you determined the statment to be valid.
129
+ Let me provide you with an exmaple:
130
+
131
+ <QUERY>I would really like to learn more about Bermudan geography<QUERY>
132
+
133
+ <REASONING>This query is interested in geograph as it relates to Bermuda. Some things they might be interested in are Bermudan climate, towns, cities, and geography</REASONING>
134
+
135
+ <STATEMENT>Bermuda's Climate is [blank]. Some of Bermuda's cities and towns are [blank]. Other points of interested about Bermuda's geography are [blank].</STATEMENT>
136
+
137
+ <DETERMINATION>The query originally only mentions bermuda and geography. The answers do not provide any false information, instead replacing meaningful responses with a placeholder [blank]. If it had hallucinated, it would not be valid. Because the statements do not hallucinate anything, this is a valid statement.</DETERMINATION>
138
+
139
+ <VALID>YES</VALID>
140
+
141
+ Now it's your turn! Remember not to hallucinate:
142
+
143
+ <QUERY>{query}</QUERY>
144
+ """
145
+
146
+ )
147
+ query_prompt = query_template.invoke({"query":query})
148
+ query_response = llm.invoke(query_prompt)
149
+ new_query = parse_xml_and_query(query_response.content)
150
+
151
+ retrieved, _ = retrieve(index_name=index_name, query=new_query, embeddings=embeddings, k=k)
152
  if not retrieved:
153
  return "No documents found for your query.", []
154
 
155
  # Rerank documents
156
+ reranked = rerank(documents=retrieved, query=new_query)
157
  if not reranked:
158
  return "Unable to process the retrieved documents.", []
159
 
 
163
  return "No relevant content found in the documents.", []
164
 
165
  # Prepare prompt
166
+ answer_template = PromptTemplate.from_template(
167
  """Pretend you are a professional librarian. Please Summarize The Following Context as though you had retrieved it for a patron:
168
  Context:{context}
169
  Make sure to answer in the following format
 
186
  )
187
 
188
  # Generate response
189
+ ans_prompt = answer_template.invoke({"context": context, "query": query})
190
+ response = llm.invoke(ans_prompt)
 
191
 
192
  # Parse and return response
193
  parsed = parse_xml_and_check(response.content)