Ahmad-Moiz commited on
Commit
51e9885
·
1 Parent(s): ac77c53

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +353 -0
app.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import pinecone
5
+ import pandas as pd
6
+ import altair as alt
7
+ import streamlit as st
8
+ from typing import List
9
+ from langchain.vectorstores import Pinecone
10
+ from langchain.llms import Anthropic
11
+ from langchain.chat_models import ChatOpenAI
12
+ from langchain.evaluation.qa import QAEvalChain
13
+ from langchain.embeddings import HuggingFaceEmbeddings
14
+ from langchain.embeddings.openai import OpenAIEmbeddings
15
+ from langchain.chains.question_answering import load_qa_chain
16
+ from langchain.retrievers.self_query.base import SelfQueryRetriever
17
+ from kor_retriever_lex import kor_retriever
18
+ from langchain.docstore.document import Document
19
+ from self_query_retriever_lex import metadata_field_info, document_content_description
20
+ from prompts import GRADE_DOCS_PROMPT, GRADE_ANSWER_PROMPT, GRADE_ANSWER_PROMPT_FAST, GRADE_ANSWER_PROMPT_BIAS_CHECK, GRADE_ANSWER_PROMPT_OPENAI, QA_CHAIN_PROMPT_LEX, QA_CHAIN_PROMPT_TRAVEL
21
+
22
+ # Keep dataframe in memory to accumulate experimental results
23
+ if "existing_df" not in st.session_state:
24
+ summary = pd.DataFrame(columns=['model',
25
+ 'retriever',
26
+ 'embedding',
27
+ 'num_neighbors',
28
+ 'Latency',
29
+ 'Retrieval score',
30
+ 'Answer score'])
31
+ st.session_state.existing_df = summary
32
+ else:
33
+ summary = st.session_state.existing_df
34
+
35
+ @st.cache_resource
36
+ def make_llm(model_version: str):
37
+ """
38
+ Make LLM from model version
39
+ @param model_version: model_version
40
+ @return: LLN
41
+ """
42
+ if (model_version == "gpt-3.5-turbo") or (model_version == "gpt-4"):
43
+ chosen_model = ChatOpenAI(model_name=model_version, temperature=0)
44
+ elif model_version == "anthropic":
45
+ chosen_model = Anthropic(temperature=0)
46
+ else:
47
+ st.warning("Model version not recognized. Using gpt-3.5-turbo", icon="⚠")
48
+ chosen_model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
49
+ return chosen_model
50
+
51
+ @st.cache_resource
52
+ def make_retriever(retriever_type,embedding_type,pc_api_key,pc_region,pc_index):
53
+ """
54
+ Make document retriever
55
+ @param retriever_type: retriever type
56
+ @param embedding_type: embedding type
57
+ @param num_neighbors: number of neighbors for retrieval
58
+ @return: Pinecone
59
+ """
60
+ st.info("Connecting to Pinecone ...")
61
+
62
+ # Retriver type
63
+ if retriever_type in ("Pinecone","Pinecone w/ metadata filtering"):
64
+ return p
65
+ elif retriever_type == "Pinecone w/ self-querying":
66
+ return SelfQueryRetriever.from_llm(ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0), p, document_content_description, metadata_field_info, verbose=True, k=10)
67
+ elif retriever_type == "Kor filtering":
68
+ return kor_retriever
69
+
70
+ def make_chain(llm):
71
+ """
72
+ Make retrieval chain
73
+ @param retriever: retriever
74
+ @param retriever_type: retriever type
75
+ @return: QA chain
76
+ """
77
+
78
+ qa_chain = load_qa_chain(llm, chain_type="stuff", prompt=QA_CHAIN_PROMPT_LEX)
79
+
80
+ return qa_chain
81
+
82
+
83
+ def grade_model_answer(predicted_dataset: List, predictions: List, grade_answer_prompt: str) -> List:
84
+ """
85
+ Grades the distilled answer based on ground truth and model predictions.
86
+ @param predicted_dataset: A list of dictionaries containing ground truth questions and answers.
87
+ @param predictions: A list of dictionaries containing model predictions for the questions.
88
+ @param grade_answer_prompt: The prompt level for the grading. Either "Fast" or "Full".
89
+ @return: A list of scores for the distilled answers.
90
+ """
91
+ # Grade the distilled answer
92
+ st.info("Grading model answer ...")
93
+ # Set the grading prompt based on the grade_answer_prompt parameter
94
+ if grade_answer_prompt == "Fast":
95
+ prompt = GRADE_ANSWER_PROMPT_FAST
96
+ elif grade_answer_prompt == "Descriptive w/ bias check":
97
+ prompt = GRADE_ANSWER_PROMPT_BIAS_CHECK
98
+ elif grade_answer_prompt == "OpenAI grading prompt":
99
+ prompt = GRADE_ANSWER_PROMPT_OPENAI
100
+ else:
101
+ prompt = GRADE_ANSWER_PROMPT
102
+
103
+ # Create an evaluation chain
104
+ eval_chain = QAEvalChain.from_llm(
105
+ llm=ChatOpenAI(model_name="gpt-4", temperature=0),
106
+ prompt=prompt
107
+ )
108
+
109
+ # Evaluate the predictions and ground truth using the evaluation chain
110
+ graded_outputs = eval_chain.evaluate(
111
+ predicted_dataset,
112
+ predictions,
113
+ question_key="question",
114
+ prediction_key="result"
115
+ )
116
+
117
+ return graded_outputs
118
+
119
+
120
+ def grade_model_retrieval(gt_dataset: List, predictions: List, grade_docs_prompt: str):
121
+ """
122
+ Grades the relevance of retrieved documents based on ground truth and model predictions.
123
+ @param gt_dataset: list of dictionaries containing ground truth questions and answers.
124
+ @param predictions: list of dictionaries containing model predictions for the questions
125
+ @param grade_docs_prompt: prompt level for the grading. Either "Fast" or "Full"
126
+ @return: list of scores for the retrieved documents.
127
+ """
128
+ # Grade the docs retrieval
129
+ st.info("Grading relevance of retrieved docs ...")
130
+
131
+ # Set the grading prompt based on the grade_docs_prompt parameter
132
+ prompt = GRADE_DOCS_PROMPT
133
+
134
+ # Create an evaluation chain
135
+ eval_chain = QAEvalChain.from_llm(
136
+ llm=ChatOpenAI(model_name="gpt-4", temperature=0),
137
+ prompt=prompt
138
+ )
139
+
140
+ # Evaluate the predictions and ground truth using the evaluation chain
141
+ graded_outputs = eval_chain.evaluate(
142
+ gt_dataset,
143
+ predictions,
144
+ question_key="question",
145
+ prediction_key="result"
146
+ )
147
+ return graded_outputs
148
+
149
+
150
+ def run_evaluation(chain, retriever, eval_set, grade_prompt, retriever_type, num_neighbors):
151
+ """
152
+ Runs evaluation on a model's performance on a given evaluation dataset.
153
+ @param chain: Model chain used for answering questions
154
+ @param retriever: Document retriever used for retrieving relevant documents
155
+ @param eval_set: List of dictionaries containing questions and corresponding ground truth answers
156
+ @param grade_prompt: String prompt used for grading model's performance
157
+ @param retriever_type: String specifying the type of retriever used
158
+ @param num_neighbors: Number of neighbors to retrieve using the retriever
159
+ @return: A tuple of four items:
160
+ - answers_grade: A dictionary containing scores for the model's answers.
161
+ - retrieval_grade: A dictionary containing scores for the model's document retrieval.
162
+ - latencies_list: A list of latencies in seconds for each question answered.
163
+ - predictions_list: A list of dictionaries containing the model's predicted answers and relevant documents for each question.
164
+ """
165
+ st.info("Running evaluation ...")
166
+ predictions_list = []
167
+ retrieved_docs = []
168
+ gt_dataset = []
169
+ latencies_list = []
170
+
171
+ for data in eval_set:
172
+
173
+ # Get answer and log latency
174
+ start_time = time.time()
175
+
176
+ # Get docs
177
+ if retriever_type == "Pinecone w/ self-querying":
178
+ docs = retriever.get_relevant_documents(data["question"])
179
+
180
+ elif retriever_type == "Pinecone w/ metadata filtering":
181
+ ### Set metadata here ###
182
+ metadata_filter = {'id':"0252"}
183
+ docs = retriever.similarity_search(query=data["question"],k=num_neighbors,filter=metadata_filter)
184
+
185
+ elif retriever_type == "Kor filtering":
186
+ docs = retriever(p,data["question"])
187
+
188
+ else:
189
+ docs = retriever.similarity_search(query=data["question"],k=num_neighbors)
190
+
191
+ print("--DOCS--")
192
+ if not docs:
193
+ docs=[Document(page_content="I was unable to recover any information about the question!")]
194
+ print(docs)
195
+
196
+ # Get answer
197
+ answer = chain.run(input_documents=docs,question=data["question"])
198
+ predictions_list.append({"question": data["question"], "answer": data["answer"], "result": answer})
199
+ gt_dataset.append(data)
200
+ end_time = time.time()
201
+ elapsed_time = end_time - start_time
202
+ latencies_list.append(elapsed_time)
203
+
204
+ # Get doc text
205
+ retrieved_doc_text = ""
206
+ for i, doc in enumerate(docs):
207
+ retrieved_doc_text += "Doc %s: " % str(i + 1) + doc.page_content + " "
208
+ retrieved = {"question": data["question"], "answer": data["answer"], "result": retrieved_doc_text}
209
+ retrieved_docs.append(retrieved)
210
+
211
+ # Grade docs and answer
212
+ answers_grade = grade_model_answer(gt_dataset, predictions_list, grade_prompt)
213
+ retrieval_grade = grade_model_retrieval(gt_dataset, retrieved_docs, grade_prompt)
214
+ return answers_grade, retrieval_grade, latencies_list, predictions_list
215
+
216
+ # Auth
217
+ st.sidebar.image("img/diagnostic.jpg")
218
+
219
+ with st.sidebar.form("user_input"):
220
+
221
+ # Pinecone params
222
+ oai_api_key = st.text_input("OpenAI API Key:", type="password").strip()
223
+ pc_api_key = st.text_input("Pinecone API Key:", type="password").strip()
224
+ pc_region = st.text_input("Pinecone region:", type="password").strip()
225
+ pc_index = st.text_input("Pinecone index:", type="password").strip()
226
+
227
+ retriever_type = st.radio("Choose retriever",
228
+ ("Pinecone",
229
+ "Pinecone w/ self-querying",
230
+ "Pinecone w/ metadata filtering",
231
+ "Kor filtering"),
232
+ index=0)
233
+
234
+ num_neighbors = st.select_slider("Choose # chunks to retrieve",
235
+ options=[3, 4, 5, 6, 7, 8])
236
+
237
+ embeddings = st.radio("Choose embeddings",
238
+ ("HuggingFace",
239
+ "OpenAI"),
240
+ index=1)
241
+
242
+ model = st.radio("Choose model",
243
+ ("gpt-3.5-turbo",
244
+ "gpt-4"),
245
+ index=0)
246
+
247
+ grade_prompt = st.radio("Grading style prompt",
248
+ ("Fast",
249
+ "Descriptive",
250
+ "Descriptive w/ bias check",
251
+ "OpenAI grading prompt"),
252
+ index=3)
253
+
254
+ submitted = st.form_submit_button("Submit evaluation")
255
+
256
+ # App
257
+ st.header("VectorDB auto-evaluator")
258
+ st.info(
259
+ "`I am an evaluation tool for question-answering using an existing vectorDB (currently Pinecone is supported) and an eval set. "
260
+ "I will generate and grade an answer to each eval set question with the user-specific retrival setting, such as metadata filtering or self-querying retrieval."
261
+ " Experiments with different configurations are logged. For an example eval set, see eval_sets/lex-pod-eval.json.`")
262
+
263
+ with st.form(key='file_inputs'):
264
+
265
+ uploaded_eval_set = st.file_uploader("Please upload eval set (.json): ",
266
+ type=['json'],
267
+ accept_multiple_files=False)
268
+
269
+ submitted = st.form_submit_button("Submit files")
270
+
271
+ # Build an index from the supplied docs
272
+ if uploaded_eval_set and pc_api_key and pc_region and pc_index:
273
+
274
+ # Set API key
275
+ os.environ["OPENAI_API_KEY"] = oai_api_key
276
+
277
+ # Set embeddings (must match your Pinecone DB)
278
+ if embeddings == "OpenAI":
279
+ embedding = OpenAIEmbeddings()
280
+ elif embeddings == "HuggingFace":
281
+ embedding = HuggingFaceEmbeddings()
282
+
283
+ # Set Pinecone
284
+ pinecone.init(api_key=str(pc_api_key), environment=str(pc_region))
285
+ p = Pinecone.from_existing_index(index_name=str(pc_index), embedding=embedding)
286
+
287
+ # Eval set
288
+ eval_set = json.loads(uploaded_eval_set.read())
289
+
290
+ # Make LLM
291
+ llm = make_llm(model)
292
+
293
+ # Make retriver
294
+ retriever = make_retriever(retriever_type,embeddings,pc_api_key,pc_region,pc_index)
295
+
296
+ # Make chain
297
+ qa_chain = make_chain(llm)
298
+
299
+ # Grade model
300
+ graded_answers, graded_retrieval, latency, predictions = run_evaluation(qa_chain, retriever, eval_set, grade_prompt,
301
+ retriever_type, num_neighbors)
302
+
303
+ # Assemble outputs
304
+ d = pd.DataFrame(predictions)
305
+ d['answer score'] = [g['text'] for g in graded_answers]
306
+ d['docs score'] = [g['text'] for g in graded_retrieval]
307
+ d['latency'] = latency
308
+
309
+ # Summary statistics
310
+ mean_latency = d['latency'].mean()
311
+ correct_answer_count = len([text for text in d['answer score'] if "Incorrect" not in text])
312
+ correct_docs_count = len([text for text in d['docs score'] if "Incorrect" not in text])
313
+ percentage_answer = (correct_answer_count / len(graded_answers)) * 100
314
+ percentage_docs = (correct_docs_count / len(graded_retrieval)) * 100
315
+
316
+ st.subheader("Run Results")
317
+ st.info(
318
+ "`I will grade the chain based on: 1/ the relevance of the retrived documents relative to the question and 2/ "
319
+ "the summarized answer relative to the ground truth answer. You can see (and change) to prompts used for "
320
+ "grading in text_utils`")
321
+ st.dataframe(data=d, use_container_width=True)
322
+
323
+ # Accumulate results
324
+ st.subheader("Aggregate Results")
325
+ st.info(
326
+ "`Retrieval and answer scores are percentage of retrived documents deemed relevant by the LLM grader ("
327
+ "relative to the question) and percentage of summarized answers deemed relevant (relative to ground truth "
328
+ "answer), respectively. The size of point correponds to the latency (in seconds) of retrieval + answer "
329
+ "summarization (larger circle = slower).`")
330
+ new_row = pd.DataFrame({'model': [model],
331
+ 'retriever': [retriever_type],
332
+ 'embedding': [embeddings],
333
+ 'num_neighbors': [num_neighbors],
334
+ 'Latency': [mean_latency],
335
+ 'Retrieval score': [percentage_docs],
336
+ 'Answer score': [percentage_answer]})
337
+ summary = pd.concat([summary, new_row], ignore_index=True)
338
+ st.dataframe(data=summary, use_container_width=True)
339
+ st.session_state.existing_df = summary
340
+
341
+ # Dataframe for visualization
342
+ show = summary.reset_index().copy()
343
+ show.columns = ['expt number', 'model', 'retriever', 'embedding', 'num_neighbors', 'Latency', 'Retrieval score','Answer score']
344
+ show['expt number'] = show['expt number'].apply(lambda x: "Expt #: " + str(x + 1))
345
+ c = alt.Chart(show).mark_circle().encode(x='Retrieval score',
346
+ y='Answer score',
347
+ size=alt.Size('Latency'),
348
+ color='expt number',
349
+ tooltip=['expt number', 'Retrieval score', 'Latency', 'Answer score'])
350
+ st.altair_chart(c, use_container_width=True, theme="streamlit")
351
+
352
+ else:
353
+ st.warning('Please specify a Pinecone index and add an eval set.', icon="⚠")