Spaces:
Configuration error
Configuration error
Ahmad-Moiz
commited on
Commit
•
574c00d
1
Parent(s):
0b1e2e1
Update app.py
Browse files
app.py
CHANGED
@@ -53,7 +53,7 @@ def load_docs(files: List) -> str:
|
|
53 |
@return: string of all docs concatenated
|
54 |
"""
|
55 |
|
56 |
-
st.info("
|
57 |
all_text = ""
|
58 |
for file_path in files:
|
59 |
file_extension = os.path.splitext(file_path.name)[1]
|
@@ -69,7 +69,7 @@ def load_docs(files: List) -> str:
|
|
69 |
file_content = stringio.read()
|
70 |
all_text += file_content
|
71 |
else:
|
72 |
-
st.warning('Please provide txt or pdf.', icon="
|
73 |
return all_text
|
74 |
|
75 |
|
@@ -82,7 +82,7 @@ def generate_eval(text: str, num_questions: int, chunk: int):
|
|
82 |
@param chunk: chunk size to draw question from in the doc
|
83 |
@return: eval set as JSON list
|
84 |
"""
|
85 |
-
st.info("
|
86 |
n = len(text)
|
87 |
starting_indices = [random.randint(0, n - chunk) for _ in range(num_questions)]
|
88 |
sub_sequences = [text[i:i + chunk] for i in starting_indices]
|
@@ -93,7 +93,7 @@ def generate_eval(text: str, num_questions: int, chunk: int):
|
|
93 |
qa = chain.run(b)
|
94 |
eval_set.append(qa)
|
95 |
except:
|
96 |
-
st.warning('Error generating question %s.' % str(i + 1), icon="
|
97 |
eval_set_full = list(itertools.chain.from_iterable(eval_set))
|
98 |
return eval_set_full
|
99 |
|
@@ -108,7 +108,7 @@ def split_texts(text, chunk_size: int, overlap, split_method: str):
|
|
108 |
@param split_method:
|
109 |
@return: list of str splits
|
110 |
"""
|
111 |
-
st.info("
|
112 |
if split_method == "RecursiveTextSplitter":
|
113 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
|
114 |
chunk_overlap=overlap)
|
@@ -117,7 +117,7 @@ def split_texts(text, chunk_size: int, overlap, split_method: str):
|
|
117 |
chunk_size=chunk_size,
|
118 |
chunk_overlap=overlap)
|
119 |
else:
|
120 |
-
st.warning("
|
121 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
|
122 |
chunk_overlap=overlap)
|
123 |
|
@@ -139,7 +139,7 @@ def make_llm(model_version: str):
|
|
139 |
elif model_version == "flan-t5-xl":
|
140 |
chosen_model = HuggingFaceHub(repo_id="google/flan-t5-xl",model_kwargs={"temperature":0,"max_length":64})
|
141 |
else:
|
142 |
-
st.warning("
|
143 |
chosen_model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
|
144 |
return chosen_model
|
145 |
|
@@ -154,14 +154,14 @@ def make_retriever(splits, retriever_type, embedding_type, num_neighbors, _llm):
|
|
154 |
@param _llm: model
|
155 |
@return: retriever
|
156 |
"""
|
157 |
-
st.info("
|
158 |
# Set embeddings
|
159 |
if embedding_type == "OpenAI":
|
160 |
embedding = OpenAIEmbeddings()
|
161 |
elif embedding_type == "HuggingFace":
|
162 |
embedding = HuggingFaceEmbeddings()
|
163 |
else:
|
164 |
-
st.warning("
|
165 |
embedding = OpenAIEmbeddings()
|
166 |
|
167 |
# Select retriever
|
@@ -169,8 +169,8 @@ def make_retriever(splits, retriever_type, embedding_type, num_neighbors, _llm):
|
|
169 |
try:
|
170 |
vector_store = FAISS.from_texts(splits, embedding)
|
171 |
except ValueError:
|
172 |
-
st.warning("
|
173 |
-
icon="
|
174 |
vector_store = FAISS.from_texts(splits, HuggingFaceEmbeddings())
|
175 |
retriever_obj = vector_store.as_retriever(k=num_neighbors)
|
176 |
elif retriever_type == "SVM":
|
@@ -185,7 +185,7 @@ def make_retriever(splits, retriever_type, embedding_type, num_neighbors, _llm):
|
|
185 |
faiss_index = faiss.IndexFlatL2(d)
|
186 |
retriever_obj = GPTFaissIndex.from_documents(documents, faiss_index=faiss_index, service_context=context)
|
187 |
else:
|
188 |
-
st.warning("
|
189 |
retriever_obj = SVMRetriever.from_texts(splits, embedding)
|
190 |
return retriever_obj
|
191 |
|
@@ -198,7 +198,7 @@ def make_chain(llm, retriever, retriever_type: str) -> RetrievalQA:
|
|
198 |
@param retriever_type: retriever type
|
199 |
@return: chain (or return retriever for Llama-Index)
|
200 |
"""
|
201 |
-
st.info("
|
202 |
if retriever_type == "Llama-Index":
|
203 |
qa = retriever
|
204 |
else:
|
@@ -218,7 +218,7 @@ def grade_model_answer(predicted_dataset: List, predictions: List, grade_answer_
|
|
218 |
@return: A list of scores for the distilled answers.
|
219 |
"""
|
220 |
# Grade the distilled answer
|
221 |
-
st.info("
|
222 |
# Set the grading prompt based on the grade_answer_prompt parameter
|
223 |
if grade_answer_prompt == "Fast":
|
224 |
prompt = GRADE_ANSWER_PROMPT_FAST
|
@@ -255,7 +255,7 @@ def grade_model_retrieval(gt_dataset: List, predictions: List, grade_docs_prompt
|
|
255 |
@return: list of scores for the retrieved documents.
|
256 |
"""
|
257 |
# Grade the docs retrieval
|
258 |
-
st.info("
|
259 |
|
260 |
# Set the grading prompt based on the grade_docs_prompt parameter
|
261 |
prompt = GRADE_DOCS_PROMPT_FAST if grade_docs_prompt == "Fast" else GRADE_DOCS_PROMPT
|
@@ -291,7 +291,7 @@ def run_evaluation(chain, retriever, eval_set, grade_prompt, retriever_type, num
|
|
291 |
- latencies_list: A list of latencies in seconds for each question answered.
|
292 |
- predictions_list: A list of dictionaries containing the model's predicted answers and relevant documents for each question.
|
293 |
"""
|
294 |
-
st.info("
|
295 |
predictions_list = []
|
296 |
retrieved_docs = []
|
297 |
gt_dataset = []
|
@@ -335,27 +335,27 @@ def run_evaluation(chain, retriever, eval_set, grade_prompt, retriever_type, num
|
|
335 |
# Auth
|
336 |
st.sidebar.image("img/diagnostic.jpg")
|
337 |
|
338 |
-
oai_api_key = st.sidebar.text_input("
|
339 |
-
ant_api_key = st.sidebar.text_input("
|
340 |
-
hf_api_key = st.sidebar.text_input("
|
341 |
|
342 |
with st.sidebar.form("user_input"):
|
343 |
|
344 |
-
num_eval_questions = st.select_slider("
|
345 |
options=[1, 5, 10, 15, 20], value=5)
|
346 |
|
347 |
-
chunk_chars = st.select_slider("
|
348 |
options=[500, 750, 1000, 1500, 2000], value=1000)
|
349 |
|
350 |
-
overlap = st.select_slider("
|
351 |
options=[0, 50, 100, 150, 200], value=100)
|
352 |
|
353 |
-
split_method = st.radio("
|
354 |
("RecursiveTextSplitter",
|
355 |
"CharacterTextSplitter"),
|
356 |
index=0)
|
357 |
|
358 |
-
model = st.radio("
|
359 |
("gpt-3.5-turbo",
|
360 |
"gpt-4",
|
361 |
"anthropic"),
|
@@ -363,22 +363,22 @@ with st.sidebar.form("user_input"):
|
|
363 |
#"flan-t5-xl"),
|
364 |
index=0)
|
365 |
|
366 |
-
retriever_type = st.radio("
|
367 |
("TF-IDF",
|
368 |
"SVM",
|
369 |
"Llama-Index",
|
370 |
"similarity-search"),
|
371 |
index=3)
|
372 |
|
373 |
-
num_neighbors = st.select_slider("
|
374 |
options=[3, 4, 5, 6, 7, 8])
|
375 |
|
376 |
-
embeddings = st.radio("
|
377 |
("HuggingFace",
|
378 |
"OpenAI"),
|
379 |
index=1)
|
380 |
|
381 |
-
grade_prompt = st.radio("
|
382 |
("Fast",
|
383 |
"Descriptive",
|
384 |
"Descriptive w/ bias check",
|
@@ -387,21 +387,21 @@ with st.sidebar.form("user_input"):
|
|
387 |
|
388 |
submitted = st.form_submit_button("Submit evaluation")
|
389 |
|
390 |
-
st.sidebar.write("
|
391 |
|
392 |
# App
|
393 |
-
st.header("
|
394 |
st.info(
|
395 |
"`I am an evaluation tool for question-answering built on LangChain. Given documents, I will auto-generate a question-answer eval "
|
396 |
"set and evaluate using the selected chain settings. Experiments with different configurations are logged. "
|
397 |
"Optionally, provide your own eval set (as a JSON, see docs/karpathy-pod-eval.json for an example). If you don't have acess to GPT-4 or Anthropic, you can use our free hosted app here: https://autoevaluator.langchain.com/`")
|
398 |
|
399 |
with st.form(key='file_inputs'):
|
400 |
-
uploaded_file = st.file_uploader("
|
401 |
type=['pdf', 'txt'],
|
402 |
accept_multiple_files=True)
|
403 |
|
404 |
-
uploaded_eval_set = st.file_uploader("
|
405 |
type=['json'],
|
406 |
accept_multiple_files=False)
|
407 |
|
@@ -445,7 +445,7 @@ if uploaded_file and oai_api_key:
|
|
445 |
percentage_answer = (correct_answer_count / len(graded_answers)) * 100
|
446 |
percentage_docs = (correct_docs_count / len(graded_retrieval)) * 100
|
447 |
|
448 |
-
st.subheader("
|
449 |
st.info(
|
450 |
"`I will grade the chain based on: 1/ the relevance of the retrived documents relative to the question and 2/ "
|
451 |
"the summarized answer relative to the ground truth answer. You can see (and change) to prompts used for "
|
@@ -453,7 +453,7 @@ if uploaded_file and oai_api_key:
|
|
453 |
st.dataframe(data=d, use_container_width=True)
|
454 |
|
455 |
# Accumulate results
|
456 |
-
st.subheader("
|
457 |
st.info(
|
458 |
"`Retrieval and answer scores are percentage of retrived documents deemed relevant by the LLM grader ("
|
459 |
"relative to the question) and percentage of summarized answers deemed relevant (relative to ground truth "
|
|
|
53 |
@return: string of all docs concatenated
|
54 |
"""
|
55 |
|
56 |
+
st.info("Reading doc ...")
|
57 |
all_text = ""
|
58 |
for file_path in files:
|
59 |
file_extension = os.path.splitext(file_path.name)[1]
|
|
|
69 |
file_content = stringio.read()
|
70 |
all_text += file_content
|
71 |
else:
|
72 |
+
st.warning('Please provide txt or pdf.', icon="⚠")
|
73 |
return all_text
|
74 |
|
75 |
|
|
|
82 |
@param chunk: chunk size to draw question from in the doc
|
83 |
@return: eval set as JSON list
|
84 |
"""
|
85 |
+
st.info("Generating eval set ...")
|
86 |
n = len(text)
|
87 |
starting_indices = [random.randint(0, n - chunk) for _ in range(num_questions)]
|
88 |
sub_sequences = [text[i:i + chunk] for i in starting_indices]
|
|
|
93 |
qa = chain.run(b)
|
94 |
eval_set.append(qa)
|
95 |
except:
|
96 |
+
st.warning('Error generating question %s.' % str(i + 1), icon="⚠")
|
97 |
eval_set_full = list(itertools.chain.from_iterable(eval_set))
|
98 |
return eval_set_full
|
99 |
|
|
|
108 |
@param split_method:
|
109 |
@return: list of str splits
|
110 |
"""
|
111 |
+
st.info("Splitting doc ...")
|
112 |
if split_method == "RecursiveTextSplitter":
|
113 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
|
114 |
chunk_overlap=overlap)
|
|
|
117 |
chunk_size=chunk_size,
|
118 |
chunk_overlap=overlap)
|
119 |
else:
|
120 |
+
st.warning("Split method not recognized. Using RecursiveCharacterTextSplitter", icon="⚠")
|
121 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
|
122 |
chunk_overlap=overlap)
|
123 |
|
|
|
139 |
elif model_version == "flan-t5-xl":
|
140 |
chosen_model = HuggingFaceHub(repo_id="google/flan-t5-xl",model_kwargs={"temperature":0,"max_length":64})
|
141 |
else:
|
142 |
+
st.warning("Model version not recognized. Using gpt-3.5-turbo", icon="⚠")
|
143 |
chosen_model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
|
144 |
return chosen_model
|
145 |
|
|
|
154 |
@param _llm: model
|
155 |
@return: retriever
|
156 |
"""
|
157 |
+
st.info("Making retriever ...")
|
158 |
# Set embeddings
|
159 |
if embedding_type == "OpenAI":
|
160 |
embedding = OpenAIEmbeddings()
|
161 |
elif embedding_type == "HuggingFace":
|
162 |
embedding = HuggingFaceEmbeddings()
|
163 |
else:
|
164 |
+
st.warning("Embedding type not recognized. Using OpenAI", icon="⚠")
|
165 |
embedding = OpenAIEmbeddings()
|
166 |
|
167 |
# Select retriever
|
|
|
169 |
try:
|
170 |
vector_store = FAISS.from_texts(splits, embedding)
|
171 |
except ValueError:
|
172 |
+
st.warning("Error using OpenAI embeddings (disallowed TikToken token in the text). Using HuggingFace.",
|
173 |
+
icon="⚠")
|
174 |
vector_store = FAISS.from_texts(splits, HuggingFaceEmbeddings())
|
175 |
retriever_obj = vector_store.as_retriever(k=num_neighbors)
|
176 |
elif retriever_type == "SVM":
|
|
|
185 |
faiss_index = faiss.IndexFlatL2(d)
|
186 |
retriever_obj = GPTFaissIndex.from_documents(documents, faiss_index=faiss_index, service_context=context)
|
187 |
else:
|
188 |
+
st.warning("Retriever type not recognized. Using SVM", icon="⚠")
|
189 |
retriever_obj = SVMRetriever.from_texts(splits, embedding)
|
190 |
return retriever_obj
|
191 |
|
|
|
198 |
@param retriever_type: retriever type
|
199 |
@return: chain (or return retriever for Llama-Index)
|
200 |
"""
|
201 |
+
st.info("Making chain ...")
|
202 |
if retriever_type == "Llama-Index":
|
203 |
qa = retriever
|
204 |
else:
|
|
|
218 |
@return: A list of scores for the distilled answers.
|
219 |
"""
|
220 |
# Grade the distilled answer
|
221 |
+
st.info("Grading model answer ...")
|
222 |
# Set the grading prompt based on the grade_answer_prompt parameter
|
223 |
if grade_answer_prompt == "Fast":
|
224 |
prompt = GRADE_ANSWER_PROMPT_FAST
|
|
|
255 |
@return: list of scores for the retrieved documents.
|
256 |
"""
|
257 |
# Grade the docs retrieval
|
258 |
+
st.info("Grading relevance of retrieved docs ...")
|
259 |
|
260 |
# Set the grading prompt based on the grade_docs_prompt parameter
|
261 |
prompt = GRADE_DOCS_PROMPT_FAST if grade_docs_prompt == "Fast" else GRADE_DOCS_PROMPT
|
|
|
291 |
- latencies_list: A list of latencies in seconds for each question answered.
|
292 |
- predictions_list: A list of dictionaries containing the model's predicted answers and relevant documents for each question.
|
293 |
"""
|
294 |
+
st.info("Running evaluation ...")
|
295 |
predictions_list = []
|
296 |
retrieved_docs = []
|
297 |
gt_dataset = []
|
|
|
335 |
# Auth
|
336 |
st.sidebar.image("img/diagnostic.jpg")
|
337 |
|
338 |
+
oai_api_key = st.sidebar.text_input("OpenAI API Key:", type="password")
|
339 |
+
ant_api_key = st.sidebar.text_input("(Optional) Anthropic API Key:", type="password")
|
340 |
+
hf_api_key = st.sidebar.text_input("(Optional) HuggingFace API Token:", type="password")
|
341 |
|
342 |
with st.sidebar.form("user_input"):
|
343 |
|
344 |
+
num_eval_questions = st.select_slider("Number of eval questions",
|
345 |
options=[1, 5, 10, 15, 20], value=5)
|
346 |
|
347 |
+
chunk_chars = st.select_slider("Choose chunk size for splitting",
|
348 |
options=[500, 750, 1000, 1500, 2000], value=1000)
|
349 |
|
350 |
+
overlap = st.select_slider("Choose overlap for splitting",
|
351 |
options=[0, 50, 100, 150, 200], value=100)
|
352 |
|
353 |
+
split_method = st.radio("Split method",
|
354 |
("RecursiveTextSplitter",
|
355 |
"CharacterTextSplitter"),
|
356 |
index=0)
|
357 |
|
358 |
+
model = st.radio("Choose model",
|
359 |
("gpt-3.5-turbo",
|
360 |
"gpt-4",
|
361 |
"anthropic"),
|
|
|
363 |
#"flan-t5-xl"),
|
364 |
index=0)
|
365 |
|
366 |
+
retriever_type = st.radio("Choose retriever",
|
367 |
("TF-IDF",
|
368 |
"SVM",
|
369 |
"Llama-Index",
|
370 |
"similarity-search"),
|
371 |
index=3)
|
372 |
|
373 |
+
num_neighbors = st.select_slider("Choose # chunks to retrieve",
|
374 |
options=[3, 4, 5, 6, 7, 8])
|
375 |
|
376 |
+
embeddings = st.radio("Choose embeddings",
|
377 |
("HuggingFace",
|
378 |
"OpenAI"),
|
379 |
index=1)
|
380 |
|
381 |
+
grade_prompt = st.radio("Grading style prompt",
|
382 |
("Fast",
|
383 |
"Descriptive",
|
384 |
"Descriptive w/ bias check",
|
|
|
387 |
|
388 |
submitted = st.form_submit_button("Submit evaluation")
|
389 |
|
390 |
+
st.sidebar.write("By: [Sentient](https://twitter.com/sentient)")
|
391 |
|
392 |
# App
|
393 |
+
st.header("Auto-evaluator")
|
394 |
st.info(
|
395 |
"`I am an evaluation tool for question-answering built on LangChain. Given documents, I will auto-generate a question-answer eval "
|
396 |
"set and evaluate using the selected chain settings. Experiments with different configurations are logged. "
|
397 |
"Optionally, provide your own eval set (as a JSON, see docs/karpathy-pod-eval.json for an example). If you don't have acess to GPT-4 or Anthropic, you can use our free hosted app here: https://autoevaluator.langchain.com/`")
|
398 |
|
399 |
with st.form(key='file_inputs'):
|
400 |
+
uploaded_file = st.file_uploader("Please upload a file to evaluate (.txt or .pdf): ",
|
401 |
type=['pdf', 'txt'],
|
402 |
accept_multiple_files=True)
|
403 |
|
404 |
+
uploaded_eval_set = st.file_uploader("[Optional] Please upload eval set (.json): ",
|
405 |
type=['json'],
|
406 |
accept_multiple_files=False)
|
407 |
|
|
|
445 |
percentage_answer = (correct_answer_count / len(graded_answers)) * 100
|
446 |
percentage_docs = (correct_docs_count / len(graded_retrieval)) * 100
|
447 |
|
448 |
+
st.subheader("Run Results")
|
449 |
st.info(
|
450 |
"`I will grade the chain based on: 1/ the relevance of the retrived documents relative to the question and 2/ "
|
451 |
"the summarized answer relative to the ground truth answer. You can see (and change) to prompts used for "
|
|
|
453 |
st.dataframe(data=d, use_container_width=True)
|
454 |
|
455 |
# Accumulate results
|
456 |
+
st.subheader("Aggregate Results")
|
457 |
st.info(
|
458 |
"`Retrieval and answer scores are percentage of retrived documents deemed relevant by the LLM grader ("
|
459 |
"relative to the question) and percentage of summarized answers deemed relevant (relative to ground truth "
|