Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -17,8 +17,8 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
17 |
from langchain.embeddings import HuggingFaceEmbeddings
|
18 |
from langchain.vectorstores import FAISS
|
19 |
|
20 |
-
PDF_PATH = "
|
21 |
-
CSV_PATH = "
|
22 |
|
23 |
st.set_page_config(page_title="PolicyGaido - Insurance Q&A", page_icon="π", layout="wide")
|
24 |
st.title("Insurance Policy Q&A Assistant")
|
@@ -32,7 +32,7 @@ if "initialized" not in st.session_state:
|
|
32 |
|
33 |
with st.sidebar:
|
34 |
st.header("Configuration")
|
35 |
-
model_option = st.selectbox("Select Language Model", ["BERT-for-QA"
|
36 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
37 |
st.caption(f"Running on: {device}")
|
38 |
|
@@ -110,10 +110,9 @@ def get_answer(question, model_name):
|
|
110 |
qa_pipeline = pipeline("question-answering", model="deepset/bert-base-cased-squad2" if model_name == "BERT-for-QA" else "distilbert-base-cased-distilled-squad", tokenizer="deepset/bert-base-cased-squad2", device=0 if torch.cuda.is_available() else -1)
|
111 |
result = qa_pipeline(question=question, context=context)
|
112 |
|
113 |
-
# Calculate semantic similarity between question and context as a relevance proxy
|
114 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': device})
|
115 |
question_embedding = embeddings.embed_query(question)
|
116 |
-
context_embedding = embeddings.embed_query(context[:1000]) #
|
117 |
|
118 |
# Compute cosine similarity
|
119 |
similarity = np.dot(question_embedding, context_embedding) / (np.linalg.norm(question_embedding) * np.linalg.norm(context_embedding))
|
@@ -122,10 +121,8 @@ def get_answer(question, model_name):
|
|
122 |
return result["answer"], docs, result["score"], relevance_score
|
123 |
|
124 |
def evaluate_answer(answer, docs, confidence, relevance):
|
125 |
-
# Count potentially hallucinatory indicators
|
126 |
hallucination_indicators = 0
|
127 |
|
128 |
-
# Check if answer contains content not found in supporting docs
|
129 |
answer_found = False
|
130 |
answer_words = set(answer.lower().split())
|
131 |
|
@@ -133,24 +130,22 @@ def evaluate_answer(answer, docs, confidence, relevance):
|
|
133 |
for doc in docs:
|
134 |
doc_content = doc.page_content.lower()
|
135 |
overlap_count = sum(1 for word in answer_words if word in doc_content)
|
136 |
-
if overlap_count / len(answer_words) > 0.3:
|
137 |
answer_found = True
|
138 |
break
|
139 |
|
140 |
-
if not answer_found and len(answer_words) > 3:
|
141 |
hallucination_indicators += 1
|
142 |
-
|
143 |
-
# Check for hedging language that might indicate uncertainty
|
144 |
hedging_phrases = ["i think", "probably", "likely", "may", "might", "could be", "possibly", "perhaps"]
|
145 |
if any(phrase in answer.lower() for phrase in hedging_phrases):
|
146 |
hallucination_indicators += 1
|
147 |
|
148 |
-
# Return hallucination risk score (0-100)
|
149 |
hallucination_risk = min(100, hallucination_indicators * 50)
|
150 |
|
151 |
return {
|
152 |
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
153 |
-
"confidence": confidence * 100,
|
154 |
"relevance": relevance,
|
155 |
"hallucination_risk": hallucination_risk
|
156 |
}
|
@@ -179,10 +174,10 @@ with col2:
|
|
179 |
st.subheader("Answer")
|
180 |
if "last_answer" in st.session_state:
|
181 |
question, answer, docs, evaluation = st.session_state["last_answer"]
|
182 |
-
st.markdown(f"
|
183 |
-
st.markdown(f"
|
|
|
184 |
|
185 |
-
# Display evaluation metrics
|
186 |
col_a, col_b, col_c = st.columns(3)
|
187 |
with col_a:
|
188 |
st.metric("Confidence", f"{evaluation['confidence']:.1f}%",
|
@@ -197,16 +192,16 @@ with col2:
|
|
197 |
|
198 |
with st.expander("View Source Information"):
|
199 |
for i, doc in enumerate(docs):
|
200 |
-
st.markdown(f"
|
|
|
201 |
|
202 |
-
# History and statistics section
|
203 |
st.divider()
|
204 |
st.subheader("Evaluation History")
|
205 |
|
206 |
if st.session_state.evaluation_history:
|
207 |
history_df = pd.DataFrame(st.session_state.evaluation_history)
|
208 |
|
209 |
-
#
|
210 |
st.subheader("Performance Statistics")
|
211 |
col1, col2, col3 = st.columns(3)
|
212 |
with col1:
|
@@ -216,7 +211,7 @@ if st.session_state.evaluation_history:
|
|
216 |
with col3:
|
217 |
st.metric("Avg. Hallucination Risk", f"{history_df['hallucination_risk'].mean():.1f}%")
|
218 |
|
219 |
-
#
|
220 |
st.dataframe(history_df)
|
221 |
else:
|
222 |
st.info("No evaluation history available yet. Ask some questions to build history.")
|
|
|
17 |
from langchain.embeddings import HuggingFaceEmbeddings
|
18 |
from langchain.vectorstores import FAISS
|
19 |
|
20 |
+
PDF_PATH = "Sample HI Policy.pdf"
|
21 |
+
CSV_PATH = "RAG_Test_Questions.csv"
|
22 |
|
23 |
st.set_page_config(page_title="PolicyGaido - Insurance Q&A", page_icon="π", layout="wide")
|
24 |
st.title("Insurance Policy Q&A Assistant")
|
|
|
32 |
|
33 |
with st.sidebar:
|
34 |
st.header("Configuration")
|
35 |
+
model_option = st.selectbox("Select Language Model", ["BERT-for-QA"])
|
36 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
37 |
st.caption(f"Running on: {device}")
|
38 |
|
|
|
110 |
qa_pipeline = pipeline("question-answering", model="deepset/bert-base-cased-squad2" if model_name == "BERT-for-QA" else "distilbert-base-cased-distilled-squad", tokenizer="deepset/bert-base-cased-squad2", device=0 if torch.cuda.is_available() else -1)
|
111 |
result = qa_pipeline(question=question, context=context)
|
112 |
|
|
|
113 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': device})
|
114 |
question_embedding = embeddings.embed_query(question)
|
115 |
+
context_embedding = embeddings.embed_query(context[:1000]) # We are using the first 1000 chars only to avoid token limits
|
116 |
|
117 |
# Compute cosine similarity
|
118 |
similarity = np.dot(question_embedding, context_embedding) / (np.linalg.norm(question_embedding) * np.linalg.norm(context_embedding))
|
|
|
121 |
return result["answer"], docs, result["score"], relevance_score
|
122 |
|
123 |
def evaluate_answer(answer, docs, confidence, relevance):
|
|
|
124 |
hallucination_indicators = 0
|
125 |
|
|
|
126 |
answer_found = False
|
127 |
answer_words = set(answer.lower().split())
|
128 |
|
|
|
130 |
for doc in docs:
|
131 |
doc_content = doc.page_content.lower()
|
132 |
overlap_count = sum(1 for word in answer_words if word in doc_content)
|
133 |
+
if overlap_count / len(answer_words) > 0.3:
|
134 |
answer_found = True
|
135 |
break
|
136 |
|
137 |
+
if not answer_found and len(answer_words) > 3:
|
138 |
hallucination_indicators += 1
|
139 |
+
|
|
|
140 |
hedging_phrases = ["i think", "probably", "likely", "may", "might", "could be", "possibly", "perhaps"]
|
141 |
if any(phrase in answer.lower() for phrase in hedging_phrases):
|
142 |
hallucination_indicators += 1
|
143 |
|
|
|
144 |
hallucination_risk = min(100, hallucination_indicators * 50)
|
145 |
|
146 |
return {
|
147 |
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
148 |
+
"confidence": confidence * 100,
|
149 |
"relevance": relevance,
|
150 |
"hallucination_risk": hallucination_risk
|
151 |
}
|
|
|
174 |
st.subheader("Answer")
|
175 |
if "last_answer" in st.session_state:
|
176 |
question, answer, docs, evaluation = st.session_state["last_answer"]
|
177 |
+
st.markdown(f"Question: {question}")
|
178 |
+
st.markdown(f"Answer: {answer}")
|
179 |
+
|
180 |
|
|
|
181 |
col_a, col_b, col_c = st.columns(3)
|
182 |
with col_a:
|
183 |
st.metric("Confidence", f"{evaluation['confidence']:.1f}%",
|
|
|
192 |
|
193 |
with st.expander("View Source Information"):
|
194 |
for i, doc in enumerate(docs):
|
195 |
+
st.markdown(f"Source {i+1}: {doc.page_content[:500]}...")
|
196 |
+
|
197 |
|
|
|
198 |
st.divider()
|
199 |
st.subheader("Evaluation History")
|
200 |
|
201 |
if st.session_state.evaluation_history:
|
202 |
history_df = pd.DataFrame(st.session_state.evaluation_history)
|
203 |
|
204 |
+
# Displaying the summary statistics
|
205 |
st.subheader("Performance Statistics")
|
206 |
col1, col2, col3 = st.columns(3)
|
207 |
with col1:
|
|
|
211 |
with col3:
|
212 |
st.metric("Avg. Hallucination Risk", f"{history_df['hallucination_risk'].mean():.1f}%")
|
213 |
|
214 |
+
# Here we are showing the history table
|
215 |
st.dataframe(history_df)
|
216 |
else:
|
217 |
st.info("No evaluation history available yet. Ask some questions to build history.")
|