Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -15,7 +15,7 @@ from sklearn.preprocessing import normalize
|
|
15 |
from rank_bm25 import BM25Okapi
|
16 |
from gensim.models import Word2Vec
|
17 |
from typing import List, Optional, Tuple
|
18 |
-
import
|
19 |
|
20 |
|
21 |
logger = logging.getLogger(__name__)
|
@@ -152,7 +152,7 @@ class MistralRAGChatbot:
|
|
152 |
return np.array(response.data[0].embedding)
|
153 |
except Exception as e:
|
154 |
logging.error(f"Error fetching embedding: {e}")
|
155 |
-
return np.zeros((1024,))
|
156 |
|
157 |
def advanced_fusion_retrieval(self, user_query: str, docs: List[dict]) -> List[dict]:
|
158 |
query_embedding = self.create_embeddings([user_query])[0]
|
@@ -183,7 +183,7 @@ class MistralRAGChatbot:
|
|
183 |
return [{'text': self.texts[i], 'method': 'advanced_fusion', 'score': combined_scores[i], 'index': i} for i in sorted_indices[:5]]
|
184 |
|
185 |
def create_embeddings(self, text_list: List[str]) -> np.ndarray:
|
186 |
-
expected_dim = 1024
|
187 |
embeddings = []
|
188 |
for text in text_list:
|
189 |
word_vectors = [self.word2vec_model.wv[token] for token in text.split() if token in self.word2vec_model.wv]
|
@@ -195,7 +195,6 @@ class MistralRAGChatbot:
|
|
195 |
embeddings.append(avg_embedding)
|
196 |
return np.array(embeddings, dtype=np.float32)
|
197 |
|
198 |
-
|
199 |
async def generate_response_with_rag(
|
200 |
self,
|
201 |
user_query: str,
|
@@ -225,7 +224,6 @@ class MistralRAGChatbot:
|
|
225 |
response = "An error occurred while generating the response."
|
226 |
return response, [doc['text'] for doc in reranked_docs[:5]], reranked_docs[:5]
|
227 |
|
228 |
-
|
229 |
def retrieve_documents(
|
230 |
self,
|
231 |
user_query: str,
|
@@ -248,7 +246,7 @@ class MistralRAGChatbot:
|
|
248 |
def retrieve_with_annoy(self, user_query: str, query_embedding: np.ndarray, top_k: int) -> Tuple[List[int], List[float]]:
|
249 |
n_results = min(top_k, len(self.texts))
|
250 |
indices, distances = self.annoy_index.get_nns_by_vector(query_embedding, n_results, include_distances=True)
|
251 |
-
scores = [1.0 - (dist / max(distances)) for dist in distances]
|
252 |
logging.debug(f"Annoy retrieval returned {len(indices)} documents.")
|
253 |
return indices, scores
|
254 |
|
@@ -315,9 +313,9 @@ class MistralRAGChatbot:
|
|
315 |
return reranked_docs
|
316 |
|
317 |
def reciprocal_rank_fusion(self, user_query: str, docs: List[dict]) -> List[dict]:
|
318 |
-
k = 60
|
319 |
method_ranks = {}
|
320 |
-
fused_scores = {}
|
321 |
for doc in docs:
|
322 |
method = doc['method']
|
323 |
if method not in method_ranks:
|
@@ -328,9 +326,9 @@ class MistralRAGChatbot:
|
|
328 |
idx = doc['index']
|
329 |
if idx not in fused_scores:
|
330 |
fused_scores[idx] = sum(1 / (k + rank) for method_rank in method_ranks.values() for i, rank in method_rank.items() if i == idx)
|
331 |
-
reranked_docs = sorted(docs, key=lambda x: fused_scores.get(x['index'], 0), reverse=True)
|
332 |
for doc in reranked_docs:
|
333 |
-
doc['rrf_score'] = fused_scores.get(doc['index'], 0)
|
334 |
return reranked_docs
|
335 |
|
336 |
def weighted_score_fusion(self, user_query: str, docs: List[dict]) -> List[dict]:
|
@@ -405,17 +403,9 @@ def create_vector_db_and_annoy_index(pdf_path, vector_db_path, annoy_index_path)
|
|
405 |
print("Vector database and Annoy index creation completed.")
|
406 |
|
407 |
|
408 |
-
|
409 |
-
import gradio as gr
|
410 |
-
|
411 |
def chatbot_interface(file, user_query, response_style, selected_retrieval_methods, selected_reranking_methods, chunk_size, overlap):
|
412 |
vector_db_path = "vector_db.pkl"
|
413 |
annoy_index_path = "vector_index.ann"
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
|
420 |
store_embeddings_in_vector_db(file.name, 'vector_db.pkl', 'vector_index.ann', chunk_size, overlap)
|
421 |
|
@@ -451,7 +441,7 @@ iface = gr.Interface(
|
|
451 |
gr.File(label="Upload a PDF"),
|
452 |
gr.Textbox(lines=5, label="User Query"),
|
453 |
gr.Dropdown(["Detailed", "Concise", "Creative", "Technical"], label="Response Style"),
|
454 |
-
gr.Dropdown(["annoy", "tfidf", "bm25", "euclidean", "jaccard"], label="Retrieval Methods", multiselect=True),
|
455 |
gr.Dropdown(["advanced_fusion", "reciprocal_rank_fusion", "weighted_score_fusion", "semantic_similarity"], label="Reranking Methods"),
|
456 |
gr.Slider(minimum=1024, maximum=2048, step=128, value=2048, label="Chunk Size"),
|
457 |
gr.Slider(minimum=100, maximum=300, step=100, value=200, label="Overlap")
|
|
|
15 |
from rank_bm25 import BM25Okapi
|
16 |
from gensim.models import Word2Vec
|
17 |
from typing import List, Optional, Tuple
|
18 |
+
import gradio as gr
|
19 |
|
20 |
|
21 |
logger = logging.getLogger(__name__)
|
|
|
152 |
return np.array(response.data[0].embedding)
|
153 |
except Exception as e:
|
154 |
logging.error(f"Error fetching embedding: {e}")
|
155 |
+
return np.zeros((1024,))
|
156 |
|
157 |
def advanced_fusion_retrieval(self, user_query: str, docs: List[dict]) -> List[dict]:
|
158 |
query_embedding = self.create_embeddings([user_query])[0]
|
|
|
183 |
return [{'text': self.texts[i], 'method': 'advanced_fusion', 'score': combined_scores[i], 'index': i} for i in sorted_indices[:5]]
|
184 |
|
185 |
def create_embeddings(self, text_list: List[str]) -> np.ndarray:
|
186 |
+
expected_dim = 1024
|
187 |
embeddings = []
|
188 |
for text in text_list:
|
189 |
word_vectors = [self.word2vec_model.wv[token] for token in text.split() if token in self.word2vec_model.wv]
|
|
|
195 |
embeddings.append(avg_embedding)
|
196 |
return np.array(embeddings, dtype=np.float32)
|
197 |
|
|
|
198 |
async def generate_response_with_rag(
|
199 |
self,
|
200 |
user_query: str,
|
|
|
224 |
response = "An error occurred while generating the response."
|
225 |
return response, [doc['text'] for doc in reranked_docs[:5]], reranked_docs[:5]
|
226 |
|
|
|
227 |
def retrieve_documents(
|
228 |
self,
|
229 |
user_query: str,
|
|
|
246 |
def retrieve_with_annoy(self, user_query: str, query_embedding: np.ndarray, top_k: int) -> Tuple[List[int], List[float]]:
|
247 |
n_results = min(top_k, len(self.texts))
|
248 |
indices, distances = self.annoy_index.get_nns_by_vector(query_embedding, n_results, include_distances=True)
|
249 |
+
scores = [1.0 - (dist / max(distances)) for dist in distances]
|
250 |
logging.debug(f"Annoy retrieval returned {len(indices)} documents.")
|
251 |
return indices, scores
|
252 |
|
|
|
313 |
return reranked_docs
|
314 |
|
315 |
def reciprocal_rank_fusion(self, user_query: str, docs: List[dict]) -> List[dict]:
|
316 |
+
k = 60
|
317 |
method_ranks = {}
|
318 |
+
fused_scores = {}
|
319 |
for doc in docs:
|
320 |
method = doc['method']
|
321 |
if method not in method_ranks:
|
|
|
326 |
idx = doc['index']
|
327 |
if idx not in fused_scores:
|
328 |
fused_scores[idx] = sum(1 / (k + rank) for method_rank in method_ranks.values() for i, rank in method_rank.items() if i == idx)
|
329 |
+
reranked_docs = sorted(docs, key=lambda x: fused_scores.get(x['index'], 0), reverse=True)
|
330 |
for doc in reranked_docs:
|
331 |
+
doc['rrf_score'] = fused_scores.get(doc['index'], 0)
|
332 |
return reranked_docs
|
333 |
|
334 |
def weighted_score_fusion(self, user_query: str, docs: List[dict]) -> List[dict]:
|
|
|
403 |
print("Vector database and Annoy index creation completed.")
|
404 |
|
405 |
|
|
|
|
|
|
|
406 |
def chatbot_interface(file, user_query, response_style, selected_retrieval_methods, selected_reranking_methods, chunk_size, overlap):
|
407 |
vector_db_path = "vector_db.pkl"
|
408 |
annoy_index_path = "vector_index.ann"
|
|
|
|
|
|
|
|
|
|
|
409 |
|
410 |
store_embeddings_in_vector_db(file.name, 'vector_db.pkl', 'vector_index.ann', chunk_size, overlap)
|
411 |
|
|
|
441 |
gr.File(label="Upload a PDF"),
|
442 |
gr.Textbox(lines=5, label="User Query"),
|
443 |
gr.Dropdown(["Detailed", "Concise", "Creative", "Technical"], label="Response Style"),
|
444 |
+
gr.Dropdown(["annoy", "tfidf", "bm25", "euclidean", "jaccard"], label="Retrieval Methods", multiselect=True),
|
445 |
gr.Dropdown(["advanced_fusion", "reciprocal_rank_fusion", "weighted_score_fusion", "semantic_similarity"], label="Reranking Methods"),
|
446 |
gr.Slider(minimum=1024, maximum=2048, step=128, value=2048, label="Chunk Size"),
|
447 |
gr.Slider(minimum=100, maximum=300, step=100, value=200, label="Overlap")
|