Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -185,16 +185,35 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
185 |
|
186 |
return results[:top_k], end_time - start_time, vector_store
|
187 |
|
188 |
-
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model):
|
189 |
-
|
190 |
"num_results": len(results),
|
191 |
-
"avg_content_length":
|
192 |
"search_time": search_time,
|
193 |
"vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
|
194 |
"num_documents": len(vector_store.docstore._dict),
|
195 |
"num_tokens": num_tokens,
|
196 |
-
"embedding_vocab_size": embedding_model.client.get_vocab_size() if hasattr(embedding_model, 'client') and hasattr(embedding_model.client, 'get_vocab_size') else "N/A"
|
|
|
|
|
197 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
|
199 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
200 |
# Tokenize the texts
|
@@ -236,7 +255,7 @@ def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
|
236 |
|
237 |
return tokenizer, optimized_texts
|
238 |
|
239 |
-
def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang, use_custom_embedding, optimize_vocab, phonetic_weight):
|
240 |
all_results = []
|
241 |
all_stats = []
|
242 |
settings = {
|
@@ -273,6 +292,7 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
|
|
273 |
tokenizer, optimized_chunks = optimize_vocabulary(chunks)
|
274 |
chunks = optimized_chunks
|
275 |
|
|
|
276 |
results, search_time, vector_store = search_embeddings(
|
277 |
chunks,
|
278 |
embedding_model,
|
@@ -284,7 +304,7 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
|
|
284 |
phonetic_weight
|
285 |
)
|
286 |
|
287 |
-
stats = calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model)
|
288 |
stats["model"] = f"{model_type} - {model_name}"
|
289 |
stats.update(settings)
|
290 |
|
@@ -309,6 +329,39 @@ def format_results(results, stats):
|
|
309 |
formatted_results.append(result)
|
310 |
return formatted_results
|
311 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
def launch_interface(share=True):
|
313 |
iface = gr.Interface(
|
314 |
fn=compare_embeddings,
|
@@ -331,7 +384,8 @@ def launch_interface(share=True):
|
|
331 |
],
|
332 |
outputs=[
|
333 |
gr.Dataframe(label="Results", interactive=False),
|
334 |
-
gr.Dataframe(label="Statistics", interactive=False)
|
|
|
335 |
],
|
336 |
title="Advanced Embedding Comparison Tool",
|
337 |
description="Compare different embedding models and retrieval strategies with advanced preprocessing and phonetic matching"
|
|
|
185 |
|
186 |
return results[:top_k], end_time - start_time, vector_store
|
187 |
|
188 |
+
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k):
|
189 |
+
stats = {
|
190 |
"num_results": len(results),
|
191 |
+
"avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
|
192 |
"search_time": search_time,
|
193 |
"vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
|
194 |
"num_documents": len(vector_store.docstore._dict),
|
195 |
"num_tokens": num_tokens,
|
196 |
+
"embedding_vocab_size": embedding_model.client.get_vocab_size() if hasattr(embedding_model, 'client') and hasattr(embedding_model.client, 'get_vocab_size') else "N/A",
|
197 |
+
"embedding_dimension": len(embedding_model.embed_query(query)),
|
198 |
+
"top_k": top_k,
|
199 |
}
|
200 |
+
|
201 |
+
# Calculate diversity of results
|
202 |
+
if len(results) > 1:
|
203 |
+
embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
204 |
+
pairwise_similarities = cosine_similarity(embeddings)
|
205 |
+
stats["result_diversity"] = 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
|
206 |
+
else:
|
207 |
+
stats["result_diversity"] = "N/A"
|
208 |
+
|
209 |
+
# Calculate rank correlation between embedding similarity and result order
|
210 |
+
query_embedding = embedding_model.embed_query(query)
|
211 |
+
result_embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
212 |
+
similarities = [cosine_similarity([query_embedding], [emb])[0][0] for emb in result_embeddings]
|
213 |
+
rank_correlation, _ = spearmanr(similarities, range(len(similarities)))
|
214 |
+
stats["rank_correlation"] = rank_correlation
|
215 |
+
|
216 |
+
return stats
|
217 |
|
218 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
219 |
# Tokenize the texts
|
|
|
255 |
|
256 |
return tokenizer, optimized_texts
|
257 |
|
258 |
+
def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', use_custom_embedding=False, optimize_vocab=False, phonetic_weight=0.3):
|
259 |
all_results = []
|
260 |
all_stats = []
|
261 |
settings = {
|
|
|
292 |
tokenizer, optimized_chunks = optimize_vocabulary(chunks)
|
293 |
chunks = optimized_chunks
|
294 |
|
295 |
+
|
296 |
results, search_time, vector_store = search_embeddings(
|
297 |
chunks,
|
298 |
embedding_model,
|
|
|
304 |
phonetic_weight
|
305 |
)
|
306 |
|
307 |
+
stats = calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k)
|
308 |
stats["model"] = f"{model_type} - {model_name}"
|
309 |
stats.update(settings)
|
310 |
|
|
|
329 |
formatted_results.append(result)
|
330 |
return formatted_results
|
331 |
|
332 |
+
import matplotlib.pyplot as plt
|
333 |
+
import seaborn as sns
|
334 |
+
from sklearn.manifold import TSNE
|
335 |
+
|
336 |
+
def visualize_results(results_df, stats_df):
|
337 |
+
# Create a figure with subplots
|
338 |
+
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
|
339 |
+
|
340 |
+
# 1. Bar plot of search times
|
341 |
+
sns.barplot(x='model', y='search_time', data=stats_df, ax=axs[0, 0])
|
342 |
+
axs[0, 0].set_title('Search Time by Model')
|
343 |
+
axs[0, 0].set_xticklabels(axs[0, 0].get_xticklabels(), rotation=45, ha='right')
|
344 |
+
|
345 |
+
# 2. Scatter plot of result diversity vs. rank correlation
|
346 |
+
sns.scatterplot(x='result_diversity', y='rank_correlation', hue='model', data=stats_df, ax=axs[0, 1])
|
347 |
+
axs[0, 1].set_title('Result Diversity vs. Rank Correlation')
|
348 |
+
|
349 |
+
# 3. Box plot of content lengths
|
350 |
+
sns.boxplot(x='model', y='content_length', data=results_df, ax=axs[1, 0])
|
351 |
+
axs[1, 0].set_title('Distribution of Result Content Lengths')
|
352 |
+
axs[1, 0].set_xticklabels(axs[1, 0].get_xticklabels(), rotation=45, ha='right')
|
353 |
+
|
354 |
+
# 4. t-SNE visualization of embeddings
|
355 |
+
embeddings = np.array(results_df['embedding'].tolist())
|
356 |
+
tsne = TSNE(n_components=2, random_state=42)
|
357 |
+
embeddings_2d = tsne.fit_transform(embeddings)
|
358 |
+
|
359 |
+
sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], hue=results_df['model'], ax=axs[1, 1])
|
360 |
+
axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
|
361 |
+
|
362 |
+
plt.tight_layout()
|
363 |
+
return fig
|
364 |
+
|
365 |
def launch_interface(share=True):
|
366 |
iface = gr.Interface(
|
367 |
fn=compare_embeddings,
|
|
|
384 |
],
|
385 |
outputs=[
|
386 |
gr.Dataframe(label="Results", interactive=False),
|
387 |
+
gr.Dataframe(label="Statistics", interactive=False),
|
388 |
+
gr.Plot(label="Visualizations")
|
389 |
],
|
390 |
title="Advanced Embedding Comparison Tool",
|
391 |
description="Compare different embedding models and retrieval strategies with advanced preprocessing and phonetic matching"
|