Chris4K commited on
Commit
3a4f84d
1 Parent(s): 93121a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -7
app.py CHANGED
@@ -185,16 +185,35 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
185
 
186
  return results[:top_k], end_time - start_time, vector_store
187
 
188
- def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model):
189
- return {
190
  "num_results": len(results),
191
- "avg_content_length": sum(len(doc.page_content) for doc in results) / len(results) if results else 0,
192
  "search_time": search_time,
193
  "vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
194
  "num_documents": len(vector_store.docstore._dict),
195
  "num_tokens": num_tokens,
196
- "embedding_vocab_size": embedding_model.client.get_vocab_size() if hasattr(embedding_model, 'client') and hasattr(embedding_model.client, 'get_vocab_size') else "N/A"
 
 
197
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
  def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
200
  # Tokenize the texts
@@ -236,7 +255,7 @@ def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
236
 
237
  return tokenizer, optimized_texts
238
 
239
- def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang, use_custom_embedding, optimize_vocab, phonetic_weight):
240
  all_results = []
241
  all_stats = []
242
  settings = {
@@ -273,6 +292,7 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
273
  tokenizer, optimized_chunks = optimize_vocabulary(chunks)
274
  chunks = optimized_chunks
275
 
 
276
  results, search_time, vector_store = search_embeddings(
277
  chunks,
278
  embedding_model,
@@ -284,7 +304,7 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
284
  phonetic_weight
285
  )
286
 
287
- stats = calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model)
288
  stats["model"] = f"{model_type} - {model_name}"
289
  stats.update(settings)
290
 
@@ -309,6 +329,39 @@ def format_results(results, stats):
309
  formatted_results.append(result)
310
  return formatted_results
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  def launch_interface(share=True):
313
  iface = gr.Interface(
314
  fn=compare_embeddings,
@@ -331,7 +384,8 @@ def launch_interface(share=True):
331
  ],
332
  outputs=[
333
  gr.Dataframe(label="Results", interactive=False),
334
- gr.Dataframe(label="Statistics", interactive=False)
 
335
  ],
336
  title="Advanced Embedding Comparison Tool",
337
  description="Compare different embedding models and retrieval strategies with advanced preprocessing and phonetic matching"
 
185
 
186
  return results[:top_k], end_time - start_time, vector_store
187
 
188
+ def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k):
189
+ stats = {
190
  "num_results": len(results),
191
+ "avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
192
  "search_time": search_time,
193
  "vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
194
  "num_documents": len(vector_store.docstore._dict),
195
  "num_tokens": num_tokens,
196
+ "embedding_vocab_size": embedding_model.client.get_vocab_size() if hasattr(embedding_model, 'client') and hasattr(embedding_model.client, 'get_vocab_size') else "N/A",
197
+ "embedding_dimension": len(embedding_model.embed_query(query)),
198
+ "top_k": top_k,
199
  }
200
+
201
+ # Calculate diversity of results
202
+ if len(results) > 1:
203
+ embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
204
+ pairwise_similarities = cosine_similarity(embeddings)
205
+ stats["result_diversity"] = 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
206
+ else:
207
+ stats["result_diversity"] = "N/A"
208
+
209
+ # Calculate rank correlation between embedding similarity and result order
210
+ query_embedding = embedding_model.embed_query(query)
211
+ result_embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
212
+ similarities = [cosine_similarity([query_embedding], [emb])[0][0] for emb in result_embeddings]
213
+ rank_correlation, _ = spearmanr(similarities, range(len(similarities)))
214
+ stats["rank_correlation"] = rank_correlation
215
+
216
+ return stats
217
 
218
  def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
219
  # Tokenize the texts
 
255
 
256
  return tokenizer, optimized_texts
257
 
258
+ def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', use_custom_embedding=False, optimize_vocab=False, phonetic_weight=0.3):
259
  all_results = []
260
  all_stats = []
261
  settings = {
 
292
  tokenizer, optimized_chunks = optimize_vocabulary(chunks)
293
  chunks = optimized_chunks
294
 
295
+
296
  results, search_time, vector_store = search_embeddings(
297
  chunks,
298
  embedding_model,
 
304
  phonetic_weight
305
  )
306
 
307
+ stats = calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k)
308
  stats["model"] = f"{model_type} - {model_name}"
309
  stats.update(settings)
310
 
 
329
  formatted_results.append(result)
330
  return formatted_results
331
 
332
+ import matplotlib.pyplot as plt
333
+ import seaborn as sns
334
+ from sklearn.manifold import TSNE
335
+
336
+ def visualize_results(results_df, stats_df):
337
+ # Create a figure with subplots
338
+ fig, axs = plt.subplots(2, 2, figsize=(20, 20))
339
+
340
+ # 1. Bar plot of search times
341
+ sns.barplot(x='model', y='search_time', data=stats_df, ax=axs[0, 0])
342
+ axs[0, 0].set_title('Search Time by Model')
343
+ axs[0, 0].set_xticklabels(axs[0, 0].get_xticklabels(), rotation=45, ha='right')
344
+
345
+ # 2. Scatter plot of result diversity vs. rank correlation
346
+ sns.scatterplot(x='result_diversity', y='rank_correlation', hue='model', data=stats_df, ax=axs[0, 1])
347
+ axs[0, 1].set_title('Result Diversity vs. Rank Correlation')
348
+
349
+ # 3. Box plot of content lengths
350
+ sns.boxplot(x='model', y='content_length', data=results_df, ax=axs[1, 0])
351
+ axs[1, 0].set_title('Distribution of Result Content Lengths')
352
+ axs[1, 0].set_xticklabels(axs[1, 0].get_xticklabels(), rotation=45, ha='right')
353
+
354
+ # 4. t-SNE visualization of embeddings
355
+ embeddings = np.array(results_df['embedding'].tolist())
356
+ tsne = TSNE(n_components=2, random_state=42)
357
+ embeddings_2d = tsne.fit_transform(embeddings)
358
+
359
+ sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], hue=results_df['model'], ax=axs[1, 1])
360
+ axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
361
+
362
+ plt.tight_layout()
363
+ return fig
364
+
365
  def launch_interface(share=True):
366
  iface = gr.Interface(
367
  fn=compare_embeddings,
 
384
  ],
385
  outputs=[
386
  gr.Dataframe(label="Results", interactive=False),
387
+ gr.Dataframe(label="Statistics", interactive=False),
388
+ gr.Plot(label="Visualizations")
389
  ],
390
  title="Advanced Embedding Comparison Tool",
391
  description="Compare different embedding models and retrieval strategies with advanced preprocessing and phonetic matching"