Chris4K commited on
Commit
7fad639
·
verified ·
1 Parent(s): 78e1a2e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -304
app.py CHANGED
@@ -18,128 +18,7 @@ from langchain_text_splitters import (
18
  from typing import List, Dict, Any
19
  import pandas as pd
20
 
21
- nltk.download('punkt', quiet=True)
22
-
23
- FILES_DIR = './files'
24
-
25
- MODELS = {
26
- 'HuggingFace': {
27
- 'e5-base-de': "danielheinz/e5-base-sts-en-de",
28
- 'paraphrase-miniLM': "paraphrase-multilingual-MiniLM-L12-v2",
29
- 'paraphrase-mpnet': "paraphrase-multilingual-mpnet-base-v2",
30
- 'gte-large': "gte-large",
31
- 'gbert-base': "gbert-base"
32
- },
33
- 'OpenAI': {
34
- 'text-embedding-ada-002': "text-embedding-ada-002"
35
- },
36
- 'Cohere': {
37
- 'embed-multilingual-v2.0': "embed-multilingual-v2.0"
38
- }
39
- }
40
-
41
- class FileHandler:
42
- @staticmethod
43
- def extract_text(file_path):
44
- ext = os.path.splitext(file_path)[-1].lower()
45
- if ext == '.pdf':
46
- return FileHandler._extract_from_pdf(file_path)
47
- elif ext == '.docx':
48
- return FileHandler._extract_from_docx(file_path)
49
- elif ext == '.txt':
50
- return FileHandler._extract_from_txt(file_path)
51
- else:
52
- raise ValueError(f"Unsupported file type: {ext}")
53
-
54
- @staticmethod
55
- def _extract_from_pdf(file_path):
56
- with pdfplumber.open(file_path) as pdf:
57
- return ' '.join([page.extract_text() for page in pdf.pages])
58
-
59
- @staticmethod
60
- def _extract_from_docx(file_path):
61
- doc = docx.Document(file_path)
62
- return ' '.join([para.text for para in doc.paragraphs])
63
-
64
- @staticmethod
65
- def _extract_from_txt(file_path):
66
- with open(file_path, 'r', encoding='utf-8') as f:
67
- return f.read()
68
-
69
- def get_embedding_model(model_type, model_name):
70
- if model_type == 'HuggingFace':
71
- return HuggingFaceEmbeddings(model_name=MODELS[model_type][model_name])
72
- elif model_type == 'OpenAI':
73
- return OpenAIEmbeddings(model=MODELS[model_type][model_name])
74
- elif model_type == 'Cohere':
75
- return CohereEmbeddings(model=MODELS[model_type][model_name])
76
- else:
77
- raise ValueError(f"Unsupported model type: {model_type}")
78
-
79
- def get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separators=None):
80
- if split_strategy == 'token':
81
- return TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
82
- elif split_strategy == 'recursive':
83
- return RecursiveCharacterTextSplitter(
84
- chunk_size=chunk_size,
85
- chunk_overlap=overlap_size,
86
- separators=custom_separators or ["\n\n", "\n", " ", ""]
87
- )
88
- else:
89
- raise ValueError(f"Unsupported split strategy: {split_strategy}")
90
-
91
- def get_vector_store(store_type, texts, embedding_model):
92
- if store_type == 'FAISS':
93
- return FAISS.from_texts(texts, embedding_model)
94
- elif store_type == 'Chroma':
95
- return Chroma.from_texts(texts, embedding_model)
96
- else:
97
- raise ValueError(f"Unsupported vector store type: {store_type}")
98
-
99
- def get_retriever(vector_store, search_type, search_kwargs=None):
100
- if search_type == 'similarity':
101
- return vector_store.as_retriever(search_type="similarity", search_kwargs=search_kwargs)
102
- elif search_type == 'mmr':
103
- return vector_store.as_retriever(search_type="mmr", search_kwargs=search_kwargs)
104
- else:
105
- raise ValueError(f"Unsupported search type: {search_type}")
106
-
107
- def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators):
108
- if file_path:
109
- text = FileHandler.extract_text(file_path)
110
- else:
111
- text = ""
112
- for file in os.listdir(FILES_DIR):
113
- file_path = os.path.join(FILES_DIR, file)
114
- text += FileHandler.extract_text(file_path)
115
-
116
- text_splitter = get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separators)
117
- chunks = text_splitter.split_text(text)
118
-
119
- embedding_model = get_embedding_model(model_type, model_name)
120
-
121
- return chunks, embedding_model, len(text.split())
122
-
123
- def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k):
124
- vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
125
- retriever = get_retriever(vector_store, search_type, {"k": top_k})
126
-
127
- start_time = time.time()
128
- results = retriever.get_relevant_documents(query)
129
- end_time = time.time()
130
-
131
- return results, end_time - start_time, vector_store
132
-
133
- def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model):
134
- return {
135
- "num_results": len(results),
136
- "avg_content_length": sum(len(doc.page_content) for doc in results) / len(results) if results else 0,
137
- "search_time": search_time,
138
- "vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
139
- "num_documents": len(vector_store.docstore._dict),
140
- "num_tokens": num_tokens,
141
- "embedding_vocab_size": embedding_model.client.get_vocab_size() if hasattr(embedding_model, 'client') and hasattr(embedding_model.client, 'get_vocab_size') else "N/A"
142
- }
143
 
144
  def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k):
145
  all_results = []
@@ -191,8 +70,8 @@ def format_results(results, stats):
191
  formatted_results = []
192
  for doc in results:
193
  result = {
194
- "Content": doc.page_content,
195
  "Model": stats["model"],
 
196
  **doc.metadata,
197
  **{k: v for k, v in stats.items() if k not in ["model"]}
198
  }
@@ -200,184 +79,46 @@ def format_results(results, stats):
200
  return formatted_results
201
 
202
  # Gradio interface
203
- iface = gr.Interface(
204
- fn=compare_embeddings,
205
- inputs=[
206
- gr.File(label="Upload File (Optional)"),
207
- gr.Textbox(label="Search Query"),
208
- gr.CheckboxGroup(choices=list(MODELS.keys()), label="Embedding Model Types", value=["HuggingFace"]),
209
- gr.CheckboxGroup(choices=[model for models in MODELS.values() for model in models], label="Embedding Models", value=["e5-base-de"]),
210
- gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
211
- gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
212
- gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
213
- gr.Textbox(label="Custom Split Separators (comma-separated, optional)"),
214
- gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS"),
215
- gr.Radio(choices=["similarity", "mmr"], label="Search Type", value="similarity"),
216
- gr.Slider(1, 10, step=1, value=5, label="Top K")
217
- ],
218
- outputs=[
219
- gr.Dataframe(label="Results"),
220
- gr.Dataframe(label="Statistics")
221
- ],
222
- title="Embedding Comparison Tool",
223
- description="Compare different embedding models and retrieval strategies",
224
- examples=[
225
- [ "files/test.txt", "What is machine learning?", ["HuggingFace"], ["e5-base-de"], "recursive", 500, 50, "", "FAISS", "similarity", 5]
226
- ],
227
- flagging_mode="never"
228
- )
229
-
230
- # The code remains the same as in the previous artifact, so I'll omit it here for brevity.
231
- # The changes will be in the tutorial_md variable.
232
-
233
- tutorial_md = """
234
- # Embedding Comparison Tool Tutorial
235
-
236
- This tool allows you to compare different embedding models and retrieval strategies for document search. Before we dive into how to use the tool, let's cover some important concepts.
237
-
238
- ## What is RAG?
239
-
240
- RAG stands for Retrieval-Augmented Generation. It's a technique that combines the strength of large language models with the ability to access and use external knowledge. RAG is particularly useful for:
241
-
242
- - Providing up-to-date information
243
- - Answering questions based on specific documents or data sources
244
- - Reducing hallucinations in AI responses
245
- - Customizing AI outputs for specific domains or use cases
246
-
247
- RAG is good for applications where you need accurate, context-specific information retrieval combined with natural language generation. This includes chatbots, question-answering systems, and document analysis tools.
248
-
249
- ## Key Components of RAG
250
-
251
- ### 1. Document Loading
252
-
253
- This is the process of ingesting documents from various sources (PDFs, web pages, databases, etc.) into a format that can be processed by the RAG system. Efficient document loading is crucial for handling large volumes of data.
254
-
255
- ### 2. Document Splitting
256
-
257
- Large documents are often split into smaller chunks for more efficient processing and retrieval. The choice of splitting method can significantly impact the quality of retrieval results.
258
-
259
- ### 3. Vector Store and Embeddings
260
-
261
- Embeddings are dense vector representations of text that capture semantic meaning. A vector store is a database optimized for storing and querying these high-dimensional vectors. Together, they allow for efficient semantic search.
262
-
263
- ### 4. Retrieval
264
-
265
- This is the process of finding the most relevant documents or chunks based on a query. The quality of retrieval directly impacts the final output of the RAG system.
266
-
267
- ## Why is this important?
268
-
269
- Understanding and optimizing each component of the RAG pipeline is crucial because:
270
-
271
- 1. It affects the accuracy and relevance of the information retrieved.
272
- 2. It impacts the speed and efficiency of the system.
273
- 3. It determines the scalability of your solution.
274
- 4. It influences the overall quality of the generated responses.
275
-
276
- ## Impact of Parameter Changes
277
-
278
- Changes in various parameters can have significant effects:
279
-
280
- - **Chunk Size**: Larger chunks provide more context but may reduce precision. Smaller chunks increase precision but may lose context.
281
- - **Overlap**: More overlap can help maintain context between chunks but increases computational load.
282
- - **Embedding Model**: Different models have varying performance across languages and domains.
283
- - **Vector Store**: Affects query speed and the types of searches you can perform.
284
- - **Retrieval Method**: Impacts the diversity and relevance of retrieved documents.
285
-
286
- ## Detailed Parameter Explanations
287
-
288
- ### Embedding Model
289
-
290
- The embedding model translates text into numerical vectors. The choice of model affects:
291
-
292
- - **Language Coverage**: Some models are monolingual, others are multilingual.
293
- - **Domain Specificity**: Models can be general or trained on specific domains (e.g., legal, medical).
294
- - **Vector Dimensions**: Higher dimensions can capture more information but require more storage and computation.
295
-
296
- #### Vocabulary Size
297
-
298
- The vocab size refers to the number of unique tokens the model recognizes. It's important because:
299
-
300
- - It affects the model's ability to handle rare words or specialized terminology.
301
- - Larger vocabs can lead to better performance but require more memory.
302
- - It impacts the model's performance across different languages (larger vocabs are often better for multilingual models).
303
-
304
- ### Split Strategy
305
-
306
- - **Token**: Splits based on a fixed number of tokens. Good for maintaining consistent chunk sizes.
307
- - **Recursive**: Splits based on content, trying to maintain semantic coherence. Better for preserving context.
308
-
309
- ### Vector Store Type
310
-
311
- - **FAISS**: Fast, memory-efficient. Good for large-scale similarity search.
312
- - **Chroma**: Offers additional features like metadata filtering. Good for more complex querying needs.
313
-
314
- ### Search Type
315
-
316
- - **Similarity**: Returns the most similar documents. Fast and straightforward.
317
- - **MMR (Maximum Marginal Relevance)**: Balances relevance with diversity in results. Useful for getting a broader perspective.
318
-
319
- ## MTEB (Massive Text Embedding Benchmark)
320
-
321
- MTEB is a comprehensive benchmark for evaluating text embedding models across a wide range of tasks and languages. It's useful for:
322
-
323
- - Comparing the performance of different embedding models.
324
- - Understanding how models perform on specific tasks (e.g., classification, clustering, retrieval).
325
- - Selecting the best model for your specific use case.
326
-
327
- ### Finding Embeddings on MTEB Leaderboard
328
-
329
- To find suitable embeddings using the MTEB leaderboard (https://huggingface.co/spaces/mteb/leaderboard):
330
-
331
- 1. Look at the "Avg" column for overall performance across all tasks.
332
- 2. Check performance on specific task types relevant to your use case (e.g., Retrieval, Classification).
333
- 3. Consider the model size and inference speed for your deployment constraints.
334
- 4. Look at language-specific scores if you're working with non-English text.
335
- 5. Click on model names to get more details and links to the model pages on Hugging Face.
336
-
337
- When selecting a model, balance performance with practical considerations like model size, inference speed, and specific task performance relevant to your application.
338
-
339
- By understanding these concepts and parameters, you can make informed decisions when using the Embedding Comparison Tool and optimize your RAG system for your specific needs.
340
-
341
- ## Using the Embedding Comparison Tool
342
-
343
- Now that you understand the underlying concepts, here's how to use the tool:
344
-
345
- 1. **File Upload**: Optionally upload a file (PDF, DOCX, or TXT) or leave it empty to use files in the `./files` directory.
346
-
347
- 2. **Search Query**: Enter the search query you want to use for retrieving relevant documents.
348
-
349
- 3. **Embedding Model Types**: Select one or more embedding model types (HuggingFace, OpenAI, Cohere).
350
-
351
- 4. **Embedding Models**: Choose specific models for each selected model type.
352
-
353
- 5. **Split Strategy**: Select either 'token' or 'recursive' for text splitting.
354
-
355
- 6. **Chunk Size**: Set the size of text chunks (100-1000).
356
-
357
- 7. **Overlap Size**: Set the overlap between chunks (0-100).
358
-
359
- 8. **Custom Split Separators**: Optionally enter custom separators for text splitting.
360
-
361
- 9. **Vector Store Type**: Choose between FAISS and Chroma for storing vectors.
362
-
363
- 10. **Search Type**: Select 'similarity' or 'mmr' (Maximum Marginal Relevance) search.
364
-
365
- 11. **Top K**: Set the number of top results to retrieve (1-10).
366
-
367
- After setting these parameters, click "Submit" to run the comparison. The results will be displayed in two tables:
368
-
369
- - **Results**: Shows the retrieved document contents and metadata for each model.
370
- - **Statistics**: Provides performance metrics and settings for each model.
371
-
372
- You can download the results as CSV files for further analysis.
373
-
374
- Experiment with different settings to find the best combination for your specific use case!
375
- """
376
-
377
- # The rest of the code remains the same
378
- iface = gr.TabbedInterface(
379
- [iface, gr.Markdown(tutorial_md)],
380
- ["Embedding Comparison", "Tutorial"]
381
- )
382
-
383
- iface.launch(share=True)
 
18
  from typing import List, Dict, Any
19
  import pandas as pd
20
 
21
+ # ... (previous code remains the same) ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k):
24
  all_results = []
 
70
  formatted_results = []
71
  for doc in results:
72
  result = {
 
73
  "Model": stats["model"],
74
+ "Content": doc.page_content,
75
  **doc.metadata,
76
  **{k: v for k, v in stats.items() if k not in ["model"]}
77
  }
 
79
  return formatted_results
80
 
81
  # Gradio interface
82
+ def launch_interface(share=True):
83
+ iface = gr.Interface(
84
+ fn=compare_embeddings,
85
+ inputs=[
86
+ gr.File(label="Upload File (Optional)"),
87
+ gr.Textbox(label="Search Query"),
88
+ gr.CheckboxGroup(choices=list(MODELS.keys()), label="Embedding Model Types", value=["HuggingFace"]),
89
+ gr.CheckboxGroup(choices=[model for models in MODELS.values() for model in models], label="Embedding Models", value=["e5-base-de"]),
90
+ gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
91
+ gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
92
+ gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
93
+ gr.Textbox(label="Custom Split Separators (comma-separated, optional)"),
94
+ gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS"),
95
+ gr.Radio(choices=["similarity", "mmr"], label="Search Type", value="similarity"),
96
+ gr.Slider(1, 10, step=1, value=5, label="Top K")
97
+ ],
98
+ outputs=[
99
+ gr.Dataframe(label="Results", interactive=False),
100
+ gr.Dataframe(label="Statistics", interactive=False)
101
+ ],
102
+ title="Embedding Comparison Tool",
103
+ description="Compare different embedding models and retrieval strategies",
104
+ examples=[
105
+ ["example.pdf", "What is machine learning?", ["HuggingFace"], ["e5-base-de"], "recursive", 500, 50, "", "FAISS", "similarity", 5]
106
+ ],
107
+ allow_flagging="never"
108
+ )
109
+
110
+ tutorial_md = """
111
+ # Embedding Comparison Tool Tutorial
112
+
113
+ ... (tutorial content remains the same) ...
114
+ """
115
+
116
+ iface = gr.TabbedInterface(
117
+ [iface, gr.Markdown(tutorial_md)],
118
+ ["Embedding Comparison", "Tutorial"]
119
+ )
120
+
121
+ iface.launch(share=share)
122
+
123
+ if __name__ == "__main__":
124
+ launch_interface()