asoria HF staff commited on
Commit
ba1088f
1 Parent(s): aa1bdb0

Adding template for simple RAG

Browse files
Files changed (2) hide show
  1. app.py +0 -2
  2. utils/notebook_utils.py +142 -39
app.py CHANGED
@@ -17,7 +17,6 @@ import os
17
  # TODOS:
18
  # Validate dataset type for type before generating the notebook
19
  # Add template for training
20
- # Add template for RAG and embeddings
21
 
22
  load_dotenv()
23
 
@@ -169,7 +168,6 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
169
  )
170
  generated_text = ""
171
  # Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
172
- viewer_lines = 0
173
  for cell in cells:
174
  generated_text += cell["source"] + "\n"
175
  yield generated_text, ""
 
17
  # TODOS:
18
  # Validate dataset type for type before generating the notebook
19
  # Add template for training
 
20
 
21
  load_dotenv()
22
 
 
168
  )
169
  generated_text = ""
170
  # Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
 
171
  for cell in cells:
172
  generated_text += cell["source"] + "\n"
173
  yield generated_text, ""
utils/notebook_utils.py CHANGED
@@ -20,14 +20,6 @@ def replace_wildcards(
20
  return new_templates
21
 
22
 
23
- rag_cells = [
24
- {
25
- "cell_type": "markdown",
26
- "source": "# Retrieval-Augmented Generation (RAG) System Notebook",
27
- },
28
- {"cell_type": "code", "source": ""},
29
- ]
30
-
31
  embeggins_cells = [
32
  {
33
  "cell_type": "markdown",
@@ -92,7 +84,7 @@ text_list = df[column_to_generate_embeddings].tolist()
92
  "cell_type": "code",
93
  "source": """
94
  # Specify the embedding model you want to use
95
- model = SentenceTransformer('distiluse-base-multilingual-cased')
96
  """,
97
  },
98
  {
@@ -282,45 +274,156 @@ for column in df.select_dtypes(include=['int64', 'float64']).columns:
282
  ]
283
 
284
 
285
- def generate_embedding_system_prompt():
286
- """You are an expert data scientist tasked with creating a Jupyter notebook to generate embeddings for a specific dataset.
287
- Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, and 'faiss-cpu' to create the index.
288
-
289
- The notebook should include:
290
-
291
- 1. Install necessary libraries with !pip install.
292
- 2. Import libraries.
293
- 3. Load the dataset as a DataFrame using the provided code.
294
- 4. Select the column to generate embeddings.
295
- 5. Remove duplicate data.
296
- 6. Convert the selected column to a list.
297
- 7. Load the sentence-transformers model.
298
- 8. Create a FAISS index.
299
- 9. Encode a query sample.
300
- 10. Search for similar documents using the FAISS index.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
- Ensure the notebook is well-organized with explanations for each step.
303
- The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
304
 
305
- The user will provide dataset information in the following format:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
 
307
- ## Columns and Data Types
 
 
 
 
 
 
 
 
 
308
 
309
- ## Sample Data
310
 
311
- ## Loading Data code
 
312
 
313
- Use the provided code to load the dataset; do not use any other method.
314
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
 
317
  def generate_rag_system_prompt():
318
- """You are an expert machine learning engineer tasked with creating a Jupyter notebook to demonstrate a Retrieval-Augmented Generation (RAG) system using a specific dataset.
319
- The dataset is provided as a pandas DataFrame.
320
-
321
- Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index, and 'transformers' for inference.
322
-
323
- The RAG notebook should include:
324
 
325
  1. Install necessary libraries.
326
  2. Import libraries.
 
20
  return new_templates
21
 
22
 
 
 
 
 
 
 
 
 
23
  embeggins_cells = [
24
  {
25
  "cell_type": "markdown",
 
84
  "cell_type": "code",
85
  "source": """
86
  # Specify the embedding model you want to use
87
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
88
  """,
89
  },
90
  {
 
274
  ]
275
 
276
 
277
+ rag_cells = [
278
+ {
279
+ "cell_type": "markdown",
280
+ "source": """
281
+ ---
282
+ # **Retrieval-Augmented Generation Notebook for {dataset_name} dataset**
283
+ ---
284
+ """,
285
+ },
286
+ {
287
+ "cell_type": "markdown",
288
+ "source": "## 1. Setup necessary libraries and load the dataset",
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "source": """
293
+ # Install and import necessary libraries.
294
+ !pip install pandas sentence-transformers faiss-cpu transformers torch
295
+ """,
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "source": """
300
+ import pandas as pd
301
+ from sentence_transformers import SentenceTransformer
302
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
303
+ import faiss
304
+ import torch
305
+ """,
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "source": """
310
+ # Load the dataset as a DataFrame
311
+ {first_code}
312
+ """,
313
+ },
314
+ {
315
+ "cell_type": "code",
316
+ "source": """
317
+ # Specify the column name that contains the text data to generate embeddings
318
+ column_to_generate_embeddings = '{longest_col}'
319
+ """,
320
+ },
321
+ {
322
+ "cell_type": "markdown",
323
+ "source": "## 2. Loading embedding model and creating FAISS index",
324
+ },
325
+ {
326
+ "cell_type": "code",
327
+ "source": """
328
+ # Remove duplicate entries based on the specified column
329
+ df = df.drop_duplicates(subset=column_to_generate_embeddings)
330
+ """,
331
+ },
332
+ {
333
+ "cell_type": "code",
334
+ "source": """
335
+ # Convert the column data to a list of text entries
336
+ text_list = df[column_to_generate_embeddings].tolist()
337
+ """,
338
+ },
339
+ {
340
+ "cell_type": "code",
341
+ "source": """
342
+ # Specify the embedding model you want to use
343
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
344
+ """,
345
+ },
346
+ {
347
+ "cell_type": "code",
348
+ "source": """
349
+ vectors = model.encode(text_list)
350
+ vector_dimension = vectors.shape[1]
351
 
352
+ # Initialize the FAISS index with the appropriate dimension (384 for this model)
353
+ index = faiss.IndexFlatL2(vector_dimension)
354
 
355
+ # Encode the text list into embeddings and add them to the FAISS index
356
+ index.add(vectors)
357
+ """,
358
+ },
359
+ {
360
+ "cell_type": "markdown",
361
+ "source": "## 3. Perform a text search",
362
+ },
363
+ {
364
+ "cell_type": "code",
365
+ "source": """
366
+ # Specify the text you want to search for in the list
367
+ text_to_search = text_list[0]
368
+ print(f"Text to search: {text_to_search}")
369
+ """,
370
+ },
371
+ {
372
+ "cell_type": "code",
373
+ "source": """
374
+ # Generate the embedding for the search query
375
+ query_embedding = model.encode([text_to_search])
376
+ """,
377
+ },
378
+ {
379
+ "cell_type": "code",
380
+ "source": """
381
+ # Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)
382
+ D, I = index.search(query_embedding, k=10)
383
 
384
+ # Print the similar documents found
385
+ print(f"Similar documents: {[text_list[i] for i in I[0]]}")
386
+ """,
387
+ },
388
+ {"cell_type": "markdown", "source": "## 4. Load pipeline and perform inference"},
389
+ {
390
+ "cell_type": "code",
391
+ "source": """
392
+ # Adjust model name as needed
393
+ checkpoint = 'HuggingFaceTB/SmolLM-1.7B-Instruct'
394
 
395
+ device = "cuda" if torch.cuda.is_available() else "cpu" # for GPU usage or "cpu" for CPU usage
396
 
397
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
398
+ model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
399
 
400
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
401
+ """,
402
+ },
403
+ {
404
+ "cell_type": "code",
405
+ "source": """
406
+ # Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query
407
+ query = "How to prepare a cake?"
408
+ selected_elements = [text_list[i] for i in I[0].tolist()]
409
+ context = ','.join(selected_elements)
410
+ prompt = f"system: Answer user's question based on '{context}'. user: {query}"
411
+ """,
412
+ },
413
+ {
414
+ "cell_type": "code",
415
+ "source": """
416
+ # Send the prompt to the pipeline and show the answer
417
+ output = generator(prompt)
418
+ print("Generated Summary:")
419
+ print(output[0]['generated_text'])
420
+ """,
421
+ },
422
+ ]
423
 
424
 
425
  def generate_rag_system_prompt():
426
+ """
 
 
 
 
 
427
 
428
  1. Install necessary libraries.
429
  2. Import libraries.