Spaces:

asoria
/

auto-notebook-creator

Running

App Files Files Community

asoria HF staff commited on Sep 3, 2024

Commit

ba1088f

•

1 Parent(s): aa1bdb0

Adding template for simple RAG

Browse files

Files changed (2) hide show

app.py +0 -2
utils/notebook_utils.py +142 -39

app.py CHANGED Viewed

@@ -17,7 +17,6 @@ import os
 # TODOS:
 # Validate dataset type for type before generating the notebook
 # Add template for training
-# Add template for RAG and embeddings
 load_dotenv()
@@ -169,7 +168,6 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
     )
     generated_text = ""
     # Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
-    viewer_lines = 0
     for cell in cells:
         generated_text += cell["source"] + "\n"
         yield generated_text, ""

 # TODOS:
 # Validate dataset type for type before generating the notebook
 # Add template for training
 load_dotenv()
     )
     generated_text = ""
     # Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
     for cell in cells:
         generated_text += cell["source"] + "\n"
         yield generated_text, ""

utils/notebook_utils.py CHANGED Viewed

@@ -20,14 +20,6 @@ def replace_wildcards(
     return new_templates
-rag_cells = [
-    {
-        "cell_type": "markdown",
-        "source": "# Retrieval-Augmented Generation (RAG) System Notebook",
-    },
-    {"cell_type": "code", "source": ""},
-]
 embeggins_cells = [
     {
         "cell_type": "markdown",
@@ -92,7 +84,7 @@ text_list = df[column_to_generate_embeddings].tolist()
         "cell_type": "code",
         "source": """
 # Specify the embedding model you want to use
-model = SentenceTransformer('distiluse-base-multilingual-cased')
 """,
     },
     {
@@ -282,45 +274,156 @@ for column in df.select_dtypes(include=['int64', 'float64']).columns:
 ]
-def generate_embedding_system_prompt():
-    """You are an expert data scientist tasked with creating a Jupyter notebook to generate embeddings for a specific dataset.
-    Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, and 'faiss-cpu' to create the index.
-    The notebook should include:
-    1. Install necessary libraries with !pip install.
-    2. Import libraries.
-    3. Load the dataset as a DataFrame using the provided code.
-    4. Select the column to generate embeddings.
-    5. Remove duplicate data.
-    6. Convert the selected column to a list.
-    7. Load the sentence-transformers model.
-    8. Create a FAISS index.
-    9. Encode a query sample.
-    10. Search for similar documents using the FAISS index.
-    Ensure the notebook is well-organized with explanations for each step.
-    The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
-    The user will provide dataset information in the following format:
-    ## Columns and Data Types
-    ## Sample Data
-    ## Loading Data code
-    Use the provided code to load the dataset; do not use any other method.
-    """
 def generate_rag_system_prompt():
-    """You are an expert machine learning engineer tasked with creating a Jupyter notebook to demonstrate a Retrieval-Augmented Generation (RAG) system using a specific dataset.
-    The dataset is provided as a pandas DataFrame.
-    Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index, and 'transformers' for inference.
-    The RAG notebook should include:
     1. Install necessary libraries.
     2. Import libraries.

     return new_templates
 embeggins_cells = [
     {
         "cell_type": "markdown",
         "cell_type": "code",
         "source": """
 # Specify the embedding model you want to use
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 """,
     },
     {
 ]
+rag_cells = [
+    {
+        "cell_type": "markdown",
+        "source": """
+---
+# **Retrieval-Augmented Generation Notebook for {dataset_name} dataset**
+---
+""",
+    },
+    {
+        "cell_type": "markdown",
+        "source": "## 1. Setup necessary libraries and load the dataset",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Install and import necessary libraries.
+!pip install pandas sentence-transformers faiss-cpu transformers torch
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import faiss
+import torch
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Load the dataset as a DataFrame
+{first_code}
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Specify the column name that contains the text data to generate embeddings
+column_to_generate_embeddings = '{longest_col}'
+""",
+    },
+    {
+        "cell_type": "markdown",
+        "source": "## 2. Loading embedding model and creating FAISS index",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Remove duplicate entries based on the specified column
+df = df.drop_duplicates(subset=column_to_generate_embeddings)
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Convert the column data to a list of text entries
+text_list = df[column_to_generate_embeddings].tolist()
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Specify the embedding model you want to use
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+vectors = model.encode(text_list)
+vector_dimension = vectors.shape[1]
+# Initialize the FAISS index with the appropriate dimension (384 for this model)
+index = faiss.IndexFlatL2(vector_dimension)
+# Encode the text list into embeddings and add them to the FAISS index
+index.add(vectors)
+""",
+    },
+    {
+        "cell_type": "markdown",
+        "source": "## 3. Perform a text search",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Specify the text you want to search for in the list
+text_to_search = text_list[0]
+print(f"Text to search: {text_to_search}")
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Generate the embedding for the search query
+query_embedding = model.encode([text_to_search])
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)
+D, I = index.search(query_embedding, k=10)
+# Print the similar documents found
+print(f"Similar documents: {[text_list[i] for i in I[0]]}")
+""",
+    },
+    {"cell_type": "markdown", "source": "## 4. Load pipeline and perform inference"},
+    {
+        "cell_type": "code",
+        "source": """
+# Adjust model name as needed
+checkpoint = 'HuggingFaceTB/SmolLM-1.7B-Instruct'
+device = "cuda" if torch.cuda.is_available() else "cpu" # for GPU usage or "cpu" for CPU usage
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
+generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query
+query = "How to prepare a cake?"
+selected_elements = [text_list[i] for i in I[0].tolist()]
+context = ','.join(selected_elements)
+prompt = f"system: Answer user's question based on '{context}'. user: {query}"
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Send the prompt to the pipeline and show the answer
+output = generator(prompt)
+print("Generated Summary:")
+print(output[0]['generated_text'])
+""",
+    },
+]
 def generate_rag_system_prompt():
+    """
     1. Install necessary libraries.
     2. Import libraries.