Spaces:

akazakov
/

rag-gradio-sample-project

Paused

App Files Files Community

AlexanderKazakov commited on Nov 27, 2023

Commit

eeafaaa

1 Parent(s): d7fdb42

small improvement for chunking; openai embeddings

Browse files

Files changed (9) hide show

README initial.md +1 -1
gradio_app/app.py +8 -6
gradio_app/backend/embedders.py +43 -0
gradio_app/backend/query_llm.py +7 -9
gradio_app/backend/semantic_search.py +2 -2
gradio_app/templates/context_html_template.j2 +5 -65
prep_scripts/lancedb_setup.py +12 -26
prep_scripts/markdown_to_text.py +17 -14
settings.py +10 -6

README initial.md CHANGED Viewed

@@ -13,7 +13,7 @@ Deliberately stripped down to leave some room for experimenting
 - TODOs:
   - Experiment with chunking, see how it affects the results. When deciding how to chunk it helps to think about what kind of chunks you'd like to see as context to your queries.
     - Deliverables: Demonstrate how retrieved documents differ with different chunking strategies and how it affects the output.
-  - Try out different embedding models (EMB_MODEL_NAME). Good models to start with are **sentence-transformers/all-MiniLM-L6-v2** - lightweight, **thenlper/gte-large** - relatively heavy but more powerful.
     - Deliverables: Demonstrate how retrieved documents differ with different embedding models and how they affect the output. Provide an estimate of how the time to embed the chunks and DB ingestion time differs (happening in **prep_scrips/lancedb_setup.py**).
   - Add a re-ranker (cross-encoder) to the pipeline. Start with sentence-transformers pages on cross-encoders [1](https://www.sbert.net/examples/applications/cross-encoder/README.html) [2](https://www.sbert.net/examples/applications/retrieve_rerank/README.html), then pick a [pretrained cross-encoder](https://www.sbert.net/docs/pretrained-models/ce-msmarco.html), e.g. **cross-encoder/ms-marco-MiniLM-L-12-v2**. Don't forget to increase the number of *retrieved* documents when using re-ranker. The number of documents used as context should stay the same.
     - Deliverables: Demonstrate how retrieved documents differ after adding a re-ranker and how it affects the output. Provide an estimate of how latency changes.

 - TODOs:
   - Experiment with chunking, see how it affects the results. When deciding how to chunk it helps to think about what kind of chunks you'd like to see as context to your queries.
     - Deliverables: Demonstrate how retrieved documents differ with different chunking strategies and how it affects the output.
+  - Try out different embedding models (EMBED_NAME). Good models to start with are **sentence-transformers/all-MiniLM-L6-v2** - lightweight, **thenlper/gte-large** - relatively heavy but more powerful.
     - Deliverables: Demonstrate how retrieved documents differ with different embedding models and how they affect the output. Provide an estimate of how the time to embed the chunks and DB ingestion time differs (happening in **prep_scrips/lancedb_setup.py**).
   - Add a re-ranker (cross-encoder) to the pipeline. Start with sentence-transformers pages on cross-encoders [1](https://www.sbert.net/examples/applications/cross-encoder/README.html) [2](https://www.sbert.net/examples/applications/retrieve_rerank/README.html), then pick a [pretrained cross-encoder](https://www.sbert.net/docs/pretrained-models/ce-msmarco.html), e.g. **cross-encoder/ms-marco-MiniLM-L-12-v2**. Don't forget to increase the number of *retrieved* documents when using re-ranker. The number of documents used as context should stay the same.
     - Deliverables: Demonstrate how retrieved documents differ after adding a re-ranker and how it affects the output. Provide an estimate of how latency changes.

gradio_app/app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import logging
 from time import perf_counter
 import gradio as gr
 from jinja2 import Environment, FileSystemLoader
 from gradio_app.backend.ChatGptInteractor import num_tokens_from_messages
@@ -31,10 +32,10 @@ context_html_template = env.get_template('context_html_template.j2')
 # Examples
 examples = [
     'What is BERT?',
-    'Tell me about BERT deep learning model',
     'What is the capital of China?',
     'Why is the sky blue?',
-    'Who won the mens world cup in 2014?',
 ]
@@ -58,7 +59,7 @@ def bot(history, api_kind):
     # Retrieve documents relevant to query
     document_start = perf_counter()
-    query_vec = embedder.encode(query)
     documents = table.search(query_vec, vector_column_name=VECTOR_COLUMN_NAME).limit(top_k_rank).to_list()
     thresh_dist = max(thresh_dist, min(d['_distance'] for d in documents))
     documents = [d for d in documents if d['_distance'] <= thresh_dist]
@@ -69,10 +70,11 @@ def bot(history, api_kind):
     while len(documents) != 0:
         context = context_template.render(documents=documents)
-        context_html = context_html_template.render(documents=documents)
         messages = construct_openai_messages(context, history)
-        num_tokens = num_tokens_from_messages(messages, OPENAI_LLM_NAME)
-        if num_tokens + 512 < context_lengths[OPENAI_LLM_NAME]:
             break
         documents.pop()
     else:

 from time import perf_counter
 import gradio as gr
+import markdown
 from jinja2 import Environment, FileSystemLoader
 from gradio_app.backend.ChatGptInteractor import num_tokens_from_messages
 # Examples
 examples = [
     'What is BERT?',
+    'Tell me about GPT',
+    'How to use accelerate in google colab?',
     'What is the capital of China?',
     'Why is the sky blue?',
 ]
     # Retrieve documents relevant to query
     document_start = perf_counter()
+    query_vec = embedder.embed(query)[0]
     documents = table.search(query_vec, vector_column_name=VECTOR_COLUMN_NAME).limit(top_k_rank).to_list()
     thresh_dist = max(thresh_dist, min(d['_distance'] for d in documents))
     documents = [d for d in documents if d['_distance'] <= thresh_dist]
     while len(documents) != 0:
         context = context_template.render(documents=documents)
+        documents_html = [markdown.markdown(d) for d in documents]
+        context_html = context_html_template.render(documents=documents_html)
         messages = construct_openai_messages(context, history)
+        num_tokens = num_tokens_from_messages(messages, LLM_NAME)
+        if num_tokens + 512 < context_lengths[LLM_NAME]:
             break
         documents.pop()
     else:

gradio_app/backend/embedders.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+import openai
+from sentence_transformers import SentenceTransformer
+from abc import ABC, abstractmethod
+class Embedder(ABC):
+    @abstractmethod
+    def embed(self, texts):
+        pass
+class HfEmbedder(Embedder):
+    def __init__(self, model_name):
+        self.model = SentenceTransformer(model_name)
+        self.model.eval()
+    @torch.no_grad()
+    def embed(self, texts):
+        encoded = self.model.encode(texts, normalize_embeddings=True)
+        return [list(vec) for vec in encoded]
+class OpenAIEmbedder(Embedder):
+    def __init__(self, model_name):
+        self.model_name = model_name
+    def embed(self, texts):
+        responses = openai.Embedding.create(input=texts, engine=self.model_name)
+        return [response['embedding'] for response in responses['data']]
+class EmbedderFactory:
+    @staticmethod
+    def get_embedder(type):
+        if type == "sentence-transformers/all-MiniLM-L6-v2":
+            return HfEmbedder(type)
+        elif type == "text-embedding-ada-002":
+            return OpenAIEmbedder(type)
+        else:
+            raise ValueError(f"Unsupported embedder type: {type}")

gradio_app/backend/query_llm.py CHANGED Viewed

@@ -2,19 +2,17 @@ import gradio as gr
 from typing import Any, Dict, Generator, List
-from huggingface_hub import InferenceClient
-from transformers import AutoTokenizer
 from jinja2 import Environment, FileSystemLoader
 from settings import *
 from gradio_app.backend.ChatGptInteractor import *
-tokenizer = AutoTokenizer.from_pretrained(HF_LLM_NAME)
-HF_TOKEN = None
-hf_client = InferenceClient(HF_LLM_NAME, token=HF_TOKEN)
 def format_prompt(message: str, api_kind: str):
@@ -125,7 +123,7 @@ def construct_openai_messages(context, history):
 def generate_openai(messages):
-    cgi = ChatGptInteractor(model_name=OPENAI_LLM_NAME)
     for part in cgi.chat_completion(messages, max_tokens=512, temperature=0, stream=True):
         yield cgi.get_stream_text(part)
@@ -162,7 +160,7 @@ def _generate_openai(prompt: str, history: str, temperature: float = 0.9, max_ne
     try:
         stream = openai.ChatCompletion.create(
-            model=OPENAI_LLM_NAME,
             messages=formatted_prompt,
             **generate_kwargs,
             stream=True

 from typing import Any, Dict, Generator, List
+# from huggingface_hub import InferenceClient
+# from transformers import AutoTokenizer
 from jinja2 import Environment, FileSystemLoader
 from settings import *
 from gradio_app.backend.ChatGptInteractor import *
+# tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
+# HF_TOKEN = None
+# hf_client = InferenceClient(LLM_NAME, token=HF_TOKEN)
 def format_prompt(message: str, api_kind: str):
 def generate_openai(messages):
+    cgi = ChatGptInteractor(model_name=LLM_NAME)
     for part in cgi.chat_completion(messages, max_tokens=512, temperature=0, stream=True):
         yield cgi.get_stream_text(part)
     try:
         stream = openai.ChatCompletion.create(
+            model=LLM_NAME,
             messages=formatted_prompt,
             **generate_kwargs,
             stream=True

gradio_app/backend/semantic_search.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import logging
 import lancedb
-from sentence_transformers import SentenceTransformer
 from settings import *
 # Setting up the logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-embedder = SentenceTransformer(EMB_MODEL_NAME)
 db = lancedb.connect(LANCEDB_DIRECTORY)
 table = db.open_table(LANCEDB_TABLE_NAME)

 import logging
 import lancedb
+from gradio_app.backend.embedders import EmbedderFactory
 from settings import *
 # Setting up the logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+embedder = EmbedderFactory.get_embedder(EMBED_NAME)
 db = lancedb.connect(LANCEDB_DIRECTORY)
 table = db.open_table(LANCEDB_TABLE_NAME)

gradio_app/templates/context_html_template.j2 CHANGED Viewed

@@ -11,85 +11,25 @@
             font-family: "Source Sans Pro";
         }
-        .instructions > * {
-          color: #111 !important;
-        }
-        details.doc-box * {
-          color: #111 !important;
-        }
-        .dark {
-            background: #111;
-            color: white;
-        }
         .doc-box {
             padding: 10px;
             margin-top: 10px;
-            background-color: #baecc2;
             border-radius: 6px;
             color: #111 !important;
-            max-width: 700px;
-            box-shadow: rgba(0, 0, 0, 0.2) 0px 1px 2px 0px;
-        }
-        .doc-full {
-            margin: 10px 14px;
-            line-height: 1.6rem;
-        }
-        .instructions {
-            color: #111 !important;
-            background: #b7bdfd;
-            display: block;
-            border-radius: 6px;
-            padding: 6px 10px;
-            line-height: 1.6rem;
-            max-width: 700px;
-            box-shadow: rgba(0, 0, 0, 0.2) 0px 1px 2px 0px;
-        }
-        .query {
-            color: #111 !important;
-            background: #ffbcbc;
-            display: block;
-            border-radius: 6px;
-            padding: 6px 10px;
-            line-height: 1.6rem;
-            max-width: 700px;
             box-shadow: rgba(0, 0, 0, 0.2) 0px 1px 2px 0px;
         }
     </style>
 </head>
 <body>
-<div class="prose svelte-1ybaih5" id="context_html">
 <h2>Context:</h2>
 {% for doc in documents %}
-    <details class="doc-box">
-        <summary>
-            <b>Doc {{ loop.index }}:</b> <span class="doc-short">{{ doc[:1000] }}...</span>
-        </summary>
-        <div class="doc-full">{{ doc }}</div>
-    </details>
 {% endfor %}
-</div>
-<script>
-document.addEventListener("DOMContentLoaded", function() {
-    const detailsElements = document.querySelectorAll('.doc-box');
-    detailsElements.forEach(detail => {
-        detail.addEventListener('toggle', function() {
-            const docShort = this.querySelector('.doc-short');
-            if (this.open) {
-                docShort.style.display = 'none';
-            } else {
-                docShort.style.display = 'inline';
-            }
-        });
-    });
-});
-</script>
 </body>
 </html>

             font-family: "Source Sans Pro";
         }
         .doc-box {
             padding: 10px;
             margin-top: 10px;
+            background-color: #374151;
             border-radius: 6px;
             color: #111 !important;
             box-shadow: rgba(0, 0, 0, 0.2) 0px 1px 2px 0px;
         }
     </style>
 </head>
 <body>
 <h2>Context:</h2>
 {% for doc in documents %}
+    <div class="doc-box">
+        {{ doc }}
+    </div>
 {% endfor %}
 </body>
 </html>

prep_scripts/lancedb_setup.py CHANGED Viewed

@@ -1,36 +1,29 @@
 import shutil
-import traceback
 import lancedb
-import torch
 import pyarrow as pa
 import pandas as pd
 from pathlib import Path
 import tqdm
 import numpy as np
-from sentence_transformers import SentenceTransformer
 from markdown_to_text import *
 from settings import *
 shutil.rmtree(LANCEDB_DIRECTORY, ignore_errors=True)
 db = lancedb.connect(LANCEDB_DIRECTORY)
 batch_size = 32
-model = SentenceTransformer(EMB_MODEL_NAME)
-model.eval()
-if torch.backends.mps.is_available():
-    device = "mps"
-elif torch.cuda.is_available():
-    device = "cuda"
-else:
-    device = "cpu"
 schema = pa.schema([
-    pa.field(VECTOR_COLUMN_NAME, pa.list_(pa.float32(), emb_sizes[EMB_MODEL_NAME])),
     pa.field(TEXT_COLUMN_NAME, pa.string()),
     pa.field(DOCUMENT_PATH_COLUMN_NAME, pa.string()),
 ])
@@ -49,17 +42,18 @@ for file in files:
         print(f'Skipped {file_ext} extension: {file}')
         continue
-    doc_header = ' / '.join(split_path(file_path)) + ':\n\n'
     with open(file, encoding='utf-8') as f:
         f = f.read()
         f = remove_comments(f)
         f = split_markdown(f)
-        chunks.extend((doc_header + chunk, os.path.abspath(file)) for chunk in f)
 from matplotlib import pyplot as plt
 plt.hist([len(c) for c, d in chunks], bins=100)
 plt.show()
 for i in tqdm.tqdm(range(0, int(np.ceil(len(chunks) / batch_size)))):
     texts, doc_paths = [], []
     for text, doc_path in chunks[i * batch_size:(i + 1) * batch_size]:
@@ -67,9 +61,7 @@ for i in tqdm.tqdm(range(0, int(np.ceil(len(chunks) / batch_size)))):
             texts.append(text)
             doc_paths.append(doc_path)
-    encoded = model.encode(texts, normalize_embeddings=True, device=device)
-    encoded = [list(vec) for vec in encoded]
     df = pd.DataFrame({
         VECTOR_COLUMN_NAME: encoded,
         TEXT_COLUMN_NAME: texts,
@@ -79,10 +71,4 @@ for i in tqdm.tqdm(range(0, int(np.ceil(len(chunks) / batch_size)))):
     tbl.add(df)
-# '''
-# create ivf-pd index https://lancedb.github.io/lancedb/ann_indexes/
-# with the size of the transformer docs, index is not really needed
-# but we'll do it for demonstration purposes
-# '''
-# tbl.create_index(num_partitions=256, num_sub_vectors=96, vector_column_name=VECTOR_COLUMN_NAME)

 import shutil
 import lancedb
+import openai
 import pyarrow as pa
 import pandas as pd
 from pathlib import Path
 import tqdm
 import numpy as np
+from gradio_app.backend.embedders import EmbedderFactory
 from markdown_to_text import *
 from settings import *
+with open('data/openaikey.txt') as f:
+    OPENAI_KEY = f.read().strip()
+openai.api_key = OPENAI_KEY
 shutil.rmtree(LANCEDB_DIRECTORY, ignore_errors=True)
 db = lancedb.connect(LANCEDB_DIRECTORY)
 batch_size = 32
 schema = pa.schema([
+    pa.field(VECTOR_COLUMN_NAME, pa.list_(pa.float32(), emb_sizes[EMBED_NAME])),
     pa.field(TEXT_COLUMN_NAME, pa.string()),
     pa.field(DOCUMENT_PATH_COLUMN_NAME, pa.string()),
 ])
         print(f'Skipped {file_ext} extension: {file}')
         continue
     with open(file, encoding='utf-8') as f:
         f = f.read()
         f = remove_comments(f)
         f = split_markdown(f)
+        chunks.extend((chunk, os.path.abspath(file)) for chunk in f)
 from matplotlib import pyplot as plt
 plt.hist([len(c) for c, d in chunks], bins=100)
 plt.show()
+embedder = EmbedderFactory.get_embedder(EMBED_NAME)
 for i in tqdm.tqdm(range(0, int(np.ceil(len(chunks) / batch_size)))):
     texts, doc_paths = [], []
     for text, doc_path in chunks[i * batch_size:(i + 1) * batch_size]:
             texts.append(text)
             doc_paths.append(doc_path)
+    encoded = embedder.embed(texts)
     df = pd.DataFrame({
         VECTOR_COLUMN_NAME: encoded,
         TEXT_COLUMN_NAME: texts,
     tbl.add(df)

prep_scripts/markdown_to_text.py CHANGED Viewed

@@ -21,33 +21,26 @@ def remove_comments(md):
     return re.sub(r'<!--((.|\n)*)-->', '', md)
-header_pattern = re.compile(r'\n\s*\n(#{1,3})\s.*\n\s*\n')
 def split_content(content):
     _parts = content.split('\n\n')
     parts = []
     for p in _parts:
-        if len(p) < 2 * TEXT_CHUNK_SIZE:
             parts.append(p)
         else:
             parts.extend(p.split('\n'))
     res = ['']
     for p in parts:
-        if len(res[-1]) + len(p) < TEXT_CHUNK_SIZE:
             res[-1] += p + '\n\n'
         else:
             res.append(p + '\n\n')
-    if (
-            len(res) >= 2 and
-            len(res[-1]) < TEXT_CHUNK_SIZE / 4 and
-            len(res[-2]) < TEXT_CHUNK_SIZE
-    ):
-        res[-2] += res[-1]
-        res.pop()
     return res
@@ -65,20 +58,30 @@ def split_markdown(md):
         chunk = ''
         for i in sorted(name_hierarchy):
             if len(name_hierarchy[i]) != 0:
-                chunk += name_hierarchy[i] + '\n\n'
         chunk += content
         chunk = chunk.strip()
         res.append(chunk)
-    md = f'\n\n{md}'  # to find a header at the top of a file
     headers = list(header_pattern.finditer(md))
     name_hierarchy = {i: '' for i in (1, 2, 3)}
     res = []
     for i in range(len(headers)):
         header = headers[i]
         level = len(header.group(1))
-        name = header.group().strip()
         name_hierarchy[level] = name
         if i == 0 and header.start() != 0:
             construct_chunks(md[:header.start()])

     return re.sub(r'<!--((.|\n)*)-->', '', md)
+header_pattern = re.compile(r'\n\s*\n(#{1,3})\s(.*)\n\s*\n')
 def split_content(content):
+    text_chunk_size = context_lengths[EMBED_NAME] - 32
     _parts = content.split('\n\n')
     parts = []
     for p in _parts:
+        if len(p) < text_chunk_size:
             parts.append(p)
         else:
             parts.extend(p.split('\n'))
     res = ['']
     for p in parts:
+        if len(res[-1]) + len(p) < text_chunk_size:
             res[-1] += p + '\n\n'
         else:
             res.append(p + '\n\n')
     return res
         chunk = ''
         for i in sorted(name_hierarchy):
             if len(name_hierarchy[i]) != 0:
+                j = i + 1
+                while j in name_hierarchy:
+                    if name_hierarchy[j].find(name_hierarchy[i]) != -1:
+                        break
+                    j += 1
+                else:
+                    chunk += f'{"#" * (i + 1)}{name_hierarchy[i]}\n\n'
         chunk += content
         chunk = chunk.strip()
         res.append(chunk)
+    # to find a header at the top of a file
+    md = f'\n\n{md}'
     headers = list(header_pattern.finditer(md))
+    # only first header can be first-level
+    headers = [h for i, h in enumerate(headers) if i == 0 or len(h.group(1)) > 1]
     name_hierarchy = {i: '' for i in (1, 2, 3)}
     res = []
     for i in range(len(headers)):
         header = headers[i]
         level = len(header.group(1))
+        name = header.group(2).strip()
         name_hierarchy[level] = name
         if i == 0 and header.start() != 0:
             construct_chunks(md[:header.start()])

settings.py CHANGED Viewed

@@ -1,22 +1,26 @@
 MARKDOWN_SOURCE_DIR = "data/transformers/docs/source/en/"
-EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 LANCEDB_DIRECTORY = "data/lancedb"
 LANCEDB_TABLE_NAME = "table"
 VECTOR_COLUMN_NAME = "embedding"
 TEXT_COLUMN_NAME = "text"
 DOCUMENT_PATH_COLUMN_NAME = "document_path"
-HF_LLM_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
-OPENAI_LLM_NAME = "gpt-3.5-turbo"
-""" in symbols, approximate, without headers """
-TEXT_CHUNK_SIZE = 1000
 emb_sizes = {
     "sentence-transformers/all-MiniLM-L6-v2": 384,
-    "thenlper/gte-large": 0
 }
 context_lengths = {
     "mistralai/Mistral-7B-Instruct-v0.1": 4096,
     "gpt-3.5-turbo": 4096,
 }

 MARKDOWN_SOURCE_DIR = "data/transformers/docs/source/en/"
 LANCEDB_DIRECTORY = "data/lancedb"
 LANCEDB_TABLE_NAME = "table"
 VECTOR_COLUMN_NAME = "embedding"
 TEXT_COLUMN_NAME = "text"
 DOCUMENT_PATH_COLUMN_NAME = "document_path"
+# LLM_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
+LLM_NAME = "gpt-3.5-turbo"
+# EMBED_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+EMBED_NAME = "text-embedding-ada-002"
 emb_sizes = {
     "sentence-transformers/all-MiniLM-L6-v2": 384,
+    "thenlper/gte-large": 1024,
+    "text-embedding-ada-002": 1536,
 }
 context_lengths = {
     "mistralai/Mistral-7B-Instruct-v0.1": 4096,
     "gpt-3.5-turbo": 4096,
+    "sentence-transformers/all-MiniLM-L6-v2": 128,
+    "thenlper/gte-large": 512,
+    "text-embedding-ada-002": 8191,
 }