Spaces:

JPBianchi
/

vectorsearch

Running

App Files Files Community

JPBianchi commited on Dec 18, 2023

Commit

30ffb9e

•

1 Parent(s): a0cb228

temp before HF pull

Browse files

Files changed (23) hide show

.devcontainer/devcontainer.json +29 -0
.gitattributes +1 -0
.gitignore +179 -0
.streamlit/config.toml +11 -0
.vscode/launch.json +13 -0
README.md +28 -0
app.py +755 -0
backend.py +185 -0
class_templates.py +46 -0
finetune_backend.py +55 -0
helpers.py +112 -0
llama_test.ipynb +80 -0
openai_interface.py +100 -0
preprocessing.py +123 -0
prompt_templates.py +63 -0
prompt_templates_luis.py +63 -0
readme2.md +43 -0
requirements.txt +50 -0
reranker.py +89 -0
retrieval_evaluation.py +332 -0
unitesting_utils.py +34 -0
utilities/install_kernel.sh +4 -0
weaviate_interface.py +434 -0

.devcontainer/devcontainer.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "name": "Python 3",
+    "image": "mcr.microsoft.com/devcontainers/python:3.10",
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	//"features": {}
+	// Configure tool-specific properties.
+	"customizations": {
+		// Configure properties specific to VS Code.
+		"vscode": {
+			"settings": {"terminal.integrated.shell.linux": "/bin/bash"},
+			"extensions": [
+				"ms-toolsai.jupyter"
+			]
+		}
+	},
+    "forwardPorts": [8501, 8888],
+	"portsAttributes": {
+		"8501": {
+			"label": "Streamlit App",
+			"onAutoForward": "openBrowser"
+		},
+		"8888": {
+			"label": "Jupyter Notebook",
+			"onAutoForward": "openBrowser"
+		}
+	},
+	"postCreateCommand": "pip install -r requirements.txt"
+}

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/impact_theory_data.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,179 @@

+# Large Files
+# models/
+eval_results/
+# models/all-mpnet*
+# models/finetuned-all-MiniLM*
+# models/finetuned-WhereIsAI-UAE*
+models/*
+# !models/finetuned-all-mpnet-base-v2-300
+data/*.parquet
+.DS_Store
+secrets.toml
+TODO.md
+assets/*
+!assets/it_tom_bilyeu.png
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+*copy*
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,11 @@

+[theme]
+base="dark"
+primaryColor="purple"  # border of textboxes !!??
+#primaryColor="#2d59b3"
+backgroundColor="#000000"
+secondaryBackgroundColor= "#0e404d" # should be identical to blue in banner # "#2d59b3"  light blue
+textColor="#FFFFFF"
+font="sans serif"

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": false
+        }
+    ]
+}

README.md CHANGED Viewed

@@ -10,3 +10,31 @@ pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+---
+See the app @ [jpb-vectorsearch.streamlit.app](https://jpb-vectorsearch.streamlit.app/)
+Beware, sometimes, the online app crashes... especially with the metrics.
+<p align="left">
+  <img src="assets/screenshot_frontpage_with_finetune.png" width=800/>
+</p>
+<!-- <p align="center">
+  <img src="assets/screenshot_frontpage_online.png"/>
+</p> -->
+## Activity on Modal backend during finetuning
+<p align="left">
+  <img src="assets/modal_finetuning1.png" width=800/>
+</p>
+<p align="left">
+  <img src="assets/modal_finetuning2.png" width=800/>
+</p>
+<p align="left">
+  <img src="assets/modal_finetuning_activity.png" width=800/>
+</p>

app.py ADDED Viewed

	@@ -0,0 +1,755 @@

+#%%
+from tiktoken import get_encoding, encoding_for_model
+from weaviate_interface import WeaviateClient, WhereFilter
+from sentence_transformers import SentenceTransformer
+from prompt_templates import question_answering_prompt_series, question_answering_system
+from openai_interface import GPT_Turbo
+from app_features import (convert_seconds, generate_prompt_series, search_result,
+                          validate_token_threshold, load_content_cache, load_data,
+                          expand_content)
+from retrieval_evaluation import execute_evaluation, calc_hit_rate_scores
+from llama_index.finetuning import EmbeddingQAFinetuneDataset
+from weaviate_interface import WeaviateClient
+from openai import BadRequestError
+from reranker import ReRanker
+from loguru import logger
+import streamlit as st
+from streamlit_option_menu import option_menu
+import hydralit_components as hc
+import sys
+import json
+import os, time, requests, re
+from datetime import timedelta
+import pathlib
+import gdown
+import tempfile
+import base64
+import shutil
+def get_base64_of_bin_file(bin_file):
+    with open(bin_file, 'rb') as file:
+        data = file.read()
+    return base64.b64encode(data).decode()
+from dotenv import load_dotenv, find_dotenv
+load_dotenv(find_dotenv('env'), override=True)
+# I use a key that I increment each time I want to change a text_input
+if 'key' not in st.session_state:
+    st.session_state.key = 0
+# key = st.session_state['key']
+if not pathlib.Path('models').exists():
+    os.mkdir('models')
+# I should cache these things but no time left
+# I put a file local.txt in my desktop models folder to find out if it's running online
+we_are_online = not pathlib.Path("models/local.txt").exists()
+we_are_not_online = not we_are_online
+golden_dataset = EmbeddingQAFinetuneDataset.from_json("data/golden_100.json")
+# shutil.rmtree("models/models")  # remove it - I wanted to clear the space on streamlit online
+## PAGE CONFIGURATION
+st.set_page_config(page_title="Ask Impact Theory",
+                   page_icon="assets/impact-theory-logo-only.png",
+                   layout="wide",
+                   initial_sidebar_state="collapsed",
+                   menu_items={'Report a bug': "https://www.extremelycoolapp.com/bug"})
+image = "https://is2-ssl.mzstatic.com/image/thumb/Music122/v4/bd/34/82/bd348260-314c-5898-26c0-bef2e0388ebe/source/1200x1200bb.png"
+def add_bg_from_local(image_file):
+    bin_str = get_base64_of_bin_file(image_file)
+    page_bg_img = f'''
+    <style>
+    .stApp {{
+      background-image: url("data:image/png;base64,{bin_str}");
+      background-size: 100% auto;
+      background-repeat: no-repeat;
+      background-attachment: fixed;
+    }}
+    </style>
+    '''
+    st.markdown(page_bg_img, unsafe_allow_html=True)
+# COMMENT: I tried to create a dropdown menu but it's harder than it looks, so I gave up
+# https://discuss.streamlit.io/t/streamlit-option-menu-is-a-simple-streamlit-component-that-allows-users-to-select-a-single-item-from-a-list-of-options-in-a-menu/20514
+# not great, but it works
+# selected = option_menu("About", ["Improvements","This"], #"Main Menu", ["Home", 'Settings'],
+#     icons=['house', 'gear'],
+#     menu_icon="cast",
+#     default_index=1)
+# # Custom HTML/CSS for the banner
+# base64_img = get_base64_of_bin_file("assets/it_tom_bilyeu.png")
+# banner_menu_html = f"""
+# <div class="banner">
+#     <img src= "data:image/png;base64,{base64_img}" alt="Banner Image">
+# </div>
+# <style>
+#     .banner {{
+#         width: 100%;
+#         height: auto;
+#         overflow: hidden;
+#         display: flex;
+#         justify-content: center;
+#     }}
+#     .banner img {{
+#         width: 130%;
+#         height: auto;
+#         object-fit: contain;
+#     }}
+# </style>
+# """
+# st.components.v1.html(banner_menu_html)
+# specify the primary menu definition
+# it gives a vertical menu inside a navigation bar !!!
+# menu_data = [
+#         {'icon': "far fa-copy", 'label':"Left End"},
+#         {'id':'Copy','icon':"🐙",'label':"Copy"},
+#         {'icon': "far fa-chart-bar", 'label':"Chart"},#no tooltip message
+#         {'icon': "far fa-address-book", 'label':"Book"},
+#         {'id':' Crazy return value 💀','icon': "💀", 'label':"Calendar"},
+#         {'icon': "far fa-clone", 'label':"Component"},
+#         {'icon': "fas fa-tachometer-alt", 'label':"Dashboard",'ttip':"I'm the Dashboard tooltip!"}, #can add a tooltip message
+#         {'icon': "far fa-copy", 'label':"Right End"},
+# ]
+# # we can override any part of the primary colors of the menu
+# over_theme = {'txc_inactive': '#FFFFFF','menu_background':'red','txc_active':'yellow','option_active':'blue'}
+# # over_theme = {'txc_inactive': '#FFFFFF'}
+# menu_id = hc.nav_bar(menu_definition=menu_data,
+#                      home_name='Home',
+#                      override_theme=over_theme)
+#get the id of the menu item clicked
+# st.info(f"{menu_id=}")
+## RERANKER
+reranker = ReRanker('cross-encoder/ms-marco-MiniLM-L-6-v2')
+## ENCODING  --> tiktoken library
+model_ids = ['gpt-3.5-turbo-16k', 'gpt-3.5-turbo-0613']
+model_nameGPT = model_ids[1]
+encoding = encoding_for_model(model_nameGPT)
+# = get_encoding('gpt-3.5-turbo-0613')
+##############
+data_path = './data/impact_theory_data.json'
+cache_path = 'data/impact_theory_cache.parquet'
+data = load_data(data_path)
+cache = None # load_content_cache(cache_path)
+try:
+    # st.write("Loading secrets from secrets.toml")
+    Wapi_key = st.secrets['secrets']['WEAVIATE_API_KEY']
+    url = st.secrets['secrets']['WEAVIATE_ENDPOINT']
+    openai_api_key = st.secrets['secrets']['OPENAI_API_KEY']
+    hf_token = st.secrets['secrets']['LLAMA2_ENDPOINT_HF_TOKEN_chris']
+    hf_endpoint = st.secrets['secrets']['LLAMA2_ENDPOINT_UPLIMIT']
+    # st.write("Secrets loaded from secrets.toml")
+    # st.write("HF_TOKEN", hf_token)
+except:
+    st.write("Loading secrets from environment variables")
+    api_key = os.environ['WEAVIATE_API_KEY']
+    url = os.environ['WEAVIATE_ENDPOINT']
+    openai_api_key = os.environ['OPENAI_API_KEY']
+    hf_token = os.environ['LLAMA2_ENDPOINT_HF_TOKEN_chris']
+    hf_endpoint = os.environ['LLAMA2_ENDPOINT_UPLIMIT']
+#%%
+# model_default = 'sentence-transformers/all-mpnet-base-v2'
+model_default = 'models/finetuned-all-mpnet-base-v2-300' if we_are_not_online \
+           else 'sentence-transformers/all-mpnet-base-v2'
+available_models = ['sentence-transformers/all-mpnet-base-v2',
+                    'sentence-transformers/all-MiniLM-L6-v2',
+                    'models/finetuned-all-mpnet-base-v2-300']
+#%%
+models_urls = {'models/finetuned-all-mpnet-base-v2-300': "https://drive.google.com/drive/folders/1asJ37-AUv5nytLtH6hp6_bVV3_cZOXfj"}
+def download_model_from_Gdrive(model_name_or_path, model_full_path):
+    print("Downloading model from Google Drive")
+    st.write("Downloading model from Google Drive")
+    assert model_name_or_path in models_urls, f"Model {model_name_or_path} not found in models_urls"
+    url = models_urls[model_name_or_path]
+    gdown.download_folder(url, output=model_full_path, quiet=False, use_cookies=False)
+    print("Model downloaded and saved to models folder")
+    # st.write("Model downloaded")
+def download_model(model_name_or_path, model_full_path):
+    if model_name_or_path.startswith("models/"):
+        download_model_from_Gdrive(model_name_or_path, model_full_path)
+        print(f"Model {model_full_path} downloaded")
+        models_urls[model_name_or_path] = model_full_path
+        # st.sidebar.write(f"Model {model_full_path} downloaded")
+    elif model_name_or_path.startswith("sentence-transformers/"):
+        st.sidebar.write(f"Downloading Sentence Transformer model {model_name_or_path}")
+        model = SentenceTransformer(model_name_or_path)   # HF looks into its own models folder/path
+        models_urls[model_name_or_path] = model_full_path
+        # st.sidebar.write(f"Model {model_name_or_path} downloaded")
+        model.save(model_full_path)
+        # st.sidebar.write(f"Model {model_name_or_path} saved to {model_full_path}")
+# if 'modelspath' not in st.session_state:
+#     st.session_state['modelspath'] = None
+# if st.session_state.modelspath is None:
+#     # let's create a temp folder on the first run
+#     persistent_dir = pathlib.Path("path/to/persistent_dir")
+#     persistent_dir.mkdir(parents=True, exist_ok=True)
+#     with tempfile.TemporaryDirectory() as temp_dir:
+#         st.session_state.modelspath = temp_dir
+#         print(f"Temporary directory created at {temp_dir}")
+#     # the temp folder disappears with the context, but not the one we've created manually
+# else:
+#     temp_dir = st.session_state.modelspath
+#     print(f"Temporary directory already exists at {temp_dir}")
+#     # st.write(os.listdir(temp_dir))
+#%%
+# for streamlit online, we must download the model from google drive
+# because github LFS doesn't work on forked repos
+def check_model(model_name_or_path):
+    model_path = pathlib.Path(model_name_or_path)
+    model_full_path = str(pathlib.Path("models") / model_path) # this creates a models folder inside /models
+    model_full_path = model_full_path.replace("sentence-transformers/", "models/") # all are saved in models folder
+    if pathlib.Path(model_full_path).exists():
+        # let's use the model that's already there
+        print(f"Model {model_full_path} already exists")
+        # but delete everything else in we are online because
+        # streamlit online has limited space (and will shut down the app if it's full)
+        if we_are_online:
+            # st.sidebar.write(f"Model {model_full_path} already exists")
+            # st.sidebar.write(f"Deleting other models")
+            dirs = os.listdir("models/models")
+            # we get only the folder name, not the full path
+            dirs.remove(model_full_path.split('/')[-1])
+            for p in dirs:
+                dirpath = pathlib.Path("models/models") / p
+                if dirpath.is_dir():
+                    shutil.rmtree(dirpath)
+    else:
+        if we_are_online:
+            # space issues on streamlit online, let's not leave anything behind
+            # and redownload the model eveery time
+            print("Deleting models/models folder")
+            if pathlib.Path('models/models').exists():
+                shutil.rmtree("models/models")  # make room, if other models are there
+            # st.sidebar.write(f"models/models folder deleted")
+        download_model(model_name_or_path, model_full_path)
+    return model_full_path
+#%% instantiate Weaviate client
+def get_weaviate_client(api_key, url, model_name_or_path, openai_api_key):
+    client = WeaviateClient(api_key, url,
+                            model_name_or_path=model_name_or_path,
+                            openai_api_key=openai_api_key)
+    client.display_properties.append('summary')
+    available_classes = sorted(client.show_classes())
+    # st.write(f"Available classes: {available_classes}")
+    # st.write(f"Available classes type: {type(available_classes)}")
+    logger.info(available_classes)
+    return client, available_classes
+##############
+# data = load_data(data_path)
+# guests list for sidebar
+guest_list = sorted(list(set([d['guest'] for d in data])))
+def main():
+    with st.sidebar:
+        # moved it to main area
+        # guest = st.selectbox('Select Guest',
+        #                      options=guest_list,
+        #                      index=None,
+        #                      placeholder='Select Guest')
+        _, center, _ = st.columns([3, 5, 3])
+        with center:
+            st.text("Search Lab")
+        _, center, _ = st.columns([2, 5, 3])
+        with center:
+            if we_are_online:
+                st.text("Running ONLINE")
+                st.text("(UNSTABLE)")
+            else:
+                st.text("Running OFFLINE")
+        st.write("----------")
+        alpha_input = st.slider(label='Alpha',min_value=0.00, max_value=1.00, value=0.40, step=0.05)
+        retrieval_limit = st.slider(label='Hybrid Search Results', min_value=10, max_value=300, value=10, step=10)
+        hybrid_filter = st.toggle('Filter Guest', True) # i.e. look only at guests' data
+        rerank = st.toggle('Use Reranker', True)
+        if rerank:
+            reranker_topk = st.slider(label='Reranker Top K',min_value=1, max_value=5, value=3, step=1)
+        else:
+            # needed to not fill the LLM with too many responses (> context size)
+            # we could make it dependent on the model
+            reranker_topk = 3
+        rag_it = st.toggle('RAG it', True)
+        if rag_it:
+            st.sidebar.write(f"Using LLM '{model_nameGPT}'")
+            llm_temperature = st.slider(label='LLM T˚', min_value=0.0, max_value=2.0, value=0.01, step=0.10 )
+        model_name_or_path = st.selectbox(label='Model Name:', options=available_models,
+                                          index=available_models.index(model_default),
+                                          placeholder='Select Model')
+        st.write("Experimental and time limited 2'")
+        finetune_model = st.toggle('Finetune on Modal A100 GPU', False)
+        if finetune_model:
+            from finetune_backend import finetune
+            if 'finetuned' in model_name_or_path:
+                st.write("Model already finetuned")
+            elif model_name_or_path.startswith("models/"):
+                st.write("Sentence Transformers models only!")
+            else:
+                try:
+                    if 'finetuned' in finetune_model:
+                        st.write("Model already finetuned")
+                    else:
+                        model_path = finetune(model_name_or_path, savemodel=True, outpath='models')
+                        if model_path is not None:
+                            if finetune_model.split('/')[-1] not in model_path:
+                                st.write(model_path)  # a warning from finetuning in this case
+                            elif model_path not in available_models:
+                                # finetuning generated a model, let's add it
+                                available_models.append(model_path)
+                                st.write("Model saved!")
+                except Exception:
+                    st.write("Model not found on HF or error")
+        model_name_or_path = check_model(model_name_or_path)
+        client, available_classes = get_weaviate_client(Wapi_key, url, model_name_or_path, openai_api_key)
+        start_class = 'Impact_theory_all_mpnet_base_v2_finetuned'
+        class_name = st.selectbox(
+            label='Class Name:',
+            options=available_classes,
+            index=available_classes.index(start_class),
+            placeholder='Select Class Name'
+            )
+        st.write("----------")
+        c1,c2 = st.columns([8,1])
+        with c1:
+            show_metrics = st.toggle('Show Metrics on Golden set', False)
+            if show_metrics:
+                # _, center, _ = st.columns([3, 5, 3])
+                # with center:
+                #     st.text("Metrics")
+                with c2:
+                    with st.spinner(''):
+                        metrics = execute_evaluation(golden_dataset, class_name, client, alpha=alpha_input)
+        if show_metrics:
+            kw_hit_rate = metrics['kw_hit_rate']
+            kw_mrr = metrics['kw_mrr']
+            hybrid_hit_rate = metrics['hybrid_hit_rate']
+            vector_hit_rate = metrics['vector_hit_rate']
+            vector_mrr = metrics['vector_mrr']
+            total_misses = metrics['total_misses']
+            st.text(f"KW hit rate: {kw_hit_rate}")
+            st.text(f"Vector hit rate: {vector_hit_rate}")
+            st.text(f"Hybrid hit rate: {hybrid_hit_rate}")
+            st.text(f"Hybrid MRR: {vector_mrr}")
+            st.text(f"Total misses: {total_misses}")
+        st.write("----------")
+    st.title("Chat with the Impact Theory podcasts!")
+    # st.image('./assets/impact-theory-logo.png', width=400)
+    st.image('assets/it_tom_bilyeu.png', use_column_width=True)
+    # st.subheader(f"Chat with the Impact Theory podcast: ")
+    st.write('\n')
+    # st.stop()
+    st.write("\u21D0 Open the sidebar to change Search settings \n ")  # https://home.unicode.org also 21E0, 21B0  B2 D0
+    guest = st.selectbox('Select A Guest',
+                            options=guest_list,
+                            index=None,
+                            placeholder='Select Guest')
+    col1, col2 = st.columns([7,3])
+    with col1:
+        if guest is None:
+            msg = f'Select a guest before asking your question:'
+        else:
+            msg = f'Enter your question about {guest}:'
+        textbox = st.empty()
+        # best solution I found to be able to change the text inside a text_input box afterwards, using a key
+        query = textbox.text_input(msg,
+                                  value="",
+                                  placeholder="You can refer to the guest with pronoun or drop the question mark",
+                                  key=st.session_state.key)
+        # st.write(f"Guest = {guest}")
+        # st.write(f"key = {st.session_state.key}")
+        st.write('\n\n\n\n\n')
+        reworded_query = {'changed': False, 'status': 'error'} # at start, the query is empty
+        valid_response = [] # at start, the query is empty, so prevent the search
+        if query:
+            if guest is None:
+                st.session_state.key += 1
+                query = textbox.text_input(msg,
+                                        value="",
+                                        placeholder="YOU MUST SELECT A GUEST BEFORE ASKING A QUESTION",
+                                        key=st.session_state.key)
+                # st.write(f"key = {st.session_state.key}")
+                st.stop()
+            else:
+                # st.write(f'It looks like you selected {guest} as a filter (It is ignored for now).')
+                with col2:
+                    # let's add a nice pulse bar while generating the response
+                    with hc.HyLoader('', hc.Loaders.pulse_bars, primary_color= 'red', height=50):  #"#0e404d" for image green
+                        # with st.spinner('Generating Response...'):
+                        with col1:
+                            # let's use Llama2 here
+                            reworded_query = reword_query(query, guest,
+                                                          model_name='llama2-13b-chat')
+                            query = reworded_query['rewritten_question']
+                            # we can arrive here only if a guest was selected
+                            where_filter = WhereFilter(path=['guest'], operator='Equal', valueText=guest).todict() \
+                                                if hybrid_filter else None
+                            hybrid_response = client.hybrid_search(query,
+                                                                class_name,
+                                                                # properties=['content'], #['title', 'summary', 'content'],
+                                                                alpha=alpha_input,
+                                                                display_properties=client.display_properties,
+                                                                where_filter=where_filter,
+                                                                limit=retrieval_limit)
+                            response = hybrid_response
+                            if rerank:
+                                # rerank results with cross encoder
+                                ranked_response = reranker.rerank(response, query,
+                                                                apply_sigmoid=True, # score between 0 and 1
+                                                                top_k=reranker_topk)
+                                logger.info(ranked_response)
+                                expanded_response = expand_content(ranked_response, cache,
+                                                                content_key='doc_id',
+                                                                create_new_list=True)
+                                response = expanded_response
+                        # make sure token count < threshold
+                        token_threshold = 8000 if model_nameGPT == model_ids[0] else 3500
+                        valid_response = validate_token_threshold(response,
+                                                                question_answering_prompt_series,
+                                                                query=query,
+                                                                tokenizer= encoding,# variable from ENCODING,
+                                                                token_threshold=token_threshold,
+                                                                verbose=True)
+                        # st.write(f"Number of results: {len(valid_response)}")
+    # I jump out of col1 to get all page width, so need to retest query
+    if query is not None and reworded_query['status'] != 'error':
+        show_query = st.toggle('Show rewritten query', False)
+        if show_query: # or reworded_query['changed']:
+            st.write(f"Rewritten query: {query}")
+        # creates container for LLM response to position it above search results
+        chat_container, response_box = [], st.empty()
+        # # RAG time !! execute chat call to LLM
+        if rag_it:
+            # st.subheader("Response from Impact Theory (context)")
+            # will appear under the answer, moved it into the response box
+            # generate LLM prompt
+            prompt = generate_prompt_series(query=query, results=valid_response)
+            GPTllm = GPT_Turbo(model=model_nameGPT,
+                            api_key=st.secrets['secrets']['OPENAI_API_KEY'])
+            try:
+                #   inserts chat stream from LLM
+                for resp in GPTllm.get_chat_completion(prompt=prompt,
+                                                        temperature=llm_temperature,
+                                                        max_tokens=350,
+                                                        show_response=True,
+                                                        stream=True):
+                    with response_box:
+                        content = resp.choices[0].delta.content
+                        if content:
+                            chat_container.append(content)
+                            result = "".join(chat_container).strip()
+                            response_box.markdown(f"### Response from Impact Theory (RAG):\n\n{result}")
+            except BadRequestError as e:
+                logger.info('Making request with smaller context')
+                valid_response = validate_token_threshold(response,
+                                                        question_answering_prompt_series,
+                                                        query=query,
+                                                        tokenizer=encoding,
+                                                        token_threshold=3500,
+                                                        verbose=True)
+                # if reranker is off, we may receive a LOT of responses
+                # so we must reduce the context size manually
+                if not rerank:
+                    valid_response = valid_response[:reranker_topk]
+                prompt = generate_prompt_series(query=query, results=valid_response)
+                for resp in GPTllm.get_chat_completion(prompt=prompt,
+                                                    temperature=llm_temperature,
+                                                    max_tokens=350,  # expand for more verbose answers
+                                                    show_response=True,
+                                                    stream=True):
+                    try:
+                        # inserts chat stream from LLM
+                        with response_box:
+                            content = resp.choice[0].delta.content
+                            if content:
+                                chat_container.append(content)
+                                result = "".join(chat_container).strip()
+                                response_box.markdown(f"### Response from Impact Theory (RAG):\n\n{result}")
+                    except Exception as e:
+                        print(e)
+        st.markdown("----")
+        st.subheader("Search Results")
+        for i, hit in enumerate(valid_response):
+            col1, col2 = st.columns([7, 3], gap='large')
+            image = hit['thumbnail_url'] # get thumbnail_url
+            episode_url = hit['episode_url'] # get episode_url
+            title = hit["title"] # get title
+            show_length = hit["length"] # get length
+            time_string = str(timedelta(seconds=show_length)) # convert show_length to readable time string
+            with col1:
+                st.write(search_result(i=i,
+                                        url=episode_url,
+                                        guest=hit['guest'],
+                                        title=title,
+                                        content='',
+                                        length=time_string),
+                                        unsafe_allow_html=True)
+                st.write('\n\n')
+            with col2:
+                #st.write(f"<a href={episode_url} <img src={image} width='200'></a>",
+                #         unsafe_allow_html=True)
+                #st.markdown(f"[![{title}]({image})]({episode_url})")
+                # st.markdown(f'<a href="{episode_url}">'
+                #            f'<img src={image} '
+                #            f'caption={title.split("|")[0]} width=200, use_column_width=False />'
+                #            f'</a>',
+                #            unsafe_allow_html=True)
+                st.image(image, caption=title.split('|')[0], width=200, use_column_width=False)
+            # let's use all width for the content
+            st.write(hit['content'])
+def get_answer(query, valid_response, GPTllm):
+    # generate LLM prompt
+    prompt = generate_prompt_series(query=query,
+                                    results=valid_response)
+    return GPTllm.get_chat_completion(prompt=prompt,
+                                   system_message='answer this question based on the podcast material',
+                                   temperature=0,
+                                   max_tokens=500,
+                                   stream=False,
+                                   show_response=False)
+def reword_query(query, guest, model_name='llama2-13b-chat', response_processing=True):
+    """ Asks LLM to rewrite the query when the guest name is missing.
+    Args:
+        query (str): user query
+        guest (str): guest name
+        model_name (str, optional): name of a LLM model to be used
+    """
+    # tags = {'llama2-13b-chat': {'start': '<s>', 'end': '</s>', 'instruction': '[INST]', 'system': '[SYS]'},
+    #         'gpt-3.5-turbo-0613': {'start': '<|startoftext|>', 'end': '', 'instruction': "```", 'system': ```}}
+    prompt_fields = {
+        "you_are":f"You are an expert in linguistics and semantics, analyzing the question asked by a user to a vector search system, \
+                    and making sure that the question is well formulated and that the system can understand it.",
+        "your_task":f"Your task is to detect if the name of the guest ({guest}) is mentioned in the user's question, \
+                    and if that is not the case, rewrite the question using the guest name, \
+                    without changing the meaning of the question. \
+                    Most of the time, the user will have used a pronoun to designate the guest, in which case, \
+                    simply replace the pronoun with the guest name.",
+        "question":f"If the user mentions the guest name, ie {query}, just return his question as is. \
+                    If the user does not mention the guest name, rewrite the question using the guest name.",
+        "final_instruction":f"Only regerate the requested rewritten question or the original, WITHOUT ANY COMMENT OR REPHRASING. \
+                    Your answer must be as close as possible to the original question, \
+                    and exactly identical, word for word, if the user mentions the guest name, i.e. {guest}.",
+    }
+    # prompt created by chatGPT :-)
+    # and Llama still outputs the original question and precedes the answer with 'rewritten question'
+    prompt_fields2 = {
+    "you_are": (
+        "You are an expert in linguistics and semantics. Your role is to analyze questions asked to a vector search system."
+    ),
+    "your_task": (
+        f"Detect if the guest's FULL name, {guest}, is mentioned in the user's question. "
+        "If not, rewrite the question by replacing pronouns or indirect references with the guest's name." \
+        "If yes, return the original question as is, without any change at all, not even punctuation,"
+        "except a question mark that you MUST add if it's missing."
+    ),
+    "question": (
+        f"Original question: '{query}'. "
+        "Rewrite this question to include the guest's FULL name if it's not already mentioned."
+        "The Only thing you can and MUST add is a question mark if it's missing."
+    ),
+    "final_instruction": (
+        "Create a rewritten question or keep the original question as is. "
+        "Do not include any labels, titles, or additional text before or after the question."
+        "The Only thing you can and MUST add is a question mark if it's missing."
+        "Return a json object, with the key 'original_question' for the original question, \
+        and 'rewritten_question' for the rewritten question \
+        and 'changed' being True if you changed the answer, otherwise False."
+    ),
+    }
+    if model_name == 'llama2-13b-chat':
+        # special tags are used:
+        # `<s>` - start prompt tag
+        # `[INST], [/INST]` - Opening and closing model instruction tags
+        # `<<<SYS>>>, <</SYS>>` - Opening and closing system prompt tags
+        llama_prompt = """
+        <s>[INST] <<SYS>>
+        {you_are}
+        <</SYS>>
+        {your_task}\n
+        ```
+        \n\n
+        Question: {question}\n
+        {final_instruction} [/INST]
+        Answer:
+        """
+        prompt = llama_prompt.format(**prompt_fields2)
+        hf_token = st.secrets['secrets']['LLAMA2_ENDPOINT_HF_TOKEN_chris']
+        # hf_token = st.secrets['secrets']['LLAMA2_ENDPOINT_HF_TOKEN']
+        hf_endpoint = st.secrets['secrets']['LLAMA2_ENDPOINT_UPLIMIT']
+        headers = {"Authorization": f"Bearer {hf_token}",
+                "Content-Type": "application/json",}
+        json_body = {
+                "inputs": prompt,
+                "parameters": {"max_new_tokens":400,
+                               "repetition_penalty": 1.0,
+                               "temperature":0.01}
+        }
+        response = requests.request("POST", hf_endpoint, headers=headers, data=json.dumps(json_body))
+        response = json.loads(response.content.decode("utf-8"))
+        # ^ will not process the badly formatted generated text, so we do it ourselves
+        if isinstance(response, dict) and 'error' in response:
+            print("Found error")
+            print(response)
+            # return {'error': response['error'], 'rewritten_question': query, 'changed': False, 'status': 'error'}
+            # I test this here otherwise it gets in col 2 or 1, which are too
+            # if reworded_query['status'] == 'error':
+            # st.write(f"Error in LLM response: 'error':{reworded_query['error']}")
+            # st.write("The LLM could not connect to the server. Please try again later.")
+            # st.stop()
+            return reword_query(query, guest, model_name='gpt-3.5-turbo-0613')
+        if response_processing:
+            if isinstance(response, list) and isinstance(response[0], dict) and 'generated_text' in response[0]:
+                print("Found generated text")
+                response0 = response[0]['generated_text']
+                pattern = r'\"(\w+)\":\s*(\".*?\"|\w+)'
+                matches = re.findall(pattern, response0)
+                # let's build a dictionary
+                result = {key: json.loads(value) if value.startswith("\"") else value for key, value in matches}
+                return result | {'status': 'success'}
+            else:
+                print("Found no answer")
+                return reword_query(query, guest, model_name='gpt-3.5-turbo-0613')
+                # return {'original_question': query, 'rewritten_question': query, 'changed': False, 'status': 'no properly formatted answer' }
+        else:
+            return response
+        # return response
+        # assert 'error' not in response, f"Error in LLM response: {response['error']}"
+        # assert 'generated_text' in response[0], f"Error in LLM response: {response}, no 'generated_text' field"
+        # # let's extract the rewritten question
+        # return response[0]['generated_text'] .split("Rewritten question: '")[-1][:-1]
+    else:
+        # assume openai
+        model_ids = ['gpt-3.5-turbo-16k', 'gpt-3.5-turbo-0613']
+        model_name = model_ids[1]
+        GPTllm = GPT_Turbo(model=model_name,
+                            api_key=st.secrets['secrets']['OPENAI_API_KEY'])
+        openai_prompt = """
+        {your_task}\n
+        ```
+        \n\n
+        Question: {question}\n
+        {final_instruction}
+        Answer:
+        """
+        prompt = openai_prompt.format(**prompt_fields)
+        try:
+            resp = GPTllm.get_chat_completion(prompt=openai_prompt,
+                                            system_message=prompt_fields['you_are'],
+                                            temperature=0.01,
+                                            max_tokens=1500, # it's a question...
+                                            show_response=True,
+                                            stream=False)
+            return {'rewritten_question': resp.choices[0].delta.content,
+                    'changed': True, 'status': 'success'}
+        except Exception:
+            return {'rewritten_question': query, 'changed': False, 'status': 'not success'}
+if __name__ == '__main__':
+    main()
+# %%

backend.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import modal
+from typing import List, Dict, Tuple, Union, Callable
+# from preprocessing import FileIO
+# assets = modal.Mount.from_local_dir(
+#     "./data",
+#     # condition=lambda pth: not ".venv" in pth,
+#     remote_path="./data",
+# )
+stub = modal.Stub("vector-search-project")
+vector_search = modal.Image.debian_slim().pip_install(
+    "sentence_transformers==2.2.2", "llama_index==0.9.6.post1", "angle_emb==0.1.5"
+    )
+stub.volume = modal.Volume.new()
+@stub.function(image=vector_search,
+               gpu="A100",
+               timeout=600,
+               volumes={"/root/models": stub.volume}
+               # secrets are available in the environment with os.environ["SECRET_NAME"]
+               # secret=modal.Secret.from_name("my-huggingface-secret")
+               )
+def encode_content_splits(content_splits,
+                            model=None,  # path or name of model
+                            **kwargs
+                            ):
+    """ kwargs provided in case encode method has extra arguments """
+    from sentence_transformers import SentenceTransformer
+    import os, time
+    models_list = os.listdir('/root/models')
+    print("Models:", models_list)
+    if isinstance(model, str) and model[-1] == "/":
+        model = model[:-1]
+    if isinstance(model, str):
+        model = model.split('/')[-1]
+    if isinstance(model, str) and model in models_list:
+        if "UAE-Large-V1-300" in model:
+            print("Loading finetuned UAE-Large-V1-300 model from Modal Volume")
+            from angle_emb import AnglE
+            model = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1',
+                                          pretrained_model_path=os.path.join('/root/models', model),
+                                          pooling_strategy='cls').cuda()
+            kwargs['to_numpy'] = True
+            # this model doesn't accept list of lists
+            if isinstance(content_splits[0], list):
+                content_splits = [chunk for episode in content_splits for chunk in episode]
+        else:
+            print(f"Loading model {model} from Modal volume")
+            model = SentenceTransformer(os.path.join('/root/models', model))
+    elif isinstance(model, str):
+        if model in models_list:
+            print(f"Loading model {model} from Modal volume")
+            model = SentenceTransformer(os.path.join('/root/models', model))
+        else:
+            print(f"Model {model} not found in Modal volume, loading from HuggingFace")
+            model = SentenceTransformer(model)
+    else:
+        print(f"Using model provided as argument")
+        if 'save' in kwargs:
+            if isinstance(kwargs['save'], str) and kwargs['save'][-1] == '/':
+                kwargs['save'] = kwargs['save'][:-1]
+            kwargs['save'] = kwargs['save'].split('/')[-1]
+            fname = os.path.join('/root/models',  kwargs['save'])
+            print(f"Saving model in {fname}")
+            # model.save(fname)
+            print(f"Model saved in {fname}")
+            kwargs.pop('save')
+    print("Starting encoding")
+    start = time.perf_counter()
+    emb = [list(zip(episode, model.encode(episode, **kwargs))) for episode in content_splits]
+    end = time.perf_counter() - start
+    print(f"GPU processing lasted {end:.2f} seconds")
+    print("Encoding finished")
+    return emb
+@stub.function(image=vector_search, gpu="A100", timeout=120,
+               mounts=[modal.Mount.from_local_dir("./data",
+                                                  remote_path="/root/data",
+                                                  condition=lambda pth: ".json" in pth)],
+               volumes={"/root/models": stub.volume}
+)
+def finetune(training_path='./data/training_data_300.json',
+             valid_path='./data/validation_data_100.json',
+             model_id=None):
+    import os
+    print("Data:", os.listdir('/root/data'))
+    print("Models:", os.listdir('/root/models'))
+    if model_id is None:
+        print("No model ID provided")
+        return None
+    elif isinstance(model_id, str) and model_id[-1] == "/":
+        model_id = model_id[:-1]
+    from llama_index.finetuning import EmbeddingQAFinetuneDataset
+    training_set = EmbeddingQAFinetuneDataset.from_json(training_path)
+    valid_set = EmbeddingQAFinetuneDataset.from_json(valid_path)
+    print("Datasets loaded")
+    num_training_examples = len(training_set.queries)
+    print(f"Training examples: {num_training_examples}")
+    from llama_index.finetuning import SentenceTransformersFinetuneEngine
+    print(f"Model Name is {model_id}")
+    model_ext = model_id.split('/')[1]
+    ft_model_name = f'finetuned-{model_ext}-{num_training_examples}'
+    model_outpath = os.path.join("/root/models", ft_model_name)
+    print(f'Model ID: {model_id}')
+    print(f'Model Outpath: {model_outpath}')
+    finetune_engine = SentenceTransformersFinetuneEngine(
+        training_set,
+        batch_size=32,
+        model_id=model_id,
+        model_output_path=model_outpath,
+        val_dataset=valid_set,
+        epochs=10
+    )
+    import io, os, zipfile, glob, time
+    try:
+        start = time.perf_counter()
+        finetune_engine.finetune()
+        end = time.perf_counter() - start
+        print(f"GPU processing lasted {end:.2f} seconds")
+        print(os.listdir('/root/models'))
+        stub.volume.commit()  # Persist changes, ie the finetumed model
+        # TODO SHARE THE MODEL ON HUGGINGFACE
+        # https://huggingface.co/docs/transformers/v4.15.0/model_sharing
+        folder_to_zip = model_outpath
+        # Zip the contents of the folder at 'folder_path' and return a BytesIO object.
+        bytes_buffer = io.BytesIO()
+        with zipfile.ZipFile(bytes_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+            for file_path in glob.glob(folder_to_zip + "/**", recursive=True):
+                print(f"Processed file {file_path}")
+                zip_file.write(file_path, os.path.relpath(file_path, start=folder_to_zip))
+        # Move the pointer to the start of the BytesIO buffer before returning
+        bytes_buffer.seek(0)
+        # You can now return this zipped_folder object, write it to a file, send it over a network, etc.
+        # Replace with the path to the folder you want to zip
+        zippedio = bytes_buffer
+        return zippedio
+    except:
+        return "Finetuning failed"
+@stub.local_entrypoint()
+def test_method(content_splits=[["a"]]):
+    output = encode_content_splits.remote(content_splits)
+    return output
+# deploy it with
+# modal token set --token-id ak-xxxxxx --token-secret as-xxxxx # given when we create a new token
+# modal deploy podcast/1/backend.py
+# View Deployment: https://modal.com/apps/jpbianchi/falcon_hackaton-project <<< use this project name

class_templates.py ADDED Viewed

	@@ -0,0 +1,46 @@

+impact_theory_class_properties = [
+                                    {'name': 'title',
+                                        'dataType': ['text'],
+                                        'indexFilterable': True,
+                                        'indexSearchable': True},
+                                    {'name': 'video_id',
+                                        'dataType': ['text'],
+                                        'indexFilterable': True,
+                                        'indexSearchable': False},
+                                    {'name': 'length',
+                                        'dataType': ['int'],
+                                        'indexFilterable': True,
+                                        'indexSearchable': False},
+                                    {'name': 'thumbnail_url',
+                                        'dataType': ['text'],
+                                        'indexFilterable': False,
+                                        'indexSearchable': False},
+                                    {'name': 'views',
+                                        'dataType': ['int'],
+                                        'indexFilterable': True,
+                                        'indexSearchable': False},
+                                    {'name': 'episode_url',
+                                        'dataType': ['text'],
+                                        'indexFilterable': False,
+                                        'indexSearchable': False},
+                                    {'name': 'doc_id',
+                                        'dataType': ['text'],
+                                        'indexFilterable': True,
+                                        'indexSearchable': False},
+                                    {'name': 'guest',
+                                        'dataType': ['text'],
+                                        'indexFilterable': True,
+                                        'indexSearchable': True},
+                                    {'name': 'summary',
+                                        'dataType': ['text'],
+                                        'indexFilterable': False,
+                                        'indexSearchable': True},
+                                    {'name': 'content',
+                                        'dataType': ['text'],
+                                        'indexFilterable': False,
+                                        'indexSearchable': True},
+                                ]
+                                    # {'name': 'publish_date',
+                                    #     'dataType': ['date'],
+                                    #     'indexFilterable': True,
+                                    #     'indexSearchable': False},

finetune_backend.py ADDED Viewed

	@@ -0,0 +1,55 @@

+#%%
+import os, time, io, zipfile
+from preprocessing import FileIO
+import shutil
+import modal
+from llama_index.finetuning import EmbeddingQAFinetuneDataset
+from dotenv import load_dotenv, find_dotenv
+env = load_dotenv(find_dotenv('env'), override=True)
+#%%
+training_path = 'data/training_data_300.json'
+valid_path = 'data/validation_data_100.json'
+training_set = EmbeddingQAFinetuneDataset.from_json(training_path)
+valid_set = EmbeddingQAFinetuneDataset.from_json(valid_path)
+def finetune(model='all-mpnet-base-v2', savemodel=False, outpath='.'):
+    """ Finetunes a model on Modal GPU A100.
+        The model is saved in /root/models on a Modal volume
+        and can be stored locally.
+    Args:
+        model (str): the Sentence Transformer model name
+        savemodel (bool, optional): whether to save the model or not.
+    Returns:
+        path of the saved model (when saved)
+    """
+    f = modal.Function.lookup("vector-search-project", "finetune")
+    model = model.replace('/','')
+    if 'sentence-transformers' not in model:
+        model = f"sentence-transformers/{model}"
+    fullpath = os.path.join(outpath, f"finetuned-{model}-300")
+    if os.path.exists(fullpath):
+        msg = "Model already exists!"
+        print(msg)
+        return msg
+    start = time.perf_counter()
+    finetuned_model = f.remote(training_path, valid_path, model_id=model)
+    end = time.perf_counter() - start
+    print(f"Finetuning with GPU lasted {end:.2f} seconds")
+    if savemodel:
+        with open(fullpath, 'wb') as file:
+            # Write the contents of the BytesIO object to a new file
+            file.write(finetuned_model.getbuffer())
+        print(f"Model saved in {fullpath}")
+        return fullpath

helpers.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from typing import List, Tuple, Dict, Any
+import time
+from tqdm.notebook import tqdm
+from rich import print
+from retrieval_evaluation import calc_hit_rate_scores, calc_mrr_scores, record_results, add_params
+from llama_index.finetuning import EmbeddingQAFinetuneDataset
+from weaviate_interface import WeaviateClient
+def retrieval_evaluation(dataset: EmbeddingQAFinetuneDataset,
+                         class_name: str,
+                         retriever: WeaviateClient,
+                         retrieve_limit: int=5,
+                         chunk_size: int=256,
+                         hnsw_config_keys: List[str]=['maxConnections', 'efConstruction', 'ef'],
+                         display_properties: List[str]=['doc_id', 'guest', 'content'],
+                         dir_outpath: str='./eval_results',
+                         include_miss_info: bool=False,
+                         user_def_params: Dict[str,Any]=None
+                         ) -> Dict[str, str|int|float]:
+    '''
+    Given a dataset and a retriever evaluate the performance of the retriever. Returns a dict of kw and vector
+    hit rates and mrr scores. If inlude_miss_info is True, will also return a list of kw and vector responses
+    and their associated queries that did not return a hit, for deeper analysis. Text file with results output
+    is automatically saved in the dir_outpath directory.
+    Args:
+    -----
+    dataset: EmbeddingQAFinetuneDataset
+        Dataset to be used for evaluation
+    class_name: str
+        Name of Class on Weaviate host to be used for retrieval
+    retriever: WeaviateClient
+        WeaviateClient object to be used for retrieval
+    retrieve_limit: int=5
+        Number of documents to retrieve from Weaviate host
+    chunk_size: int=256
+        Number of tokens used to chunk text. This value is purely for results
+        recording purposes and does not affect results.
+    display_properties: List[str]=['doc_id', 'content']
+        List of properties to be returned from Weaviate host for display in response
+    dir_outpath: str='./eval_results'
+        Directory path for saving results.  Directory will be created if it does not
+        already exist.
+    include_miss_info: bool=False
+        Option to include queries and their associated kw and vector response values
+        for queries that are "total misses"
+    user_def_params : dict=None
+        Option for user to pass in a dictionary of user-defined parameters and their values.
+    '''
+    results_dict = {'n':retrieve_limit,
+                    'Retriever': retriever.model_name_or_path,
+                    'chunk_size': chunk_size,
+                    'kw_hit_rate': 0,
+                    'kw_mrr': 0,
+                    'vector_hit_rate': 0,
+                    'vector_mrr': 0,
+                    'total_misses': 0,
+                    'total_questions':0
+                    }
+    #add hnsw configs and user defined params (if any)
+    results_dict = add_params(retriever, class_name, results_dict, user_def_params, hnsw_config_keys)
+    start = time.perf_counter()
+    miss_info = []
+    for query_id, q in tqdm(dataset.queries.items(), 'Queries'):
+        results_dict['total_questions'] += 1
+        hit = False
+        #make Keyword, Vector, and Hybrid calls to Weaviate host
+        try:
+            kw_response = retriever.keyword_search(request=q, class_name=class_name, limit=retrieve_limit, display_properties=display_properties)
+            vector_response = retriever.vector_search(request=q, class_name=class_name, limit=retrieve_limit, display_properties=display_properties)
+            #collect doc_ids and position of doc_ids to check for document matches
+            kw_doc_ids = {result['doc_id']:i for i, result in enumerate(kw_response, 1)}
+            vector_doc_ids = {result['doc_id']:i for i, result in enumerate(vector_response, 1)}
+            #extract doc_id for scoring purposes
+            doc_id = dataset.relevant_docs[query_id][0]
+            #increment hit_rate counters and mrr scores
+            if doc_id in kw_doc_ids:
+                results_dict['kw_hit_rate'] += 1
+                results_dict['kw_mrr'] += 1/kw_doc_ids[doc_id]
+                hit = True
+            if doc_id in vector_doc_ids:
+                results_dict['vector_hit_rate'] += 1
+                results_dict['vector_mrr'] += 1/vector_doc_ids[doc_id]
+                hit = True
+            # if no hits, let's capture that
+            if not hit:
+                results_dict['total_misses'] += 1
+                miss_info.append({'query': q, 'kw_response': kw_response, 'vector_response': vector_response})
+        except Exception as e:
+            print(e)
+            continue
+    #use raw counts to calculate final scores
+    calc_hit_rate_scores(results_dict)
+    calc_mrr_scores(results_dict)
+    end = time.perf_counter() - start
+    print(f'Total Processing Time: {round(end/60, 2)} minutes')
+    record_results(results_dict, chunk_size, dir_outpath=dir_outpath, as_text=True)
+    if include_miss_info:
+        return results_dict, miss_info
+    return results_dict

llama_test.ipynb ADDED Viewed

	@@ -0,0 +1,80 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install huggingface_hub --q\n",
+    "%pip install ipywidgets --q"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers.pipelines.text_generation import TextGenerationPipeline\n",
+    "from transformers import AutoConfig\n",
+    "import transformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f9c842f1bd7146e5a4e4d517450531ee",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "notebook_login()  #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"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

openai_interface.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+from openai import OpenAI
+from typing import List, Any, Tuple
+from tqdm import tqdm
+import streamlit as st
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dotenv import load_dotenv, find_dotenv
+load_dotenv(find_dotenv('env'), override=True)
+try:
+    api_key = st.secrets['secrets']['OPENAI_API_KEY']
+except:
+    api_key = os.environ['OPENAI_API_KEY']
+class GPT_Turbo:
+    def __init__(self, model: str="gpt-3.5-turbo-0613", api_key: str=api_key):
+        self.model = model
+        self.client = OpenAI(api_key=api_key)
+    def get_chat_completion(self,
+                            prompt: str,
+                            system_message: str='You are a helpful assistant.',
+                            temperature: int=0,
+                            max_tokens: int=500,
+                            stream: bool=False,
+                            show_response: bool=False
+                            ) -> str:
+        messages =  [
+            {'role': 'system', 'content': system_message},
+            {'role': 'assistant', 'content': prompt}
+                    ]
+        response = self.client.chat.completions.create( model=self.model,
+                                                        messages=messages,
+                                                        temperature=temperature,
+                                                        max_tokens=max_tokens,
+                                                        stream=stream)
+        if show_response:
+            return response
+        return response.choices[0].message.content
+    def multi_thread_request(self,
+                             filepath: str,
+                             prompt: str,
+                             content: List[str],
+                             temperature: int=0
+                             ) -> List[Any]:
+        data = []
+        with ThreadPoolExecutor(max_workers=2*os.cpu_count()) as exec:
+            futures = [exec.submit(self.get_completion_from_messages, [{'role': 'user','content': f'{prompt} ```{c}```'}], temperature, 500, False) for c in content]
+            with open(filepath, 'a') as f:
+                for future in as_completed(futures):
+                    result = future.result()
+                    if len(data) % 10 == 0:
+                            print(f'{len(data)} of {len(content)} completed.')
+                    if result:
+                        data.append(result)
+                        self.write_to_file(file_handle=f, data=result)
+        return [res for res in data if res]
+    def generate_question_context_pairs(self,
+                                        context_tuple: Tuple[str, str],
+                                        num_questions_per_chunk: int=2,
+                                        max_words_per_question: int=10
+                                        ) -> List[str]:
+        doc_id, context = context_tuple
+        prompt = f'Context information is included below enclosed in triple backticks. Given the context information and not prior knowledge, generate questions based on the below query.\n\nYou are an end user querying for information about your favorite podcast. \
+                   Your task is to setup {num_questions_per_chunk} questions that can be answered using only the given context. The questions should be diverse in nature across the document and be no longer than {max_words_per_question} words. \
+                   Restrict the questions to the context information provided.\n\
+                   ```{context}```\n\n'
+        response = self.get_completion_from_messages(prompt=prompt, temperature=0, max_tokens=500, show_response=True)
+        questions = response.choices[0].message["content"]
+        return (doc_id, questions)
+    def batch_generate_question_context_pairs(self,
+                                              context_tuple_list: List[Tuple[str, str]],
+                                              num_questions_per_chunk: int=2,
+                                              max_words_per_question: int=10
+                                              ) -> List[Tuple[str, str]]:
+        data = []
+        progress = tqdm(unit="Generated Questions", total=len(context_tuple_list))
+        with ThreadPoolExecutor(max_workers=2*os.cpu_count()) as exec:
+            futures = [exec.submit(self.generate_question_context_pairs, context_tuple, num_questions_per_chunk, max_words_per_question) for context_tuple in context_tuple_list]
+            for future in as_completed(futures):
+                result = future.result()
+                if result:
+                    data.append(result)
+                    progress.update(1)
+        return data
+    def get_embedding(self):
+         pass
+    def write_to_file(self, file_handle, data: str) -> None:
+            file_handle.write(data)
+            file_handle.write('\n')

preprocessing.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+import json
+import pandas as pd
+from typing import List, Union, Dict
+from loguru import logger
+import pandas as pd
+import pathlib
+## Set of helper functions that support data preprocessing
+class FileIO:
+    '''
+    Convenience class for saving and loading data in parquet and
+    json formats to/from disk.
+    '''
+    def save_as_parquet(self,
+                        file_path: str,
+                        data: Union[List[dict], pd.DataFrame],
+                        overwrite: bool=False) -> None:
+        '''
+        Saves DataFrame to disk as a parquet file.  Removes the index.
+        Args:
+        -----
+        file_path : str
+            Output path to save file, if not included "parquet" will be appended
+            as file extension.
+        data : Union[List[dict], pd.DataFrame]
+            Data to save as parquet file. If data is a list of dicts, it will be
+            converted to a DataFrame before saving.
+        overwrite : bool
+            Overwrite existing file if True, otherwise raise FileExistsError.
+        '''
+        if isinstance(data, list):
+           data = self._convert_toDataFrame(data)
+        if not file_path.endswith('parquet'):
+            file_path = self._rename_file_extension(file_path, 'parquet')
+        self._check_file_path(file_path, overwrite=overwrite)
+        data.to_parquet(file_path, index=False)
+        logger.info(f'DataFrame saved as parquet file here: {file_path}')
+    def _convert_toDataFrame(self, data: List[dict]) -> pd.DataFrame:
+        return pd.DataFrame().from_dict(data)
+    def _rename_file_extension(self, file_path: str, extension: str):
+        '''
+        Renames file with appropriate extension if file_path
+        does not already have correct extension.
+        '''
+        prefix = os.path.splitext(file_path)[0]
+        file_path = prefix + '.' + extension
+        return file_path
+    def _check_file_path(self, file_path: str, overwrite: bool) -> None:
+        '''
+        Checks for existence of file and overwrite permissions.
+        '''
+        if os.path.exists(file_path) and overwrite == False:
+            raise FileExistsError(f'File by name {file_path} already exists, try using another file name or set overwrite to True.')
+        elif os.path.exists(file_path):
+            os.remove(file_path)
+        else:
+            file_name = os.path.basename(file_path)
+            dir_structure = file_path.replace(file_name, '')
+            pathlib.Path(dir_structure).mkdir(parents=True, exist_ok=True)
+    def load_parquet(self, file_path: str, verbose: bool=True) -> List[dict]:
+        '''
+        Loads parquet from disk, converts to pandas DataFrame as intermediate
+        step and outputs a list of dicts (docs).
+        '''
+        df = pd.read_parquet(file_path)
+        vector_labels = ['content_vector', 'image_vector', 'content_embedding']
+        for label in vector_labels:
+            if label in df.columns:
+                df[label] = df[label].apply(lambda x: x.tolist())
+        if verbose:
+            memory_usage = round(df.memory_usage().sum()/(1024*1024),2)
+            print(f'Shape of data: {df.values.shape}')
+            print(f'Memory Usage: {memory_usage}+ MB')
+        list_of_dicts = df.to_dict('records')
+        return list_of_dicts
+    def load_json(self, file_path: str):
+        '''
+        Loads json file from disk.
+        '''
+        with open(file_path) as f:
+            data = json.load(f)
+        return data
+    def save_as_json(self,
+                     file_path: str,
+                     data: Union[List[dict], dict],
+                     indent: int=4,
+                     overwrite: bool=False
+                     ) -> None:
+        '''
+        Saves data to disk as a json file. Data can be a list of dicts or a single dict.
+        '''
+        if not file_path.endswith('json'):
+            file_path = self._rename_file_extension(file_path, 'json')
+        self._check_file_path(file_path, overwrite=overwrite)
+        with open(file_path, 'w') as f:
+            json.dump(data, f, indent=indent)
+        logger.info(f'Data saved as json file here: {file_path}')
+class Utilities:
+    def create_video_url(self, video_id: str, playlist_id: str):
+        '''
+        Creates a hyperlink to a video episode given a video_id and playlist_id.
+        Args:
+        -----
+        video_id : str
+            Video id of the episode from YouTube
+        playlist_id : str
+            Playlist id of the episode from YouTube
+        '''
+        return f'https://www.youtube.com/watch?v={video_id}&list={playlist_id}'

prompt_templates.py ADDED Viewed

	@@ -0,0 +1,63 @@

+question_answering_system = '''
+You are the host of the show Impact Theory, and your name is Tom Bilyeu.  The description of your show is as follows:
+If you’re looking to thrive in uncertain times, achieve unprecedented goals, and improve the most meaningful aspects of your life, then Impact Theory is the show for you. Hosted by Tom Bilyeu, a voracious learner and hyper-successful entrepreneur, the show investigates and analyzes the most useful topics with the world’s most sought-after guests.
+Bilyeu attacks each episode with a clear desire to further evolve the holistic skillset that allowed him to co-found the billion-dollar company Quest Nutrition, generate over half a billion organic views on his content, build a thriving marriage of over 20 years, and quantifiably improve the lives of over 10,000 people through his school, Impact Theory University.
+Bilyeu’s insatiable hunger for knowledge gives the show urgency, relevance, and depth while leaving listeners with the knowledge, tools, and empowerment to take control of their lives and develop true personal power.
+'''
+question_answering_prompt_single = '''
+Use the below context enclosed in triple back ticks to answer the question. If the context does not provide enough information to answer the question, then use any knowledge you have to answer the question.\n
+```{context}```\n
+Question:\n
+{question}.\n
+Answer:
+'''
+question_answering_prompt_series = '''
+Your task is to synthesize and reason over a series of transcripts of an interview between Tom Bilyeu and his guest(s).
+After your synthesis, use the series of transcripts to answer the below question.  The series will be in the following format:\n
+```
+Show Summary: <summary>
+Show Guest: <guest>
+Transcript: <transcript>
+```\n\n
+Start Series:
+```
+{series}
+```
+Question:\n
+{question}\n
+Answer the question and provide reasoning if necessary to explain the answer.\n
+If the context does not provide enough information to answer the question, then \n
+state that you cannot answer the question with the provided context.\n
+Answer:
+'''
+context_block = '''
+Show Summary: {summary}
+Show Guest: {guest}
+Transcript: {transcript}
+'''
+qa_generation_prompt = '''
+Impact Theory episode summary and episode guest are below:
+---------------------
+Summary: {summary}
+---------------------
+Guest: {guest}
+---------------------
+Given the Summary and Guest of the episode as context \
+use the following randomly selected transcript section \
+of the episode and not prior knowledge, generate questions that can \
+be answered by the transcript section:
+---------------------
+Transcript: {transcript}
+---------------------
+Your task is to create {num_questions_per_chunk} questions that can \
+only be answered given the previous context and transcript details. \
+The question should randomly start with How, Why, or What.
+'''

prompt_templates_luis.py ADDED Viewed

	@@ -0,0 +1,63 @@

+question_answering_system = '''
+You are the host of the show Impact Theory, and your name is Tom Bilyeu.  The description of your show is as follows:
+If you’re looking to thrive in uncertain times, achieve unprecedented goals, and improve the most meaningful aspects of your life, then Impact Theory is the show for you. Hosted by Tom Bilyeu, a voracious learner and hyper-successful entrepreneur, the show investigates and analyzes the most useful topics with the world’s most sought-after guests.
+Bilyeu attacks each episode with a clear desire to further evolve the holistic skillset that allowed him to co-found the billion-dollar company Quest Nutrition, generate over half a billion organic views on his content, build a thriving marriage of over 20 years, and quantifiably improve the lives of over 10,000 people through his school, Impact Theory University.
+Bilyeu’s insatiable hunger for knowledge gives the show urgency, relevance, and depth while leaving listeners with the knowledge, tools, and empowerment to take control of their lives and develop true personal power.
+'''
+question_answering_prompt_single = '''
+Use the below context enclosed in triple back ticks to answer the question. If the context does not provide enough information to answer the question, then use any knowledge you have to answer the question.\n
+```{context}```\n
+Question:\n
+{question}.\n
+Answer:
+'''
+question_answering_prompt_series = '''
+Your task is to synthesize and reason over a series of transcripts of an interview between Tom Bilyeu and his guest(s).
+After your synthesis, use the series of transcripts to answer the below question.  The series will be in the following format:\n
+```
+Show Summary: <summary>
+Show Guest: <guest>
+Transcript: <transcript>
+```\n\n
+Start Series:
+```
+{series}
+```
+Question:\n
+{question}\n
+Answer the question and provide reasoning if necessary to explain the answer.\n
+If the context does not provide enough information to answer the question, then \n
+state that you cannot answer the question with the provided context.\n
+Answer:
+'''
+context_block = '''
+Show Summary: {summary}
+Show Guest: {guest}
+Transcript: {transcript}
+'''
+qa_generation_prompt = '''
+Impact Theory episode summary and episode guest are below:
+---------------------
+Summary: {summary}
+---------------------
+Guest: {guest}
+---------------------
+Given the Summary and Guest of the episode as context \
+use the following randomly selected transcript section \
+of the episode and not prior knowledge, generate questions that can \
+be answered by the transcript section:
+---------------------
+Transcript: {transcript}
+---------------------
+Your task is to create {num_questions_per_chunk} questions that can \
+only be answered given the previous context and transcript details. \
+The question should randomly start with How, Why, or What.
+'''

readme2.md ADDED Viewed

	@@ -0,0 +1,43 @@

+# Welcome to Vector Search Applications with LLMs
+This is the course repository for Vector Search Applications with LLMs taught by [Chris Sanchez](https://www.linkedin.com/in/excellenceisahabit/) with assistance from [Matias Weber](https://www.linkedin.com/in/matiasweber/).
+The course is desgined to teach search and discovery industry best practices culminating in a demo Retrieval Augmented Generation (RAG) application.  Along the way students will learn all of the components of a RAG system to include data preprocessing, embedding creation, vector database selection, indexing, retrieval systems, reranking, retrieval evaluation, question answering through an LLM and UI implementation through Streamlit.
+# Prerequisites - Technical Experience
+Students are expected to have the following technical skills prior to enrolling. Students who do not meet these prerequisites will likely have an overly challenging learning experience:
+- Minimum of 1-year experience coding in Python.  Skillsets should include programming using OOP, dictionary and list comprehensions, lambda functions, setting up virtual environments, comfortability with git version control.
+- Professional or academic experience working with search engines.
+- Ability to comfortably navigate the command line to include familiarity with docker.
+- Nice to have but not strictly required:
+    - experience fine-tuning a ML model
+    - familiarity with the Streamlit API
+    - familiarity with making inference calls to a Generative LLM (OpenAI or Llama-2)
+# Prerequisites - Administrative
+1. Students will need access to their own compute environment, whether locally or remote.  There are no hard requirements for RAM or CPU processing power, but in general the more punch the better.
+2. Students will need accounts with the following organizations:
+    - Either an [OpenAI](https://openai.com) account **(RECOMMENDED)** or a [HuggingFace](https://huggingface.co/join) account.  Students have the option of either using a paid LLM service (OpenAI) or using the open source `meta-llama/Llama-2-7b-chat-hf` model.  Students choosing the latter option will first need to [register with Meta](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) to request access to the Llama-2 model.
+    - An account with [weaviate.io](https://weaviate.io).  The current iteration of this course will use Weaviate as a sparse and dense vector database.  Weaviate offers free cloud instance cluster resources for 21 days (as of November 2023).  **Students are advised to NOT CREATE** a Weaviate cloud cluster until the course officially starts.
+    - A standard [Github](https://github.com/) account in order to fork this repo, clone a copy, and submit commits to the fork as needed throughout the course.
+# Setup
+1. Fork this course repo (see upper right hand corner of the repo web page).
+    <img src="assets/forkbutton.png" alt="fork button" width="300" height="auto">
+3. Clone a copy of the forked repo into the dev environment of your choice.  Navigate into the cloned `vectorsearch-applications` directory.
+4. Create a python virtual environment using your library of choice.  Here's an example using [`conda`](https://docs.conda.io/projects/miniconda/en/latest/):
+```
+conda create --name impactenv -y python=3.10
+```
+4. Once the environment is created, activate the environment and install dependencies.
+```
+conda activate impactenv
+pip install -r requirements.txt
+```
+5. Last but not least create a `.env` text file in your cloned repo.  At a minimum, add the following environment variables:
+```
+OPENAI_API_KEY= "your OpenAI account API Key"
+HF_TOKEN= "your HuggingFace account token"  <--- Optional: not required if using OpenAI
+WEAVIATE_API_KEY= "your Weaviate cluster API Key"   <--- you will get this on Day One of the course
+WEAVIATE_ENDPOINT= "your Weaviate cluster endpoint"  <--- you will get this on Day One of the course
+```
+6. If you've made it this far, you are ready to start the course.  Enjoy the process!
+<img src="assets/getsome.jpg" alt="jocko" width="500" height="auto">

requirements.txt ADDED Viewed

	@@ -0,0 +1,50 @@

+beautifulsoup4==4.12.2
+datasets==2.14.3
+huggingface-hub==0.16.4
+ipython==8.14.0
+ipywidgets==8.1.1
+jedi==0.19.0
+jupyter-events==0.7.0
+jupyter-lsp==2.2.0
+jupyter_client==8.3.0
+jupyter_core==5.3.1
+jupyter_server==2.7.0
+jupyter_server_terminals==0.4.4
+jupyterlab==4.0.4
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==3.0.9
+jupyterlab_server==2.24.0
+langchain==0.0.310
+langcodes==3.3.0
+langsmith==0.0.43
+llama-hub==0.0.47post1
+llama-index==0.9.6.post1
+loguru==0.7.0
+matplotlib==3.7.2
+matplotlib-inline==0.1.6
+numpy==1.24.4
+openai==1.3.5
+pandas==2.0.3
+protobuf==4.23.4
+pyarrow==12.0.1
+python-dotenv==1.0.0
+rank-bm25==0.2.2
+requests==2.31.0
+requests-oauthlib==1.3.1
+rich==13.7.0
+sentence-transformers==2.2.2
+streamlit==1.28.2
+tiktoken==0.5.1
+tokenizers==0.13.3
+torch==2.0.1
+tqdm==4.66.1
+transformers==4.33.1
+weaviate-client==3.25.3
+polars>=0.19
+plotly
+angle-emb==0.1.5  # for UAE-Large-V1 model
+streamlit-option-menu==0.3.6
+hydralit_components==1.0.10
+pathlib
+gdown
+modal

reranker.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from sentence_transformers import CrossEncoder
+from torch.nn import Sigmoid
+from typing import List, Union
+import numpy as np
+from loguru import logger
+class ReRanker(CrossEncoder):
+    '''
+    Cross-Encoder models achieve higher performance than Bi-Encoders,
+    however, they do not scale well to large datasets. The lack of scalability
+    is due to the underlying cross-attention mechanism, which is computationally
+    expensive.  Thus a Bi-Encoder is best used for 1st-stage document retrieval and
+    a Cross-Encoder is used to re-rank the retrieved documents.
+    https://www.sbert.net/examples/applications/cross-encoder/README.html
+    '''
+    def __init__(self,
+                 model_name: str='cross-encoder/ms-marco-MiniLM-L-6-v2',
+                 **kwargs
+                 ):
+        super().__init__(model_name=model_name,
+                         **kwargs)
+        self.model_name = model_name
+        self.score_field = 'cross_score'
+        self.activation_fct = Sigmoid()
+    def _cross_encoder_score(self,
+                             results: List[dict],
+                             query: str,
+                             hit_field: str='content',
+                             apply_sigmoid: bool=True,
+                             return_scores: bool=False
+                             ) -> Union[np.array, None]:
+        '''
+        Given a list of hits from a Retriever:
+            1. Scores hits by passing query and results through CrossEncoder model.
+            2. Adds cross-score key to results dictionary.
+            3. If desired returns np.array of Cross Encoder scores.
+        '''
+        activation_fct = self.activation_fct if apply_sigmoid else None
+        #build query/content list
+        cross_inp = [[query, hit[hit_field]] for hit in results]
+        #get scores
+        cross_scores = self.predict(cross_inp, activation_fct=activation_fct)
+        for i, result in enumerate(results):
+            result[self.score_field]=cross_scores[i]
+        if return_scores:return cross_scores
+    def rerank(self,
+               results: List[dict],
+               query: str,
+               top_k: int=10,
+               apply_sigmoid: bool=True,
+               threshold: float=None
+               ) -> List[dict]:
+        '''
+        Given a list of hits from a Retriever:
+            1. Scores hits by passing query and results through CrossEncoder model.
+            2. Adds cross_score key to results dictionary.
+            3. Returns reranked results limited by either a threshold value or top_k.
+        Args:
+        -----
+        results : List[dict]
+            List of results from the Weaviate client
+        query : str
+            User query
+        top_k : int=10
+            Number of results to return
+        apply_sigmoid : bool=True
+            Whether to apply sigmoid activation to cross-encoder scores.  If False,
+            returns raw cross-encoder scores (logits).
+        threshold : float=None
+            Minimum cross-encoder score to return. If no hits are above threshold,
+            returns top_k hits.
+        '''
+        # Sort results by the cross-encoder scores
+        self._cross_encoder_score(results=results, query=query, apply_sigmoid=apply_sigmoid)
+        sorted_hits = sorted(results, key=lambda x: x[self.score_field], reverse=True)
+        if threshold or threshold == 0:
+            filtered_hits = [hit for hit in sorted_hits if hit[self.score_field] >= threshold]
+            if not any(filtered_hits):
+                logger.warning(f'No hits above threshold {threshold}. Returning top {top_k} hits.')
+                return sorted_hits[:top_k]
+            return filtered_hits
+        return sorted_hits[:top_k]

retrieval_evaluation.py ADDED Viewed

	@@ -0,0 +1,332 @@

+#external files
+from openai_interface import GPT_Turbo
+from weaviate_interface import WeaviateClient
+from llama_index.finetuning import EmbeddingQAFinetuneDataset
+from prompt_templates import qa_generation_prompt
+from reranker import ReRanker
+#standard library imports
+import json
+import time
+import uuid
+import os
+import re
+import random
+from datetime import datetime
+from typing import List, Dict, Tuple, Union, Literal
+#misc
+from tqdm import tqdm
+class QueryContextGenerator:
+    '''
+    Class designed for the generation of query/context pairs using a
+    Generative LLM. The LLM is used to generate questions from a given
+    corpus of text. The query/context pairs can be used to fine-tune
+    an embedding model using a MultipleNegativesRankingLoss loss function
+    or can be used to create evaluation datasets for retrieval models.
+    '''
+    def __init__(self, openai_key: str, model_id: str='gpt-3.5-turbo-0613'):
+        self.llm = GPT_Turbo(model=model_id, api_key=openai_key)
+    def clean_validate_data(self,
+                            data: List[dict],
+                            valid_fields: List[str]=['content', 'summary', 'guest', 'doc_id'],
+                            total_chars: int=950
+                            ) -> List[dict]:
+        '''
+        Strip original data chunks so they only contain valid_fields.
+        Remove any chunks less than total_chars in size. Prevents LLM
+        from asking questions from sparse content.
+        '''
+        clean_docs = [{k:v for k,v in d.items() if k in valid_fields} for d in data]
+        valid_docs = [d for d in clean_docs if len(d['content']) > total_chars]
+        return valid_docs
+    def train_val_split(self,
+                        data: List[dict],
+                        n_train_questions: int,
+                        n_val_questions: int,
+                        n_questions_per_chunk: int=2,
+                        total_chars: int=950):
+        '''
+        Splits corpus into training and validation sets.  Training and
+        validation samples are randomly selected from the corpus. total_chars
+        parameter is set based on pre-analysis of average doc length in the
+        training corpus.
+        '''
+        clean_data = self.clean_validate_data(data, total_chars=total_chars)
+        random.shuffle(clean_data)
+        train_index = n_train_questions//n_questions_per_chunk
+        valid_index = n_val_questions//n_questions_per_chunk
+        end_index = valid_index + train_index
+        if end_index > len(clean_data):
+            raise ValueError('Cannot create dataset with desired number of questions, try using a larger dataset')
+        train_data = clean_data[:train_index]
+        valid_data = clean_data[train_index:end_index]
+        print(f'Length Training Data: {len(train_data)}')
+        print(f'Length Validation Data: {len(valid_data)}')
+        return train_data, valid_data
+    def generate_qa_embedding_pairs(
+                                    self,
+                                    data: List[dict],
+                                    generate_prompt_tmpl: str=None,
+                                    num_questions_per_chunk: int = 2,
+                                    ) -> EmbeddingQAFinetuneDataset:
+        """
+        Generate query/context pairs from a list of documents. The query/context pairs
+        can be used for fine-tuning an embedding model using a MultipleNegativesRankingLoss
+        or can be used to create an evaluation dataset for retrieval models.
+        This function was adapted for this course from the llama_index.finetuning.common module:
+        https://github.com/run-llama/llama_index/blob/main/llama_index/finetuning/embeddings/common.py
+        """
+        generate_prompt_tmpl = qa_generation_prompt if not generate_prompt_tmpl else generate_prompt_tmpl
+        queries = {}
+        relevant_docs = {}
+        corpus = {chunk['doc_id'] : chunk['content'] for chunk in data}
+        for chunk in tqdm(data):
+            summary = chunk['summary']
+            guest = chunk['guest']
+            transcript = chunk['content']
+            node_id = chunk['doc_id']
+            query = generate_prompt_tmpl.format(summary=summary,
+                                                guest=guest,
+                                                transcript=transcript,
+                                                num_questions_per_chunk=num_questions_per_chunk)
+            try:
+                response = self.llm.get_chat_completion(prompt=query, temperature=0.1, max_tokens=100)
+            except Exception as e:
+                print(e)
+                continue
+            result = str(response).strip().split("\n")
+            questions = [
+                re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
+            ]
+            questions = [question for question in questions if len(question) > 0]
+            for question in questions:
+                question_id = str(uuid.uuid4())
+                queries[question_id] = question
+                relevant_docs[question_id] = [node_id]
+        # construct dataset
+        return EmbeddingQAFinetuneDataset(
+            queries=queries, corpus=corpus, relevant_docs=relevant_docs
+        )
+def execute_evaluation(dataset: EmbeddingQAFinetuneDataset,
+                       class_name: str,
+                       retriever: WeaviateClient,
+                       reranker: ReRanker=None,
+                       alpha: float=0.5,
+                       retrieve_limit: int=100,
+                       top_k: int=5,
+                       chunk_size: int=256,
+                       hnsw_config_keys: List[str]=['maxConnections', 'efConstruction', 'ef'],
+                       search_type: Literal['kw', 'vector', 'hybrid', 'all']='all',
+                       display_properties: List[str]=['doc_id', 'content'],
+                       dir_outpath: str='./eval_results',
+                       include_miss_info: bool=False,
+                       user_def_params: dict=None
+                       ) -> Union[dict, Tuple[dict, List[dict]]]:
+    '''
+    Given a dataset, a retriever, and a reranker, evaluate the performance of the retriever and reranker.
+    Returns a dict of kw, vector, and hybrid hit rates and mrr scores. If inlude_miss_info is True, will
+    also return a list of kw and vector responses and their associated queries that did not return a hit.
+    Args:
+    -----
+    dataset: EmbeddingQAFinetuneDataset
+        Dataset to be used for evaluation
+    class_name: str
+        Name of Class on Weaviate host to be used for retrieval
+    retriever: WeaviateClient
+        WeaviateClient object to be used for retrieval
+    reranker: ReRanker
+        ReRanker model to be used for results reranking
+    alpha: float=0.5
+        Weighting factor for BM25 and Vector search.
+        alpha can be any number from 0 to 1, defaulting to 0.5:
+            alpha = 0 executes a pure keyword search method (BM25)
+            alpha = 0.5 weighs the BM25 and vector methods evenly
+            alpha = 1 executes a pure vector search method
+    retrieve_limit: int=5
+        Number of documents to retrieve from Weaviate host
+    top_k: int=5
+        Number of top results to evaluate
+    chunk_size: int=256
+        Number of tokens used to chunk text
+    hnsw_config_keys: List[str]=['maxConnections', 'efConstruction', 'ef']
+        List of keys to be used for retrieving HNSW Index parameters from Weaviate host
+    search_type: Literal['kw', 'vector', 'hybrid', 'all']='all'
+        Type of search to be evaluated.  Options are 'kw', 'vector', 'hybrid', or 'all'
+    display_properties: List[str]=['doc_id', 'content']
+        List of properties to be returned from Weaviate host for display in response
+    dir_outpath: str='./eval_results'
+        Directory path for saving results.  Directory will be created if it does not
+        already exist.
+    include_miss_info: bool=False
+        Option to include queries and their associated search response values
+        for queries that are "total misses"
+    user_def_params : dict=None
+        Option for user to pass in a dictionary of user-defined parameters and their values.
+        Will be automatically added to the results_dict if correct type is passed.
+    '''
+    reranker_name = reranker.model_name if reranker else "None"
+    results_dict = {'n':retrieve_limit,
+                    'top_k': top_k,
+                    'alpha': alpha,
+                    'Retriever': retriever.model_name_or_path,
+                    'Ranker': reranker_name,
+                    'chunk_size': chunk_size,
+                    'kw_hit_rate': 0,
+                    'kw_mrr': 0,
+                    'vector_hit_rate': 0,
+                    'vector_mrr': 0,
+                    'hybrid_hit_rate':0,
+                    'hybrid_mrr': 0,
+                    'total_misses': 0,
+                    'total_questions':0
+                    }
+    #add extra params to results_dict
+    results_dict = add_params(retriever, class_name, results_dict, user_def_params, hnsw_config_keys)
+    start = time.perf_counter()
+    miss_info = []
+    for query_id, q in tqdm(dataset.queries.items(), 'Queries'):
+        results_dict['total_questions'] += 1
+        hit = False
+        #make Keyword, Vector, and Hybrid calls to Weaviate host
+        try:
+            kw_response = retriever.keyword_search(request=q, class_name=class_name, limit=retrieve_limit, display_properties=display_properties)
+            vector_response = retriever.vector_search(request=q, class_name=class_name, limit=retrieve_limit, display_properties=display_properties)
+            hybrid_response = retriever.hybrid_search(request=q, class_name=class_name, alpha=alpha, limit=retrieve_limit, display_properties=display_properties)
+            #rerank returned responses if reranker is provided
+            if reranker:
+                kw_response = reranker.rerank(kw_response, q, top_k=top_k)
+                vector_response = reranker.rerank(vector_response, q, top_k=top_k)
+                hybrid_response = reranker.rerank(hybrid_response, q, top_k=top_k)
+            #collect doc_ids to check for document matches (include only results_top_k)
+            kw_doc_ids = {result['doc_id']:i for i, result in enumerate(kw_response[:top_k], 1)}
+            vector_doc_ids = {result['doc_id']:i for i, result in enumerate(vector_response[:top_k], 1)}
+            hybrid_doc_ids = {result['doc_id']:i for i, result in enumerate(hybrid_response[:top_k], 1)}
+            #extract doc_id for scoring purposes
+            doc_id = dataset.relevant_docs[query_id][0]
+            #increment hit_rate counters and mrr scores
+            if doc_id in kw_doc_ids:
+                results_dict['kw_hit_rate'] += 1
+                results_dict['kw_mrr'] += 1/kw_doc_ids[doc_id]
+                hit = True
+            if doc_id in vector_doc_ids:
+                results_dict['vector_hit_rate'] += 1
+                results_dict['vector_mrr'] += 1/vector_doc_ids[doc_id]
+                hit = True
+            if doc_id in hybrid_doc_ids:
+                results_dict['hybrid_hit_rate'] += 1
+                results_dict['hybrid_mrr'] += 1/hybrid_doc_ids[doc_id]
+                hit = True
+            # if no hits, let's capture that
+            if not hit:
+                results_dict['total_misses'] += 1
+                miss_info.append({'query': q,
+                                  'answer': dataset.corpus[doc_id],
+                                  'doc_id': doc_id,
+                                  'kw_response': kw_response,
+                                  'vector_response': vector_response,
+                                  'hybrid_response': hybrid_response})
+        except Exception as e:
+            print(e)
+            continue
+    #use raw counts to calculate final scores
+    calc_hit_rate_scores(results_dict, search_type=search_type)
+    calc_mrr_scores(results_dict, search_type=search_type)
+    end = time.perf_counter() - start
+    print(f'Total Processing Time: {round(end/60, 2)} minutes')
+    record_results(results_dict, chunk_size, dir_outpath=dir_outpath, as_text=True)
+    if include_miss_info:
+        return results_dict, miss_info
+    return results_dict
+def calc_hit_rate_scores(results_dict: Dict[str, Union[str, int]],
+                         search_type: Literal['kw', 'vector', 'hybrid', 'all']=['kw', 'vector']
+                         ) -> None:
+    if search_type == 'all':
+        search_type = ['kw', 'vector', 'hybrid']
+    for prefix in search_type:
+        results_dict[f'{prefix}_hit_rate'] = round(results_dict[f'{prefix}_hit_rate']/results_dict['total_questions'],2)
+def calc_mrr_scores(results_dict: Dict[str, Union[str, int]],
+                    search_type: Literal['kw', 'vector', 'hybrid', 'all']=['kw', 'vector']
+                    ) -> None:
+    if search_type == 'all':
+        search_type = ['kw', 'vector', 'hybrid']
+    for prefix in search_type:
+        results_dict[f'{prefix}_mrr'] = round(results_dict[f'{prefix}_mrr']/results_dict['total_questions'],2)
+def create_dir(dir_path: str) -> None:
+    '''
+    Checks if directory exists, and creates new directory
+    if it does not exist
+    '''
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+def record_results(results_dict: Dict[str, Union[str, int]],
+                   chunk_size: int,
+                   dir_outpath: str='./eval_results',
+                   as_text: bool=False
+                   ) -> None:
+    '''
+    Write results to output file in either txt or json format
+    Args:
+    -----
+    results_dict: Dict[str, Union[str, int]]
+        Dictionary containing results of evaluation
+    chunk_size: int
+        Size of text chunks in tokens
+    dir_outpath: str
+        Path to output directory.  Directory only, filename is hardcoded
+        as part of this function.
+    as_text: bool
+        If True, write results as text file.  If False, write as json file.
+    '''
+    create_dir(dir_outpath)
+    time_marker = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    ext = 'txt' if as_text else 'json'
+    path = os.path.join(dir_outpath, f'retrieval_eval_{chunk_size}_{time_marker}.{ext}')
+    if as_text:
+        with open(path, 'a') as f:
+            f.write(f"{results_dict}\n")
+    else:
+        with open(path, 'w') as f:
+            json.dump(results_dict, f, indent=4)
+def add_params(client: WeaviateClient,
+               class_name: str,
+               results_dict: dict,
+               param_options: dict,
+               hnsw_config_keys: List[str]
+              ) -> dict:
+    hnsw_params = {k:v for k,v in client.show_class_config(class_name)['vectorIndexConfig'].items() if k in hnsw_config_keys}
+    if hnsw_params:
+        results_dict = {**results_dict, **hnsw_params}
+    if param_options and isinstance(param_options, dict):
+        results_dict = {**results_dict, **param_options}
+    return results_dict

unitesting_utils.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import json
+import urllib.request
+def load_impact_theory_data():
+    '''
+    Loads impact_theory_data.json data by trying three options:
+    1. Assumes user is in Google Colab environment and loads file from content dir.
+    2. If 1st option doesn't work, assumes user is in course repo and loads from data dir.
+    3. If 2nd option doesn't work, assumes user does not have direct access to data so
+       downloads data direct from course repo.
+    '''
+    try:
+        path = '/content/impact_theory_data.json'
+        with open(path) as f:
+            data = json.load(f)
+        return data
+    except Exception:
+        print(f"Data not available at {path}")
+        try:
+            path = './data/impact_theory_data.json'
+            with open(path) as f:
+                data = json.load(f)
+            print(f'OK, data available at {path}')
+            return data
+        except Exception:
+            print(f'Data not available at {path}, downloading from source')
+            try:
+                with urllib.request.urlopen("https://ra.githubusercontent.com/americanthinker/vectorsearch-applications/main/data/impact_theory_data.json") as url:
+                    data = json.load(url)
+                return data
+            except Exception:
+                print('Data cannot be loaded from source, please move data file to one of these paths to run this test:\n\
+    1. "/content/impact_theory_data.json"   --> if you are in Google Colab\n\
+    2. "./data/impact_theory_data.json"      --> if you are in a local environment\n')

utilities/install_kernel.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/bin/bash
+echo Installing Jupyter kernel named $1 with display name $2
+ipython kernel install --name "$1" --user --display-name $2

weaviate_interface.py ADDED Viewed

	@@ -0,0 +1,434 @@

+from weaviate import Client, AuthApiKey
+from dataclasses import dataclass
+from openai import OpenAI
+from sentence_transformers import SentenceTransformer
+from typing import List, Union, Callable
+from torch import cuda
+from tqdm import tqdm
+import time
+class WeaviateClient(Client):
+    '''
+    A python native Weaviate Client class that encapsulates Weaviate functionalities
+    in one object. Several convenience methods are added for ease of use.
+    Args
+    ----
+    api_key: str
+        The API key for the Weaviate Cloud Service (WCS) instance.
+        https://console.weaviate.cloud/dashboard
+    endpoint: str
+        The url endpoint for the Weaviate Cloud Service instance.
+    model_name_or_path: str='sentence-transformers/all-MiniLM-L6-v2'
+        The name or path of the SentenceTransformer model to use for vector search.
+        Will also support OpenAI text-embedding-ada-002 model.  This param enables
+        the use of most leading models on MTEB Leaderboard:
+        https://huggingface.co/spaces/mteb/leaderboard
+    openai_api_key: str=None
+        The API key for the OpenAI API. Only required if using OpenAI text-embedding-ada-002 model.
+    '''
+    def __init__(self,
+                 api_key: str,
+                 endpoint: str,
+                 model_name_or_path: str='sentence-transformers/all-MiniLM-L6-v2',
+                 openai_api_key: str=None,
+                 **kwargs
+                ):
+        auth_config = AuthApiKey(api_key=api_key)
+        super().__init__(auth_client_secret=auth_config,
+                         url=endpoint,
+                         **kwargs)
+        self.model_name_or_path = model_name_or_path
+        self.openai_model = False
+        if self.model_name_or_path == 'text-embedding-ada-002':
+            if not openai_api_key:
+                raise ValueError(f'OpenAI API key must be provided to use this model: {self.model_name_or_path}')
+            self.model = OpenAI(api_key=openai_api_key)
+            self.openai_model = True
+        else:
+            self.model = SentenceTransformer(self.model_name_or_path) if self.model_name_or_path else None
+        self.display_properties = ['title', 'video_id', 'length', 'thumbnail_url', 'views', 'episode_url', \
+                                    'doc_id', 'guest', 'content']  # 'playlist_id', 'channel_id', 'author'
+    def show_classes(self) -> Union[List[dict], str]:
+        '''
+        Shows all available classes (indexes) on the Weaviate instance.
+        '''
+        classes = self.cluster.get_nodes_status()[0]['shards']
+        if classes:
+            return [d['class'] for d in classes]
+        else:
+            return "No classes found on cluster."
+    def show_class_info(self) -> Union[List[dict], str]:
+        '''
+        Shows all information related to the classes (indexes) on the Weaviate instance.
+        '''
+        classes = self.cluster.get_nodes_status()[0]['shards']
+        if classes:
+            return [d for d in classes]
+        else:
+            return "No classes found on cluster."
+    def show_class_properties(self, class_name: str) -> Union[dict, str]:
+        '''
+        Shows all properties of a class (index) on the Weaviate instance.
+        '''
+        classes = self.schema.get()
+        if classes:
+            all_classes = classes['classes']
+            for d in all_classes:
+                if d['class'] == class_name:
+                    return d['properties']
+            return f'Class "{class_name}" not found on host'
+        return f'No Classes found on host'
+    def show_class_config(self, class_name: str) -> Union[dict, str]:
+        '''
+        Shows all configuration of a class (index) on the Weaviate instance.
+        '''
+        classes = self.schema.get()
+        if classes:
+            all_classes = classes['classes']
+            for d in all_classes:
+                if d['class'] == class_name:
+                    return d
+            return f'Class "{class_name}" not found on host'
+        return f'No Classes found on host'
+    def delete_class(self, class_name: str) -> str:
+        '''
+        Deletes a class (index) on the Weaviate instance, if it exists.
+        '''
+        available = self._check_class_avialability(class_name)
+        if isinstance(available, bool):
+            if available:
+                self.schema.delete_class(class_name)
+                not_deleted = self._check_class_avialability(class_name)
+                if isinstance(not_deleted, bool):
+                    if not_deleted:
+                        return f'Class "{class_name}" was not deleted. Try again.'
+                    else:
+                        return f'Class "{class_name}" deleted'
+                return f'Class "{class_name}" deleted and there are no longer any classes on host'
+            return f'Class "{class_name}" not found on host'
+        return available
+    def _check_class_avialability(self, class_name: str) -> Union[bool, str]:
+        '''
+        Checks if a class (index) exists on the Weaviate instance.
+        '''
+        classes = self.schema.get()
+        if classes:
+            all_classes = classes['classes']
+            for d in all_classes:
+                if d['class'] == class_name:
+                    return True
+            return False
+        else:
+            return f'No Classes found on host'
+    def format_response(self,
+                         response: dict,
+                         class_name: str
+                         ) -> List[dict]:
+        '''
+        Formats json response from Weaviate into a list of dictionaries.
+        Expands _additional fields if present into top-level dictionary.
+        '''
+        if response.get('errors'):
+            return response['errors'][0]['message']
+        results = []
+        hits = response['data']['Get'][class_name]
+        for d in hits:
+            temp = {k:v for k,v in d.items() if k != '_additional'}
+            if d.get('_additional'):
+                for key in d['_additional']:
+                    temp[key] = d['_additional'][key]
+            results.append(temp)
+        return results
+    def update_ef_value(self, class_name: str, ef_value: int) -> str:
+        '''
+        Updates ef_value for a class (index) on the Weaviate instance.
+        '''
+        self.schema.update_config(class_name=class_name, config={'vectorIndexConfig': {'ef': ef_value}})
+        print(f'ef_value updated to {ef_value} for class {class_name}')
+        return self.show_class_config(class_name)['vectorIndexConfig']
+    def keyword_search(self,
+                       request: str,
+                       class_name: str,
+                       properties: List[str]=['content'],
+                       limit: int=10,
+                       where_filter: dict=None,
+                       display_properties: List[str]=None,
+                       return_raw: bool=False) -> Union[dict, List[dict]]:
+        '''
+        Executes Keyword (BM25) search.
+        Args
+        ----
+        query: str
+            User query.
+        class_name: str
+            Class (index) to search.
+        properties: List[str]
+            List of properties to search across.
+        limit: int=10
+            Number of results to return.
+        display_properties: List[str]=None
+            List of properties to return in response.
+            If None, returns all properties.
+        return_raw: bool=False
+            If True, returns raw response from Weaviate.
+        '''
+        display_properties = display_properties if display_properties else self.display_properties
+        response = (self.query
+                    .get(class_name, display_properties)
+                    .with_bm25(query=request, properties=properties)
+                    .with_additional(['score', "id"])
+                    .with_limit(limit)
+                    )
+        response = response.with_where(where_filter).do() if where_filter else response.do()
+        if return_raw:
+            return response
+        else:
+            return self.format_response(response, class_name)
+    def vector_search(self,
+                      request: str,
+                      class_name: str,
+                      limit: int=10,
+                      where_filter: dict=None,
+                      display_properties: List[str]=None,
+                      return_raw: bool=False,
+                      device: str='cuda:0' if cuda.is_available() else 'cpu'
+                      ) -> Union[dict, List[dict]]:
+        '''
+        Executes vector search using embedding model defined on instantiation
+        of WeaviateClient instance.
+        Args
+        ----
+        query: str
+            User query.
+        class_name: str
+            Class (index) to search.
+        limit: int=10
+            Number of results to return.
+        display_properties: List[str]=None
+            List of properties to return in response.
+            If None, returns all properties.
+        return_raw: bool=False
+            If True, returns raw response from Weaviate.
+        '''
+        display_properties = display_properties if display_properties else self.display_properties
+        query_vector = self._create_query_vector(request, device=device)
+        response = (
+                    self.query
+                    .get(class_name, display_properties)
+                    .with_near_vector({"vector": query_vector})
+                    .with_limit(limit)
+                    .with_additional(['distance'])
+                    )
+        response = response.with_where(where_filter).do() if where_filter else response.do()
+        if return_raw:
+            return response
+        else:
+            return self.format_response(response, class_name)
+    def _create_query_vector(self, query: str, device: str) -> List[float]:
+        '''
+        Creates embedding vector from text query.
+        '''
+        return self.get_openai_embedding(query) if self.openai_model else self.model.encode(query, device=device).tolist()
+    def get_openai_embedding(self, query: str) -> List[float]:
+        '''
+        Gets embedding from OpenAI API for query.
+        '''
+        embedding = self.model.embeddings.create(input=query, model='text-embedding-ada-002').model_dump()
+        if embedding:
+            return embedding['data'][0]['embedding']
+        else:
+           raise ValueError(f'No embedding found for query: {query}')
+    def hybrid_search(self,
+                      request: str,
+                      class_name: str,
+                      properties: List[str]=['content'],
+                      alpha: float=0.5,
+                      limit: int=10,
+                      where_filter: dict=None,
+                      display_properties: List[str]=None,
+                      return_raw: bool=False,
+                      device: str='cuda:0' if cuda.is_available() else 'cpu'
+                     ) -> Union[dict, List[dict]]:
+        '''
+        Executes Hybrid (BM25 + Vector) search.
+        Args
+        ----
+        query: str
+            User query.
+        class_name: str
+            Class (index) to search.
+        properties: List[str]
+            List of properties to search across (using BM25)
+        alpha: float=0.5
+            Weighting factor for BM25 and Vector search.
+            alpha can be any number from 0 to 1, defaulting to 0.5:
+                alpha = 0 executes a pure keyword search method (BM25)
+                alpha = 0.5 weighs the BM25 and vector methods evenly
+                alpha = 1 executes a pure vector search method
+        limit: int=10
+            Number of results to return.
+        display_properties: List[str]=None
+            List of properties to return in response.
+            If None, returns all properties.
+        return_raw: bool=False
+            If True, returns raw response from Weaviate.
+        '''
+        display_properties = display_properties if display_properties else self.display_properties
+        query_vector = self._create_query_vector(request, device=device)
+        response = (
+                    self.query
+                    .get(class_name, display_properties)
+                    .with_hybrid(query=request,
+                                 alpha=alpha,
+                                 vector=query_vector,
+                                 properties=properties,
+                                 fusion_type='relativeScoreFusion') #hard coded option for now
+                    .with_additional(["score", "explainScore"])
+                    .with_limit(limit)
+                    )
+        response = response.with_where(where_filter).do() if where_filter else response.do()
+        if return_raw:
+            return response
+        else:
+            return self.format_response(response, class_name)
+class WeaviateIndexer:
+    def __init__(self,
+                 client: WeaviateClient,
+                 batch_size: int=150,
+                 num_workers: int=4,
+                 dynamic: bool=True,
+                 creation_time: int=5,
+                 timeout_retries: int=3,
+                 connection_error_retries: int=3,
+                 callback: Callable=None,
+                 ):
+        '''
+        Class designed to batch index documents into Weaviate. Instantiating
+        this class will automatically configure the Weaviate batch client.
+        '''
+        self._client = client
+        self._callback = callback if callback else self._default_callback
+        self._client.batch.configure(batch_size=batch_size,
+                                     num_workers=num_workers,
+                                     dynamic=dynamic,
+                                     creation_time=creation_time,
+                                     timeout_retries=timeout_retries,
+                                     connection_error_retries=connection_error_retries,
+                                     callback=self._callback
+                                    )
+    def _default_callback(self, results: dict):
+        """
+        Check batch results for errors.
+        Parameters
+        ----------
+        results : dict
+            The Weaviate batch creation return value.
+        """
+        if results is not None:
+            for result in results:
+                if "result" in result and "errors" in result["result"]:
+                    if "error" in result["result"]["errors"]:
+                        print(result["result"])
+    def batch_index_data(self,
+                         data: List[dict],
+                         class_name: str,
+                         vector_property: str='content_embedding'
+                         ) -> None:
+        '''
+        Batch function for fast indexing of data onto Weaviate cluster.
+        This method assumes that self._client.batch is already configured.
+        '''
+        start = time.perf_counter()
+        with self._client.batch as batch:
+            for d in tqdm(data):
+                #define single document
+                properties = {k:v for k,v in d.items() if k != vector_property}
+                try:
+                    #add data object to batch
+                    batch.add_data_object(
+                                        data_object=properties,
+                                        class_name=class_name,
+                                        vector=d[vector_property]
+                                        )
+                except Exception as e:
+                    print(e)
+                    continue
+        end = time.perf_counter() - start
+        print(f'Batch job completed in {round(end/60, 2)} minutes.')
+        class_info = self._client.show_class_info()
+        for i, c in enumerate(class_info):
+            if c['class'] == class_name:
+                print(class_info[i])
+        self._client.batch.shutdown()
+@dataclass
+class WhereFilter:
+    '''
+    Simplified interface for constructing a WhereFilter object.
+    Args
+    ----
+    path: List[str]
+        List of properties to filter on.
+    operator: str
+        Operator to use for filtering. Options: ['And', 'Or', 'Equal', 'NotEqual',
+        'GreaterThan', 'GreaterThanEqual', 'LessThan', 'LessThanEqual', 'Like',
+        'WithinGeoRange', 'IsNull', 'ContainsAny', 'ContainsAll']
+    value[dataType]: Union[int, bool, str, float, datetime]
+        Value to filter on. The dataType suffix must match the data type of the
+        property being filtered on. At least and only one value type must be provided.
+    '''
+    path: List[str]
+    operator: str
+    valueInt: int=None
+    valueBoolean: bool=None
+    valueText: str=None
+    valueNumber: float=None
+    valueDate = None
+    def post_init(self):
+        operators = ['And', 'Or', 'Equal', 'NotEqual','GreaterThan', 'GreaterThanEqual', 'LessThan',\
+                      'LessThanEqual', 'Like', 'WithinGeoRange', 'IsNull', 'ContainsAny', 'ContainsAll']
+        if self.operator not in operators:
+            raise ValueError(f'operator must be one of: {operators}, got {self.operator}')
+        values = [self.valueInt, self.valueBoolean, self.valueText, self.valueNumber, self.valueDate]
+        if not any(values):
+            raise ValueError('At least one value must be provided.')
+        if len(values) > 1:
+            raise ValueError('At most one value can be provided.')
+    def todict(self):
+        return {k:v for k,v in self.__dict__.items() if v is not None}