In [2]:
import json, os

from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SentenceSplitter
from llama_index.schema import MetadataMode

In [4]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.finetuning import SentenceTransformersFinetuneEngine

In [19]:
from llama_index.embeddings import OpenAIEmbedding
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

In [20]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path

In [5]:
from llama_index.llms import OpenAI

In [6]:
import llama_index
print(llama_index.__version__)

0.9.39


In [7]:
TRAIN_FILES = ["../raw_documents/HI_Knowledge_Base.pdf"]
VAL_FILES = ["../raw_documents/HI Chapter Summary Version 1.3.pdf"]

TRAIN_CORPUS_FPATH = "../data/train_corpus.json"
VAL_CORPUS_FPATH = "../data/val_corpus.json"

In [8]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SentenceSplitter()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

In [9]:
if not os.path.exists(TRAIN_CORPUS_FPATH) or \
   not os.path.exists(VAL_CORPUS_FPATH):

    train_nodes = load_corpus(TRAIN_FILES, verbose=True)
    val_nodes = load_corpus(VAL_FILES, verbose=True)
    
    train_dataset = generate_qa_embedding_pairs(
        llm=OpenAI(model="gpt-3.5-turbo-1106"), nodes=train_nodes
    )
    val_dataset = generate_qa_embedding_pairs(
        llm=OpenAI(model="gpt-3.5-turbo-1106"), nodes=val_nodes
    )
    
    train_dataset.save_json(TRAIN_CORPUS_FPATH)
    val_dataset.save_json(VAL_CORPUS_FPATH)
    
else:
    train_dataset = EmbeddingQAFinetuneDataset.from_json(TRAIN_CORPUS_FPATH)
    val_dataset = EmbeddingQAFinetuneDataset.from_json(VAL_CORPUS_FPATH)

In [11]:
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-small-en-v1.5",
    model_output_path="test_model",
    batch_size=5,
    val_dataset=val_dataset
)

In [12]:
finetune_engine.finetune()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/183 [00:00<?, ?it/s]

Iteration:   0%|          | 0/183 [00:00<?, ?it/s]

In [13]:
embed_model = finetune_engine.get_finetuned_model()

In [14]:
embed_model

HuggingFaceEmbedding(model_name='test_model', embed_batch_size=10, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x3c7fadca0>, tokenizer_name='test_model', max_length=512, pooling=<Pooling.CLS: 'cls'>, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None)

In [15]:
def evaluate(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    service_context = ServiceContext.from_defaults(embed_model=embed_model)
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, service_context=service_context, show_progress=True
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

In [16]:
def evaluate_st(
    dataset,
    model_id,
    name,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(
        queries, corpus, relevant_docs, name=name
    )
    model = SentenceTransformer(model_id)
    output_path = "results/"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

### Using OpenAI Ada embedding

In [21]:
ada = OpenAIEmbedding()
ada_val_results = evaluate(val_dataset, ada)

Generating embeddings:   0%|          | 0/100 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/200 [00:00<?, ?it/s]

In [22]:
df_ada = pd.DataFrame(ada_val_results)

In [24]:
df_ada[:5]

Unnamed: 0,is_hit,retrieved,expected,query
0,False,"[5b9cd986-33dc-46f1-abae-e4e1dc9e3629, c3c1804...",6a756f03-638d-480d-8222-1a6bf3790e3c,011d84b2-0c26-4c5c-89d1-2a85498f30e0
1,True,"[6a756f03-638d-480d-8222-1a6bf3790e3c, c3c1804...",6a756f03-638d-480d-8222-1a6bf3790e3c,70c5ddd7-eb86-4a41-af70-a23d2392f48d
2,True,"[c83dbd8a-7e62-445e-8c12-a8ad604ff65e, 2177824...",c83dbd8a-7e62-445e-8c12-a8ad604ff65e,a8f4290a-1281-4272-aab9-bf089954a45e
3,True,"[c83dbd8a-7e62-445e-8c12-a8ad604ff65e, 2177824...",c83dbd8a-7e62-445e-8c12-a8ad604ff65e,c1ef991a-1cc6-4dbf-b179-2df688c84301
4,True,"[21778248-2ed9-4147-bdb0-a60337a1a599, c83dbd8...",21778248-2ed9-4147-bdb0-a60337a1a599,1ce25e78-c1e1-487e-9455-9418baa0b60c


In [27]:
hit_rate_ada = df_ada["is_hit"].mean()
hit_rate_ada, len(df_ada)

(0.955, 200)

### Using BAAI bge-small model without fine-tuning

In [26]:
bge = "local:BAAI/bge-small-en-v1.5"
bge_val_results = evaluate(val_dataset, bge)

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Generating embeddings:   0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [28]:
df_bge = pd.DataFrame(bge_val_results)

In [29]:
df_bge[:5]

Unnamed: 0,is_hit,retrieved,expected,query
0,False,"[69a5696d-0c0e-482a-b6a9-f7b87f19945f, fa650c7...",6a756f03-638d-480d-8222-1a6bf3790e3c,011d84b2-0c26-4c5c-89d1-2a85498f30e0
1,True,"[6a756f03-638d-480d-8222-1a6bf3790e3c, d89a649...",6a756f03-638d-480d-8222-1a6bf3790e3c,70c5ddd7-eb86-4a41-af70-a23d2392f48d
2,True,"[c83dbd8a-7e62-445e-8c12-a8ad604ff65e, 2177824...",c83dbd8a-7e62-445e-8c12-a8ad604ff65e,a8f4290a-1281-4272-aab9-bf089954a45e
3,True,"[c83dbd8a-7e62-445e-8c12-a8ad604ff65e, ad2e3eb...",c83dbd8a-7e62-445e-8c12-a8ad604ff65e,c1ef991a-1cc6-4dbf-b179-2df688c84301
4,True,"[21778248-2ed9-4147-bdb0-a60337a1a599, c83dbd8...",21778248-2ed9-4147-bdb0-a60337a1a599,1ce25e78-c1e1-487e-9455-9418baa0b60c


In [30]:
hit_rate_bge = df_bge["is_hit"].mean()
hit_rate_bge, len(df_bge)

(0.915, 200)

In [31]:
evaluate_st(val_dataset, "BAAI/bge-small-en-v1.5", name="bge")

0.7955697668171072

### Using BAAI bge-small model with `fine-tuning`

In [32]:
finetuned = "local:test_model"
val_results_finetuned = evaluate(val_dataset, finetuned)

Generating embeddings:   0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [33]:
df_finetuned = pd.DataFrame(val_results_finetuned)

In [34]:
hit_rate_finetuned = df_finetuned["is_hit"].mean()
hit_rate_finetuned

0.97

In [35]:
evaluate_st(val_dataset, "test_model", name="finetuned")

0.8573385846534823

### Summary

In [36]:
df_ada["model"] = "ada"
df_bge["model"] = "bge"
df_finetuned["model"] = "fine_tuned"

In [37]:
df_all = pd.concat([df_ada, df_bge, df_finetuned])
df_all.groupby("model").mean("is_hit")

Unnamed: 0_level_0,is_hit
model,Unnamed: 1_level_1
ada,0.955
bge,0.915
fine_tuned,0.97


In [38]:
df_st_bge = pd.read_csv(
    "results/Information-Retrieval_evaluation_bge_results.csv"
)
df_st_finetuned = pd.read_csv(
    "results/Information-Retrieval_evaluation_finetuned_results.csv"
)

In [39]:
df_st_bge["model"] = "bge"
df_st_finetuned["model"] = "fine_tuned"
df_st_all = pd.concat([df_st_bge, df_st_finetuned])
df_st_all = df_st_all.set_index("model")
df_st_all

Unnamed: 0_level_0,epoch,steps,cos_sim-Accuracy@1,cos_sim-Accuracy@3,cos_sim-Accuracy@5,cos_sim-Accuracy@10,cos_sim-Precision@1,cos_sim-Recall@1,cos_sim-Precision@3,cos_sim-Recall@3,...,dot_score-Recall@1,dot_score-Precision@3,dot_score-Recall@3,dot_score-Precision@5,dot_score-Recall@5,dot_score-Precision@10,dot_score-Recall@10,dot_score-MRR@10,dot_score-NDCG@10,dot_score-MAP@100
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bge,-1,-1,0.705,0.865,0.92,0.96,0.705,0.705,0.288333,0.865,...,0.705,0.288333,0.865,0.184,0.92,0.096,0.96,0.792935,0.833595,0.79557
fine_tuned,-1,-1,0.79,0.9,0.97,0.98,0.79,0.79,0.3,0.9,...,0.79,0.3,0.9,0.194,0.97,0.098,0.98,0.856264,0.886738,0.857339
