In [None]:
import json, os

from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SentenceSplitter
from llama_index.schema import MetadataMode

In [None]:
from llama_index.finetuning import (
 generate_qa_embedding_pairs,
 EmbeddingQAFinetuneDataset,
)
from llama_index.finetuning import SentenceTransformersFinetuneEngine

In [None]:
from llama_index.embeddings import OpenAIEmbedding
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path

In [None]:
from llama_index.llms import OpenAI

In [None]:
import llama_index
print(llama_index.__version__)

In [None]:
TRAIN_FILES = ["../raw_documents/HI_Knowledge_Base.pdf"]
VAL_FILES = ["../raw_documents/HI Chapter Summary Version 1.3.pdf"]

TRAIN_CORPUS_FPATH = "../data/train_corpus.json"
VAL_CORPUS_FPATH = "../data/val_corpus.json"

In [None]:
def load_corpus(files, verbose=False):
 if verbose:
 print(f"Loading files {files}")

 reader = SimpleDirectoryReader(input_files=files)
 docs = reader.load_data()
 if verbose:
 print(f"Loaded {len(docs)} docs")

 parser = SentenceSplitter()
 nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

 if verbose:
 print(f"Parsed {len(nodes)} nodes")

 return nodes

In [None]:
if not os.path.exists(TRAIN_CORPUS_FPATH) or \
 not os.path.exists(VAL_CORPUS_FPATH):

 train_nodes = load_corpus(TRAIN_FILES, verbose=True)
 val_nodes = load_corpus(VAL_FILES, verbose=True)
 
 train_dataset = generate_qa_embedding_pairs(
 llm=OpenAI(model="gpt-3.5-turbo-1106"), nodes=train_nodes
 )
 val_dataset = generate_qa_embedding_pairs(
 llm=OpenAI(model="gpt-3.5-turbo-1106"), nodes=val_nodes
 )
 
 train_dataset.save_json(TRAIN_CORPUS_FPATH)
 val_dataset.save_json(VAL_CORPUS_FPATH)
 
else:
 train_dataset = EmbeddingQAFinetuneDataset.from_json(TRAIN_CORPUS_FPATH)
 val_dataset = EmbeddingQAFinetuneDataset.from_json(VAL_CORPUS_FPATH)

In [None]:
finetune_engine = SentenceTransformersFinetuneEngine(
 train_dataset,
 model_id="BAAI/bge-small-en-v1.5",
 model_output_path="../models/fine-tuned-embeddings",
 batch_size=5,
 val_dataset=val_dataset
)

In [None]:
finetune_engine.finetune()

In [None]:
embed_model = finetune_engine.get_finetuned_model()

In [None]:
embed_model

In [None]:
fine_tuned_embed_model = SentenceTransformer("../models/fine-tuned-embeddings")

In [None]:
def evaluate(
 dataset,
 embed_model,
 top_k=5,
 verbose=False,
):
 corpus = dataset.corpus
 queries = dataset.queries
 relevant_docs = dataset.relevant_docs

 service_context = ServiceContext.from_defaults(embed_model=embed_model)
 nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
 index = VectorStoreIndex(
 nodes, service_context=service_context, show_progress=True
 )
 retriever = index.as_retriever(similarity_top_k=top_k)

 eval_results = []
 for query_id, query in tqdm(queries.items()):
 retrieved_nodes = retriever.retrieve(query)
 retrieved_ids = [node.node.node_id for node in retrieved_nodes]
 expected_id = relevant_docs[query_id][0]
 is_hit = expected_id in retrieved_ids # assume 1 relevant doc

 eval_result = {
 "is_hit": is_hit,
 "retrieved": retrieved_ids,
 "expected": expected_id,
 "query": query_id,
 }
 eval_results.append(eval_result)
 return eval_results

In [None]:
def evaluate_st(
 dataset,
 model_id,
 name,
):
 corpus = dataset.corpus
 queries = dataset.queries
 relevant_docs = dataset.relevant_docs

 evaluator = InformationRetrievalEvaluator(
 queries, corpus, relevant_docs, name=name
 )
 model = SentenceTransformer(model_id)
 output_path = "../results/"
 Path(output_path).mkdir(exist_ok=True, parents=True)
 return evaluator(model, output_path=output_path)

### Using OpenAI Ada embedding

In [None]:
ada = OpenAIEmbedding()
ada_val_results = evaluate(val_dataset, ada)

In [None]:
df_ada = pd.DataFrame(ada_val_results)

In [None]:
df_ada[:5]

In [None]:
hit_rate_ada = df_ada["is_hit"].mean()
hit_rate_ada, len(df_ada)

### Using BAAI bge-small model without fine-tuning

In [None]:
bge = "local:BAAI/bge-small-en-v1.5"
bge_val_results = evaluate(val_dataset, bge)

In [None]:
df_bge = pd.DataFrame(bge_val_results)

In [None]:
df_bge[:5]

In [None]:
hit_rate_bge = df_bge["is_hit"].mean()
hit_rate_bge, len(df_bge)

In [None]:
evaluate_st(val_dataset, "BAAI/bge-small-en-v1.5", name="bge")

### Using BAAI bge-small model with `fine-tuning`

In [None]:
finetuned = "local:../models/fine-tuned-embeddings"
val_results_finetuned = evaluate(val_dataset, finetuned)

In [None]:
df_finetuned = pd.DataFrame(val_results_finetuned)

In [None]:
hit_rate_finetuned = df_finetuned["is_hit"].mean()
hit_rate_finetuned

In [None]:
evaluate_st(val_dataset, "../models/fine-tuned-embeddings", name="finetuned")

### Summary

In [None]:
df_ada["model"] = "ada"
df_bge["model"] = "bge"
df_finetuned["model"] = "fine_tuned"

In [None]:
df_all = pd.concat([df_ada, df_bge, df_finetuned])
df_all.groupby("model").mean("is_hit")

In [None]:
df_st_bge = pd.read_csv(
 "../results/Information-Retrieval_evaluation_bge_results.csv"
)
df_st_finetuned = pd.read_csv(
 "../results/Information-Retrieval_evaluation_finetuned_results.csv"
)

In [None]:
df_st_bge["model"] = "bge"
df_st_finetuned["model"] = "fine_tuned"
df_st_all = pd.concat([df_st_bge, df_st_finetuned])
df_st_all = df_st_all.set_index("model")
df_st_all