In [37]:
import math
import numpy as np
from pathlib import Path
from typing import List, Union, Any
from tqdm import tqdm
from sentence_transformers import CrossEncoder
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from sentence_transformers import CrossEncoder

In [31]:
class AverageInstructEmbeddings(HuggingFaceInstructEmbeddings):
 max_length: int = None
 def __init__(self, max_length: int = 512, **kwargs: Any):
 super().__init__(**kwargs)
 self.max_length = max_length
 if self.max_length < 0:
 print('max_length is not specified, using model default max_seq_length')

 def embed_documents(self, texts: List[str]) -> List[List[float]]:
 all_embeddings = []
 for text in tqdm(texts, desc="Embedding documents"):
 if len(text) > self.max_length and self.max_length > -1:
 n_chunks = math.ceil(len(text)/self.max_length)
 chunks = [
 text[i*self.max_length:(i+1)*self.max_length]
 for i in range(n_chunks)
 ]
 instruction_pairs = [[self.embed_instruction, chunk] for chunk in chunks]
 chunk_embeddings = self.client.encode(instruction_pairs)
 avg_embedding = np.mean(chunk_embeddings, axis=0)
 all_embeddings.append(avg_embedding.tolist())
 else:
 instruction_pairs = [[self.embed_instruction, text]]
 embeddings = self.client.encode(instruction_pairs)
 all_embeddings.append(embeddings[0].tolist())

 return all_embeddings


class BenchDataST:
 def __init__(self, path: str, percentage: float = 0.005, chunk_size: int = 512, chunk_overlap: int = 100):
 self.path = path
 self.percentage = percentage
 self.docs = []
 self.metadata = []
 self.load()
 self.text_splitter = CharacterTextSplitter(separator="", chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 self.docs_processed = self.text_splitter.create_documents(self.docs, self.metadata)

 def load(self):
 for p in Path(self.path).iterdir():
 if not p.is_dir():
 with open(p) as f:
 source = f.readline().strip().replace('source: ', '')
 self.docs.append(f.read())
 self.metadata.append({"source": source})
 self.docs = self.docs[:int(len(self.docs) * self.percentage)]
 self.metadata = self.metadata[:int(len(self.metadata) * self.percentage)]

 def __len__(self):
 return len(self.docs)

 def __getitem__(self, idx):
 return self.docs[idx], self.metadata[idx]

 def __iter__(self):
 for doc, metadata in zip(self.docs, self.metadata):
 yield doc, metadata

 def __repr__(self):
 return f'BenchDataST({len(self)} docs) at {self.path} with {self.percentage} percentage \nSources: {self.metadata} \nChunks: {self.text_splitter}'
 

class BenchmarkST:
 def __init__(self, data: BenchDataST, baseline_model: Union[HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings, AverageInstructEmbeddings], embedding_models: List[Union[HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings, AverageInstructEmbeddings]]):
 self.data = data
 self.baseline_model = baseline_model
 self.embedding_models = embedding_models
 self.baseline_index, self.indexes = self.build_indexes()

 def build_indexes(self):
 indexes = []
 for model in [self.baseline_model] + self.embedding_models:
 print(f"Building index for {model}")
 index = FAISS.from_documents(self.data.docs_processed, model)
 indexes.append(index)
 return indexes[0], indexes[1:]
 
 def add_index(self, index: FAISS):
 self.indexes.append(index)
 
 def evaluate(self, query: str, k: int = 3):
 baseline_results = self.baseline_index.similarity_search_with_score(query, k=k)
 results = []
 for index in self.indexes:
 results.append(index.similarity_search_with_score(query, k=k))
 return baseline_results, results

In [48]:
data = BenchDataST(
 path="./datasets/huggingface_docs/",
 percentage=0.005,
 chunk_size=512,
 chunk_overlap=100
)

baseline_embedding_model = AverageInstructEmbeddings(
 model_name="hkunlp/instructor-base",
 embed_instruction="Represent this piece of text for searching relevant information:",
 query_instruction="Query the most relevant piece of information from the Hugging Face documentation",
 max_length=512,
)

embedding_model = HuggingFaceEmbeddings(
 model_name="intfloat/e5-large-v2",
)

cross_encoder = HuggingFaceEmbeddings(model_name="cross-encoder/ms-marco-MiniLM-L-12-v2")

benchmark = BenchmarkST(
 data=data,
 baseline_model=baseline_embedding_model,
 embedding_models=[cross_encoder]
)

load INSTRUCTOR_Transformer
max_seq_length 512


No sentence-transformers model found with name /Users/michalwilinski/.cache/torch/sentence_transformers/cross-encoder_ms-marco-MiniLM-L-12-v2. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /Users/michalwilinski/.cache/torch/sentence_transformers/cross-encoder_ms-marco-MiniLM-L-12-v2 were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Building index for client=INSTRUCTOR(
 (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: T5EncoderModel 
 (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
 (2): Dense({'in_features': 768, 'out_features': 768, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
 (3): Normalize()
) model_name='hkunlp/instructor-base' cache_folder=None model_kwargs={} encode_kwargs={} embed_instruction='Represent this piece of text for searching relevant information:' query_instruction='Query the most relevant piece of information from the Hugging Face documentation' max_length=512


Embedding documents: 100%|██████████| 278/278 [00:19<00:00, 14.11it/s]


Building index for client=SentenceTransformer(
 (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
 (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
) model_name='cross-encoder/ms-marco-MiniLM-L-12-v2' cache_folder=None model_kwargs={} encode_kwargs={} multi_process=False


In [54]:
query = "textual inversion"
k = 100
baseline_results, results = benchmark.evaluate(query=query, k=k)
print("Baseline results:")
[print(doc.metadata,score) for (doc,score) in baseline_results]
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")
cross_encoder_results = cross_encoder.predict([(query, doc.page_content) for doc in data.docs_processed])
# rerank results
cross_encoder_results = sorted(zip(data.docs_processed, cross_encoder_results), key=lambda x: x[1], reverse=True)
print("Cross encoder results:")
final_results = cross_encoder_results[:3]
[print(doc.metadata, score) for (doc,score) in final_results]
print("bye")

Baseline results:
{'source': 'https://github.com/huggingface/course/blob/main/chapters/en/chapter6/4.mdx'} 0.23610792
{'source': 'https://github.com/huggingface/course/blob/main/chapters/en/chapter6/4.mdx'} 0.24087097
{'source': 'https://github.com/huggingface/course/blob/main/chapters/en/chapter6/4.mdx'} 0.24181677
{'source': 'https://github.com/huggingface/course/blob/main/chapters/en/chapter6/4.mdx'} 0.24541612
{'source': 'https://github.com/huggingface/course/blob/main/chapters/en/chapter6/4.mdx'} 0.24639006
{'source': 'https://github.com/huggingface/course/blob/main/chapters/en/chapter6/4.mdx'} 0.24780047
{'source': 'https://github.com/huggingface/course/blob/main/chapters/en/chapter6/4.mdx'} 0.2535807
{'source': 'https://github.com/huggingface/optimum/blob/main/docs/source/exporters/onnx/usage_guides/export_a_model.mdx'} 0.25887597
{'source': 'https://github.com/huggingface/course/blob/main/chapters/en/chapter6/4.mdx'} 0.27293646
{'source': 'https://github.com/huggingface/course/

In [55]:
print(final_results[0][0].page_content)

es where the space character is not used (like Chinese or Japanese).

The other main feature of SentencePiece is *reversible tokenization*: since there is no special treatment of spaces, decoding the tokens is done simply by concatenating them and replacing the `_`s with spaces -- this results in the normalized text. As we saw earlier, the BERT tokenizer removes repeating spaces, so its tokenization is not reversible.

## Algorithm overview[[algorithm-overview]]

In the following sections, we'll dive into t
