In [None]:
import math
from pathlib import Path
from datetime import datetime
from typing import Any

import numpy as np
from tqdm import tqdm
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from huggingface_hub import HfApi, snapshot_download

## Index building

In [None]:
def collect_docs(directory: str, docs: list[str], metadata: list[Any]):
 for p in Path(directory).iterdir():
 if not p.is_dir():
 with open(p) as f:
 # the first line is the source of the text
 source = f.readline().strip().replace('source: ', '')
 docs.append(f.read())
 metadata.append({"source": source})
 # break

In [None]:
DIRECTORIES = [
 "./datasets/huggingface_docs/",
 "./datasets/huggingface_audio_transcribed/"
]

docs = []
metadata = []
for directory in DIRECTORIES:
 collect_docs(directory, docs, metadata)

print(f'number of documents: {len(docs)}')

In [None]:
# if split_chunk_size > 512 model is processing first 512 characters of the chunk
split_chunk_size = 800
chunk_overlap = 200
text_splitter = CharacterTextSplitter(
 separator="",
 chunk_size=split_chunk_size,
 chunk_overlap=chunk_overlap,
 length_function=len,
)
docs = text_splitter.create_documents(docs, metadata)
print(f'number of chunks: {len(docs)}')

In [None]:
model_name = "hkunlp/instructor-large"
embed_instruction = "Represent the Hugging Face library documentation"
query_instruction = "Query the most relevant piece of information from the Hugging Face documentation"

embedding_model = HuggingFaceInstructEmbeddings(
 model_name=model_name,
 embed_instruction=embed_instruction,
 query_instruction=query_instruction,
)

In [None]:
class AverageInstructEmbeddings(HuggingFaceInstructEmbeddings):
 max_length: int = None

 def __init__(self, max_length: int = 512, **kwargs: Any):
 super().__init__(**kwargs)
 self.max_length = max_length
 if self.max_length < 0:
 print('max_length is not specified, using model default max_seq_length')

 def embed_documents(self, texts: list[str]) -> list[list[float]]:
 all_embeddings = []
 for text in tqdm(texts, desc="Embedding documents"):
 if len(text) > self.max_length and self.max_length > -1:
 n_chunks = math.ceil(len(text)/self.max_length)
 chunks = [
 text[i*self.max_length:(i+1)*self.max_length]
 for i in range(n_chunks)
 ]
 instruction_pairs = [[self.embed_instruction, chunk] for chunk in chunks]
 chunk_embeddings = self.client.encode(instruction_pairs)
 avg_embedding = np.mean(chunk_embeddings, axis=0)
 all_embeddings.append(avg_embedding.tolist())
 else:
 instruction_pairs = [[self.embed_instruction, text]]
 embeddings = self.client.encode(instruction_pairs)
 all_embeddings.append(embeddings[0].tolist())

 return all_embeddings


# max length fed to the model
# if longer than CHUNK_SIZE in previous steps: then N chunks + averaging of embeddings
max_length = 512
embedding_model = AverageInstructEmbeddings( 
 model_name=model_name,
 embed_instruction=embed_instruction,
 query_instruction=query_instruction,
 max_length=max_length,
)

In [None]:
embeddings = embedding_model.embed_documents(texts=[d.page_content for d in docs[:10]])

In [None]:
index = FAISS.from_documents(docs, embedding_model)

## Index uploading

In [None]:
todays_date = datetime.now().strftime('%d_%b_%Y')
index_name = f'index-{model_name}-{split_chunk_size}-{chunk_overlap}-m{max_length}-{todays_date}'
index_name = index_name.replace('/', '_')

In [None]:
index.save_local(f'../indexes/{index_name}/')

In [None]:
index = FAISS.load_local(f'../indexes/{index_name}/', embedding_model)
docs = index.similarity_search(query='how to create a pipeline object?', k=5)
docs[0].page_content
docs[0].metadata

In [None]:
for i, doc in enumerate(docs, start=1):
 print(f"\n{'='*100}\n")
 print(f"Document {i} of {len(docs)}")
 print("Page Content:")
 print(f"\n{'-'*100}\n")
 print(f'length of a chunk: {len(doc.page_content)}')
 print(doc.page_content, '\n')
 print(doc.metadata)

In [None]:
api = HfApi()
api.create_repo(
 repo_id=f'KonradSzafer/{index_name}',
 repo_type='dataset',
 private=False,
 exist_ok=True
)
api.upload_folder(
 folder_path=f'../indexes/{index_name}',
 repo_id=f'KonradSzafer/{index_name}',
 repo_type='dataset',
)

## Index inference

In [None]:
index_repo_id = f'KonradSzafer/index-hkunlp_instructor-large-512-m512-11_Jan_2024'

snapshot_download(
 repo_id=index_repo_id,
 allow_patterns=['*.faiss', '*.pkl'], 
 repo_type='dataset',
 local_dir='../indexes/run/'
)

In [None]:
index = FAISS.load_local('../indexes/run/', embedding_model)
docs = index.similarity_search(query='how to create a pipeline object?', k=5)
docs[0].metadata
docs[0].page_content