{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import math\n", "from pathlib import Path\n", "from datetime import datetime\n", "from typing import Any\n", "\n", "import numpy as np\n", "from tqdm import tqdm\n", "from langchain.chains import RetrievalQA\n", "from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings\n", "from langchain.document_loaders import TextLoader\n", "from langchain.indexes import VectorstoreIndexCreator\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.vectorstores import FAISS\n", "from huggingface_hub import HfApi, snapshot_download" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Index building" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def collect_docs(directory: str, docs: list[str], metadata: list[Any]):\n", " for p in Path(directory).iterdir():\n", " if not p.is_dir():\n", " with open(p) as f:\n", " # the first line is the source of the text\n", " source = f.readline().strip().replace('source: ', '')\n", " docs.append(f.read())\n", " metadata.append({\"source\": source})\n", " # break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "DIRECTORIES = [\n", " \"./datasets/huggingface_docs/\",\n", " \"./datasets/huggingface_audio_transcribed/\"\n", "]\n", "\n", "docs = []\n", "metadata = []\n", "for directory in DIRECTORIES:\n", " collect_docs(directory, docs, metadata)\n", "\n", "print(f'number of documents: {len(docs)}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# if split_chunk_size > 512 model is processing first 512 characters of the chunk\n", "split_chunk_size = 800\n", "chunk_overlap = 200\n", "text_splitter = CharacterTextSplitter(\n", " separator=\"\",\n", " chunk_size=split_chunk_size,\n", " chunk_overlap=chunk_overlap,\n", " length_function=len,\n", ")\n", "docs = text_splitter.create_documents(docs, metadata)\n", "print(f'number of chunks: {len(docs)}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model_name = \"hkunlp/instructor-large\"\n", "embed_instruction = \"Represent the Hugging Face library documentation\"\n", "query_instruction = \"Query the most relevant piece of information from the Hugging Face documentation\"\n", "\n", "embedding_model = HuggingFaceInstructEmbeddings(\n", " model_name=model_name,\n", " embed_instruction=embed_instruction,\n", " query_instruction=query_instruction,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class AverageInstructEmbeddings(HuggingFaceInstructEmbeddings):\n", " max_length: int = None\n", "\n", " def __init__(self, max_length: int = 512, **kwargs: Any):\n", " super().__init__(**kwargs)\n", " self.max_length = max_length\n", " if self.max_length < 0:\n", " print('max_length is not specified, using model default max_seq_length')\n", "\n", " def embed_documents(self, texts: list[str]) -> list[list[float]]:\n", " all_embeddings = []\n", " for text in tqdm(texts, desc=\"Embedding documents\"):\n", " if len(text) > self.max_length and self.max_length > -1:\n", " n_chunks = math.ceil(len(text)/self.max_length)\n", " chunks = [\n", " text[i*self.max_length:(i+1)*self.max_length]\n", " for i in range(n_chunks)\n", " ]\n", " instruction_pairs = [[self.embed_instruction, chunk] for chunk in chunks]\n", " chunk_embeddings = self.client.encode(instruction_pairs)\n", " avg_embedding = np.mean(chunk_embeddings, axis=0)\n", " all_embeddings.append(avg_embedding.tolist())\n", " else:\n", " instruction_pairs = [[self.embed_instruction, text]]\n", " embeddings = self.client.encode(instruction_pairs)\n", " all_embeddings.append(embeddings[0].tolist())\n", "\n", " return all_embeddings\n", "\n", "\n", "# max length fed to the model\n", "# if longer than CHUNK_SIZE in previous steps: then N chunks + averaging of embeddings\n", "max_length = 512\n", "embedding_model = AverageInstructEmbeddings( \n", " model_name=model_name,\n", " embed_instruction=embed_instruction,\n", " query_instruction=query_instruction,\n", " max_length=max_length,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "embeddings = embedding_model.embed_documents(texts=[d.page_content for d in docs[:10]])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "index = FAISS.from_documents(docs, embedding_model)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Index uploading" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "todays_date = datetime.now().strftime('%d_%b_%Y')\n", "index_name = f'index-{model_name}-{split_chunk_size}-{chunk_overlap}-m{max_length}-{todays_date}'\n", "index_name = index_name.replace('/', '_')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "index.save_local(f'../indexes/{index_name}/')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "index = FAISS.load_local(f'../indexes/{index_name}/', embedding_model)\n", "docs = index.similarity_search(query='how to create a pipeline object?', k=5)\n", "docs[0].page_content\n", "docs[0].metadata" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for i, doc in enumerate(docs, start=1):\n", " print(f\"\\n{'='*100}\\n\")\n", " print(f\"Document {i} of {len(docs)}\")\n", " print(\"Page Content:\")\n", " print(f\"\\n{'-'*100}\\n\")\n", " print(f'length of a chunk: {len(doc.page_content)}')\n", " print(doc.page_content, '\\n')\n", " print(doc.metadata)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "api = HfApi()\n", "api.create_repo(\n", " repo_id=f'KonradSzafer/{index_name}',\n", " repo_type='dataset',\n", " private=False,\n", " exist_ok=True\n", ")\n", "api.upload_folder(\n", " folder_path=f'../indexes/{index_name}',\n", " repo_id=f'KonradSzafer/{index_name}',\n", " repo_type='dataset',\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Index inference" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "index_repo_id = f'KonradSzafer/index-hkunlp_instructor-large-512-m512-11_Jan_2024'\n", "\n", "snapshot_download(\n", " repo_id=index_repo_id,\n", " allow_patterns=['*.faiss', '*.pkl'], \n", " repo_type='dataset',\n", " local_dir='../indexes/run/'\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "index = FAISS.load_local('../indexes/run/', embedding_model)\n", "docs = index.similarity_search(query='how to create a pipeline object?', k=5)\n", "docs[0].metadata\n", "docs[0].page_content" ] } ], "metadata": { "kernelspec": { "display_name": "hf_qa_bot", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }