{ "cells": [ { "cell_type": "markdown", "id": "8acae3ed-2953-45a3-aba9-0327b6ae3679", "metadata": {}, "source": [ "### ChromaDB method - create vectorstore based on Chroma" ] }, { "cell_type": "code", "execution_count": 1, "id": "7de9c591-5a77-4bbe-80f1-4897e15f0b97", "metadata": {}, "outputs": [], "source": [ "import sys, os, shutil\n", "sys.path.insert(0, \"../\")\n", "\n", "from preprocess_raw_documents import split_content\n", "\n", "import chromadb\n", "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", "from llama_index.vector_stores.chroma.base import ChromaVectorStore\n", "from llama_index.core import StorageContext\n", "from llama_index.core import ServiceContext\n", "from llama_index.core import Document\n", "\n", "from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding\n", "from llama_index.core import Settings\n", "\n", "import nest_asyncio\n", "nest_asyncio.apply()\n", "\n", "import time" ] }, { "cell_type": "code", "execution_count": 2, "id": "978152ce-4d87-44b5-b521-dbaff60b32b0", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "199it [00:00, 8821.71it/s]\n", "200it [00:00, 12584.17it/s]\n" ] } ], "source": [ "split_content(filepath=\"../raw_documents/answers.txt\", \n", " separator=\"\\n\\n\", \n", " tmp_folder=\"../raw_documents/answers_temp\")\n", "\n", "split_content(filepath=\"../raw_documents/qna.txt\", \n", " separator=\"\\n\\n\\n\", \n", " tmp_folder=\"../raw_documents/qna_temp\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "d925371b-8777-4f5b-a7f2-ec3f228ef266", "metadata": {}, "outputs": [], "source": [ "answers_temp_files = []\n", "folder_path = \"../raw_documents/answers_temp\"\n", "for f in os.listdir(folder_path):\n", " fpath = os.path.join(folder_path, f)\n", " answers_temp_files.append(fpath)\n", " \n", "qna_temp_files = []\n", "folder_path = \"../raw_documents/qna_temp\"\n", "for f in os.listdir(folder_path):\n", " fpath = os.path.join(folder_path, f)\n", " qna_temp_files.append(fpath)" ] }, { "cell_type": "code", "execution_count": null, "id": "e876a26b-822d-44d6-a3dd-ccdcc04933cf", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 7, "id": "3e65dff6-77b6-4be8-8857-5cecf3a035bb", "metadata": {}, "outputs": [], "source": [ "# load some documents\n", "documents = SimpleDirectoryReader(input_files=[\n", " \"../raw_documents/HI Chapter Summary Version 1.3.pdf\",\n", " \"../raw_documents/conversation_examples.txt\",\n", " \"../raw_documents/HI_Knowledge_Base.pdf\",\n", " ] + answers_temp_files + qna_temp_files ).load_data()\n", "document = Document(text=\"\\n\\n\".join([doc.text for doc in documents]))" ] }, { "cell_type": "code", "execution_count": 8, "id": "bd86b3f5-1dfc-4257-bd9c-86d34f02398d", "metadata": {}, "outputs": [], "source": [ "# initialize client, setting path to save data\n", "db = chromadb.PersistentClient(path=\"../models/chroma_db_advanced\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "f568ce7b-bcbf-455c-acf1-6c2cae129fed", "metadata": {}, "outputs": [], "source": [ "# create collection\n", "chroma_collection = db.get_or_create_collection(\"quickstart\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "ed0b018e-1982-46b2-b1b4-04f5c0ce8672", "metadata": {}, "outputs": [], "source": [ "# assign chroma as the vector_store to the context\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)" ] }, { "cell_type": "code", "execution_count": null, "id": "eb5edab2-30db-4bf7-96b5-4005d3161988", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 11, "id": "0946b6ce-96ab-44de-ad75-e424a8429f67", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LLM is explicitly disabled. Using MockLLM.\n" ] } ], "source": [ "Settings.llm = None\n", "Settings.chunk_size = 1024\n", "Settings.chunk_overlap = 50\n", "Settings.embed_model = \"local:../models/fine-tuned-embeddings-advanced\"" ] }, { "cell_type": "code", "execution_count": 12, "id": "b8c73a2c-1129-406a-8046-085afcaf9cbb", "metadata": {}, "outputs": [], "source": [ "nodes = Settings.node_parser.get_nodes_from_documents(documents)" ] }, { "cell_type": "code", "execution_count": 13, "id": "75f1c76f-d3e5-4b69-818c-98865adb1457", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6814" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(nodes)" ] }, { "cell_type": "code", "execution_count": null, "id": "adfe688f-95c0-477c-a9de-e9e77541a1d7", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 14, "id": "dab4c6f3-ef67-4d90-b3d5-e290c5d1b6f4", "metadata": {}, "outputs": [], "source": [ "storage_context = StorageContext.from_defaults(vector_store=vector_store)" ] }, { "cell_type": "code", "execution_count": 15, "id": "6a764113-ad7e-4674-aa57-ebbf405902a8", "metadata": {}, "outputs": [], "source": [ "storage_context.docstore.add_documents(nodes)" ] }, { "cell_type": "code", "execution_count": null, "id": "38e7c88d-6c45-4275-8293-d09b4b85a7cf", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 16, "id": "e492ed4a-23a3-47d6-8b50-51fb48b3aa05", "metadata": {}, "outputs": [], "source": [ "start_time = time.time()" ] }, { "cell_type": "code", "execution_count": 17, "id": "cbd11b89-9b83-4f08-bb30-160f750f2ffb", "metadata": {}, "outputs": [], "source": [ "vector_index = VectorStoreIndex(nodes, storage_context=storage_context)" ] }, { "cell_type": "code", "execution_count": 18, "id": "d3bd848d-9985-4a3d-bdc4-ec340cc69ef3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Indexing time: 2.3 mins\n" ] } ], "source": [ "indexing_cost = time.time() - start_time\n", "indexing_cost = indexing_cost / 60\n", "print(f\"Indexing time: {indexing_cost:.1f} mins\")" ] }, { "cell_type": "code", "execution_count": 19, "id": "f16cca33-71fb-437d-a033-671b9fd44054", "metadata": {}, "outputs": [], "source": [ "vector_query_engine = vector_index.as_query_engine()" ] }, { "cell_type": "code", "execution_count": 20, "id": "3290e870-41d7-49c4-9c4f-cb16bd1f469e", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "Response(response='Context information is below.\\n---------------------\\nfile_path: ../raw_documents/answers_temp/answers_050.txt\\n\\nQuestion: The fundamental principle of Singapore healthcare financing is ____________.\\nAnswer: The answer is \"Individual Savings\".\\n\\nfile_path: ../raw_documents/qna_temp/qna_050.txt\\n\\nC1/5\\nQuestion: The fundamental principle of Singapore healthcare financing is ____________.\\nA. The 3’s M. That is Medisave, Medishield, Medifund.\\nB. Means Testing and Casemix.\\nC. Individual Savings.\\nD. Tax based subsidies and government subvention.\\nAnswer: C. The answer is \"Individual Savings\".\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: Healthcare System in Singapore consists of?\\nAnswer: ', source_nodes=[NodeWithScore(node=TextNode(id_='536fef67-6a3f-4054-a94a-cc9143599510', embedding=None, metadata={'file_path': '../raw_documents/answers_temp/answers_050.txt', 'file_name': 'answers_050.txt', 'file_type': 'text/plain', 'file_size': 130, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={: RelatedNodeInfo(node_id='2b0f7dad-c532-4abd-8c42-f53383a4fc76', node_type=, metadata={'file_path': '../raw_documents/answers_temp/answers_050.txt', 'file_name': 'answers_050.txt', 'file_type': 'text/plain', 'file_size': 130, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='5b1d1dc729a663e4ccfacc0f18adf0f6644a2a7d2991490fd962d1550c83f2ff'), : RelatedNodeInfo(node_id='6d93c092-b4cc-4b5b-b379-080d777d3908', node_type=, metadata={'file_path': '../raw_documents/answers_temp/answers_044.txt', 'file_name': 'answers_044.txt', 'file_type': 'text/plain', 'file_size': 164, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='caeb59043b8daa56ed472941882947570abff951f64aa0498672aba5921fac1d'), : RelatedNodeInfo(node_id='859a9958-6f5d-4581-95d0-39edfc950ef5', node_type=, metadata={}, hash='8416454b2fbad3e6122c5151d2b3d1eadf0afde3514ba09374c71e96baf712bc')}, text='Question: The fundamental principle of Singapore healthcare financing is ____________.\\nAnswer: The answer is \"Individual Savings\".', start_char_idx=0, end_char_idx=130, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'), score=0.4159636550867191), NodeWithScore(node=TextNode(id_='472000ae-a0aa-4464-a200-72fe67a3fbde', embedding=None, metadata={'file_path': '../raw_documents/qna_temp/qna_050.txt', 'file_name': 'qna_050.txt', 'file_type': 'text/plain', 'file_size': 297, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={: RelatedNodeInfo(node_id='506fb715-d3b0-4ca7-b7ca-011a1e1a1f0d', node_type=, metadata={'file_path': '../raw_documents/qna_temp/qna_050.txt', 'file_name': 'qna_050.txt', 'file_type': 'text/plain', 'file_size': 297, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='7461ffa12ff6729003131976b82995b7254ab10f8dc7d79c65988ec9e3b7b631'), : RelatedNodeInfo(node_id='d8232b90-d641-4966-b98f-4ca0821db773', node_type=, metadata={'file_path': '../raw_documents/qna_temp/qna_044.txt', 'file_name': 'qna_044.txt', 'file_type': 'text/plain', 'file_size': 383, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='cbeb00c29c6130548466697a862fee43ab2be92d84158cc0b69c2f5c7bbe68b1'), : RelatedNodeInfo(node_id='e772e623-cf91-41cd-a516-50acb894eb54', node_type=, metadata={}, hash='a7583b0fd46f98d0118c712632277d81f417b779f8bcc100ab2558dae6317cde')}, text='C1/5\\nQuestion: The fundamental principle of Singapore healthcare financing is ____________.\\nA. The 3’s M. That is Medisave, Medishield, Medifund.\\nB. Means Testing and Casemix.\\nC. Individual Savings.\\nD. Tax based subsidies and government subvention.\\nAnswer: C. The answer is \"Individual Savings\".', start_char_idx=0, end_char_idx=295, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'), score=0.4126648577998099)], metadata={'536fef67-6a3f-4054-a94a-cc9143599510': {'file_path': '../raw_documents/answers_temp/answers_050.txt', 'file_name': 'answers_050.txt', 'file_type': 'text/plain', 'file_size': 130, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, '472000ae-a0aa-4464-a200-72fe67a3fbde': {'file_path': '../raw_documents/qna_temp/qna_050.txt', 'file_name': 'qna_050.txt', 'file_type': 'text/plain', 'file_size': 297, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}})" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "response = vector_query_engine.query(\"Healthcare System in Singapore consists of?\")\n", "response" ] }, { "cell_type": "code", "execution_count": null, "id": "aa4b9906-5f75-4003-9f4c-5cfcc7ab1eaf", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 21, "id": "1bb75b04-6a62-43a4-8728-d2e52e49f1c0", "metadata": {}, "outputs": [], "source": [ "if os.path.exists(\"../raw_documents/answers_temp\"):\n", " shutil.rmtree(\"../raw_documents/answers_temp\")" ] }, { "cell_type": "code", "execution_count": 22, "id": "0ed920fb-6456-49ac-8b63-08bd86b5b39c", "metadata": {}, "outputs": [], "source": [ "if os.path.exists(\"../raw_documents/qna_temp\"):\n", " shutil.rmtree(\"../raw_documents/qna_temp\")" ] }, { "cell_type": "code", "execution_count": null, "id": "131d907a-0677-4ad8-b3f7-6fc9b9c5d0a5", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "08fb2be5-3a44-4bb8-a9fc-61d7f03b7a35", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "a7fc01f6-4738-415b-a96b-afd6cf8d789a", "metadata": {}, "source": [ "### ChromaDB method - load vectorstore based on Chroma" ] }, { "cell_type": "code", "execution_count": 1, "id": "c1a42c35-5f57-423c-8fb7-7d18b3b466b5", "metadata": {}, "outputs": [], "source": [ "import chromadb\n", "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", "from llama_index.vector_stores.chroma.base import ChromaVectorStore\n", "from llama_index.core import StorageContext\n", "from llama_index.core import ServiceContext\n", "from llama_index.core import Document\n", "from llama_index.core import Settings\n", "\n", "from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding\n", "from llama_index.llms.openai import OpenAI\n", "from llama_index.core.memory import ChatMemoryBuffer\n", "\n", "import time" ] }, { "cell_type": "code", "execution_count": null, "id": "72dd0ece-c72d-428a-89b4-9494d948c845", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 2, "id": "d38dc953-b923-4128-86a1-c8c6f69af0ed", "metadata": {}, "outputs": [], "source": [ "fine_tuned_path = \"local:../models/fine-tuned-embeddings-advanced\"" ] }, { "cell_type": "code", "execution_count": 3, "id": "4c83c613-2cfc-4871-9d07-c82f77a3bd5e", "metadata": {}, "outputs": [], "source": [ "llm = OpenAI(model=\"gpt-4-0125-preview\", temperature=0.0)" ] }, { "cell_type": "code", "execution_count": 4, "id": "0583e9b0-d977-488c-8331-46dfa749924c", "metadata": {}, "outputs": [], "source": [ "Settings.llm = llm\n", "Settings.embed_model = fine_tuned_path" ] }, { "cell_type": "code", "execution_count": null, "id": "f994f440-f647-48b4-a517-46a79f7561e5", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 5, "id": "2159a2b6-494b-41b9-ac54-dd342bfb74ba", "metadata": {}, "outputs": [], "source": [ "db = chromadb.PersistentClient(path=\"../models/chroma_db_advanced\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "1b385644-b46e-4d13-88fa-9f4af39db405", "metadata": {}, "outputs": [], "source": [ "chroma_collection = db.get_or_create_collection(\"quickstart\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "93cb53d1-6b8c-4b2d-a839-53501c0d54b2", "metadata": {}, "outputs": [], "source": [ "# assign chroma as the vector_store to the context\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)" ] }, { "cell_type": "code", "execution_count": 8, "id": "c40d59e1-6d42-41f0-8c9b-70aa026093ae", "metadata": {}, "outputs": [], "source": [ "# create your index\n", "index = VectorStoreIndex.from_vector_store(\n", " vector_store=vector_store,\n", " storage_context=storage_context\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "73ba6d06-ba69-4b5e-962a-9cf7d2dc4d94", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 9, "id": "1a506940-c2b4-4d14-ad93-fd451331c582", "metadata": {}, "outputs": [], "source": [ "system_content = (\"You are a helpful study assistant. \"\n", " \"You do not respond as 'User' or pretend to be 'User'. \"\n", " \"You only respond once as 'Assistant'.\"\n", ")" ] }, { "cell_type": "code", "execution_count": 10, "id": "3f592848-8536-4b4d-b34a-adc32d043432", "metadata": {}, "outputs": [], "source": [ "memory = ChatMemoryBuffer.from_defaults(token_limit=100_000)" ] }, { "cell_type": "code", "execution_count": 11, "id": "6c7df81a-fd2f-42bf-b09c-46d7750f7252", "metadata": {}, "outputs": [], "source": [ "chat_engine = index.as_chat_engine(\n", " chat_mode=\"context\",\n", " memory=memory,\n", " system_prompt=system_content\n", ")" ] }, { "cell_type": "code", "execution_count": 12, "id": "c3106dff-dd6f-47a9-9454-1e61775e7539", "metadata": {}, "outputs": [], "source": [ "hi_engine = index.as_query_engine(\n", " memory=memory,\n", " system_prompt=system_content,\n", " similarity_top_k=10,\n", " streaming=True\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "53a38081-4a79-44bc-bfa3-5d8653804328", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 24, "id": "434f0caf-8b1f-40c6-b9ec-b039cd1ca612", "metadata": {}, "outputs": [], "source": [ "prompt = \"\"\"\n", "Question: Which is not a government healthcare philosophy? \n", "A. To nurture a healthy nation by promoting good health.\n", "B. To rely on competition to improve service and raise efficiency\n", "C. To intervene directly whenever necessary\n", "D. To provide for the care of employees\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 26, "id": "a1c83dff-50d1-47b1-b7e9-4fc5cd08e1e8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "D. To provide for the care of employees\n" ] } ], "source": [ "res = hi_engine.query(prompt)\n", "print(res)" ] }, { "cell_type": "code", "execution_count": null, "id": "cedd3512-548d-4455-80fd-c6a8b2c0cd00", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ec53dfcf-d4c0-4d10-a24e-be2004a83656", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 14, "id": "78abaf95-e52d-445c-9d8e-bc51efb20f06", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "The correct answer is \"Deductibles apply for all treatments\".\n" ] } ], "source": [ "res = chat_engine.chat(prompt)\n", "print(res.response)" ] }, { "cell_type": "code", "execution_count": null, "id": "1e62303c-3a00-448f-ad93-15cb6cee1f24", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "301e8270-783d-4942-a05f-9683ca96fbda", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 5 }