{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using deprecated Together LLM. Please use langchain_together instead.\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os \n", "from dotenv import load_dotenv\n", "\n", "from langchain_community.document_loaders.csv_loader import CSVLoader\n", "from langchain_community.embeddings import HuggingFaceEmbeddings\n", "from langchain_community.vectorstores import Vectara\n", "from langchain_core.output_parsers import StrOutputParser\n", "from langchain_core.prompts import ChatPromptTemplate\n", "from langchain_core.runnables import RunnablePassthrough\n", "\n", "try:\n", " from langchain_together import Together\n", "except:\n", " print('Using deprecated Together LLM. Please use langchain_together instead.')\n", " from langchain_community.llms import Together\n", "\n", "MODEL_NAME = \"mistralai/Mixtral-8x7B-Instruct-v0.1\"\n", "\n", "load_dotenv()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID']\n", "vectara_corpus_id = os.environ['VECTARA_CORPUS_ID']\n", "vectara_api_key = os.environ['VECTARA_API_KEY']\n", "\n", "embeddings = HuggingFaceEmbeddings(model_name=\"intfloat/multilingual-e5-large\")\n", "\n", "vectara = Vectara(vectara_customer_id=vectara_customer_id,\n", " vectara_corpus_id=vectara_corpus_id,\n", " vectara_api_key=vectara_api_key)\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/salgadev/code/DocVerifyRAG/.venv/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The class `langchain_community.llms.together.Together` was deprecated in langchain-community 0.0.12 and will be removed in 0.2. An updated version of the class exists in the langchain-together package and should be used instead. To use it run `pip install -U langchain-together` and import as `from langchain_together import Together`.\n", " warn_deprecated(\n" ] } ], "source": [ "model = Together(model=MODEL_NAME,\n", " temperature=0.7,\n", " max_tokens=256,\n", " top_k=50,\n", " together_api_key=os.environ[\"TOGETHER_API_KEY\"]\n", " )" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def get_sources(documents):\n", " return documents[:-1]\n", "\n", "def get_summary(documents):\n", " return documents[-1].page_content" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "from langchain.prompts import PromptTemplate\n", "\n", "summary_config = {\"is_enabled\": True, \"max_results\": 3, \"response_lang\": \"eng\"}\n", "retriever = vectara.as_retriever(\n", " search_kwargs={\"k\": 3, \"summary_config\": summary_config}\n", ")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "wrong_disc_meta = 'electrical_doc.pdf, Electrical wiring scheme and specifications for a generator room, S - Sanitaer'\n", "good_meta = 'ISB-020-U3-W-S-01-B18003-001-020.pdf, Schieber / Hawle / Schieber 4000 + Handrad 7800 DN100 Schutzraum, S - Sanitaer'" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "template = \"\"\"\n", "passage: You are a helpful assistant that understands BIM building documents and engineering disciplines.\n", "passage: You will analyze BIM document metadata composed of filename, description, and discipline.\n", "passage: The metadata is written in German.\n", "passage: metadata: {metadata}\n", "query: Does the filename match other filenames within the same discipline?\n", "query: Does the description match the engineering discipline?\n", "query: How different is the metadata to your curated information?\n", "query: Highligh any discrepancies and comment on wether or not the metadata is anomalous.\n", "\"\"\"\n", "\n", "prompt = PromptTemplate(template=template, input_variables=[\"metadata\"])\n" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\npassage: You are a helpful assistant that understands BIM building documents and engineering disciplines.\\npassage: You will analyze BIM document metadata composed of filename, description, and discipline.\\npassage: The metadata is written in German.\\npassage: metadata: ISB-020-U3-W-S-01-B18003-001-020.pdf, Schieber / Hawle / Schieber 4000 + Handrad 7800 DN100 Schutzraum, S - Sanitaer\\nquery: Does the filename match other filenames within the same discipline?\\nquery: Does the description match the engineering discipline?\\nquery: How different is the metadata to your curated information?\\nquery: Highligh any discrepancies and comment on wether or not the metadata is anomalous.\\n'" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "formatted_prompt = prompt.format(metadata=good_meta)\n", "formatted_prompt" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Based on the provided BIM document metadata in German, the filename \"ISB-020-U3-W-S-01-B18003-001-020.pdf\" belongs to the discipline S - Sanitaer [2]. Comparing it to other filenames within the same discipline, there are similar filenames like \"ISB-020-U3-W-S-01-B17012-011-000\" and \"ISB-020-U3-W-S-01-B19009-001-020\" [3]. The description \"Schieber / Hawle / Schieber 4000 + Handrad 7800 DN100 Schutzraum\" corresponds to the engineering discipline S - Sanitaer [2]. The metadata displays a specific naming convention and content related to sanitary engineering, aligning with the discipline indicated [2]. No significant discrepancies were found in the metadata analyzed, suggesting that the provided metadata is consistent and not anomalous within the context of BIM building documents and engineering disciplines.'" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ans = (retriever | get_summary).invoke(formatted_prompt)\n", "ans" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# metadata matches!" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\npassage: You are a helpful assistant that understands BIM building documents and engineering disciplines.\\npassage: You will analyze BIM document metadata composed of filename, description, and discipline.\\npassage: The metadata is written in German.\\npassage: metadata: electrical_doc.pdf, Electrical wiring scheme and specifications for a generator room, S - Sanitaer\\nquery: Does the filename match other filenames within the same discipline?\\nquery: Does the description match the engineering discipline?\\nquery: How different is the metadata to your curated information?\\nquery: Highligh any discrepancies and comment on wether or not the metadata is anomalous.\\n'" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "formatted_prompt = prompt.format(metadata=wrong_disc_meta)\n", "formatted_prompt" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Based on the provided search results, the filename \"electrical_doc.pdf\" matches other filenames within the same discipline of E - Elektroanlagen [7]. However, the description \"Electrical wiring scheme and specifications for a generator room\" aligns more with the discipline of electrical engineering rather than \"S - Sanitaer\" [7]. The metadata presents discrepancies as the description does not directly correspond to the discipline mentioned, indicating a mismatch [7]. This inconsistency suggests that the metadata could be considered anomalous due to the mismatch between the description and the specified discipline [7]. The metadata exhibits a clear discrepancy between the content of the file and the discipline it is categorized under, raising a question about the accuracy of the metadata classification.'" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ans = (retriever | get_summary).invoke(formatted_prompt)\n", "ans" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Anomaly detected!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 2 }