In [1]:
import os 
from dotenv import load_dotenv

from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Vectara
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

try:
 from langchain_together import Together
except:
 print('Using deprecated Together LLM. Please use langchain_together instead.')
 from langchain_community.llms import Together

MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"

load_dotenv()

Using deprecated Together LLM. Please use langchain_together instead.


True

In [2]:
vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID']
vectara_corpus_id = os.environ['VECTARA_CORPUS_ID']
vectara_api_key = os.environ['VECTARA_API_KEY']

embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")

vectara = Vectara(vectara_customer_id=vectara_customer_id,
 vectara_corpus_id=vectara_corpus_id,
 vectara_api_key=vectara_api_key)


In [21]:
model = Together(model=MODEL_NAME,
 temperature=0.7,
 max_tokens=256,
 top_k=50,
 together_api_key=os.environ["TOGETHER_API_KEY"]
 )

 warn_deprecated(


In [6]:
def get_sources(documents):
 return documents[:-1]

def get_summary(documents):
 return documents[-1].page_content

In [26]:
from langchain.prompts import PromptTemplate

summary_config = {"is_enabled": True, "max_results": 3, "response_lang": "eng"}
retriever = vectara.as_retriever(
 search_kwargs={"k": 3, "summary_config": summary_config}
)

In [19]:
wrong_disc_meta = 'electrical_doc.pdf, Electrical wiring scheme and specifications for a generator room, S - Sanitaer'
good_meta = 'ISB-020-U3-W-S-01-B18003-001-020.pdf, Schieber / Hawle / Schieber 4000 + Handrad 7800 DN100 Schutzraum, S - Sanitaer'

In [65]:
template = """
passage: You are a helpful assistant that understands BIM building documents and engineering disciplines.
passage: You will analyze BIM document metadata composed of filename, description, and discipline.
passage: The metadata is written in German.
passage: metadata: {metadata}
query: Does the filename match other filenames within the same discipline?
query: Does the description match the engineering discipline?
query: How different is the metadata to your curated information?
query: Highligh any discrepancies and comment on wether or not the metadata is anomalous.
"""

prompt = PromptTemplate(template=template, input_variables=["metadata"])


In [66]:
formatted_prompt = prompt.format(metadata=good_meta)
formatted_prompt

'\npassage: You are a helpful assistant that understands BIM building documents and engineering disciplines.\npassage: You will analyze BIM document metadata composed of filename, description, and discipline.\npassage: The metadata is written in German.\npassage: metadata: ISB-020-U3-W-S-01-B18003-001-020.pdf, Schieber / Hawle / Schieber 4000 + Handrad 7800 DN100 Schutzraum, S - Sanitaer\nquery: Does the filename match other filenames within the same discipline?\nquery: Does the description match the engineering discipline?\nquery: How different is the metadata to your curated information?\nquery: Highligh any discrepancies and comment on wether or not the metadata is anomalous.\n'

In [67]:
ans = (retriever | get_summary).invoke(formatted_prompt)
ans

'Based on the provided BIM document metadata in German, the filename "ISB-020-U3-W-S-01-B18003-001-020.pdf" belongs to the discipline S - Sanitaer [2]. Comparing it to other filenames within the same discipline, there are similar filenames like "ISB-020-U3-W-S-01-B17012-011-000" and "ISB-020-U3-W-S-01-B19009-001-020" [3]. The description "Schieber / Hawle / Schieber 4000 + Handrad 7800 DN100 Schutzraum" corresponds to the engineering discipline S - Sanitaer [2]. The metadata displays a specific naming convention and content related to sanitary engineering, aligning with the discipline indicated [2]. No significant discrepancies were found in the metadata analyzed, suggesting that the provided metadata is consistent and not anomalous within the context of BIM building documents and engineering disciplines.'

# metadata matches!

In [68]:
formatted_prompt = prompt.format(metadata=wrong_disc_meta)
formatted_prompt

'\npassage: You are a helpful assistant that understands BIM building documents and engineering disciplines.\npassage: You will analyze BIM document metadata composed of filename, description, and discipline.\npassage: The metadata is written in German.\npassage: metadata: electrical_doc.pdf, Electrical wiring scheme and specifications for a generator room, S - Sanitaer\nquery: Does the filename match other filenames within the same discipline?\nquery: Does the description match the engineering discipline?\nquery: How different is the metadata to your curated information?\nquery: Highligh any discrepancies and comment on wether or not the metadata is anomalous.\n'

In [69]:
ans = (retriever | get_summary).invoke(formatted_prompt)
ans

'Based on the provided search results, the filename "electrical_doc.pdf" matches other filenames within the same discipline of E - Elektroanlagen [7]. However, the description "Electrical wiring scheme and specifications for a generator room" aligns more with the discipline of electrical engineering rather than "S - Sanitaer" [7]. The metadata presents discrepancies as the description does not directly correspond to the discipline mentioned, indicating a mismatch [7]. This inconsistency suggests that the metadata could be considered anomalous due to the mismatch between the description and the specified discipline [7]. The metadata exhibits a clear discrepancy between the content of the file and the discipline it is categorized under, raising a question about the accuracy of the metadata classification.'

# Anomaly detected!