import os import argparse import json import openai import sys from dotenv import load_dotenv from langchain_community.document_loaders import TextLoader from langchain_community.document_loaders import UnstructuredPDFLoader from langchain_community.embeddings.fake import FakeEmbeddings from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Vectara from backend.schema import Metadata, BimDiscipline load_dotenv() vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID'] vectara_corpus_id = os.environ['VECTARA_CORPUS_ID'] vectara_api_key = os.environ['VECTARA_API_KEY'] vectorstore = Vectara(vectara_customer_id=vectara_customer_id, vectara_corpus_id=vectara_corpus_id, vectara_api_key=vectara_api_key) prompt_template = """ BimDiscipline = ['plumbing', 'network', 'heating', 'electrical', 'ventilation', 'architecture'] You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the title, a brief one-sentence summary, and the discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document." Analyze the provided document, which could be in either German or English. Extract the title, summarize it briefly in one sentence, and infer the discipline. Document: context=" """ def ingest(file_path): extension = file_path.split('.')[-1] ext = extension.lower() if ext == 'pdf': loader = UnstructuredPDFLoader(file_path) elif ext == 'txt': loader = TextLoader(file_path) else: raise NotImplementedError('Only .txt or .pdf files are supported') # transform locally documents = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separators=[ "\n\n", "\n", " ", ",", "\uff0c", # Fullwidth comma "\u3001", # Ideographic comma "\uff0e", # Fullwidth full stop # "\u200B", # Zero-width space (Asian languages) # "\u3002", # Ideographic full stop (Asian languages) "", ]) docs = text_splitter.split_documents(documents) return docs def extract_metadata(docs): # plain text context = "".join( [doc.page_content.replace('\n\n','').replace('..','') for doc in docs]) prompt = f'{prompt_template}{context}"' # Create client client = openai.OpenAI( base_url="https://api.together.xyz/v1", api_key=os.environ["TOGETHER_API_KEY"], ) # Call the LLM with the JSON schema chat_completion = client.chat.completions.create( model="mistralai/Mixtral-8x7B-Instruct-v0.1", messages=[ { "role": "system", "content": f"You are a helpful assistant that responsds in JSON format" }, { "role": "user", "content": prompt } ] ) created_user = json.loads(chat_completion.choices[0].message.content) return created_user if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate metadata for a BIM document") parser.add_argument("document", metavar="FILEPATH", type=str, help="Path to the BIM document") args = parser.parse_args() if not os.path.exists(args.document) or not os.path.isfile(args.document): print("File '{}' not found or not accessible.".format(args.document)) sys.exit(-1) docs = ingest(args.document) metadata = extract_metadata(docs) print(json.dumps(metadata, indent=2))