File size: 5,709 Bytes
9813b6b
 
 
 
 
 
 
e39bb0b
9813b6b
e39bb0b
9813b6b
e39bb0b
d9ef11d
 
 
 
 
 
9813b6b
 
d9ef11d
9813b6b
 
5ee94a9
9813b6b
d9ef11d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9813b6b
e39bb0b
 
 
 
 
 
 
2e3cdd3
e39bb0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9813b6b
2e3cdd3
9813b6b
 
 
 
 
 
3a18c7f
9813b6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ea41f6
9813b6b
 
 
 
 
 
 
 
 
 
 
e39bb0b
9813b6b
e39bb0b
9813b6b
 
d9ef11d
 
 
 
 
9813b6b
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os
import io
import argparse
import json
import openai
import sys
from dotenv import load_dotenv

from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredPDFLoader

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Vectara
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter


load_dotenv()

MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"

vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID']
vectara_corpus_id = os.environ['VECTARA_CORPUS_ID']
vectara_api_key = os.environ['VECTARA_API_KEY']

embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")

vectara = Vectara(vectara_customer_id=vectara_customer_id,
                      vectara_corpus_id=vectara_corpus_id,
                      vectara_api_key=vectara_api_key)


summary_config = {"is_enabled": True, "max_results": 3, "response_lang": "eng"}
retriever = vectara.as_retriever(
    search_kwargs={"k": 3, "summary_config": summary_config}
)

template = """
passage: You are a helpful assistant that understands BIM building documents.
passage: You will analyze BIM document metadata composed of filename, description, and engineering discipline.
passage: The metadata is written in German.
passage: Filename: {filename}, Description: {description}, Engineering discipline: {discipline}.
query: Does the filename match other filenames within the same discipline?
query: Does the description match the engineering discipline?
query: How different is the metadata to your curated information?
query: Highligh any discrepancies and comment on wether or not the metadata is anomalous.
"""

prompt = PromptTemplate(template=template, input_variables=['filename', 'description', 'discipline'])


def get_sources(documents):
    return documents[:-1]

def get_summary(documents):
    return documents[-1].page_content

def ingest(file_path):    
    try:
        loader = PyPDFLoader(file_path)        
        documents = loader.load()
        print('Loaded PyPDFLoader')
    except Exception as e:
        print(f'{e}')
        loader = UnstructuredPDFLoader(file_path)
        documents = loader.load()
        print('Loaded UnstructuredPDFLoader')  
    finally:    
        # transform locally
        documents = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
        separators=[
            "\n\n",
            "\n",
            " ",
            ",",
            "\uff0c",  # Fullwidth comma
            "\u3001",  # Ideographic comma
            "\uff0e",  # Fullwidth full stop
            # "\u200B",  # Zero-width space (Asian languages)
            # "\u3002",  # Ideographic full stop (Asian languages)
            "",
        ])
        docs = text_splitter.split_documents(documents)

        return docs


def generate_metadata(docs):
    prompt_template = """
    BimDiscipline = ['plumbing', 'network', 'heating', 'electrical', 'ventilation', 'architecture']

    You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the filename, a short description, and the engineering discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document."

    Analyze the provided document, which could be in either German or English. Extract the filename, its description, and infer the engineering discipline it belongs to. Document:
    context="
    """     
    # plain text     
    filepath = [doc.metadata for doc in docs][0]['source']
    context = "".join(
        [doc.page_content.replace('\n\n','').replace('..','') for doc in docs])

    prompt = f'{prompt_template}{context}"\nFilepath:{filepath}'

    #print(prompt)
    
    # Create client
    client = openai.OpenAI(
        base_url="https://api.together.xyz/v1",
        api_key=os.environ["TOGETHER_API_KEY"],
        #api_key=userdata.get('TOGETHER_API_KEY'),    
    )

    # Call the LLM with the JSON schema
    chat_completion = client.chat.completions.create(
        model=MODEL_NAME,        
        messages=[
            {
                "role": "system",
                "content": f"You are a helpful assistant that responsds in JSON format"                
            },
            {
                "role": "user",
                "content": prompt                                
            }
        ]
    )
    return chat_completion.choices[0].message.content

    #return json.loads(chat_completion.choices[0].message.content)    


def analyze_metadata(filename, description, discipline):
    formatted_prompt = prompt.format(filename=filename, description=description, discipline=discipline)
    return (retriever | get_summary).invoke(formatted_prompt)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Generate metadata for a BIM document")
    parser.add_argument("document", metavar="FILEPATH", type=str,
                        help="Path to the BIM document")

    args = parser.parse_args()

    if not os.path.exists(args.document) or not os.path.isfile(args.document):
        print("File '{}' not found or not accessible.".format(args.document))
        sys.exit(-1)

    docs = ingest(args.document)
    metadata = generate_metadata(docs)
    print(metadata)