Carlos Salgado commited on
Commit
d9ef11d
1 Parent(s): a0ad413

implement missing features in app.py, add retriever with summarization to scripts

Browse files
Files changed (2) hide show
  1. app.py +42 -21
  2. scripts.py +48 -2
app.py CHANGED
@@ -3,29 +3,50 @@ import os
3
  import streamlit as st
4
  import tempfile
5
 
6
- from scripts import generate_metadata, ingest, MODEL_NAME
7
 
8
 
9
- st.title('DocVerifyRAG')
10
- st.write('Anomaly detection for BIM document metadata')
11
 
12
- uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf","txt"])
13
 
14
- if uploaded_file is not None:
 
 
 
 
15
  try:
16
- with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp:
17
- tmp.write(uploaded_file.read())
18
- file_path = tmp.name
19
- st.write(f'Created temporary file {file_path}')
20
-
21
- docs = ingest(file_path)
22
- st.write('## Querying Together.ai API')
23
- metadata = generate_metadata(docs)
24
- st.write(f'## Metadata Generated by {MODEL_NAME}')
25
- st.write(metadata)
26
-
27
- # Clean up the temporary file
28
- os.remove(file_path)
29
-
30
- except Exception as e:
31
- st.error(f'Error: {e}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import streamlit as st
4
  import tempfile
5
 
6
+ from scripts import analyze_metadata, generate_metadata, ingest, MODEL_NAME
7
 
8
 
9
+ st.title('# DocVerifyRAG')
10
+ st.write('## Anomaly detection for BIM document metadata')
11
 
12
+ st.write('### Enter your file metadata in the following schema:')
13
 
14
+ user_input = st.text_input(
15
+ label='Filename, Description, Discipline',
16
+ value="", placeholder=str)
17
+
18
+ if st.button('Submit'):
19
  try:
20
+ filename, description, discipline = user_input.split(',')
21
+
22
+ st.write('## Analyzing with Vectara + together.ai')
23
+ analysis = analyze_metadata(filename, description, discipline)
24
+
25
+ st.write(analysis)
26
+
27
+ st.write('## Generate metadata?')
28
+ st.write('### Upload the file that corresponds to the submitted metadata')
29
+
30
+ uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf","txt"])
31
+
32
+ if uploaded_file is not None:
33
+ try:
34
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp:
35
+ tmp.write(uploaded_file.read())
36
+ file_path = tmp.name
37
+ st.write(f'Created temporary file {file_path}')
38
+
39
+ docs = ingest(file_path)
40
+ st.write('## Querying Together.ai API')
41
+ metadata = generate_metadata(docs)
42
+ st.write(f'## Metadata Generated by {MODEL_NAME}')
43
+ st.write(metadata)
44
+
45
+ # Clean up the temporary file
46
+ os.remove(file_path)
47
+
48
+ except Exception as e:
49
+ st.error(f'Error: {e}')
50
+ except ValueError:
51
+ st.error('Please enter 3 comma separated values')
52
+
scripts.py CHANGED
@@ -7,13 +7,54 @@ import sys
7
  from dotenv import load_dotenv
8
  from langchain_community.document_loaders import TextLoader
9
  from langchain_community.document_loaders import UnstructuredPDFLoader
10
- from langchain_community.embeddings.fake import FakeEmbeddings
 
 
 
 
 
11
  from langchain_text_splitters import RecursiveCharacterTextSplitter
12
 
 
13
  load_dotenv()
14
 
15
  MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def ingest(file_path):
19
  extension = os.path.splitext(file_path)[1].lower()
@@ -52,7 +93,7 @@ def generate_metadata(docs):
52
 
53
  You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the filename, a short description, and the engineering discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document."
54
 
55
- Analyze the provided document, which could be in either German or English. Extract the filename, its description, and infer the engineering discipline it belongs to. Document:
56
  context="
57
  """
58
  # plain text
@@ -89,6 +130,11 @@ def generate_metadata(docs):
89
  return json.loads(chat_completion.choices[0].message.content)
90
 
91
 
 
 
 
 
 
92
  if __name__ == "__main__":
93
  parser = argparse.ArgumentParser(description="Generate metadata for a BIM document")
94
  parser.add_argument("document", metavar="FILEPATH", type=str,
 
7
  from dotenv import load_dotenv
8
  from langchain_community.document_loaders import TextLoader
9
  from langchain_community.document_loaders import UnstructuredPDFLoader
10
+ from langchain_community.embeddings import HuggingFaceEmbeddings
11
+ from langchain_community.vectorstores import Vectara
12
+ from langchain_core.output_parsers import StrOutputParser
13
+ from langchain_core.prompts import ChatPromptTemplate
14
+ from langchain_core.runnables import RunnablePassthrough
15
+ from langchain.prompts import PromptTemplate
16
  from langchain_text_splitters import RecursiveCharacterTextSplitter
17
 
18
+
19
  load_dotenv()
20
 
21
  MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
22
 
23
+ vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID']
24
+ vectara_corpus_id = os.environ['VECTARA_CORPUS_ID']
25
+ vectara_api_key = os.environ['VECTARA_API_KEY']
26
+
27
+ embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
28
+
29
+ vectara = Vectara(vectara_customer_id=vectara_customer_id,
30
+ vectara_corpus_id=vectara_corpus_id,
31
+ vectara_api_key=vectara_api_key)
32
+
33
+
34
+ summary_config = {"is_enabled": True, "max_results": 3, "response_lang": "eng"}
35
+ retriever = vectara.as_retriever(
36
+ search_kwargs={"k": 3, "summary_config": summary_config}
37
+ )
38
+
39
+ template = """
40
+ passage: You are a helpful assistant that understands BIM building documents.
41
+ passage: You will analyze BIM document metadata composed of filename, description, and engineering discipline.
42
+ passage: The metadata is written in German.
43
+ passage: Filename: {filename}, Description: {description}, Engineering discipline: {discipline}.
44
+ query: Does the filename match other filenames within the same discipline?
45
+ query: Does the description match the engineering discipline?
46
+ query: How different is the metadata to your curated information?
47
+ query: Highligh any discrepancies and comment on wether or not the metadata is anomalous.
48
+ """
49
+
50
+ prompt = PromptTemplate(template=template, input_variables=['filename', 'description', 'discipline'])
51
+
52
+
53
+ def get_sources(documents):
54
+ return documents[:-1]
55
+
56
+ def get_summary(documents):
57
+ return documents[-1].page_content
58
 
59
  def ingest(file_path):
60
  extension = os.path.splitext(file_path)[1].lower()
 
93
 
94
  You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the filename, a short description, and the engineering discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document."
95
 
96
+ Analyze the provided document, which could be in either German or English. Extract the filename, its description, and infer the engineering discipline it belongs to. Respond in both English and German. Document:
97
  context="
98
  """
99
  # plain text
 
130
  return json.loads(chat_completion.choices[0].message.content)
131
 
132
 
133
+ def analyze_metadata(filename, description, discipline):
134
+ formatted_prompt = prompt.format(filename=filename, description=description, discipline=discipline)
135
+ return (retriever | get_summary).invoke(formatted_prompt)
136
+
137
+
138
  if __name__ == "__main__":
139
  parser = argparse.ArgumentParser(description="Generate metadata for a BIM document")
140
  parser.add_argument("document", metavar="FILEPATH", type=str,