Carlos Salgado commited on
Commit
24dc52a
1 Parent(s): bbe64b5

move requirements and app to root, rewrite basic app

Browse files
.github/workflows/hugging_face.yml CHANGED
@@ -18,5 +18,5 @@ jobs:
18
  - name: Push to hub
19
  env:
20
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
21
- run: git push https://AIhackathons:$HF_TOKEN@huggingface.co/spaces/AIhackathons/docverifyrag main
22
 
 
18
  - name: Push to hub
19
  env:
20
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
21
+ run: git push https://salgadev:$HF_TOKEN@huggingface.co/spaces/salgadev/docverifyrag main
22
 
app.py CHANGED
@@ -1,219 +1,22 @@
1
- import time
2
- import streamlit as st
3
- from PyPDF2 import PdfReader
4
- from langchain.text_splitter import CharacterTextSplitter
5
- from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
- from langchain.vectorstores import FAISS
7
- from langchain.chat_models import ChatOpenAI
8
- from langchain.memory import ConversationBufferMemory
9
- from langchain.chains import ConversationalRetrievalChain
10
- import os
11
- import pickle
12
- from datetime import datetime
13
- from backend.generate_metadata import generate_metadata, ingest
14
-
15
-
16
- css = '''
17
- <style>
18
- .chat-message {
19
- padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
20
- }
21
- .chat-message.user {
22
- background-color: #2b313e
23
- }
24
- .chat-message.bot {
25
- background-color: #475063
26
- }
27
- .chat-message .avatar {
28
- width: 20%;
29
- }
30
- .chat-message .avatar img {
31
- max-width: 78px;
32
- max-height: 78px;
33
- border-radius: 50%;
34
- object-fit: cover;
35
- }
36
- .chat-message .message {
37
- width: 80%;
38
- padding: 0 1.5rem;
39
- color: #fff;
40
- }
41
- '''
42
- bot_template = '''
43
- <div class="chat-message bot">
44
- <div class="avatar">
45
- <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png"
46
- style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
47
- </div>
48
- <div class="message">{{MSG}}</div>
49
- </div>
50
- '''
51
- user_template = '''
52
- <div class="chat-message user">
53
- <div class="avatar">
54
- <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
55
- </div>
56
- <div class="message">{{MSG}}</div>
57
- </div>
58
- '''
59
-
60
-
61
- def get_pdf_text(pdf_docs):
62
- text = ""
63
- for pdf in pdf_docs:
64
- pdf_reader = PdfReader(pdf)
65
- for page in pdf_reader.pages:
66
- text += page.extract_text()
67
- return text
68
-
69
-
70
- def get_text_chunks(text):
71
- text_splitter = CharacterTextSplitter(
72
- separator="\n",
73
- chunk_size=1000,
74
- chunk_overlap=200,
75
- length_function=len
76
- )
77
- chunks = text_splitter.split_text(text)
78
- return chunks
79
-
80
-
81
- def get_vectorstore(text_chunks):
82
- embeddings = OpenAIEmbeddings()
83
- # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
84
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
85
- return vectorstore
86
-
87
-
88
- def get_conversation_chain(vectorstore):
89
- llm = ChatOpenAI()
90
- # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
91
-
92
- memory = ConversationBufferMemory(
93
- memory_key='chat_history', return_messages=True)
94
- conversation_chain = ConversationalRetrievalChain.from_llm(
95
- llm=llm,
96
- retriever=vectorstore.as_retriever(),
97
- memory=memory
98
- )
99
- return conversation_chain
100
-
101
-
102
- def handle_userinput(user_question):
103
- response = st.session_state.conversation({'question': user_question})
104
- st.session_state.chat_history = response['chat_history']
105
-
106
- for i, message in enumerate(st.session_state.chat_history):
107
- # Display user message
108
- if i % 2 == 0:
109
- st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
110
- else:
111
- print(message)
112
- # Display AI response
113
- st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
114
-
115
- # THIS DOESNT WORK, SOMEONE PLS FIX
116
- # Display source document information if available in the message
117
- if hasattr(message, 'source') and message.source:
118
- st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
119
-
120
-
121
- def safe_vec_store():
122
- # USE VECTARA INSTEAD
123
- os.makedirs('vectorstore', exist_ok=True)
124
- filename = 'vectores' + datetime.now().strftime('%Y%m%d%H%M') + '.pkl'
125
- file_path = os.path.join('vectorstore', filename)
126
- vector_store = st.session_state.vectorstore
127
-
128
- # Serialize and save the entire FAISS object using pickle
129
- with open(file_path, 'wb') as f:
130
- pickle.dump(vector_store, f)
131
-
132
-
133
- def main():
134
- st.set_page_config(page_title="Doc Verify RAG", page_icon=":hospital:")
135
- st.write(css, unsafe_allow_html=True)
136
- if "openai_api_key" not in st.session_state:
137
- st.session_state.openai_api_key = False
138
- if "openai_org" not in st.session_state:
139
- st.session_state.openai_org = False
140
- if "classify" not in st.session_state:
141
- st.session_state.classify = False
142
- def set_pw():
143
- st.session_state.openai_api_key = True
144
- st.subheader("Your documents")
145
- # OPENAI_ORG_ID = st.text_input("OPENAI ORG ID:")
146
- OPENAI_API_KEY = st.text_input("OPENAI API KEY:", type="password",
147
- disabled=st.session_state.openai_api_key, on_change=set_pw)
148
- if st.session_state.classify:
149
- pdf_doc = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=False)
150
- else:
151
- pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
152
- filenames = [file.name for file in pdf_docs if file is not None]
153
- if st.button("Process"):
154
- with st.spinner("Processing"):
155
- if st.session_state.classify:
156
- # THE CLASSIFICATION APP
157
- st.write("Classifying")
158
- plain_text_doc = ingest(pdf_doc.name)
159
- classification_result = generate_metadata(plain_text_doc)
160
- st.write(classification_result)
161
- else:
162
- # NORMAL RAG
163
- loaded_vec_store = None
164
- for filename in filenames:
165
- if ".pkl" in filename:
166
- file_path = os.path.join('vectorstore', filename)
167
- with open(file_path, 'rb') as f:
168
- loaded_vec_store = pickle.load(f)
169
- raw_text = get_pdf_text(pdf_docs)
170
- text_chunks = get_text_chunks(raw_text)
171
- vec = get_vectorstore(text_chunks)
172
- if loaded_vec_store:
173
- vec.merge_from(loaded_vec_store)
174
- st.warning("loaded vectorstore")
175
- if "vectorstore" in st.session_state:
176
- vec.merge_from(st.session_state.vectorstore)
177
- st.warning("merged to existing")
178
- st.session_state.vectorstore = vec
179
- st.session_state.conversation = get_conversation_chain(vec)
180
- st.success("data loaded")
181
-
182
-
183
- if "conversation" not in st.session_state:
184
- st.session_state.conversation = None
185
- if "chat_history" not in st.session_state:
186
- st.session_state.chat_history = None
187
-
188
- st.header("Doc Verify RAG :hospital:")
189
- user_question = st.text_input("Ask a question about your documents:")
190
- if user_question:
191
- handle_userinput(user_question)
192
- with st.sidebar:
193
-
194
- st.subheader("Classification Instrucitons")
195
- classifier_docs = st.file_uploader("Upload your instructions here and click on 'Process'", accept_multiple_files=True)
196
- filenames = [file.name for file in classifier_docs if file is not None]
197
-
198
- if st.button("Process Classification"):
199
- st.session_state.classify = True
200
- with st.spinner("Processing"):
201
- st.warning("set classify")
202
- time.sleep(3)
203
-
204
-
205
- # Save and Load Embeddings
206
- if st.button("Save Embeddings"):
207
- if "vectorstore" in st.session_state:
208
- safe_vec_store()
209
- # st.session_state.vectorstore.save_local("faiss_index")
210
- st.sidebar.success("saved")
211
- else:
212
- st.sidebar.warning("No embeddings to save. Please process documents first.")
213
-
214
- if st.button("Load Embeddings"):
215
- st.warning("this function is not in use, just upload the vectorstore")
216
-
217
-
218
- if __name__ == '__main__':
219
- main()
 
1
+ import streamlit as st
2
+ import io
3
+ import tempfile
4
+
5
+ from scripts import generate_metadata, ingest
6
+
7
+
8
+ st.title('PDF to Text Converter')
9
+ st.write('This app converts a PDF file to plain text.')
10
+
11
+ uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf","txt"])
12
+
13
+ if uploaded_file is not None:
14
+ try:
15
+ file_ext = uploaded_file.name.split('.')[-1].lower()
16
+ pdf_file = io.BytesIO(uploaded_file.read())
17
+ docs = ingest(pdf_file, file_ext)
18
+ metadata = generate_metadata(docs)
19
+ st.write('## Converted Text')
20
+ st.write(metadata)
21
+ except Exception as e:
22
+ st.error(f'Error: {e}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/generate_metadata.py DELETED
@@ -1,102 +0,0 @@
1
- import os
2
- import io
3
- import argparse
4
- import json
5
- import openai
6
- import sys
7
- from dotenv import load_dotenv
8
- from langchain_community.document_loaders import TextLoader
9
- from langchain_community.document_loaders import UnstructuredPDFLoader
10
- from langchain_community.embeddings.fake import FakeEmbeddings
11
- from langchain_text_splitters import RecursiveCharacterTextSplitter
12
-
13
- load_dotenv()
14
-
15
-
16
- import io
17
-
18
- def ingest(file_obj, file_ext='pdf'):
19
- if file_ext == 'pdf':
20
- loader = UnstructuredPDFLoader(file_obj)
21
- elif file_ext == 'txt':
22
- loader = TextLoader(file_obj)
23
- else:
24
- raise NotImplementedError('Only .txt or .pdf files are supported')
25
-
26
- # transform locally
27
- documents = loader.load()
28
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
29
- separators=[
30
- "\n\n",
31
- "\n",
32
- " ",
33
- ",",
34
- "\uff0c", # Fullwidth comma
35
- "\u3001", # Ideographic comma
36
- "\uff0e", # Fullwidth full stop
37
- # "\u200B", # Zero-width space (Asian languages)
38
- # "\u3002", # Ideographic full stop (Asian languages)
39
- "",
40
- ])
41
- docs = text_splitter.split_documents(documents)
42
-
43
- return docs
44
-
45
-
46
- def generate_metadata(docs):
47
- prompt_template = """
48
- BimDiscipline = ['plumbing', 'network', 'heating', 'electrical', 'ventilation', 'architecture']
49
-
50
- You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the filename, a short description, and the engineering discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document."
51
-
52
- Analyze the provided document, which could be in either German or English. Extract the filename, its description, and infer the engineering discipline it belongs to. Document:
53
- context="
54
- """
55
- # plain text
56
- filepath = [doc.metadata for doc in docs][0]['source']
57
- context = "".join(
58
- [doc.page_content.replace('\n\n','').replace('..','') for doc in docs])
59
-
60
- prompt = f'{prompt_template}{context}"\nFilepath:{filepath}'
61
-
62
- #print(prompt)
63
-
64
- # Create client
65
- client = openai.OpenAI(
66
- base_url="https://api.together.xyz/v1",
67
- api_key=os.environ["TOGETHER_API_KEY"],
68
- #api_key=userdata.get('TOGETHER_API_KEY'),
69
- )
70
-
71
- # Call the LLM with the JSON schema
72
- chat_completion = client.chat.completions.create(
73
- model="mistralai/Mixtral-8x7B-Instruct-v0.1",
74
- messages=[
75
- {
76
- "role": "system",
77
- "content": f"You are a helpful assistant that responsds in JSON format"
78
- },
79
- {
80
- "role": "user",
81
- "content": prompt
82
- }
83
- ]
84
- )
85
-
86
- return json.loads(chat_completion.choices[0].message.content)
87
-
88
-
89
- if __name__ == "__main__":
90
- parser = argparse.ArgumentParser(description="Generate metadata for a BIM document")
91
- parser.add_argument("document", metavar="FILEPATH", type=str,
92
- help="Path to the BIM document")
93
-
94
- args = parser.parse_args()
95
-
96
- if not os.path.exists(args.document) or not os.path.isfile(args.document):
97
- print("File '{}' not found or not accessible.".format(args.document))
98
- sys.exit(-1)
99
-
100
- docs = ingest(args.document)
101
- metadata = generate_metadata(docs)
102
- print(metadata)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/requirements.txt → requirements.txt RENAMED
File without changes