SANDRAMSC commited on
Commit
ce0ced2
1 Parent(s): 19f7272

Initial commit

Browse files
frontend/app.py DELETED
@@ -1,210 +0,0 @@
1
- import time
2
- import streamlit as st
3
- from dotenv import load_dotenv
4
- from PyPDF2 import PdfReader
5
- from langchain.text_splitter import CharacterTextSplitter
6
- from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
7
- from langchain.vectorstores import FAISS
8
- from langchain.chat_models import ChatOpenAI
9
- from langchain.memory import ConversationBufferMemory
10
- from langchain.chains import ConversationalRetrievalChain
11
- import os
12
- import pickle
13
- from datetime import datetime
14
- from backend.generate_metadata import extract_metadata, ingest
15
-
16
-
17
- css = '''
18
- <style>
19
- .chat-message {
20
- padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
21
- }
22
- .chat-message.user {
23
- background-color: #2b313e
24
- }
25
- .chat-message.bot {
26
- background-color: #475063
27
- }
28
- .chat-message .avatar {
29
- width: 20%;
30
- }
31
- .chat-message .avatar img {
32
- max-width: 78px;
33
- max-height: 78px;
34
- border-radius: 50%;
35
- object-fit: cover;
36
- }
37
- .chat-message .message {
38
- width: 80%;
39
- padding: 0 1.5rem;
40
- color: #fff;
41
- }
42
- '''
43
- bot_template = '''
44
- <div class="chat-message bot">
45
- <div class="avatar">
46
- <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
47
- </div>
48
- <div class="message">{{MSG}}</div>
49
- </div>
50
- '''
51
- user_template = '''
52
- <div class="chat-message user">
53
- <div class="avatar">
54
- <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
55
- </div>
56
- <div class="message">{{MSG}}</div>
57
- </div>
58
- '''
59
-
60
-
61
- def get_pdf_text(pdf_docs):
62
- text = ""
63
- for pdf in pdf_docs:
64
- pdf_reader = PdfReader(pdf)
65
- for page in pdf_reader.pages:
66
- text += page.extract_text()
67
- return text
68
-
69
-
70
- def get_text_chunks(text):
71
- text_splitter = CharacterTextSplitter(
72
- separator="\n",
73
- chunk_size=1000,
74
- chunk_overlap=200,
75
- length_function=len
76
- )
77
- chunks = text_splitter.split_text(text)
78
- return chunks
79
-
80
-
81
- def get_vectorstore(text_chunks):
82
- embeddings = OpenAIEmbeddings()
83
- # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
84
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
85
- return vectorstore
86
-
87
-
88
- def get_conversation_chain(vectorstore):
89
- llm = ChatOpenAI()
90
- # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
91
-
92
- memory = ConversationBufferMemory(
93
- memory_key='chat_history', return_messages=True)
94
- conversation_chain = ConversationalRetrievalChain.from_llm(
95
- llm=llm,
96
- retriever=vectorstore.as_retriever(),
97
- memory=memory
98
- )
99
- return conversation_chain
100
-
101
-
102
- def handle_userinput(user_question):
103
- response = st.session_state.conversation({'question': user_question})
104
- st.session_state.chat_history = response['chat_history']
105
-
106
- for i, message in enumerate(st.session_state.chat_history):
107
- # Display user message
108
- if i % 2 == 0:
109
- st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
110
- else:
111
- print(message)
112
- # Display AI response
113
- st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
114
-
115
- # THIS DOESNT WORK, SOMEONE PLS FIX
116
- # Display source document information if available in the message
117
- if hasattr(message, 'source') and message.source:
118
- st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
119
-
120
-
121
-
122
- def safe_vec_store():
123
- # USE VECTARA INSTEAD
124
- os.makedirs('vectorstore', exist_ok=True)
125
- filename = 'vectores' + datetime.now().strftime('%Y%m%d%H%M') + '.pkl'
126
- file_path = os.path.join('vectorstore', filename)
127
- vector_store = st.session_state.vectorstore
128
-
129
- # Serialize and save the entire FAISS object using pickle
130
- with open(file_path, 'wb') as f:
131
- pickle.dump(vector_store, f)
132
-
133
-
134
- def main():
135
- load_dotenv()
136
- st.set_page_config(page_title="Doc Verify RAG", page_icon=":hospital:")
137
- st.write(css, unsafe_allow_html=True)
138
- st.session_state.classify = False
139
- st.subheader("Your documents")
140
- pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=not st.session_state.classify)
141
- filenames = [file.name for file in pdf_docs if file is not None]
142
-
143
- if st.button("Process"):
144
- with st.spinner("Processing"):
145
- if st.session_state.classify:
146
- # THE CLASSIFICATION APP
147
- plain_text_doc = ingest(pdf_docs)
148
-
149
- # NORMAL RAG
150
- loaded_vec_store = None
151
- for filename in filenames:
152
- if ".pkl" in filename:
153
- file_path = os.path.join('vectorstore', filename)
154
- with open(file_path, 'rb') as f:
155
- loaded_vec_store = pickle.load(f)
156
- raw_text = get_pdf_text(pdf_docs)
157
- text_chunks = get_text_chunks(raw_text)
158
- vec = get_vectorstore(text_chunks)
159
- if loaded_vec_store:
160
- vec.merge_from(loaded_vec_store)
161
- st.warning("loaded vectorstore")
162
- if "vectorstore" in st.session_state:
163
- vec.merge_from(st.session_state.vectorstore)
164
- st.warning("merged to existing")
165
- st.session_state.vectorstore = vec
166
- st.session_state.conversation = get_conversation_chain(vec)
167
- st.success("data loaded")
168
- if st.session_state.classify:
169
- # THE CLASSIFICATION APP
170
- classification_result = extract_metadata(plain_text_doc)
171
- st.write(classification_result)
172
-
173
-
174
- if "conversation" not in st.session_state:
175
- st.session_state.conversation = None
176
- if "chat_history" not in st.session_state:
177
- st.session_state.chat_history = None
178
-
179
- st.header("Doc Verify RAG :hospital:")
180
- user_question = st.text_input("Ask a question about your documents:")
181
- if user_question:
182
- handle_userinput(user_question)
183
-
184
- with st.sidebar:
185
-
186
- st.subheader("Classification Instrucitons")
187
- classifier_docs = st.file_uploader("Upload your instructions here and click on 'Process'", accept_multiple_files=True)
188
- filenames = [file.name for file in classifier_docs if file is not None]
189
-
190
- if st.button("Process Classification"):
191
- with st.spinner("Processing"):
192
- st.session_state.classify = True
193
- time.sleep(3)
194
-
195
-
196
- # Save and Load Embeddings
197
- if st.button("Save Embeddings"):
198
- if "vectorstore" in st.session_state:
199
- safe_vec_store()
200
- # st.session_state.vectorstore.save_local("faiss_index")
201
- st.sidebar.success("saved")
202
- else:
203
- st.sidebar.warning("No embeddings to save. Please process documents first.")
204
-
205
- if st.button("Load Embeddings"):
206
- st.warning("this function is not in use, just upload the vectorstore")
207
-
208
-
209
- if __name__ == '__main__':
210
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
frontend/src/components/Features.tsx CHANGED
@@ -58,7 +58,7 @@ export const Features = () => {
58
  </CardHeader>
59
  <CardFooter className="flex flex-wrap md:justify-center gap-4">
60
  <iframe
61
- src="https://sandramsc-docverifyrag.hf.space"
62
  width="850"
63
  style={{ border: 'none' }}
64
  height="750"
 
58
  </CardHeader>
59
  <CardFooter className="flex flex-wrap md:justify-center gap-4">
60
  <iframe
61
+ src="https://aihackathons-docverifyrag.hf.space"
62
  width="850"
63
  style={{ border: 'none' }}
64
  height="750"