Rams901 commited on
Commit
33a7c5b
0 Parent(s):

Duplicate from Rams901/rent-qa

Browse files
Files changed (6) hide show
  1. .gitattributes +36 -0
  2. README.md +13 -0
  3. app.py +278 -0
  4. rent_data/index.faiss +3 -0
  5. rent_data/index.pkl +3 -0
  6. requirements.txt +10 -0
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ availability.json filter=lfs diff=lfs merge=lfs -text
36
+ rent_data/index.faiss filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Rent QA
3
+ emoji: null
4
+ colorFrom: blue
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 3.27.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: Rams901/rent-qa
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ from langchain.document_loaders import UnstructuredPDFLoader
4
+ from langchain.indexes import VectorstoreIndexCreator
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain.chains import LLMChain
8
+ from langchain import PromptTemplate
9
+ import re
10
+ import pandas as pd
11
+ from langchain.vectorstores import FAISS
12
+ import requests
13
+ from bs4 import BeautifulSoup
14
+ from typing import List
15
+ from langchain.document_loaders import YoutubeLoader
16
+ from langchain.schema import (
17
+ SystemMessage,
18
+ HumanMessage,
19
+ AIMessage
20
+ )
21
+ from langchain.embeddings import HuggingFaceEmbeddings
22
+ from langchain.chat_models import ChatOpenAI
23
+
24
+
25
+ CHARACTER_CUT_OFF = 20000
26
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
27
+
28
+ embeddings = HuggingFaceEmbeddings()
29
+ db = FAISS.load_local('rent_data', embeddings)
30
+
31
+ llm = ChatOpenAI(
32
+ temperature=0,
33
+ model='gpt-3.5-turbo'
34
+ )
35
+
36
+ def remove_tags(soup: BeautifulSoup) -> str:
37
+ for data in soup(["style", "script"]):
38
+ # Remove tags
39
+ data.decompose()
40
+ # return data by retrieving the tag content
41
+ return " ".join(soup.stripped_strings)
42
+
43
+ def read_webpage(url: str) -> str:
44
+ print(f"Getting the response from url : {url})")
45
+ response = requests.get(url)
46
+ html_content = response.content
47
+
48
+ # Parse the HTML content using BeautifulSoup
49
+ soup = BeautifulSoup(html_content, "html.parser")
50
+
51
+ # Get all the text content from the relevant HTML tags
52
+ text_content = remove_tags(soup)
53
+
54
+ # for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "div"]:
55
+ # for element in soup.find_all(tag):
56
+ # text_content += element.get_text() + " "
57
+
58
+ print(text_content)
59
+ return text_content
60
+
61
+ def grab_transcript(url):
62
+ loader = YoutubeLoader.from_youtube_url(url)
63
+ transcript = loader.load()
64
+ return transcript[0].page_content
65
+
66
+ def process_webpages(urls: List[str]):
67
+ # A set to keep track of visited pages
68
+ visited_pages = set()
69
+ content = []
70
+
71
+ for url in urls:
72
+ aggregated_text = ""
73
+
74
+ try:
75
+
76
+ if 'youtube' not in url:
77
+
78
+ visited_pages.add(url)
79
+ aggregated_text += f"\nGetting the content of {url}:\n"
80
+
81
+ try:
82
+ aggregated_text += read_webpage(url)
83
+ except e as Exception:
84
+
85
+ print(read_webpage(url))
86
+ aggregated_text += "No Transcript found"
87
+
88
+ else:
89
+ # Youtube work:
90
+ aggregated_text += f"\nGetting the transcription of {url}:\n"
91
+ aggregated_text += grab_transcript(url)
92
+
93
+ except Exception as e:
94
+ print(e)
95
+
96
+ content.append(aggregated_text)
97
+
98
+ return content
99
+
100
+ def extract_urls(text):
101
+ url_regex = r"(https?://\S+)"
102
+ urls = re.findall(url_regex, text)
103
+ return urls
104
+
105
+ def add_text(history, text):
106
+
107
+ print(history)
108
+ history = history + [(text, None)]
109
+
110
+ return history, ""
111
+
112
+ def create_db_from_urls(urls: str) -> FAISS:
113
+
114
+ content = process_webpages(urls)
115
+ # 1K CHUNK SIZE
116
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
117
+ print(len(content), len(urls))
118
+ docs = text_splitter.create_documents(content)
119
+
120
+ # print(docs[0])
121
+ global db
122
+ if(type(db) != str):
123
+ # local_db =
124
+
125
+ docs += list(db.docstore._dict.values())
126
+
127
+ print(docs)
128
+
129
+ # db = FAISS.from_documents(docs, embeddings)
130
+ # print(db.docstore())
131
+ # print(docs[0])
132
+ db = FAISS.from_documents(docs, embeddings)
133
+
134
+ return db
135
+
136
+ # How to access the file? Where is it saved?
137
+ def add_file(history, files):
138
+ history = []
139
+ files = files[0]
140
+ docs = []
141
+ for file in files:
142
+
143
+ loader = UnstructuredPDFLoader(file.name)
144
+ text = loader.load()
145
+ # pdf_content = pdf2text(file.name)
146
+
147
+ docs += text_splitter.split_documents(text)
148
+ # print(docs[0])
149
+
150
+ global db
151
+ if(type(db) != str):
152
+ # local_db =
153
+
154
+ docs += list(db.docstore._dict.values())
155
+
156
+ print(docs)
157
+ history = history + [(f"{len(files)} PDF(s) Uploaded", None),]
158
+
159
+
160
+ db = FAISS.from_documents(docs, embeddings)
161
+ print(f"History in add file: {history}")
162
+ # print(db.docstore())
163
+ print(type(history), type(history[0]))
164
+ return ([history,],)
165
+
166
+ def qa_retrieve(chatlog, ):
167
+
168
+ print(f"Chatlog qa: {chatlog}")
169
+ query = chatlog[-1][0]
170
+ docs = ""
171
+ # Extracting urls from query
172
+ # urls = None
173
+
174
+ # if (urls):
175
+ # create_db_from_urls(urls)
176
+ # global db
177
+ # if(type(db) != str):
178
+ # # local_db =
179
+ # docs = list(db.docstore._dict.values())
180
+ # print(docs)
181
+ # db = FAISS.from_documents(docs, embeddings)
182
+ # # db.merge_from(local_db)
183
+ # else:
184
+ # db = FAISS.from_documents(docs, embeddings)
185
+ global db
186
+ print(db)
187
+ # if (type(db) == str):
188
+ # discussion = [j for i in chatlog for j in i]
189
+ # messages = [[SystemMessage(content = "You are Wikibot. You are a WikiPedia assistant that that can digest articles and answer questions based on your library of content.")]]
190
+ # messages += [([HumanMessage(content = x[0])],AIMessage(content = x[-1])) for x in chatlog[:-1] ]
191
+
192
+ # messages = [j for i in messages for j in i]
193
+ # print(messages)
194
+
195
+ # messages.append(HumanMessage(content = chatlog[-1][0]))
196
+ # print(messages)
197
+
198
+ # response = llm(messages=messages
199
+ # ).content
200
+
201
+ # else:
202
+
203
+ docs = db.similarity_search(query, k=4)
204
+
205
+ docs_page_content = " ".join([d.page_content for d in docs])
206
+
207
+
208
+ prompt = PromptTemplate(
209
+ input_variables=["question", "docs"],
210
+ template="""
211
+ As a consultant, your role is to assist the user in analyzing different cases of lord and tenants behaviours.
212
+ You will help the user with questions related to any of the information provided by the documents. You will make sure to give as much help as possible
213
+ even if sometimes the information seeked does not exist. Your priority is to find the information asked by the user and if it doesn't exist you will try
214
+ to think of how to answer using your own thoughts.
215
+
216
+ Answer the following question: {question}
217
+ Use the following documents: {docs}
218
+
219
+ If you feel like you don't have enough information to answer the question, say "I don't know".
220
+
221
+ """,
222
+ )
223
+
224
+
225
+ # llm = BardLLM()
226
+ chain = LLMChain(llm=llm, prompt = prompt)
227
+
228
+ response = chain.run(question=query, docs=docs_page_content)
229
+
230
+ chatlog[-1][1] = response
231
+ return chatlog
232
+
233
+ def flush():
234
+ global db
235
+ db = ""
236
+ return None
237
+
238
+ with gr.Blocks() as demo:
239
+ chatbot = gr.Chatbot([], elem_id="chatbot").style(height=750)
240
+
241
+ with gr.Row():
242
+ with gr.Column(scale=0.65):
243
+ txt = gr.components.Textbox(
244
+ placeholder="Ask me anything",show_label=False
245
+ )
246
+
247
+ with gr.Column(scale=0.15, min_width=0):
248
+ btn = gr.UploadButton("📁", file_types=["text"], file_count = 'multiple')
249
+
250
+ # with gr.Row():
251
+
252
+ # with gr.Column(scale=0.85):
253
+
254
+ # url = gr.components.Textbox(
255
+
256
+ # label="Website URLs",
257
+ # placeholder="https://www.example.org/ https://www.example.com/",
258
+ # )
259
+
260
+ with gr.Column(scale=0.15, min_width = 0):
261
+ send_btn = gr.Button("📨")
262
+
263
+ with gr.Row():
264
+ with gr.Column():
265
+ clear = gr.Button("Clear")
266
+ pdf_content = gr.Textbox("", visible = False)
267
+ txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then(
268
+ qa_retrieve, [chatbot], chatbot
269
+ ).then(lambda : (None), outputs = [ pdf_content])
270
+ btn.upload(add_file, [chatbot, btn], [chatbot,], batch = True).then(qa_retrieve, [chatbot], chatbot)
271
+
272
+ send_btn.click(add_text, [chatbot, txt, ], [chatbot, txt]).then(
273
+ qa_retrieve, [chatbot, ], chatbot).then(lambda : None, outputs = [ pdf_content])
274
+
275
+ clear.click(flush, None, outputs = chatbot, queue=False)
276
+
277
+ demo.queue(concurrency_count = 4)
278
+ demo.launch()
rent_data/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2944eec8a548a3d4c97a9f26d2a87054ede0b063f9139f2f788c37888f32b02
3
+ size 171764781
rent_data/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76630ed8bbc38dd10e113c088bca6a0531cd321469a3bcec99f427920468a35d
3
+ size 60863161
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ unstructured
3
+ pdf2image
4
+ langchain
5
+ gradio
6
+ openai
7
+ sentence_transformers
8
+ youtube-transcript-api
9
+ FAISS-gpu
10
+ BeautifulSoup4