mayajwilson76 Raghav001 commited on
Commit
b6791db
β€’
0 Parent(s):

Duplicate from Raghav001/PDF

Browse files

Co-authored-by: Raghavan <Raghav001@users.noreply.huggingface.co>

Files changed (4) hide show
  1. .gitattributes +34 -0
  2. README.md +14 -0
  3. app.py +312 -0
  4. requirements.txt +8 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ChatPDF
3
+ emoji: πŸ’»
4
+ colorFrom: gray
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 3.20.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: Raghav001/PDF
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import gradio as gr
4
+ # from concurrent.futures import ThreadPoolExecutor
5
+ import pdfplumber
6
+ import pandas as pd
7
+ import langchain
8
+ import time
9
+ from cnocr import CnOcr
10
+ import pinecone
11
+ import openai
12
+ from langchain.vectorstores import Pinecone
13
+ from langchain.embeddings.openai import OpenAIEmbeddings
14
+ from langchain.text_splitter import CharacterTextSplitter
15
+
16
+ # from langchain.document_loaders import PyPDFLoader
17
+ from langchain.document_loaders import UnstructuredWordDocumentLoader
18
+ from langchain.document_loaders import UnstructuredPowerPointLoader
19
+ # from langchain.document_loaders.image import UnstructuredImageLoader
20
+
21
+
22
+ from langchain.chains.question_answering import load_qa_chain
23
+ from langchain import OpenAI
24
+
25
+ from sentence_transformers import SentenceTransformer, models, util
26
+ word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2', do_lower_case=True)
27
+ pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='cls')
28
+ embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
29
+ ocr = CnOcr()
30
+ # chat_url = 'https://Raghav001-API.hf.space/sale'
31
+ chat_url = 'https://Raghav001-API.hf.space/chatpdf'
32
+ chat_emd = 'https://Raghav001-API.hf.space/embedd'
33
+ headers = {
34
+ 'Content-Type': 'application/json',
35
+ }
36
+ # thread_pool_executor = ThreadPoolExecutor(max_workers=4)
37
+ history_max_len = 500
38
+ all_max_len = 3000
39
+
40
+
41
+
42
+ # Initialize Pinecone client and create an index
43
+ pinecone.init(api_key="ffb1f594-0915-4ebf-835f-c1eaa62fdcdc",environment = "us-west4-gcp-free")
44
+ index = pinecone.Index(index_name="test")
45
+
46
+
47
+ def get_emb(text):
48
+ emb_url = 'https://Raghav001-API.hf.space/embeddings'
49
+ data = {"content": text}
50
+ try:
51
+ result = requests.post(url=emb_url,
52
+ data=json.dumps(data),
53
+ headers=headers
54
+ )
55
+ print("--------------------------------Embeddings-----------------------------------")
56
+ print(result.json()['data'][0]['embedding'])
57
+ return result.json()['data'][0]['embedding']
58
+ except Exception as e:
59
+ print('data', data, 'result json', result.json())
60
+
61
+
62
+ def doc_emb(doc: str):
63
+ texts = doc.split('\n')
64
+ # futures = []
65
+ emb_list = embedder.encode(texts)
66
+ print('emb_list',emb_list)
67
+ # for text in texts:
68
+ # futures.append(thread_pool_executor.submit(get_emb, text))
69
+ # for f in futures:
70
+ # emb_list.append(f.result())
71
+ print('\n'.join(texts))
72
+ pine(doc)
73
+ gr.Textbox.update(value="")
74
+ return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
75
+ value="""success ! Let's talk"""), gr.Chatbot.update(visible=True)
76
+
77
+
78
+ def get_response(msg, bot, doc_text_list, doc_embeddings):
79
+ # future = thread_pool_executor.submit(get_emb, msg)
80
+ gr.Textbox.update(value="")
81
+ now_len = len(msg)
82
+ req_json = {'question': msg}
83
+ his_bg = -1
84
+ for i in range(len(bot) - 1, -1, -1):
85
+ if now_len + len(bot[i][0]) + len(bot[i][1]) > history_max_len:
86
+ break
87
+ now_len += len(bot[i][0]) + len(bot[i][1])
88
+ his_bg = i
89
+ req_json['history'] = [] if his_bg == -1 else bot[his_bg:]
90
+ # query_embedding = future.result()
91
+ query_embedding = embedder.encode([msg])
92
+ cos_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
93
+ score_index = [[score, index] for score, index in zip(cos_scores, [i for i in range(len(cos_scores))])]
94
+ score_index.sort(key=lambda x: x[0], reverse=True)
95
+ print('score_index:\n', score_index)
96
+ print('doc_emb_state', doc_emb_state)
97
+ index_set, sub_doc_list = set(), []
98
+ for s_i in score_index:
99
+ doc = doc_text_list[s_i[1]]
100
+ if now_len + len(doc) > all_max_len:
101
+ break
102
+ index_set.add(s_i[1])
103
+ now_len += len(doc)
104
+ # Maybe the paragraph is truncated wrong, so add the upper and lower paragraphs
105
+ if s_i[1] > 0 and s_i[1] -1 not in index_set:
106
+ doc = doc_text_list[s_i[1]-1]
107
+ if now_len + len(doc) > all_max_len:
108
+ break
109
+ index_set.add(s_i[1]-1)
110
+ now_len += len(doc)
111
+ if s_i[1] + 1 < len(doc_text_list) and s_i[1] + 1 not in index_set:
112
+ doc = doc_text_list[s_i[1]+1]
113
+ if now_len + len(doc) > all_max_len:
114
+ break
115
+ index_set.add(s_i[1]+1)
116
+ now_len += len(doc)
117
+
118
+ index_list = list(index_set)
119
+ index_list.sort()
120
+ for i in index_list:
121
+ sub_doc_list.append(doc_text_list[i])
122
+ req_json['doc'] = '' if len(sub_doc_list) == 0 else '\n'.join(sub_doc_list)
123
+ data = {"content": json.dumps(req_json)}
124
+ print('data:\n', req_json)
125
+ result = requests.post(url=chat_url,
126
+ data=json.dumps(data),
127
+ headers=headers
128
+ )
129
+ res = result.json()['content']
130
+ bot.append([msg, res])
131
+ return bot[max(0, len(bot) - 3):]
132
+
133
+
134
+ def up_file(fls):
135
+ doc_text_list = []
136
+
137
+
138
+ names = []
139
+ print(names)
140
+ for i in fls:
141
+ names.append(str(i.name))
142
+
143
+
144
+ pdf = []
145
+ docs = []
146
+ pptx = []
147
+
148
+ for i in names:
149
+
150
+ if i[-3:] == "pdf":
151
+ pdf.append(i)
152
+ elif i[-4:] == "docx":
153
+ docs.append(i)
154
+ else:
155
+ pptx.append(i)
156
+
157
+
158
+ #Pdf Extracting
159
+ for idx, file in enumerate(pdf):
160
+ print("11111")
161
+ #print(file.name)
162
+ with pdfplumber.open(file) as pdf:
163
+ for i in range(len(pdf.pages)):
164
+ # Read page i+1 of a PDF document
165
+ page = pdf.pages[i]
166
+ res_list = page.extract_text().split('\n')[:-1]
167
+
168
+ for j in range(len(page.images)):
169
+ # Get the binary stream of the image
170
+ img = page.images[j]
171
+ file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
172
+ with open(file_name, mode='wb') as f:
173
+ f.write(img['stream'].get_data())
174
+ try:
175
+ res = ocr.ocr(file_name)
176
+ # res = PyPDFLoader(file_name)
177
+ except Exception as e:
178
+ res = []
179
+ if len(res) > 0:
180
+ res_list.append(' '.join([re['text'] for re in res]))
181
+
182
+ tables = page.extract_tables()
183
+ for table in tables:
184
+ # The first column is used as the header
185
+ df = pd.DataFrame(table[1:], columns=table[0])
186
+ try:
187
+ records = json.loads(df.to_json(orient="records", force_ascii=False))
188
+ for rec in records:
189
+ res_list.append(json.dumps(rec, ensure_ascii=False))
190
+ except Exception as e:
191
+ res_list.append(str(df))
192
+
193
+ doc_text_list += res_list
194
+
195
+ #pptx Extracting
196
+ for i in pptx:
197
+ loader = UnstructuredPowerPointLoader(i)
198
+ data = loader.load()
199
+ # content = str(data).split("'")
200
+ # cnt = content[1]
201
+ # # c = cnt.split('\\n\\n')
202
+ # # final = "".join(c)
203
+ # c = cnt.replace('\\n\\n',"").replace("<PAGE BREAK>","").replace("\t","")
204
+ doc_text_list.append(data)
205
+
206
+
207
+
208
+ #Doc Extracting
209
+ for i in docs:
210
+ loader = UnstructuredWordDocumentLoader(i)
211
+ data = loader.load()
212
+ # content = str(data).split("'")
213
+ # cnt = content[1]
214
+ # # c = cnt.split('\\n\\n')
215
+ # # final = "".join(c)
216
+ # c = cnt.replace('\\n\\n',"").replace("<PAGE BREAK>","").replace("\t","")
217
+ doc_text_list.append(data)
218
+
219
+ # #Image Extraction
220
+ # for i in jpg:
221
+ # loader = UnstructuredImageLoader(i)
222
+ # data = loader.load()
223
+ # # content = str(data).split("'")
224
+ # # cnt = content[1]
225
+ # # # c = cnt.split('\\n\\n')
226
+ # # # final = "".join(c)
227
+ # # c = cnt.replace('\\n\\n',"").replace("<PAGE BREAK>","").replace("\t","")
228
+ # doc_text_list.append(data)
229
+
230
+ doc_text_list = [str(text).strip() for text in doc_text_list if len(str(text).strip()) > 0]
231
+ # print(doc_text_list)
232
+ return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
233
+ visible=True), gr.Markdown.update(
234
+ value="Processing")
235
+
236
+
237
+ def pine(data):
238
+ char_text_spliter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap=0)
239
+ # doc_text = char_text_spliter.split_documents(data)
240
+ doc_spilt = []
241
+ data = data.split(" ")
242
+ # print(len(data))
243
+
244
+ c = 0
245
+ check = 0
246
+ for i in data:
247
+ # print(i)
248
+ if c == 350:
249
+ text = " ".join(data[check: check + c])
250
+ print(text)
251
+ print(check)
252
+ doc_spilt.append(text)
253
+ check = check + c
254
+ c = 0
255
+ else:
256
+ c = c+1
257
+
258
+
259
+ Embedding_model = "text-embedding-ada-002"
260
+ embeddings = OpenAIEmbeddings(openai_api_key="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")
261
+
262
+ print(requests.post(url = chat_emd))
263
+
264
+ # embeddings = requests.post(url=chat_emd,
265
+ # data=json.dumps(data),
266
+ # headers=headers
267
+ # )
268
+
269
+ pinecone.init(api_key = "ffb1f594-0915-4ebf-835f-c1eaa62fdcdc",
270
+ environment = "us-west4-gcp-free"
271
+ )
272
+
273
+ index_name = "test"
274
+ docstore = Pinecone.from_texts([d for d in doc_spilt],embeddings,index_name = index_name,namespace='a1')
275
+
276
+ return ''
277
+
278
+ def get_answer(query_live):
279
+
280
+ llm = OpenAI(temperature=0, openai='aaa')
281
+ qa_chain = load_qa_chain(llm,chain_type='stuff')
282
+ query = query_live
283
+ docs = docstore.similarity_search(query)
284
+ qa_chain.run(input_documents = docs, question = query)
285
+
286
+ with gr.Blocks() as demo:
287
+ with gr.Row():
288
+ with gr.Column():
289
+ file = gr.File(file_types=['.pdf'], label='Click to upload Document', file_count='multiple')
290
+ doc_bu = gr.Button(value='Submit', visible=False)
291
+
292
+
293
+ txt = gr.Textbox(label='result', visible=False)
294
+
295
+
296
+ doc_text_state = gr.State([])
297
+ doc_emb_state = gr.State([])
298
+
299
+ with gr.Column():
300
+ md = gr.Markdown("Please Upload the PDF")
301
+ chat_bot = gr.Chatbot(visible=False)
302
+ msg_txt = gr.Textbox(visible = False)
303
+ chat_bu = gr.Button(value='Clear', visible=False)
304
+
305
+ file.change(up_file, [file], [txt, doc_bu, md]) #hiding the text
306
+ doc_bu.click(doc_emb, [txt], [doc_text_state, doc_emb_state, msg_txt, chat_bu, md, chat_bot])
307
+ msg_txt.submit(get_response, [msg_txt, chat_bot,doc_text_state, doc_emb_state], [chat_bot],queue=False)
308
+ chat_bu.click(lambda: None, None, chat_bot, queue=False)
309
+
310
+ if __name__ == "__main__":
311
+ demo.queue().launch(show_api=False)
312
+ # demo.queue().launch(share=False, server_name='172.22.2.54', server_port=9191)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pdfplumber
2
+ sentence_transformers
3
+ cnocr
4
+ langchain
5
+ unstructured
6
+ pinecone-client
7
+ openai
8
+ tiktoken