pritish commited on
Commit
c7de76c
·
1 Parent(s): b6f2cec

Initial Commit

Browse files
Files changed (2) hide show
  1. app.py +190 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib.request
2
+ import fitz
3
+ import re
4
+ import numpy as np
5
+ import tensorflow_hub as hub
6
+ import openai
7
+ import gradio as gr
8
+ import os
9
+ from tqdm.auto import tqdm
10
+ from sklearn.neighbors import NearestNeighbors
11
+
12
+
13
+ def download_pdf(url, output_path):
14
+ urllib.request.urlretrieve(url, output_path)
15
+
16
+
17
+ def preprocess(text):
18
+ text = text.replace('\n', ' ')
19
+ text = re.sub('\s+', ' ', text)
20
+ return text
21
+
22
+
23
+ def pdf_to_text(path, start_page=1, end_page=None):
24
+ doc = fitz.open(path)
25
+ total_pages = doc.page_count
26
+
27
+ if end_page is None:
28
+ end_page = total_pages
29
+
30
+ text_list = []
31
+
32
+ for i in tqdm(range(start_page-1, end_page)):
33
+ text = doc.load_page(i).get_text("text")
34
+ text = preprocess(text)
35
+ text_list.append(text)
36
+
37
+ doc.close()
38
+ return text_list
39
+
40
+
41
+ def text_to_chunks(texts, word_length=100, start_page=1):
42
+ text_toks = [t.split(' ') for t in texts]
43
+ page_nums = []
44
+ chunks = []
45
+
46
+ for idx, words in enumerate(text_toks):
47
+ for i in range(0, len(words), word_length):
48
+ chunk = words[i:i+word_length]
49
+ if (i+word_length) > len(words) and (len(chunk) < word_length) and (
50
+ len(text_toks) != (idx+1)):
51
+ text_toks[idx+1] = chunk + text_toks[idx+1]
52
+ continue
53
+ chunk = ' '.join(chunk).strip()
54
+ chunk = f'[{idx+start_page}]' + ' ' + '"' + chunk + '"'
55
+ chunks.append(chunk)
56
+ return chunks
57
+
58
+
59
+ class SemanticSearch:
60
+
61
+ def __init__(self):
62
+ self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
63
+ self.fitted = False
64
+
65
+
66
+ def fit(self, data, batch=1000, n_neighbors=5):
67
+ self.data = data
68
+ self.embeddings = self.get_text_embedding(data, batch=batch)
69
+ n_neighbors = min(n_neighbors, len(self.embeddings))
70
+ self.nn = NearestNeighbors(n_neighbors=n_neighbors)
71
+ self.nn.fit(self.embeddings)
72
+ self.fitted = True
73
+
74
+
75
+ def __call__(self, text, return_data=True):
76
+ inp_emb = self.use([text])
77
+ neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
78
+
79
+ if return_data:
80
+ return [self.data[i] for i in neighbors]
81
+ else:
82
+ return neighbors
83
+
84
+
85
+ def get_text_embedding(self, texts, batch=1000):
86
+ embeddings = []
87
+ for i in tqdm(range(0, len(texts), batch)):
88
+ text_batch = texts[i:(i+batch)]
89
+ emb_batch = self.use(text_batch)
90
+ embeddings.append(emb_batch)
91
+ embeddings = np.vstack(embeddings)
92
+ return embeddings
93
+
94
+
95
+ openai.api_key = "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"
96
+ recommender = SemanticSearch()
97
+
98
+ def load_recommender(path, start_page=1):
99
+ global recommender
100
+ texts = pdf_to_text(path, start_page=start_page)
101
+ chunks = text_to_chunks(texts, start_page=start_page)
102
+ recommender.fit(chunks)
103
+ return 'Corpus Loaded.'
104
+
105
+
106
+ def generate_text(prompt, engine="text-davinci-003"):
107
+ completions = openai.Completion.create(
108
+ engine=engine,
109
+ prompt=prompt,
110
+ max_tokens=512,
111
+ n=1,
112
+ stop=None,
113
+ temperature=0.7,
114
+ )
115
+ message = completions.choices[0].text
116
+ return message
117
+
118
+
119
+ def generate_answer(question):
120
+ topn_chunks = recommender(question)
121
+ prompt = ""
122
+ prompt += 'search results:\n\n'
123
+ for c in topn_chunks:
124
+ prompt += c + '\n\n'
125
+
126
+ prompt += "Instructions: Compose a comprehensive reply to the query using the search results given."\
127
+ "Cite each reference using [number] notation (every result has this number at the beginning)."\
128
+ "Citation should be done at the end of each sentence. If the search results mention multiple subjects"\
129
+ "with the same name, create separate answers for each. Only include information found in the results and"\
130
+ "don't add any additional information. Make sure the answer is correct and don't output false content."\
131
+ "If the text does not relate to the query, simply state 'Found Nothing'. Don't write 'Answer:'"\
132
+ "Directly start the answer.\n"
133
+
134
+ prompt += f"Query: {question}\n\n"
135
+ answer = generate_text(prompt)
136
+ return answer
137
+
138
+
139
+ def load_corpus(url, file):
140
+ if url.strip() == '' and file == None:
141
+ return '[ERROR]: Both URL and PDF is empty. Provide atleast one.'
142
+
143
+ if url.strip() != '' and file != None:
144
+ return '[ERROR]: Both URL and PDF is provided. Please provide only one (eiter URL or PDF).'
145
+
146
+ if url.strip() != '':
147
+ glob_url = url
148
+ download_pdf(glob_url, 'corpus.pdf')
149
+ load_recommender('corpus.pdf')
150
+
151
+ else:
152
+ old_file_name = file.name
153
+ file_name = file.name
154
+ file_name = file_name[:-12] + file_name[-4:]
155
+ os.rename(old_file_name, file_name)
156
+ load_recommender(file_name)
157
+
158
+ return 'Corpus Loaded. Now you can ask Questions.'
159
+
160
+
161
+ def question_answer(question):
162
+ if question.strip() == '':
163
+ return '[ERROR]: Question field is empty'
164
+
165
+ if not recommender.fitted:
166
+ return '[ERROR]: First, provide a URL or Upload a PDF and hit submit (see left panel)'
167
+
168
+ return generate_answer(question)
169
+
170
+
171
+ with gr.Blocks() as app:
172
+ with gr.Row():
173
+
174
+ with gr.Group():
175
+ url = gr.Textbox(label='URL')
176
+ gr.Markdown("<center><h5>or<h5></center>")
177
+ file = gr.File(label='PDF', file_types=['.pdf'])
178
+ stataus = gr.Textbox(label="Output")
179
+ btn1 = gr.Button(value='Submit')
180
+ btn1.style(full_width=True)
181
+ btn1.click(load_corpus, inputs=[url, file], outputs=[stataus])
182
+
183
+ with gr.Group():
184
+ question = gr.Textbox(label='question')
185
+ btn2 = gr.Button(value='Submit')
186
+ btn2.style(full_width=True)
187
+ answer = gr.Textbox(label='answer')
188
+ btn2.click(question_answer, inputs=[question], outputs=[answer])
189
+
190
+ app.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ PyMuPDF
2
+ openai
3
+ tensorflow-hub==0.12.0