psyne commited on
Commit
657434d
0 Parent(s):

Duplicate from psyne/MLSLAzurePDFGPTMulti

Browse files
Files changed (4) hide show
  1. .gitattributes +34 -0
  2. README.md +14 -0
  3. app.py +188 -0
  4. requirements.txt +5 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: MLSLAzurePDFGPT
3
+ emoji: 🏆
4
+ colorFrom: yellow
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 3.32.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: unlicense
11
+ duplicated_from: psyne/MLSLAzurePDFGPTMulti
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib.request
2
+ import fitz
3
+ import re
4
+ import numpy as np
5
+ import tensorflow_hub as hub
6
+ import openai
7
+ import gradio as gr
8
+ import os
9
+ from sklearn.neighbors import NearestNeighbors
10
+
11
+
12
+ def download_pdf(url, output_path):
13
+ urllib.request.urlretrieve(url, output_path)
14
+
15
+
16
+ def preprocess(text):
17
+ text = text.replace('\n', ' ')
18
+ text = re.sub('\s+', ' ', text)
19
+ return text
20
+
21
+
22
+ def pdf_to_text(path, start_page=1, end_page=None):
23
+ doc = fitz.open(path)
24
+ total_pages = doc.page_count
25
+
26
+ if end_page is None:
27
+ end_page = total_pages
28
+
29
+ text_list = []
30
+
31
+ for i in range(start_page-1, end_page):
32
+ text = doc.load_page(i).get_text("text")
33
+ text = preprocess(text)
34
+ text_list.append(text)
35
+
36
+ doc.close()
37
+ return text_list
38
+
39
+
40
+ def text_to_chunks(texts, word_length=150, start_page=1):
41
+ text_toks = [t.split(' ') for t in texts]
42
+ page_nums = []
43
+ chunks = []
44
+
45
+ for idx, words in enumerate(text_toks):
46
+ for i in range(0, len(words), word_length):
47
+ chunk = words[i:i+word_length]
48
+ if (i+word_length) > len(words) and (len(chunk) < word_length) and (
49
+ len(text_toks) != (idx+1)):
50
+ text_toks[idx+1] = chunk + text_toks[idx+1]
51
+ continue
52
+ chunk = ' '.join(chunk).strip()
53
+ chunk = f'[{idx+start_page}]' + ' ' + '"' + chunk + '"'
54
+ chunks.append(chunk)
55
+ return chunks
56
+
57
+
58
+ class SemanticSearch:
59
+
60
+ def __init__(self):
61
+ self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
62
+ self.fitted = False
63
+
64
+ def fit(self, data, batch=1000, n_neighbors=5):
65
+ self.data = data
66
+ self.embeddings = self.get_text_embedding(data, batch=batch)
67
+ n_neighbors = min(n_neighbors, len(self.embeddings))
68
+ self.nn = NearestNeighbors(n_neighbors=n_neighbors)
69
+ self.nn.fit(self.embeddings)
70
+ self.fitted = True
71
+
72
+ def __call__(self, text, return_data=True):
73
+ inp_emb = self.use([text])
74
+ neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
75
+
76
+ if return_data:
77
+ return [self.data[i] for i in neighbors]
78
+ else:
79
+ return neighbors
80
+
81
+ def get_text_embedding(self, texts, batch=1000):
82
+ embeddings = []
83
+ for i in range(0, len(texts), batch):
84
+ text_batch = texts[i:(i+batch)]
85
+ emb_batch = self.use(text_batch)
86
+ embeddings.append(emb_batch)
87
+ embeddings = np.vstack(embeddings)
88
+ return embeddings
89
+
90
+
91
+ recommender = SemanticSearch()
92
+ pdf_paths = [] # List to store multiple PDF paths
93
+
94
+
95
+ def load_recommender(paths, start_page=1):
96
+ global recommender, pdf_paths
97
+ pdf_paths = paths
98
+ texts = []
99
+ for path in paths:
100
+ texts.extend(pdf_to_text(path, start_page=start_page))
101
+ chunks = text_to_chunks(texts, start_page=start_page)
102
+ recommender.fit(chunks)
103
+ return 'Corpus Loaded.'
104
+
105
+
106
+ def generate_text(prompt, engine="mlsgpt3"):
107
+ completions = openai.Completion.create(
108
+ engine=engine,
109
+ prompt=prompt,
110
+ max_tokens=512,
111
+ n=1,
112
+ stop=None,
113
+ temperature=0.7,
114
+ )
115
+ message = completions.choices[0].text
116
+ return message
117
+
118
+
119
+ def generate_answer(question):
120
+ topn_chunks = recommender(question)
121
+ prompt = ""
122
+ prompt += 'search results:\n\n'
123
+ for c in topn_chunks:
124
+ prompt += c + '\n\n'
125
+
126
+ prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. " \
127
+ "Cite each reference using [number] notation (every result has this number at the beginning). " \
128
+ "Citation should be done at the end of each sentence. If the search results mention multiple subjects " \
129
+ "with the same name, create separate answers for each. Only include information found in the results and " \
130
+ "don't add any additional information. Make sure the answer is correct and don't output false content. " \
131
+ "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier " \
132
+ "search results which have nothing to do with the question. Only answer what is asked. The " \
133
+ "answer should be short and concise.\n\nQuery: {question}\nAnswer: "
134
+
135
+ prompt += f"Query: {question}\nAnswer:"
136
+ answer = generate_text(prompt)
137
+ return answer
138
+
139
+
140
+ def question_answer(files, question, secret):
141
+ api_key = os.environ.get('AzureKey')
142
+ url_base = os.environ.get('AzureUrlBase')
143
+
144
+ if api_key is None or url_base is None:
145
+ return '[ERROR]: Please provide the Azure API Key and URL Base as environment variables.'
146
+
147
+ openai.api_key = api_key
148
+ openai.api_type = "azure"
149
+ openai.api_base = url_base
150
+ openai.api_version = "2022-12-01"
151
+
152
+ if files == []:
153
+ return '[ERROR]: Please provide at least one PDF.'
154
+
155
+ if secret != os.environ.get('Secret'):
156
+ return '[Error]: Please provide the correct secret'
157
+
158
+ else:
159
+ loaded_files = []
160
+ for file in files:
161
+ old_file_name = file.name
162
+ file_name = file.name
163
+ file_name = file_name[:-12] + file_name[-4:]
164
+ os.rename(old_file_name, file_name)
165
+ loaded_files.append(file_name)
166
+ load_recommender(loaded_files)
167
+
168
+ if question.strip() == '':
169
+ return '[ERROR]: Question field is empty.'
170
+
171
+ return generate_answer(question)
172
+
173
+
174
+ title = 'AzurePDFGPT'
175
+ description = "A test platform for indexing PDFs to in order to 'chat' with them. It is hardcoded to the Jaytest and MLSLGPT engine"
176
+
177
+ with gr.Interface(
178
+ fn=question_answer,
179
+ inputs=[
180
+ gr.File(label='PDFs', file_types=['.pdf'], file_count="multiple"),
181
+ gr.Textbox(label='Question'),
182
+ gr.Textbox(label='Secret')
183
+ ],
184
+ outputs=gr.Textbox(label='Answer'),
185
+ title=title,
186
+ description=description
187
+ ) as iface:
188
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ PyMuPDF
2
+ openai
3
+ tensorflow==2.9.2
4
+ tensorflow-hub==0.12.0
5
+ scikit-learn==1.0.2