import urllib.request import fitz import re import numpy as np import tensorflow_hub as hub import openai import gradio as gr import os from sklearn.neighbors import NearestNeighbors def download_pdf(url, output_path): urllib.request.urlretrieve(url, output_path) def preprocess(text): text = text.replace('\n', ' ') text = re.sub('\s+', ' ', text) return text def pdf_to_text(path, start_page=1, end_page=None): doc = fitz.open(path) total_pages = doc.page_count if end_page is None: end_page = total_pages text_list = [] for i in range(start_page-1, end_page): text = doc.load_page(i).get_text("text") text = preprocess(text) text_list.append(text) doc.close() return text_list def text_to_chunks(texts, word_length=150, start_page=1): text_toks = [t.split(' ') for t in texts] page_nums = [] chunks = [] for idx, words in enumerate(text_toks): for i in range(0, len(words), word_length): chunk = words[i:i+word_length] if (i+word_length) > len(words) and (len(chunk) < word_length) and ( len(text_toks) != (idx+1)): text_toks[idx+1] = chunk + text_toks[idx+1] continue chunk = ' '.join(chunk).strip() chunk = f'[{idx+start_page}]' + ' ' + '"' + chunk + '"' chunks.append(chunk) return chunks class SemanticSearch: def __init__(self): self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4') self.fitted = False def fit(self, data, batch=1000, n_neighbors=5): self.data = data self.embeddings = self.get_text_embedding(data, batch=batch) n_neighbors = min(n_neighbors, len(self.embeddings)) self.nn = NearestNeighbors(n_neighbors=n_neighbors) self.nn.fit(self.embeddings) self.fitted = True def __call__(self, text, return_data=True): inp_emb = self.use([text]) neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0] if return_data: return [self.data[i] for i in neighbors] else: return neighbors def get_text_embedding(self, texts, batch=1000): embeddings = [] for i in range(0, len(texts), batch): text_batch = texts[i:(i+batch)] emb_batch = self.use(text_batch) embeddings.append(emb_batch) embeddings = np.vstack(embeddings) return embeddings recommender = SemanticSearch() pdf_paths = [] # List to store multiple PDF paths def load_recommender(paths, start_page=1): global recommender, pdf_paths pdf_paths = paths texts = [] for path in paths: texts.extend(pdf_to_text(path, start_page=start_page)) chunks = text_to_chunks(texts, start_page=start_page) recommender.fit(chunks) return 'Corpus Loaded.' def generate_text(prompt): completions = openai.Completion.create( engine=os.environ('Engine'), prompt=prompt, max_tokens=512, n=1, stop=None, temperature=0.7, ) message = completions.choices[0].text return message def generate_answer(question): topn_chunks = recommender(question) prompt = "" prompt += 'search results:\n\n' for c in topn_chunks: prompt += c + '\n\n' prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. " \ "Cite each reference using [number] notation (every result has this number at the beginning). " \ "Citation should be done at the end of each sentence. If the search results mention multiple subjects " \ "with the same name, create separate answers for each. Only include information found in the results and " \ "don't add any additional information. Make sure the answer is correct and don't output false content. " \ "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier " \ "search results which have nothing to do with the question. Only answer what is asked. The " \ "answer should be short and concise.\n\nQuery: {question}\nAnswer: " prompt += f"Query: {question}\nAnswer:" answer = generate_text(prompt) return answer def question_answer(files, question, secret): api_key = os.environ.get('AzureKey') url_base = os.environ.get('AzureUrlBase') if api_key is None or url_base is None: return '[ERROR]: Please provide the Azure API Key and URL Base as environment variables.' openai.api_key = api_key openai.api_type = "azure" openai.api_base = url_base openai.api_version = "2022-12-01" if files == []: return '[ERROR]: Please provide at least one PDF.' if secret != os.environ.get('Secret'): return '[Error]: Please provide the correct secret' else: loaded_files = [] for file in files: old_file_name = file.name file_name = file.name file_name = file_name[:-12] + file_name[-4:] os.rename(old_file_name, file_name) loaded_files.append(file_name) load_recommender(loaded_files) if question.strip() == '': return '[ERROR]: Question field is empty.' return generate_answer(question) title = 'AzurePDFGPT' description = "A test platform for indexing PDFs to in order to 'chat' with them. It is hardcoded to the Jaytest and MLSLGPT engine" with gr.Interface( fn=question_answer, inputs=[ gr.File(label='PDFs', file_types=['.pdf'], file_count="multiple"), gr.Textbox(label='Question'), gr.Textbox(label='Secret') ], outputs=gr.Textbox(label='Answer'), title=title, description=description ) as iface: iface.launch()