MLSLPDFMulti

Sleeping

App Files Files Community

MLSLPDFMulti / app.py

psyne

Update app.py

9e50206 over 1 year ago

raw

history blame contribute delete

5.88 kB

	import urllib.request
	import fitz
	import re
	import numpy as np
	import tensorflow_hub as hub
	import openai
	import gradio as gr
	import os
	from sklearn.neighbors import NearestNeighbors


	def download_pdf(url, output_path):
	urllib.request.urlretrieve(url, output_path)


	def preprocess(text):
	text = text.replace('\n', ' ')
	text = re.sub('\s+', ' ', text)
	return text


	def pdf_to_text(path, start_page=1, end_page=None):
	doc = fitz.open(path)
	total_pages = doc.page_count

	if end_page is None:
	end_page = total_pages

	text_list = []

	for i in range(start_page-1, end_page):
	text = doc.load_page(i).get_text("text")
	text = preprocess(text)
	text_list.append(text)

	doc.close()
	return text_list


	def text_to_chunks(texts, word_length=150, start_page=1):
	text_toks = [t.split(' ') for t in texts]
	page_nums = []
	chunks = []

	for idx, words in enumerate(text_toks):
	for i in range(0, len(words), word_length):
	chunk = words[i:i+word_length]
	if (i+word_length) > len(words) and (len(chunk) < word_length) and (
	len(text_toks) != (idx+1)):
	text_toks[idx+1] = chunk + text_toks[idx+1]
	continue
	chunk = ' '.join(chunk).strip()
	chunk = f'[{idx+start_page}]' + ' ' + '"' + chunk + '"'
	chunks.append(chunk)
	return chunks


	class SemanticSearch:

	def __init__(self):
	self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
	self.fitted = False

	def fit(self, data, batch=1000, n_neighbors=5):
	self.data = data
	self.embeddings = self.get_text_embedding(data, batch=batch)
	n_neighbors = min(n_neighbors, len(self.embeddings))
	self.nn = NearestNeighbors(n_neighbors=n_neighbors)
	self.nn.fit(self.embeddings)
	self.fitted = True

	def __call__(self, text, return_data=True):
	inp_emb = self.use([text])
	neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]

	if return_data:
	return [self.data[i] for i in neighbors]
	else:
	return neighbors

	def get_text_embedding(self, texts, batch=1000):
	embeddings = []
	for i in range(0, len(texts), batch):
	text_batch = texts[i:(i+batch)]
	emb_batch = self.use(text_batch)
	embeddings.append(emb_batch)
	embeddings = np.vstack(embeddings)
	return embeddings


	recommender = SemanticSearch()
	pdf_paths = [] # List to store multiple PDF paths


	def load_recommender(paths, start_page=1):
	global recommender, pdf_paths
	pdf_paths = paths
	texts = []
	for path in paths:
	texts.extend(pdf_to_text(path, start_page=start_page))
	chunks = text_to_chunks(texts, start_page=start_page)
	recommender.fit(chunks)
	return 'Corpus Loaded.'


	def generate_text(prompt, engine="mlsgpt3"):
	completions = openai.Completion.create(
	engine=engine,
	prompt=prompt,
	max_tokens=512,
	n=1,
	stop=None,
	temperature=0.7,
	)
	message = completions.choices[0].text
	return message


	def generate_answer(question):
	topn_chunks = recommender(question)
	prompt = ""
	prompt += 'search results:\n\n'
	for c in topn_chunks:
	prompt += c + '\n\n'

	prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. " \
	"Cite each reference using [number] notation (every result has this number at the beginning). " \
	"Citation should be done at the end of each sentence. If the search results mention multiple subjects " \
	"with the same name, create separate answers for each. Only include information found in the results and " \
	"don't add any additional information. Make sure the answer is correct and don't output false content. " \
	"If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier " \
	"search results which have nothing to do with the question. Only answer what is asked. The " \
	"answer should be short and concise.\n\nQuery: {question}\nAnswer: "

	prompt += f"Query: {question}\nAnswer:"
	answer = generate_text(prompt)
	return answer


	def question_answer(files, question, secret):
	api_key = os.environ.get('AzureKey')
	url_base = os.environ.get('AzureUrlBase')

	if api_key is None or url_base is None:
	return '[ERROR]: Please provide the Azure API Key and URL Base as environment variables.'

	openai.api_key = api_key
	openai.api_type = "azure"
	openai.api_base = url_base
	openai.api_version = "2022-12-01"

	if files == []:
	return '[ERROR]: Please provide at least one PDF.'

	if secret != os.environ.get('Secret'):
	return '[Error]: Please provide the correct secret'

	else:
	loaded_files = []
	for file in files:
	old_file_name = file.name
	file_name = file.name
	file_name = file_name[:-12] + file_name[-4:]
	os.rename(old_file_name, file_name)
	loaded_files.append(file_name)
	load_recommender(loaded_files)

	if question.strip() == '':
	return '[ERROR]: Question field is empty.'

	return generate_answer(question)


	title = 'AzurePDFGPT'
	description = "A test platform for indexing PDFs to in order to 'chat' with them. It is hardcoded to the Jaytest and MLSLGPT engine"

	with gr.Interface(
	fn=question_answer,
	inputs=[
	gr.File(label='PDFs', file_types=['.pdf'], file_count="multiple"),
	gr.Textbox(label='Question'),
	gr.Textbox(label='Secret')
	],
	outputs=gr.Textbox(label='Answer'),
	title=title,
	description=description
	) as iface:
	iface.launch()