Spaces:
Sleeping
Sleeping
import os | |
import requests | |
from io import BytesIO | |
from PyPDF2 import PdfReader | |
import pandas as pd | |
from openai.embeddings_utils import get_embedding, cosine_similarity | |
import openai | |
import pkg_resources | |
import streamlit as st | |
import numpy as np | |
messages = [ | |
{"role": "system", "content": "You are SummarizeGPT, a large language model whose expertise is reading and summarizing scientific papers."} | |
] | |
class Chatbot(): | |
def parse_paper(self, pdf): | |
# This function parses the PDF and returns a list of dictionaries with the text, | |
# font size, and x and y coordinates of each text element in the PDF | |
print("Parsing paper") | |
number_of_pages = len(pdf.pages) | |
print(f"Total number of pages: {number_of_pages}") | |
# This is the list that will contain all the text elements in the PDF and will be returned by the function | |
paper_text = [] | |
for i in range(number_of_pages): | |
# Iterate through each page in the PDF, and extract the text elements. pdf.pages is a list of Page objects. | |
page = pdf.pages[i] | |
# This is the list that will contain all the text elements in the current page | |
page_text = [] | |
def visitor_body(text, cm, tm, fontDict, fontSize): | |
# tm is a 6-element tuple of floats that represent a 2x3 matrix, which is the text matrix for the text. | |
# The first two elements are the horizontal and vertical scaling factors, the third and fourth elements | |
# are the horizontal and vertical shear factors, and the fifth and sixth elements are the horizontal and vertical translation factors. | |
# x and y are the coordinates of the text element | |
x = tm[4] | |
y = tm[5] | |
# ignore header/footer, and empty text. | |
# The y coordinate is used to filter out the header and footer of the paper | |
# The length of the text is used to filter out empty text | |
if (y > 50 and y < 720) and (len(text.strip()) > 1): | |
page_text.append({ | |
# The fontsize is used to separate paragraphs into different elements in the paper_text list | |
'fontsize': fontSize, | |
# The text is stripped of whitespace and the \x03 character | |
'text': text.strip().replace('\x03', ''), | |
# The x and y coordinates are used to separate paragraphs into different elements in the paper_text list | |
'x': x, | |
'y': y | |
}) | |
# Extract the text elements from the page | |
_ = page.extract_text(visitor_text=visitor_body) | |
# Instantiate variables to be used in the loop. The first text element in the page_text list is used to initialize the variables | |
prev_y = None | |
prev_font_size = None | |
paragraph = '' | |
# Iterate through the page_text list and add the text to the paragraph string. | |
# y_diff and font_size_diff are used to separate paragraphs into different elements in the paper_text list by | |
# checking the y coordinate and the font size of the current text element and comparing it to the previous text element | |
for idx, t in enumerate(page_text): | |
if prev_y is None: | |
y_diff = abs(t['y'] - prev_y) | |
font_size_diff = abs(t['fontsize'] - prev_font_size) | |
# y_diff > 10 and font_size_diff > 1 are used to separate paragraphs into different elements in the paper_text list | |
if y_diff > 10 or font_size_diff > 1: | |
# Add paragraph to paper_text when the y_diff is too large or the font size is too different | |
# This is to separate paragraphs into different elements in the paper_text list | |
# This is done by checking the y coordinate and the font size of the current text element | |
paper_text.append({ | |
'fontsize': prev_font_size, | |
'text': paragraph.strip(), | |
'page': i | |
}) | |
paragraph = '' | |
# Add text to paragraph, and update the variables. | |
paragraph += f" {t['text']}" | |
prev_y = t['y'] | |
prev_font_size = t['fontsize'] | |
# Add last paragraph when reaching the end of the page_text | |
if idx == len(page_text) - 1: | |
paper_text.append({ | |
'fontsize': prev_font_size, | |
'text': paragraph.strip(), | |
'page': i | |
}) | |
print("Done parsing paper") | |
return paper_text | |
def paper_df(self, pdf): | |
print('Creating dataframe') | |
filtered_pdf= [] | |
for row in pdf: | |
if len(row['text']) < 30: | |
continue | |
filtered_pdf.append(row) | |
df = pd.DataFrame(filtered_pdf) | |
print(df.shape) | |
print(df.head) | |
# remove elements with identical df[text] and df[page] values | |
df = df.drop_duplicates(subset=['text', 'page'], keep='first') | |
df['length'] = df['text'].apply(lambda x: len(x)) | |
print('Done creating dataframe') | |
return df | |
def calculate_embeddings(self, df): | |
print('Calculating embeddings') | |
openai.api_key = os.getenv('OPENAI_API_KEY') | |
embedding_model = "text-embedding-ada-002" | |
# This is going to create embeddings for subsets of the PDF | |
embeddings = np.vstack(df.text.apply(lambda x: get_embedding(x, engine=embedding_model))) | |
return embeddings | |
def search_embeddings(self, df, query, n=3, pprint=True): | |
# Step 1. Get an embedding for the question being asked to the PDF | |
query_embedding = get_embedding( | |
query, | |
engine="text-embedding-ada-002" | |
) | |
# Step 2. Create a FAISS index and add the embeddings | |
d = embeddings.shape[1] | |
# Use the L2 distance metric | |
index = faiss.IndexFlatL2(d) | |
index.add(embeddings) | |
# Step 3. Search the index for the embedding of the question | |
D, I = index.search(query_embedding.reshape(1,d), n) | |
# Step 4. Get the top n results from the dataframe | |
results = df.iloc[I[0]] | |
results['similarity'] = D[0] | |
results = results.reset_index(drop=True) | |
# Make a dictionary of the first n results with the page number as the key and the text as the value | |
global sources | |
sources = [] | |
for i in range(n): | |
# append the page number and the text as a dict to the sources list | |
sources.append({'Page '+str(results.iloc[i]['page']): results.iloc[i]['text'][:150]+'...'}) | |
print(sources) | |
return results.head(n) | |
def create_prompt(self, df, user_input): | |
result = self.search_embeddings(df, user_input, n=3) | |
print(result) | |
prompt = """You are a large language model whose expertise is reading and and providing answers about research papers. | |
You are given a query and a series of text embeddings from a paper in order of their cosine similarity to the query. | |
You must take the given embeddings, as well as what you know from your model weights and knowledge of various fields of research to provide an answer to the query | |
that lines up with what was provided in the text. | |
Given the question: """+ user_input + """ | |
and the following embeddings as data: | |
1.""" + str(result.iloc[0]['text']) + """ | |
2.""" + str(result.iloc[1]['text']) + """ | |
3.""" + str(result.iloc[2]['text']) + """ | |
Return a detailed answer based on the paper. If the person asks you to summarize what is in the paper, do your best to provide a summary of the paper.:""" | |
print('Done creating prompt') | |
return prompt | |
def gpt(self, prompt): | |
openai.api_key = os.getenv('OPENAI_API_KEY') | |
print('got API key') | |
messages.append({"role": "user", "content": prompt}) | |
r = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages) | |
answer = r['choices'][0]['message']['content'] | |
response = {'answer': answer, 'sources': sources} | |
return response | |
def reply(self, prompt): | |
print(prompt) | |
prompt = self.create_prompt(df, prompt) | |
return self.gpt(prompt) | |
def process_pdf(file): | |
print("Processing pdf") | |
pdf = PdfReader(BytesIO(file)) | |
chatbot = Chatbot() | |
paper_text = chatbot.parse_paper(pdf) | |
global df | |
df = chatbot.paper_df(paper_text) | |
df = chatbot.calculate_embeddings(df) | |
print("Done processing pdf") | |
def download_pdf(url): | |
chatbot = Chatbot() | |
r = requests.get(str(url)) | |
print(r.headers) | |
pdf = PdfReader(BytesIO(r.content)) | |
paper_text = chatbot.parse_paper(pdf) | |
global df | |
df = chatbot.paper_df(paper_text) | |
df = chatbot.calculate_embeddings(df) | |
print("Done processing pdf") | |
def show_pdf(file_content): | |
base64_pdf = base64.b64encode(file_content).decode('utf-8') | |
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf"></iframe>' | |
st.markdown(pdf_display, unsafe_allow_html=True) | |
def main(): | |
st.title("Research Paper Guru") | |
st.subheader("Mike Ion - https://github.com/mikeion") | |
st.subheader("Ask a question about a research paper and get an answer with sources!") | |
st.subheader("Upload PDF or Enter URL") | |
pdf_option = st.selectbox("Choose an option:", ["Upload PDF", "Enter URL"]) | |
chatbot = Chatbot() | |
if pdf_option == "Upload PDF": | |
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") | |
if uploaded_file is not None: | |
file_content = uploaded_file.read() | |
process_pdf(file_content) | |
st.success("PDF uploaded and processed successfully!") | |
show_pdf(file_content) | |
elif pdf_option == "Enter URL": | |
url = st.text_input("Enter the URL of the PDF:") | |
if url: | |
if st.button("Download and process PDF"): | |
try: | |
r = requests.get(str(url)) | |
content = r.content | |
download_pdf(url) | |
st.success("PDF downloaded and processed successfully!") | |
show_pdf(content) | |
except Exception as e: | |
st.error(f"An error occurred while processing the PDF: {e}") | |
query = st.text_input("Enter your query:") | |
if query: | |
if st.button("Get answer"): | |
response = chatbot.reply(query) | |
st.write(response['answer']) | |
st.write("Sources:") | |
for source in response['sources']: | |
st.write(source) | |
if __name__ == "__main__": | |
main() | |