Spaces:

jackkuo
/

PaperExtractGPT

Sleeping

App Files Files Community

PaperExtractGPT / app.py

jackkuo

Update app.py

606ffde verified 6 months ago

raw

history blame

5.96 kB

	import gradio as gr
	import base64
	import os
	from openai import OpenAI

	api_key = os.getenv('API_KEY')
	base_url = os.getenv("BASE_URL")

	client = OpenAI(
	api_key=api_key,
	base_url=base_url,
	)


	def extract_pdf_pypdf(pdf_dir):
	import fitz
	path = pdf_dir

	try:
	doc = fitz.open(path)
	except:
	print("can not read pdf")
	return None

	page_count = doc.page_count
	file_content = ""
	for page in range(page_count):
	text = doc.load_page(page).get_text("text")
	# 防止目录中包含References
	file_content += text + "\n\n"

	return file_content


	def openai_api(messages):
	try:
	completion = client.chat.completions.create(
	model="claude-3-5-sonnet-20240620",
	messages=messages,
	temperature=0.1,
	max_tokens=8192,
	# timeout=300,
	stream=True
	)
	except Exception as ex:
	print("api 出现如下异常%s" % ex)
	return None

	if completion:
	try:
	response_2_list = [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in
	completion]
	print("response tokens:", len(response_2_list))

	response_2_content = ''.join(response_2_list)
	return response_2_content
	except Exception as ex:
	print("第二轮出现如下异常%s" % ex)
	return None
	else:
	print("第二轮出现异常")
	return None


	def predict(input_text, pdf_file):
	if pdf_file is None:
	return "Please upload a PDF file to proceed."

	file_content = extract_pdf_pypdf(pdf_file.name)
	messages = [
	{
	"role": "system",
	"content": "You are an expert in information extraction from scientific literature.",
	},
	{"role": "user", "content": """Provided Text:
	'''
	{{""" + file_content + """}}
	'''
	""" + input_text}
	]
	extract_result = openai_api(messages)

	return extract_result or "Too many users. Please wait a moment!"


	def view_pdf(pdf_file, max_pages=3):
	if pdf_file is None:
	return "Please upload a PDF file to view."

	try:
	# Open the PDF file
	doc = fitz.open(pdf_file.name)

	# Only read up to `max_pages` pages to reduce size for large PDFs
	preview_pdf = fitz.open() # Create an empty PDF for the preview
	for page_num in range(min(max_pages, doc.page_count)):
	preview_pdf.insert_pdf(doc, from_page=page_num, to_page=page_num)

	# Save the preview as a temporary in-memory file
	pdf_data = preview_pdf.tobytes()

	# Encode as base64 for embedding in HTML
	b64_data = base64.b64encode(pdf_data).decode('utf-8')
	return f"<embed src='data:application/pdf;base64,{b64_data}' type='application/pdf' width='100%' height='700px' />"

	except Exception as e:
	print(f"Error displaying PDF: {e}")
	return "Error displaying PDF. Please try re-uploading."


	en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
	If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
	"""

	en_2 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a JSON format?
	If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
	"""

	examples = [[en_1], [en_2]]

	with gr.Blocks(title="PaperExtractGPT") as demo:
	gr.Markdown(
	'''<p align="center">
	<h1 align="center"> Paper Extract GPT </h1>
	<p> How to use:
	<br> <strong>1</strong>: Upload your PDF.
	<br> <strong>2</strong>: Click "View PDF" to preview it.
	<br> <strong>3</strong>: Enter your extraction prompt in the input box.
	<br> <strong>4</strong>: Click "Generate" to extract, and the extracted information will display below.
	</p>
	'''
	)
	with gr.Row():
	with gr.Column():
	gr.Markdown('## Upload PDF')
	file_input = gr.File(label="Upload your PDF", type="filepath")
	viewer_button = gr.Button("View PDF")
	file_out = gr.HTML(label="PDF Preview")

	with gr.Column():
	model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
	example = gr.Examples(examples=examples, inputs=model_input)
	with gr.Row():
	gen = gr.Button("Generate")
	clr = gr.Button("Clear")
	outputs = gr.Markdown(label='Output', show_label=True, value="""\| Title \| Journal \| Year \| Author \| Institution \| Email \|
	\|---------------------------------------------\|--------------------\|------\|-----------------------------------------------\|-------------------------------------------------------\|-----------------------\|
	\| Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India \| J. Geomag. Geoelectr. \| 1973 \| R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK \| National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad \| "" \|
	""")

	gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
	clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
	viewer_button.click(view_pdf, inputs=file_input, outputs=file_out)

	demo.launch()