Spaces:
Sleeping
Sleeping
File size: 5,961 Bytes
73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 606ffde f704336 606ffde 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import gradio as gr
import base64
import os
from openai import OpenAI
api_key = os.getenv('API_KEY')
base_url = os.getenv("BASE_URL")
client = OpenAI(
api_key=api_key,
base_url=base_url,
)
def extract_pdf_pypdf(pdf_dir):
import fitz
path = pdf_dir
try:
doc = fitz.open(path)
except:
print("can not read pdf")
return None
page_count = doc.page_count
file_content = ""
for page in range(page_count):
text = doc.load_page(page).get_text("text")
# 防止目录中包含References
file_content += text + "\n\n"
return file_content
def openai_api(messages):
try:
completion = client.chat.completions.create(
model="claude-3-5-sonnet-20240620",
messages=messages,
temperature=0.1,
max_tokens=8192,
# timeout=300,
stream=True
)
except Exception as ex:
print("api 出现如下异常%s" % ex)
return None
if completion:
try:
response_2_list = [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in
completion]
print("response tokens:", len(response_2_list))
response_2_content = ''.join(response_2_list)
return response_2_content
except Exception as ex:
print("第二轮 出现如下异常%s" % ex)
return None
else:
print("第二轮出现异常")
return None
def predict(input_text, pdf_file):
if pdf_file is None:
return "Please upload a PDF file to proceed."
file_content = extract_pdf_pypdf(pdf_file.name)
messages = [
{
"role": "system",
"content": "You are an expert in information extraction from scientific literature.",
},
{"role": "user", "content": """Provided Text:
'''
{{""" + file_content + """}}
'''
""" + input_text}
]
extract_result = openai_api(messages)
return extract_result or "Too many users. Please wait a moment!"
def view_pdf(pdf_file, max_pages=3):
if pdf_file is None:
return "Please upload a PDF file to view."
try:
# Open the PDF file
doc = fitz.open(pdf_file.name)
# Only read up to `max_pages` pages to reduce size for large PDFs
preview_pdf = fitz.open() # Create an empty PDF for the preview
for page_num in range(min(max_pages, doc.page_count)):
preview_pdf.insert_pdf(doc, from_page=page_num, to_page=page_num)
# Save the preview as a temporary in-memory file
pdf_data = preview_pdf.tobytes()
# Encode as base64 for embedding in HTML
b64_data = base64.b64encode(pdf_data).decode('utf-8')
return f"<embed src='data:application/pdf;base64,{b64_data}' type='application/pdf' width='100%' height='700px' />"
except Exception as e:
print(f"Error displaying PDF: {e}")
return "Error displaying PDF. Please try re-uploading."
en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
"""
en_2 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a JSON format?
If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
"""
examples = [[en_1], [en_2]]
with gr.Blocks(title="PaperExtractGPT") as demo:
gr.Markdown(
'''<p align="center">
<h1 align="center"> Paper Extract GPT </h1>
<p> How to use:
<br> <strong>1</strong>: Upload your PDF.
<br> <strong>2</strong>: Click "View PDF" to preview it.
<br> <strong>3</strong>: Enter your extraction prompt in the input box.
<br> <strong>4</strong>: Click "Generate" to extract, and the extracted information will display below.
</p>
'''
)
with gr.Row():
with gr.Column():
gr.Markdown('## Upload PDF')
file_input = gr.File(label="Upload your PDF", type="filepath")
viewer_button = gr.Button("View PDF")
file_out = gr.HTML(label="PDF Preview")
with gr.Column():
model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
example = gr.Examples(examples=examples, inputs=model_input)
with gr.Row():
gen = gr.Button("Generate")
clr = gr.Button("Clear")
outputs = gr.Markdown(label='Output', show_label=True, value="""| Title | Journal | Year | Author | Institution | Email |
|---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
| Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
""")
gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
viewer_button.click(view_pdf, inputs=file_input, outputs=file_out)
demo.launch()
|