Spaces:
Running
Running
File size: 5,647 Bytes
900c0a5 73a2cf2 900c0a5 f704336 900c0a5 f704336 73a2cf2 f704336 73a2cf2 f704336 900c0a5 f704336 900c0a5 f704336 900c0a5 f704336 900c0a5 f704336 73a2cf2 900c0a5 f704336 900c0a5 606ffde 900c0a5 606ffde 900c0a5 606ffde 900c0a5 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 f704336 73a2cf2 900c0a5 73a2cf2 f704336 900c0a5 73a2cf2 900c0a5 f704336 73a2cf2 f704336 73a2cf2 900c0a5 73a2cf2 f704336 900c0a5 73a2cf2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
from openai import OpenAI
import gradio as gr
import fitz # PyMuPDF
from PIL import Image
from pathlib import Path
import os
api_key = os.getenv('API_KEY')
base_url = os.getenv("BASE_URL")
client = OpenAI(
api_key=api_key,
base_url=base_url,
)
def extract_pdf_pypdf(pdf_dir):
try:
doc = fitz.open(pdf_dir)
except Exception as e:
print(f"Error opening PDF: {e}")
return None
page_count = doc.page_count
file_content = ""
for page in range(page_count):
try:
text = doc.load_page(page).get_text("text")
file_content += text + "\n\n"
except Exception as e:
print(f"Error reading page {page}: {e}")
continue
return file_content
def openai_api(messages):
try:
completion = client.chat.completions.create(
model="claude-3-5-sonnet-20240620",
messages=messages,
temperature=0.1,
max_tokens=8192,
stream=True
)
response = ''.join(
[chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in completion])
return response
except Exception as ex:
print("API error:", ex)
return None
def predict(input_text, pdf_file):
if pdf_file is None:
return "Please upload a PDF file to proceed."
file_content = extract_pdf_pypdf(pdf_file.name)
messages = [
{
"role": "system",
"content": "You are an expert in information extraction from scientific literature.",
},
{"role": "user", "content": """Provided Text:
'''
{{""" + file_content + """}}
'''
""" + input_text}
]
extract_result = openai_api(messages)
return extract_result or "Too many users. Please wait a moment!"
def convert_pdf_to_images(pdf_path, image_folder="pdf_images", dpi=300):
# 创建存储图像的文件夹
os.makedirs(image_folder, exist_ok=True)
# 打开PDF文档
pdf_document = fitz.open(pdf_path)
image_paths = []
# 遍历每一页PDF,并生成高DPI的图像
for page_number in range(len(pdf_document)):
page = pdf_document[page_number]
pix = page.get_pixmap(dpi=dpi)
image_path = Path(image_folder) / f"page_{page_number + 1}.png"
Image.frombytes("RGB", [pix.width, pix.height], pix.samples).save(image_path)
image_paths.append(str(image_path)) # 收集每一页的图像路径
pdf_document.close()
return image_paths
def display_pdf_images(file):
# 转换PDF为高清图像
image_paths = convert_pdf_to_images(file)
return image_paths # 返回图像路径列表以显示
en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
"""
en_2 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a JSON format?
If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
"""
examples = [[en_1], [en_2]]
with gr.Blocks(title="PaperExtractGPT") as demo:
gr.Markdown(
'''<h1 align="center"> Paper Extract GPT </h1>
<p>How to use:
<br><strong>1</strong>: Upload your PDF.
<br><strong>2</strong>: Click "View PDF" to preview it.
<br><strong>3</strong>: Enter your extraction prompt in the input box.
<br><strong>4</strong>: Click "Generate" to extract, and the extracted information will display below.
</p>'''
)
with gr.Row():
with gr.Column():
file_input = gr.File(label="Upload your PDF", type="filepath")
example = gr.Examples(examples=[["./sample.pdf"]], inputs=file_input)
viewer_button = gr.Button("View PDF")
file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
with gr.Column():
model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
example = gr.Examples(examples=examples, inputs=model_input)
with gr.Row():
gen = gr.Button("Generate")
clr = gr.Button("Clear")
outputs = gr.Markdown(label='Output', value="""| Title | Journal | Year | Author | Institution | Email |
|---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
| Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
""")
gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
demo.launch()
|