from openai import OpenAI import gradio as gr import fitz # PyMuPDF from PIL import Image from pathlib import Path import os api_key = os.getenv('API_KEY') base_url = os.getenv("BASE_URL") client = OpenAI( api_key=api_key, base_url=base_url, ) def extract_pdf_pypdf(pdf_dir): try: doc = fitz.open(pdf_dir) except Exception as e: print(f"Error opening PDF: {e}") return None page_count = doc.page_count file_content = "" for page in range(page_count): try: text = doc.load_page(page).get_text("text") file_content += text + "\n\n" except Exception as e: print(f"Error reading page {page}: {e}") continue return file_content def openai_api(messages): try: completion = client.chat.completions.create( model="claude-3-5-sonnet-20240620", messages=messages, temperature=0.1, max_tokens=8192, stream=True ) response = ''.join( [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in completion]) return response except Exception as ex: print("API error:", ex) return None def predict(input_text, pdf_file): if pdf_file is None: return "Please upload a PDF file to proceed." file_content = extract_pdf_pypdf(pdf_file.name) messages = [ { "role": "system", "content": "You are an expert in information extraction from scientific literature.", }, {"role": "user", "content": """Provided Text: ''' {{""" + file_content + """}} ''' """ + input_text} ] extract_result = openai_api(messages) return extract_result or "Too many users. Please wait a moment!" def convert_pdf_to_images(pdf_path, image_folder="pdf_images", dpi=300): # 创建存储图像的文件夹 os.makedirs(image_folder, exist_ok=True) # 打开PDF文档 pdf_document = fitz.open(pdf_path) image_paths = [] # 遍历每一页PDF,并生成高DPI的图像 for page_number in range(len(pdf_document)): page = pdf_document[page_number] pix = page.get_pixmap(dpi=dpi) image_path = Path(image_folder) / f"page_{page_number + 1}.png" Image.frombytes("RGB", [pix.width, pix.height], pix.samples).save(image_path) image_paths.append(str(image_path)) # 收集每一页的图像路径 pdf_document.close() return image_paths def display_pdf_images(file): # 转换PDF为高清图像 image_paths = convert_pdf_to_images(file) return image_paths # 返回图像路径列表以显示 en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format? If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain. """ en_2 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a JSON format? If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain. """ examples = [[en_1], [en_2]] with gr.Blocks(title="PaperExtractGPT") as demo: gr.Markdown( '''

Paper Extract GPT

How to use:
1: Upload your PDF.
2: Click "View PDF" to preview it.
3: Enter your extraction prompt in the input box.
4: Click "Generate" to extract, and the extracted information will display below.

''' ) with gr.Row(): with gr.Column(): file_input = gr.File(label="Upload your PDF", type="filepath") example = gr.Examples(examples=[["./sample.pdf"]], inputs=file_input) viewer_button = gr.Button("View PDF") file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain") with gr.Column(): model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt') example = gr.Examples(examples=examples, inputs=model_input) with gr.Row(): gen = gr.Button("Generate") clr = gr.Button("Clear") outputs = gr.Markdown(label='Output', value="""| Title | Journal | Year | Author | Institution | Email | |---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------| | Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" | """) gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs) clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs]) viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out) demo.launch()