from openai import OpenAI import gradio as gr import fitz # PyMuPDF from PIL import Image from pathlib import Path import os api_key = os.getenv('API_KEY') base_url = os.getenv("BASE_URL") client = OpenAI( api_key=api_key, base_url=base_url, ) def extract_pdf_pypdf(pdf_dir): try: doc = fitz.open(pdf_dir) except Exception as e: print(f"Error opening PDF: {e}") return None page_count = doc.page_count file_content = "" for page in range(page_count): try: text = doc.load_page(page).get_text("text") file_content += text + "\n\n" except Exception as e: print(f"Error reading page {page}: {e}") continue return file_content def openai_api(messages): try: completion = client.chat.completions.create( model="claude-3-5-sonnet-20240620", messages=messages, temperature=0.1, max_tokens=8192, stream=True ) response = ''.join( [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in completion]) return response except Exception as ex: print("API error:", ex) return None def predict(input_text, pdf_file): if pdf_file is None: return "Please upload a PDF file to proceed." file_content = extract_pdf_pypdf(pdf_file.name) messages = [ { "role": "system", "content": "You are an expert in information extraction from scientific literature.", }, {"role": "user", "content": """Provided Text: ''' {{""" + file_content + """}} ''' """ + input_text} ] extract_result = openai_api(messages) return extract_result or "Too many users. Please wait a moment!" def convert_pdf_to_images(pdf_path, image_folder="pdf_images", dpi=300): # 创建存储图像的文件夹 os.makedirs(image_folder, exist_ok=True) # 打开PDF文档 pdf_document = fitz.open(pdf_path) image_paths = [] # 遍历每一页PDF,并生成高DPI的图像 for page_number in range(len(pdf_document)): page = pdf_document[page_number] pix = page.get_pixmap(dpi=dpi) image_path = Path(image_folder) / f"page_{page_number + 1}.png" Image.frombytes("RGB", [pix.width, pix.height], pix.samples).save(image_path) image_paths.append(str(image_path)) # 收集每一页的图像路径 pdf_document.close() return image_paths def display_pdf_images(file): # 转换PDF为高清图像 image_paths = convert_pdf_to_images(file) return image_paths # 返回图像路径列表以显示 en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format? If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain. """ en_2 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a JSON format? If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain. """ examples = [[en_1], [en_2]] with gr.Blocks(title="PaperExtractGPT") as demo: gr.Markdown( '''
How to use:
1: Upload your PDF.
2: Click "View PDF" to preview it.
3: Enter your extraction prompt in the input box.
4: Click "Generate" to extract, and the extracted information will display below.