import gradio as gr import base64 import os from openai import OpenAI api_key = os.getenv('API_KEY') base_url = os.getenv("BASE_URL") client = OpenAI( api_key=api_key, base_url=base_url, ) def extract_pdf_pypdf(pdf_dir): import fitz path = pdf_dir try: doc = fitz.open(path) except: print("can not read pdf") return None page_count = doc.page_count file_content = "" for page in range(page_count): text = doc.load_page(page).get_text("text") # 防止目录中包含References file_content += text + "\n\n" return file_content def openai_api(messages): try: completion = client.chat.completions.create( model="claude-3-5-sonnet-20240620", messages=messages, temperature=0.1, max_tokens=8192, # timeout=300, stream=True ) except Exception as ex: print("api 出现如下异常%s" % ex) return None if completion: try: response_2_list = [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in completion] print("response tokens:", len(response_2_list)) response_2_content = ''.join(response_2_list) return response_2_content except Exception as ex: print("第二轮 出现如下异常%s" % ex) return None else: print("第二轮出现异常") return None def predict(input_text, pdf_file): if pdf_file is None: return "Please upload a PDF file to proceed." file_content = extract_pdf_pypdf(pdf_file.name) messages = [ { "role": "system", "content": "You are an expert in information extraction from scientific literature.", }, {"role": "user", "content": """Provided Text: ''' {{""" + file_content + """}} ''' """ + input_text} ] extract_result = openai_api(messages) return extract_result or "Too many users. Please wait a moment!" def view_pdf(pdf_file): if pdf_file is None: return "Please upload a PDF file to view." with open(pdf_file.name, 'rb') as f: pdf_data = f.read() b64_data = base64.b64encode(pdf_data).decode('utf-8') return f"" en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format? If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain. """ en_2 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a JSON format? If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain. """ examples = [[en_1], [en_2]] with gr.Blocks(title="PaperExtractGPT") as demo: gr.Markdown( '''
How to use:
1: Upload your PDF.
2: Click "View PDF" to preview it.
3: Enter your extraction prompt in the input box.
4: Click "Generate" to extract, and the extracted information will display below.