File size: 5,647 Bytes
900c0a5
73a2cf2
900c0a5
 
 
f704336
900c0a5
f704336
73a2cf2
f704336
73a2cf2
f704336
 
 
 
 
 
 
 
900c0a5
 
 
f704336
 
 
 
 
900c0a5
 
 
 
 
 
f704336
 
 
 
 
 
 
 
 
 
 
 
 
900c0a5
 
 
f704336
900c0a5
f704336
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73a2cf2
 
900c0a5
 
 
f704336
900c0a5
 
 
606ffde
900c0a5
 
 
 
 
 
 
606ffde
900c0a5
 
606ffde
 
900c0a5
 
 
 
73a2cf2
 
f704336
 
 
73a2cf2
f704336
 
 
73a2cf2
f704336
73a2cf2
f704336
73a2cf2
900c0a5
 
 
 
 
 
 
73a2cf2
 
 
f704336
900c0a5
73a2cf2
900c0a5
f704336
73a2cf2
f704336
 
73a2cf2
 
 
900c0a5
73a2cf2
 
 
 
f704336
 
900c0a5
73a2cf2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from openai import OpenAI
import gradio as gr
import fitz  # PyMuPDF
from PIL import Image
from pathlib import Path
import os


api_key = os.getenv('API_KEY')
base_url = os.getenv("BASE_URL")

client = OpenAI(
    api_key=api_key,
    base_url=base_url,
)


def extract_pdf_pypdf(pdf_dir):
    try:
        doc = fitz.open(pdf_dir)
    except Exception as e:
        print(f"Error opening PDF: {e}")
        return None

    page_count = doc.page_count
    file_content = ""
    for page in range(page_count):
        try:
            text = doc.load_page(page).get_text("text")
            file_content += text + "\n\n"
        except Exception as e:
            print(f"Error reading page {page}: {e}")
            continue

    return file_content


def openai_api(messages):
    try:
        completion = client.chat.completions.create(
            model="claude-3-5-sonnet-20240620",
            messages=messages,
            temperature=0.1,
            max_tokens=8192,
            stream=True
        )
        response = ''.join(
            [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in completion])
        return response
    except Exception as ex:
        print("API error:", ex)
        return None


def predict(input_text, pdf_file):
    if pdf_file is None:
        return "Please upload a PDF file to proceed."

    file_content = extract_pdf_pypdf(pdf_file.name)
    messages = [
        {
            "role": "system",
            "content": "You are an expert in information extraction from scientific literature.",
        },
        {"role": "user", "content": """Provided Text:
    '''
    {{""" + file_content + """}}
    '''
                                        """ + input_text}
    ]
    extract_result = openai_api(messages)

    return extract_result or "Too many users. Please wait a moment!"


def convert_pdf_to_images(pdf_path, image_folder="pdf_images", dpi=300):
    # 创建存储图像的文件夹
    os.makedirs(image_folder, exist_ok=True)

    # 打开PDF文档
    pdf_document = fitz.open(pdf_path)
    image_paths = []

    # 遍历每一页PDF,并生成高DPI的图像
    for page_number in range(len(pdf_document)):
        page = pdf_document[page_number]
        pix = page.get_pixmap(dpi=dpi)
        image_path = Path(image_folder) / f"page_{page_number + 1}.png"
        Image.frombytes("RGB", [pix.width, pix.height], pix.samples).save(image_path)
        image_paths.append(str(image_path))  # 收集每一页的图像路径

    pdf_document.close()
    return image_paths


def display_pdf_images(file):
    # 转换PDF为高清图像
    image_paths = convert_pdf_to_images(file)
    return image_paths  # 返回图像路径列表以显示


en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
"""

en_2 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a JSON format?
If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
"""

examples = [[en_1], [en_2]]

with gr.Blocks(title="PaperExtractGPT") as demo:
    gr.Markdown(
        '''<h1 align="center"> Paper Extract GPT </h1>
        <p>How to use:
        <br><strong>1</strong>: Upload your PDF.
        <br><strong>2</strong>: Click "View PDF" to preview it.
        <br><strong>3</strong>: Enter your extraction prompt in the input box.
        <br><strong>4</strong>: Click "Generate" to extract, and the extracted information will display below.
        </p>'''
    )
    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="Upload your PDF", type="filepath")
            example = gr.Examples(examples=[["./sample.pdf"]], inputs=file_input)
            viewer_button = gr.Button("View PDF")
            file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")

        with gr.Column():
            model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
            example = gr.Examples(examples=examples, inputs=model_input)
            with gr.Row():
                gen = gr.Button("Generate")
                clr = gr.Button("Clear")
            outputs = gr.Markdown(label='Output', value="""| Title                                       | Journal            | Year | Author                                        | Institution                                           | Email                 |
|---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
| Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
""")

    gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
    clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
    viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)

demo.launch()