File size: 5,961 Bytes
73a2cf2
 
f704336
 
 
73a2cf2
f704336
73a2cf2
f704336
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73a2cf2
f704336
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73a2cf2
 
606ffde
f704336
 
 
606ffde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73a2cf2
 
f704336
 
 
73a2cf2
f704336
 
 
73a2cf2
f704336
73a2cf2
f704336
73a2cf2
f704336
73a2cf2
 
f704336
 
 
 
73a2cf2
 
 
 
 
 
f704336
73a2cf2
f704336
 
73a2cf2
f704336
 
73a2cf2
 
 
f704336
73a2cf2
 
 
 
f704336
 
73a2cf2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import gradio as gr
import base64
import os
from openai import OpenAI

api_key = os.getenv('API_KEY')
base_url = os.getenv("BASE_URL")

client = OpenAI(
    api_key=api_key,
    base_url=base_url,
)


def extract_pdf_pypdf(pdf_dir):
    import fitz
    path = pdf_dir

    try:
        doc = fitz.open(path)
    except:
        print("can not read pdf")
        return None

    page_count = doc.page_count
    file_content = ""
    for page in range(page_count):
        text = doc.load_page(page).get_text("text")
        # 防止目录中包含References
        file_content += text + "\n\n"

    return file_content


def openai_api(messages):
    try:
        completion = client.chat.completions.create(
            model="claude-3-5-sonnet-20240620",
            messages=messages,
            temperature=0.1,
            max_tokens=8192,
            # timeout=300,
            stream=True
        )
    except Exception as ex:
        print("api 出现如下异常%s" % ex)
        return None

    if completion:
        try:
            response_2_list = [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in
                               completion]
            print("response tokens:", len(response_2_list))

            response_2_content = ''.join(response_2_list)
            return response_2_content
        except Exception as ex:
            print("第二轮 出现如下异常%s" % ex)
            return None
    else:
        print("第二轮出现异常")
        return None


def predict(input_text, pdf_file):
    if pdf_file is None:
        return "Please upload a PDF file to proceed."

    file_content = extract_pdf_pypdf(pdf_file.name)
    messages = [
        {
            "role": "system",
            "content": "You are an expert in information extraction from scientific literature.",
        },
        {"role": "user", "content": """Provided Text:
    '''
    {{""" + file_content + """}}
    '''
                                        """ + input_text}
    ]
    extract_result = openai_api(messages)

    return extract_result or "Too many users. Please wait a moment!"


def view_pdf(pdf_file, max_pages=3):
    if pdf_file is None:
        return "Please upload a PDF file to view."

    try:
        # Open the PDF file
        doc = fitz.open(pdf_file.name)

        # Only read up to `max_pages` pages to reduce size for large PDFs
        preview_pdf = fitz.open()  # Create an empty PDF for the preview
        for page_num in range(min(max_pages, doc.page_count)):
            preview_pdf.insert_pdf(doc, from_page=page_num, to_page=page_num)

        # Save the preview as a temporary in-memory file
        pdf_data = preview_pdf.tobytes()

        # Encode as base64 for embedding in HTML
        b64_data = base64.b64encode(pdf_data).decode('utf-8')
        return f"<embed src='data:application/pdf;base64,{b64_data}' type='application/pdf' width='100%' height='700px' />"

    except Exception as e:
        print(f"Error displaying PDF: {e}")
        return "Error displaying PDF. Please try re-uploading."


en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
"""

en_2 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a JSON format?
If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
"""

examples = [[en_1], [en_2]]

with gr.Blocks(title="PaperExtractGPT") as demo:
    gr.Markdown(
        '''<p align="center">
        <h1 align="center"> Paper Extract GPT </h1>
        <p> How to use:
        <br> <strong>1</strong>: Upload your PDF.
        <br> <strong>2</strong>: Click "View PDF" to preview it.
        <br> <strong>3</strong>: Enter your extraction prompt in the input box.
        <br> <strong>4</strong>: Click "Generate" to extract, and the extracted information will display below.
        </p>
        '''
    )
    with gr.Row():
        with gr.Column():
            gr.Markdown('## Upload PDF')
            file_input = gr.File(label="Upload your PDF", type="filepath")
            viewer_button = gr.Button("View PDF")
            file_out = gr.HTML(label="PDF Preview")

        with gr.Column():
            model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
            example = gr.Examples(examples=examples, inputs=model_input)
            with gr.Row():
                gen = gr.Button("Generate")
                clr = gr.Button("Clear")
            outputs = gr.Markdown(label='Output', show_label=True,  value="""| Title                                       | Journal            | Year | Author                                        | Institution                                           | Email                 |
|---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
| Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
""")

    gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
    clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
    viewer_button.click(view_pdf, inputs=file_input, outputs=file_out)

demo.launch()