File size: 2,485 Bytes
dd82de8
c97f1f4
 
365f519
3106957
c97f1f4
 
6f64ce3
87a496b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3388102
87a496b
 
 
 
 
 
 
 
 
 
 
 
 
75a7908
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
os.system('pip install -q git+https://github.com/huggingface/transformers.git')
os.system('pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu')
os.system('pip install fitz')
os.system('pip install PyMuPDF')


from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import gradio as gr
import re
import fitz



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large").to(device)

class GUI:

    def preprocess(self,text):
        text = text.replace('\n', ' ')
        text = re.sub('\s+', ' ', text)
        return text

    def query_from_list(self,query, options, tok_len):

        
        t5query = f"""Question: "{query}" Context: {options}"""
        inputs = tokenizer(t5query, return_tensors="pt").to(device)
        outputs = model.generate(**inputs, max_new_tokens=tok_len)
        return tokenizer.batch_decode(outputs, skip_special_tokens=True)




    def begin(self,pdf,question,start_page=1, end_page=None):

        doc = fitz.open(pdf)
        total_pages = doc.page_count

        if end_page is None:
            end_page = total_pages

        pdf_text = ""

        for i in range(start_page-1, end_page):
            text = doc.load_page(i).get_text("text")
            text =  app.preprocess(text)
            pdf_text+=text
            # Call the LLM with input data and instruction
        input_data=pdf_text
        
        results = app.query_from_list(question,  input_data, 30)

        return results

app = GUI()
title = "Get answers from your document with questions with Flan-T5"
description = "Results will show up in a few seconds."

article="<b>References</b><br>[1] FLAN-T5”  <a href='https://huggingface.co/docs/transformers/model_doc/flan-t5'>Transformers Link</a><br>" 


css = """.output_image, .input_image {height: 600px !important}"""

iface = gr.Interface(fn=app.begin, 
                     inputs=[gr.File(label="PDF File",file_types=['.pdf']), gr.Textbox(label="Question") ],
                     outputs = gr.Text(label="Answer Summary"),
                     title=title,
                     description=description,
                     article=article,
                     css=css,
                     analytics_enabled = True, enable_queue=True)

iface.launch(inline=False, share=False, debug=False)