File size: 2,002 Bytes
b8cce11
bfca8e7
b8cce11
ce57726
 
b8cce11
 
 
 
 
ce57726
 
 
 
 
b8cce11
ce57726
b8cce11
ce57726
 
 
 
bfca8e7
 
 
ce57726
 
 
b8cce11
 
ce57726
 
 
b8cce11
ce57726
b8cce11
 
3c17b68
ce57726
 
 
 
 
bfca8e7
 
ce57726
bfca8e7
 
 
b8cce11
ce57726
 
bfca8e7
ce57726
b8cce11
 
ce57726
 
bfca8e7
b8cce11
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import gradio as gr
import pandas as pd
import re
import os
import fitz
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer")
model = AutoModelForSeq2SeqLM.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer")

def extract_text_from_pdf(pdf_file_path):
    doc = fitz.open(pdf_file_path) 
    text = ""
    for page in doc:
        text+=page.get_text() 

    return text

def generate_question_answer_pairs(pdf_file):
    if pdf_file is None:
        return "Please upload a PDF file"

    d = {'Question':[],'Answer':[]}
    df = pd.DataFrame(data=d)

    pdf_text = extract_text_from_pdf(pdf_file.name)

    sentences = re.split(r'(?<=[.!?])', pdf_text)
    question_answer_pairs = []

    for sentence in sentences:
        input_ids = tokenizer.encode(sentence, return_tensors="pt")
        outputs = model.generate(input_ids, max_length=100, num_return_sequences=1)
        question_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
        question_answer_pairs.append(question_answer)

    result = ''

    for question_answer in question_answer_pairs:
        qa_parts = question_answer.split("?")
        if len(qa_parts) >= 2:
            question_part = qa_parts[0] + "?"
            answer_part = qa_parts[1].strip()
            new_data = {'Question': [question_part], 'Answer': [answer_part]}
            df = pd.concat([df, pd.DataFrame(new_data)], ignore_index=True)
            result += f"Question: {question_part}\nAnswer: {answer_part}\n\n"
            
    df.to_csv("QAPairs.csv")
    return result, "QAPairs.csv"

title = "Question-Answer Pairs Generation"
input_file = gr.File(label="Upload a PDF file")
output_file = gr.File(label="Download as csv")
output_text = gr.Textbox()

interface = gr.Interface(
    fn=generate_question_answer_pairs,
    inputs=input_file,
    outputs=[output_text, output_file],
    title=title,
)
interface.launch()