Spaces:
Runtime error
Runtime error
File size: 2,002 Bytes
b8cce11 bfca8e7 b8cce11 ce57726 b8cce11 ce57726 b8cce11 ce57726 b8cce11 ce57726 bfca8e7 ce57726 b8cce11 ce57726 b8cce11 ce57726 b8cce11 3c17b68 ce57726 bfca8e7 ce57726 bfca8e7 b8cce11 ce57726 bfca8e7 ce57726 b8cce11 ce57726 bfca8e7 b8cce11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import gradio as gr
import pandas as pd
import re
import os
import fitz
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer")
model = AutoModelForSeq2SeqLM.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer")
def extract_text_from_pdf(pdf_file_path):
doc = fitz.open(pdf_file_path)
text = ""
for page in doc:
text+=page.get_text()
return text
def generate_question_answer_pairs(pdf_file):
if pdf_file is None:
return "Please upload a PDF file"
d = {'Question':[],'Answer':[]}
df = pd.DataFrame(data=d)
pdf_text = extract_text_from_pdf(pdf_file.name)
sentences = re.split(r'(?<=[.!?])', pdf_text)
question_answer_pairs = []
for sentence in sentences:
input_ids = tokenizer.encode(sentence, return_tensors="pt")
outputs = model.generate(input_ids, max_length=100, num_return_sequences=1)
question_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
question_answer_pairs.append(question_answer)
result = ''
for question_answer in question_answer_pairs:
qa_parts = question_answer.split("?")
if len(qa_parts) >= 2:
question_part = qa_parts[0] + "?"
answer_part = qa_parts[1].strip()
new_data = {'Question': [question_part], 'Answer': [answer_part]}
df = pd.concat([df, pd.DataFrame(new_data)], ignore_index=True)
result += f"Question: {question_part}\nAnswer: {answer_part}\n\n"
df.to_csv("QAPairs.csv")
return result, "QAPairs.csv"
title = "Question-Answer Pairs Generation"
input_file = gr.File(label="Upload a PDF file")
output_file = gr.File(label="Download as csv")
output_text = gr.Textbox()
interface = gr.Interface(
fn=generate_question_answer_pairs,
inputs=input_file,
outputs=[output_text, output_file],
title=title,
)
interface.launch() |