Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pandas as pd | |
import re | |
import os | |
import fitz | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
tokenizer = AutoTokenizer.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer") | |
model = AutoModelForSeq2SeqLM.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer") | |
def extract_text_from_pdf(pdf_file_path): | |
doc = fitz.open(pdf_file_path) | |
text = "" | |
for page in doc: | |
text+=page.get_text() | |
return text | |
def generate_question_answer_pairs(pdf_file): | |
if pdf_file is None: | |
return "Please upload a PDF file" | |
d = {'Question':[],'Answer':[]} | |
df = pd.DataFrame(data=d) | |
pdf_text = extract_text_from_pdf(pdf_file.name) | |
sentences = re.split(r'(?<=[.!?])', pdf_text) | |
question_answer_pairs = [] | |
for sentence in sentences: | |
input_ids = tokenizer.encode(sentence, return_tensors="pt") | |
outputs = model.generate(input_ids, max_length=100, num_return_sequences=1) | |
question_answer = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
question_answer_pairs.append(question_answer) | |
result = '' | |
for question_answer in question_answer_pairs: | |
qa_parts = question_answer.split("?") | |
if len(qa_parts) >= 2: | |
question_part = qa_parts[0] + "?" | |
answer_part = qa_parts[1].strip() | |
new_data = {'Question': [question_part], 'Answer': [answer_part]} | |
df = pd.concat([df, pd.DataFrame(new_data)], ignore_index=True) | |
result += f"Question: {question_part}\nAnswer: {answer_part}\n\n" | |
df.to_csv("QAPairs.csv") | |
return result, "QAPairs.csv" | |
title = "Question-Answer Pairs Generation" | |
input_file = gr.File(label="Upload a PDF file") | |
output_file = gr.File(label="Download as csv") | |
output_text = gr.Textbox() | |
interface = gr.Interface( | |
fn=generate_question_answer_pairs, | |
inputs=input_file, | |
outputs=[output_text, output_file], | |
title=title, | |
) | |
interface.launch() |