Spaces:

Michelvh
/

dutch-questgen

Sleeping

File size: 3,856 Bytes

import gradio as gr
from transformers import T5ForConditionalGeneration, T5TokenizerFast
import nltk
from nltk import tokenize

nltk.download('punkt')

checkpoint = "yhavinga/t5-base-dutch"
tokenizer = T5TokenizerFast.from_pretrained(checkpoint)
tokenizer.sep_token = '<sep>'
tokenizer.add_tokens(['<sep>'])

hfmodel = T5ForConditionalGeneration.from_pretrained("Michelvh/t5-end2end-questions-generation-dutch")

def hf_run_model(input_string, **generator_args):
    generator_args = {
    "max_length": 256,
    "num_beams": 4,
    "length_penalty": 1.5,
    "no_repeat_ngram_size": 3,
    "early_stopping": True,
    "num_return_sequences": 1,
    }
    input_string = "generate questions: " + input_string + " </s>"
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
    res = hfmodel.generate(input_ids, **generator_args)
    output = tokenizer.batch_decode(res, skip_special_tokens=True)
    output = [item.split("<sep>") for item in output]
    return output


def chunk_text(text, framesize=5):
    sentences = tokenize.sent_tokenize(text)
    frames = []
    lastindex = len(sentences) - framesize + 1
    for index in range(lastindex):
        frames.append(" ".join(sentences[index:index+framesize]))
    return frames


def flatten(l):
    return [item for sublist in l for item in sublist]


def run_model_with_frames(text, framesize=4, overlap=3, progress=gr.Progress()):
    if overlap > framesize:
        return "Overlap should be smaller than batch size"
    frames = create_frames(text, framesize, overlap)
    progress(0, desc="Starting...", total=len(frames))
    counter = 0
    result = set()
    for frame in frames:
        questions = flatten(hf_run_model(frame))
        for question in questions:
            result.add(question.strip())
        counter += 1
        progress(counter, desc="Generating...")
    output_string = ""
    for entry in result:
        output_string += entry
        output_string += "\n"
    progress=(counter, desc="Done")
    return output_string


def create_frames(text, framesize=4, overlap=3):
    sentences = tokenize.sent_tokenize(text)
    frames = []
    stepsize = framesize - overlap
    index = 0
    sentenceslength = len(sentences)
    while index < sentenceslength:
        endindex = index + framesize
        if endindex >= sentenceslength:
            frame = " ".join(sentences[-framesize:])
            index = sentenceslength
        else:
            frame = " ".join(sentences[index:endindex])
            index += stepsize
        frames.append(frame)
    return frames


def ensure_questionmark(question):
    if question.endswith("?"):
        return question
    return question + "?"

description = """
Input some Dutch text and click the button to generate some questions! 
The model is currently set up to generate as many questions, but this 
can take a couple of minutes so have some patience ;)
The optimal text lenght is porbably around 8-10 lines. Longer text 
will obviously take longer. Please keep in mind that this is a work in 
progress and might still be a little bit buggy."""

with gr.Blocks() as iface:
    gr.Markdown(description)
    context = gr.Textbox(label="Input text")
    frame_size = gr.Number(value=5, label="Batch size", info="Size of the subparts that are used to generate questions. Increase to speed up the generation", precision=0)
    overlap = gr.Number(value=4, label="Overlap", info="Overlap between batches. Should be bigger than batch size. Decrease to speed up generation", precision=0)
    questions = gr.Textbox(label="Questions")
    generate_btn = gr.Button("Generate questions")
    generate_btn.click(fn=run_model_with_frames, inputs=[context, frame_size, overlap], outputs=questions, api_name="generate_questions")

#iface = gr.Interface(fn=run_model_with_frames, inputs="text", outputs="text")
iface.launch()