File size: 3,856 Bytes
92da267
 
 
 
 
13305fa
 
92da267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66c4010
92da267
 
66c4010
 
 
92da267
 
 
 
 
 
 
a9b92de
8399959
 
8812439
a9b92de
 
92da267
 
9182f58
 
 
a9b92de
 
9182f58
 
 
 
a9b92de
9182f58
92da267
 
66c4010
e5cdcd9
 
 
 
 
 
 
8812439
e5cdcd9
 
 
 
 
 
 
 
 
 
 
 
 
 
799fc30
 
 
 
 
 
 
 
9d0aa3d
799fc30
 
8399959
 
799fc30
 
8399959
799fc30
 
92da267
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import gradio as gr
from transformers import T5ForConditionalGeneration, T5TokenizerFast
import nltk
from nltk import tokenize

nltk.download('punkt')

checkpoint = "yhavinga/t5-base-dutch"
tokenizer = T5TokenizerFast.from_pretrained(checkpoint)
tokenizer.sep_token = '<sep>'
tokenizer.add_tokens(['<sep>'])

hfmodel = T5ForConditionalGeneration.from_pretrained("Michelvh/t5-end2end-questions-generation-dutch")

def hf_run_model(input_string, **generator_args):
    generator_args = {
    "max_length": 256,
    "num_beams": 4,
    "length_penalty": 1.5,
    "no_repeat_ngram_size": 3,
    "early_stopping": True,
    "num_return_sequences": 1,
    }
    input_string = "generate questions: " + input_string + " </s>"
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
    res = hfmodel.generate(input_ids, **generator_args)
    output = tokenizer.batch_decode(res, skip_special_tokens=True)
    output = [item.split("<sep>") for item in output]
    return output


def chunk_text(text, framesize=5):
    sentences = tokenize.sent_tokenize(text)
    frames = []
    lastindex = len(sentences) - framesize + 1
    for index in range(lastindex):
        frames.append(" ".join(sentences[index:index+framesize]))
    return frames


def flatten(l):
    return [item for sublist in l for item in sublist]


def run_model_with_frames(text, framesize=4, overlap=3, progress=gr.Progress()):
    if overlap > framesize:
        return "Overlap should be smaller than batch size"
    frames = create_frames(text, framesize, overlap)
    progress(0, desc="Starting...", total=len(frames))
    counter = 0
    result = set()
    for frame in frames:
        questions = flatten(hf_run_model(frame))
        for question in questions:
            result.add(question.strip())
        counter += 1
        progress(counter, desc="Generating...")
    output_string = ""
    for entry in result:
        output_string += entry
        output_string += "\n"
    progress=(counter, desc="Done")
    return output_string


def create_frames(text, framesize=4, overlap=3):
    sentences = tokenize.sent_tokenize(text)
    frames = []
    stepsize = framesize - overlap
    index = 0
    sentenceslength = len(sentences)
    while index < sentenceslength:
        endindex = index + framesize
        if endindex >= sentenceslength:
            frame = " ".join(sentences[-framesize:])
            index = sentenceslength
        else:
            frame = " ".join(sentences[index:endindex])
            index += stepsize
        frames.append(frame)
    return frames


def ensure_questionmark(question):
    if question.endswith("?"):
        return question
    return question + "?"

description = """
Input some Dutch text and click the button to generate some questions! 
The model is currently set up to generate as many questions, but this 
can take a couple of minutes so have some patience ;)
The optimal text lenght is porbably around 8-10 lines. Longer text 
will obviously take longer. Please keep in mind that this is a work in 
progress and might still be a little bit buggy."""

with gr.Blocks() as iface:
    gr.Markdown(description)
    context = gr.Textbox(label="Input text")
    frame_size = gr.Number(value=5, label="Batch size", info="Size of the subparts that are used to generate questions. Increase to speed up the generation", precision=0)
    overlap = gr.Number(value=4, label="Overlap", info="Overlap between batches. Should be bigger than batch size. Decrease to speed up generation", precision=0)
    questions = gr.Textbox(label="Questions")
    generate_btn = gr.Button("Generate questions")
    generate_btn.click(fn=run_model_with_frames, inputs=[context, frame_size, overlap], outputs=questions, api_name="generate_questions")

#iface = gr.Interface(fn=run_model_with_frames, inputs="text", outputs="text")
iface.launch()