|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
|
|
def init_model(): |
|
para_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base") |
|
para_model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base") |
|
return para_tokenizer, para_model |
|
|
|
|
|
def paraphrase(question, para_tokenizer, para_model, num_beams=10, num_beam_groups=10, num_return_sequences=10, repetition_penalty=10.0, diversity_penalty=3.0, no_repeat_ngram_size=2, temperature=0.7, max_length=64): |
|
input_ids = para_tokenizer( |
|
f'paraphrase: {question}', |
|
return_tensors="pt", padding="longest", |
|
max_length=max_length, |
|
truncation=True, |
|
).input_ids |
|
outputs = para_model.generate( |
|
input_ids, temperature=temperature, repetition_penalty=repetition_penalty, |
|
num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size, |
|
num_beams=num_beams, num_beam_groups=num_beam_groups, |
|
max_length=max_length, diversity_penalty=diversity_penalty |
|
) |
|
res = para_tokenizer.batch_decode(outputs, skip_special_tokens=True) |
|
return res |
|
|
|
def generate_paraphrase(question): |
|
para_tokenizer, para_model = init_model() |
|
res = paraphrase(question, para_tokenizer, para_model) |
|
return res |
|
|
|
|