from transformers import AutoTokenizer, AutoModelForMaskedLM | |
from transformers import pipeline | |
import random | |
from nltk.corpus import stopwords | |
# Masking Model | |
def mask_non_stopword(sentence): | |
stop_words = set(stopwords.words('english')) | |
words = sentence.split() | |
non_stop_words = [word for word in words if word.lower() not in stop_words] | |
if not non_stop_words: | |
return sentence | |
word_to_mask = random.choice(non_stop_words) | |
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1) | |
return masked_sentence | |
# Load tokenizer and model for masked language model | |
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking") | |
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking") | |
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer) | |
def mask(sentence): | |
predictions = fill_mask(sentence) | |
masked_sentences = [predictions[i]['sequence'] for i in range(len(predictions))] | |
return masked_sentences |