|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForMaskedLM |
|
from transformers import pipeline |
|
import random |
|
from nltk.corpus import stopwords |
|
import math |
|
|
|
|
|
def mask_non_stopword(sentence): |
|
stop_words = set(stopwords.words('english')) |
|
words = sentence.split() |
|
non_stop_words = [word for word in words if word.lower() not in stop_words] |
|
if not non_stop_words: |
|
return sentence, None, None |
|
word_to_mask = random.choice(non_stop_words) |
|
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1) |
|
predictions = fill_mask(masked_sentence) |
|
words = [pred['score'] for pred in predictions] |
|
logits = [pred['token_str'] for pred in predictions] |
|
return masked_sentence, words, logits |
|
|
|
def mask_non_stopword_pseudorandom(sentence): |
|
stop_words = set(stopwords.words('english')) |
|
words = sentence.split() |
|
non_stop_words = [word for word in words if word.lower() not in stop_words] |
|
if not non_stop_words: |
|
return sentence, None, None |
|
random.seed(10) |
|
word_to_mask = random.choice(non_stop_words) |
|
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1) |
|
predictions = fill_mask(masked_sentence) |
|
words = [pred['score'] for pred in predictions] |
|
logits = [pred['token_str'] for pred in predictions] |
|
return masked_sentence, words, logits |
|
|
|
def high_entropy_words(sentence, non_melting_points): |
|
stop_words = set(stopwords.words('english')) |
|
words = sentence.split() |
|
|
|
non_melting_words = set() |
|
for _, point in non_melting_points: |
|
non_melting_words.update(point.lower().split()) |
|
|
|
candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words] |
|
|
|
if not candidate_words: |
|
return sentence, None, None |
|
|
|
max_entropy = -float('inf') |
|
max_entropy_word = None |
|
max_logits = None |
|
|
|
for word in candidate_words: |
|
masked_sentence = sentence.replace(word, '[MASK]', 1) |
|
predictions = fill_mask(masked_sentence) |
|
|
|
|
|
entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5]) |
|
|
|
if entropy > max_entropy: |
|
max_entropy = entropy |
|
max_entropy_word = word |
|
max_logits = [pred['score'] for pred in predictions] |
|
|
|
masked_sentence = sentence.replace(max_entropy_word, '[MASK]', 1) |
|
words = [pred['score'] for pred in predictions] |
|
logits = [pred['token_str'] for pred in predictions] |
|
return masked_sentence, words, logits |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking") |
|
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking") |
|
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer) |
|
|
|
non_melting_points = [(1, 'Jewish'), (2, 'messages'), (3, 'stab')] |
|
a, b, c = high_entropy_words("A former Cornell University student was sentenced to 21 months in prison on Monday after admitting that he had posted a series of online messages last fall in which he threatened to stab, rape and behead Jewish people", non_melting_points) |
|
print(f"logits type: {type(b)}") |
|
print(f"logits content: {b}") |
|
|