Spaces:
Running
Running
# from transformers import AutoTokenizer, AutoModelForMaskedLM | |
# from transformers import pipeline | |
# import random | |
# from nltk.corpus import stopwords | |
# import math | |
# # Masking Model | |
# def mask_non_stopword(sentence): | |
# stop_words = set(stopwords.words('english')) | |
# words = sentence.split() | |
# non_stop_words = [word for word in words if word.lower() not in stop_words] | |
# if not non_stop_words: | |
# return sentence | |
# word_to_mask = random.choice(non_stop_words) | |
# masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1) | |
# return masked_sentence | |
# def mask_non_stopword_pseudorandom(sentence): | |
# stop_words = set(stopwords.words('english')) | |
# words = sentence.split() | |
# non_stop_words = [word for word in words if word.lower() not in stop_words] | |
# if not non_stop_words: | |
# return sentence | |
# random.seed(10) | |
# word_to_mask = random.choice(non_stop_words) | |
# masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1) | |
# return masked_sentence | |
# def high_entropy_words(sentence, non_melting_points): | |
# stop_words = set(stopwords.words('english')) | |
# words = sentence.split() | |
# non_melting_words = set() | |
# for _, point in non_melting_points: | |
# non_melting_words.update(point.lower().split()) | |
# candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words] | |
# if not candidate_words: | |
# return sentence | |
# max_entropy = -float('inf') | |
# max_entropy_word = None | |
# for word in candidate_words: | |
# masked_sentence = sentence.replace(word, '[MASK]', 1) | |
# predictions = fill_mask(masked_sentence) | |
# # Calculate entropy based on top 5 predictions | |
# entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5]) | |
# if entropy > max_entropy: | |
# max_entropy = entropy | |
# max_entropy_word = word | |
# return sentence.replace(max_entropy_word, '[MASK]', 1) | |
# # Load tokenizer and model for masked language model | |
# tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking") | |
# model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking") | |
# fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer) | |
from transformers import AutoTokenizer, AutoModelForMaskedLM | |
from transformers import pipeline | |
import random | |
from nltk.corpus import stopwords | |
import math | |
# Masking Model | |
def mask_non_stopword(sentence): | |
stop_words = set(stopwords.words('english')) | |
words = sentence.split() | |
non_stop_words = [word for word in words if word.lower() not in stop_words] | |
if not non_stop_words: | |
return sentence, None, None | |
word_to_mask = random.choice(non_stop_words) | |
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1) | |
predictions = fill_mask(masked_sentence) | |
words = [pred['score'] for pred in predictions] | |
logits = [pred['token_str'] for pred in predictions] | |
return masked_sentence, words, logits | |
def mask_non_stopword_pseudorandom(sentence): | |
stop_words = set(stopwords.words('english')) | |
words = sentence.split() | |
non_stop_words = [word for word in words if word.lower() not in stop_words] | |
if not non_stop_words: | |
return sentence, None, None | |
random.seed(10) | |
word_to_mask = random.choice(non_stop_words) | |
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1) | |
predictions = fill_mask(masked_sentence) | |
words = [pred['score'] for pred in predictions] | |
logits = [pred['token_str'] for pred in predictions] | |
return masked_sentence, words, logits | |
def high_entropy_words(sentence, non_melting_points): | |
stop_words = set(stopwords.words('english')) | |
words = sentence.split() | |
non_melting_words = set() | |
for _, point in non_melting_points: | |
non_melting_words.update(point.lower().split()) | |
candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words] | |
if not candidate_words: | |
return sentence, None, None | |
max_entropy = -float('inf') | |
max_entropy_word = None | |
max_logits = None | |
for word in candidate_words: | |
masked_sentence = sentence.replace(word, '[MASK]', 1) | |
predictions = fill_mask(masked_sentence) | |
# Calculate entropy based on top 5 predictions | |
entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5]) | |
if entropy > max_entropy: | |
max_entropy = entropy | |
max_entropy_word = word | |
max_logits = [pred['score'] for pred in predictions] | |
masked_sentence = sentence.replace(max_entropy_word, '[MASK]', 1) | |
words = [pred['score'] for pred in predictions] | |
logits = [pred['token_str'] for pred in predictions] | |
return masked_sentence, words, logits | |
# Load tokenizer and model for masked language model | |
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking") | |
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking") | |
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer) | |
non_melting_points = [(1, 'Jewish'), (2, 'messages'), (3, 'stab')] | |
a, b, c = high_entropy_words("A former Cornell University student was sentenced to 21 months in prison on Monday after admitting that he had posted a series of online messages last fall in which he threatened to stab, rape and behead Jewish people", non_melting_points) | |
print(f"logits type: {type(b)}") | |
print(f"logits content: {b}") | |