File size: 5,638 Bytes
ee305a4 ea7f5b6 4b89d6b ea7f5b6 ee305a4 ea7f5b6 ee305a4 ea7f5b6 4b89d6b ee305a4 4b89d6b ee305a4 4b89d6b ee305a4 4b89d6b ee305a4 4b89d6b ee305a4 4b89d6b ee305a4 4b89d6b ea7f5b6 ee305a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
# from transformers import AutoTokenizer, AutoModelForMaskedLM
# from transformers import pipeline
# import random
# from nltk.corpus import stopwords
# import math
# # Masking Model
# def mask_non_stopword(sentence):
# stop_words = set(stopwords.words('english'))
# words = sentence.split()
# non_stop_words = [word for word in words if word.lower() not in stop_words]
# if not non_stop_words:
# return sentence
# word_to_mask = random.choice(non_stop_words)
# masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
# return masked_sentence
# def mask_non_stopword_pseudorandom(sentence):
# stop_words = set(stopwords.words('english'))
# words = sentence.split()
# non_stop_words = [word for word in words if word.lower() not in stop_words]
# if not non_stop_words:
# return sentence
# random.seed(10)
# word_to_mask = random.choice(non_stop_words)
# masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
# return masked_sentence
# def high_entropy_words(sentence, non_melting_points):
# stop_words = set(stopwords.words('english'))
# words = sentence.split()
# non_melting_words = set()
# for _, point in non_melting_points:
# non_melting_words.update(point.lower().split())
# candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
# if not candidate_words:
# return sentence
# max_entropy = -float('inf')
# max_entropy_word = None
# for word in candidate_words:
# masked_sentence = sentence.replace(word, '[MASK]', 1)
# predictions = fill_mask(masked_sentence)
# # Calculate entropy based on top 5 predictions
# entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5])
# if entropy > max_entropy:
# max_entropy = entropy
# max_entropy_word = word
# return sentence.replace(max_entropy_word, '[MASK]', 1)
# # Load tokenizer and model for masked language model
# tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
# model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
# fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import pipeline
import random
from nltk.corpus import stopwords
import math
# Masking Model
def mask_non_stopword(sentence):
stop_words = set(stopwords.words('english'))
words = sentence.split()
non_stop_words = [word for word in words if word.lower() not in stop_words]
if not non_stop_words:
return sentence, None, None
word_to_mask = random.choice(non_stop_words)
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
predictions = fill_mask(masked_sentence)
words = [pred['score'] for pred in predictions]
logits = [pred['token_str'] for pred in predictions]
return masked_sentence, words, logits
def mask_non_stopword_pseudorandom(sentence):
stop_words = set(stopwords.words('english'))
words = sentence.split()
non_stop_words = [word for word in words if word.lower() not in stop_words]
if not non_stop_words:
return sentence, None, None
random.seed(10)
word_to_mask = random.choice(non_stop_words)
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
predictions = fill_mask(masked_sentence)
words = [pred['score'] for pred in predictions]
logits = [pred['token_str'] for pred in predictions]
return masked_sentence, words, logits
def high_entropy_words(sentence, non_melting_points):
stop_words = set(stopwords.words('english'))
words = sentence.split()
non_melting_words = set()
for _, point in non_melting_points:
non_melting_words.update(point.lower().split())
candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
if not candidate_words:
return sentence, None, None
max_entropy = -float('inf')
max_entropy_word = None
max_logits = None
for word in candidate_words:
masked_sentence = sentence.replace(word, '[MASK]', 1)
predictions = fill_mask(masked_sentence)
# Calculate entropy based on top 5 predictions
entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5])
if entropy > max_entropy:
max_entropy = entropy
max_entropy_word = word
max_logits = [pred['score'] for pred in predictions]
masked_sentence = sentence.replace(max_entropy_word, '[MASK]', 1)
words = [pred['score'] for pred in predictions]
logits = [pred['token_str'] for pred in predictions]
return masked_sentence, words, logits
# Load tokenizer and model for masked language model
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
non_melting_points = [(1, 'Jewish'), (2, 'messages'), (3, 'stab')]
a, b, c = high_entropy_words("A former Cornell University student was sentenced to 21 months in prison on Monday after admitting that he had posted a series of online messages last fall in which he threatened to stab, rape and behead Jewish people", non_melting_points)
print(f"logits type: {type(b)}")
print(f"logits content: {b}")
|