File size: 5,638 Bytes
ee305a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea7f5b6
 
 
 
4b89d6b
ea7f5b6
 
 
 
 
 
 
ee305a4
ea7f5b6
 
ee305a4
 
 
 
ea7f5b6
4b89d6b
 
 
 
 
ee305a4
4b89d6b
 
 
ee305a4
 
 
 
4b89d6b
 
 
 
 
 
 
 
 
 
 
 
ee305a4
4b89d6b
 
 
ee305a4
4b89d6b
 
 
 
 
 
 
 
 
 
 
ee305a4
4b89d6b
ee305a4
 
 
 
4b89d6b
ea7f5b6
 
 
 
 
ee305a4
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# from transformers import AutoTokenizer, AutoModelForMaskedLM
# from transformers import pipeline
# import random
# from nltk.corpus import stopwords
# import math

# # Masking Model
# def mask_non_stopword(sentence):
#     stop_words = set(stopwords.words('english'))
#     words = sentence.split()
#     non_stop_words = [word for word in words if word.lower() not in stop_words]
#     if not non_stop_words:
#         return sentence
#     word_to_mask = random.choice(non_stop_words)
#     masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
#     return masked_sentence

# def mask_non_stopword_pseudorandom(sentence):
#     stop_words = set(stopwords.words('english'))
#     words = sentence.split()
#     non_stop_words = [word for word in words if word.lower() not in stop_words]
#     if not non_stop_words:
#         return sentence
#     random.seed(10)
#     word_to_mask = random.choice(non_stop_words)
#     masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
#     return masked_sentence

# def high_entropy_words(sentence, non_melting_points):
#     stop_words = set(stopwords.words('english'))
#     words = sentence.split()
    
#     non_melting_words = set()
#     for _, point in non_melting_points:
#         non_melting_words.update(point.lower().split())
    
#     candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
    
#     if not candidate_words:
#         return sentence
    
#     max_entropy = -float('inf')
#     max_entropy_word = None
    
#     for word in candidate_words:
#         masked_sentence = sentence.replace(word, '[MASK]', 1)
#         predictions = fill_mask(masked_sentence)
        
#         # Calculate entropy based on top 5 predictions
#         entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5])
        
#         if entropy > max_entropy:
#             max_entropy = entropy
#             max_entropy_word = word
    
#     return sentence.replace(max_entropy_word, '[MASK]', 1)


# # Load tokenizer and model for masked language model
# tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
# model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
# fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import pipeline
import random
from nltk.corpus import stopwords
import math

# Masking Model
def mask_non_stopword(sentence):
    stop_words = set(stopwords.words('english'))
    words = sentence.split()
    non_stop_words = [word for word in words if word.lower() not in stop_words]
    if not non_stop_words:
        return sentence, None, None
    word_to_mask = random.choice(non_stop_words)
    masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
    predictions = fill_mask(masked_sentence)
    words = [pred['score'] for pred in predictions]
    logits = [pred['token_str'] for pred in predictions]
    return masked_sentence, words, logits

def mask_non_stopword_pseudorandom(sentence):
    stop_words = set(stopwords.words('english'))
    words = sentence.split()
    non_stop_words = [word for word in words if word.lower() not in stop_words]
    if not non_stop_words:
        return sentence, None, None
    random.seed(10)
    word_to_mask = random.choice(non_stop_words)
    masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
    predictions = fill_mask(masked_sentence)
    words = [pred['score'] for pred in predictions]
    logits = [pred['token_str'] for pred in predictions]
    return masked_sentence, words, logits

def high_entropy_words(sentence, non_melting_points):
    stop_words = set(stopwords.words('english'))
    words = sentence.split()
    
    non_melting_words = set()
    for _, point in non_melting_points:
        non_melting_words.update(point.lower().split())
    
    candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
    
    if not candidate_words:
        return sentence, None, None
    
    max_entropy = -float('inf')
    max_entropy_word = None
    max_logits = None
    
    for word in candidate_words:
        masked_sentence = sentence.replace(word, '[MASK]', 1)
        predictions = fill_mask(masked_sentence)
        
        # Calculate entropy based on top 5 predictions
        entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5])
        
        if entropy > max_entropy:
            max_entropy = entropy
            max_entropy_word = word
            max_logits = [pred['score'] for pred in predictions]
    
    masked_sentence = sentence.replace(max_entropy_word, '[MASK]', 1)
    words = [pred['score'] for pred in predictions]
    logits = [pred['token_str'] for pred in predictions]
    return masked_sentence, words, logits

# Load tokenizer and model for masked language model
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

non_melting_points = [(1, 'Jewish'), (2, 'messages'), (3, 'stab')]
a, b, c = high_entropy_words("A former Cornell University student was sentenced to 21 months in prison on Monday after admitting that he had posted a series of online messages last fall in which he threatened to stab, rape and behead Jewish people", non_melting_points)
print(f"logits type: {type(b)}")
print(f"logits content: {b}")