jgyasu commited on
Commit
436c4c1
·
verified ·
1 Parent(s): 5d9cd0b

Upload folder using huggingface_hub

Browse files
.gitignore CHANGED
@@ -1 +1,2 @@
1
- __pycache__
 
 
1
+ .env
2
+ __pycache__/
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
app.py CHANGED
@@ -1,19 +1,22 @@
1
  import nltk
2
  nltk.download('stopwords')
3
- from transformers import AutoTokenizer
4
- from transformers import AutoModelForSeq2SeqLM
5
  import plotly.graph_objs as go
6
  from transformers import pipeline
7
- from matplotlib.colors import ListedColormap, rgb2hex
8
  import random
9
  import gradio as gr
10
  from tree import generate_subplot1, generate_subplot2
11
  from paraphraser import generate_paraphrase
12
- from lcs import find_common_subsequences
13
- from highlighter import highlight_common_words, highlight_common_words_dict
14
  from entailment import analyze_entailment
15
  from masking_methods import mask_non_stopword, mask_non_stopword_pseudorandom, high_entropy_words
16
  from sampling_methods import sample_word
 
 
 
 
17
 
18
 
19
  # Function for the Gradio interface
@@ -21,8 +24,10 @@ def model(prompt):
21
  user_prompt = prompt
22
  paraphrased_sentences = generate_paraphrase(user_prompt)
23
  analyzed_paraphrased_sentences, selected_sentences, discarded_sentences = analyze_entailment(user_prompt, paraphrased_sentences, 0.7)
24
- length_accepted_sentences = len(selected_sentences)
25
  common_grams = find_common_subsequences(user_prompt, selected_sentences)
 
 
26
 
27
  masked_sentences = []
28
  masked_words = []
@@ -51,7 +56,8 @@ def model(prompt):
51
  sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='temperature', temperature=1.0))
52
  sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='greedy', temperature=1.0))
53
 
54
- print(len(sampled_sentences))
 
55
 
56
  colors = ["red", "blue", "brown", "green"]
57
 
@@ -83,7 +89,60 @@ def model(prompt):
83
  masked_index += 3
84
  sampled_index += 12
85
 
86
- return [highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + trees1 + trees2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
 
89
  with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
@@ -127,8 +186,27 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
127
  tree2 = gr.Plot()
128
  tree2_tabs.append(tree2)
129
 
130
- submit_button.click(model, inputs=user_input, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  clear_button.click(lambda: "", inputs=None, outputs=user_input)
132
- clear_button.click(lambda: "", inputs=None, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs)
133
 
134
  demo.launch(share=True)
 
1
  import nltk
2
  nltk.download('stopwords')
3
+ # from transformers import AutoTokenizer
4
+ # from transformers import AutoModelForSeq2SeqLM
5
  import plotly.graph_objs as go
6
  from transformers import pipeline
 
7
  import random
8
  import gradio as gr
9
  from tree import generate_subplot1, generate_subplot2
10
  from paraphraser import generate_paraphrase
11
+ from lcs import find_common_subsequences, find_common_gram_positions
12
+ from highlighter import highlight_common_words, highlight_common_words_dict, reparaphrased_sentences_html
13
  from entailment import analyze_entailment
14
  from masking_methods import mask_non_stopword, mask_non_stopword_pseudorandom, high_entropy_words
15
  from sampling_methods import sample_word
16
+ from detectability import SentenceDetectabilityCalculator
17
+ from distortion import SentenceDistortionCalculator
18
+ from euclidean_distance import SentenceEuclideanDistanceCalculator
19
+ from threeD_plot import gen_three_D_plot
20
 
21
 
22
  # Function for the Gradio interface
 
24
  user_prompt = prompt
25
  paraphrased_sentences = generate_paraphrase(user_prompt)
26
  analyzed_paraphrased_sentences, selected_sentences, discarded_sentences = analyze_entailment(user_prompt, paraphrased_sentences, 0.7)
27
+ print(analyze_entailment(user_prompt, paraphrased_sentences, 0.7))
28
  common_grams = find_common_subsequences(user_prompt, selected_sentences)
29
+ subsequences = [subseq for _, subseq in common_grams]
30
+ common_grams_position = find_common_gram_positions(selected_sentences, subsequences)
31
 
32
  masked_sentences = []
33
  masked_words = []
 
56
  sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='temperature', temperature=1.0))
57
  sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='greedy', temperature=1.0))
58
 
59
+
60
+
61
 
62
  colors = ["red", "blue", "brown", "green"]
63
 
 
89
  masked_index += 3
90
  sampled_index += 12
91
 
92
+ reparaphrased_sentences = generate_paraphrase(sampled_sentences)
93
+
94
+ len_reparaphrased_sentences = len(reparaphrased_sentences)
95
+
96
+ reparaphrased_sentences_list = []
97
+
98
+ # Process the sentences in batches of 10
99
+ for i in range(0, len_reparaphrased_sentences, 10):
100
+ # Get the current batch of 10 sentences
101
+ batch = reparaphrased_sentences[i:i + 10]
102
+
103
+ # Check if the batch has exactly 10 sentences
104
+ if len(batch) == 10:
105
+ # Call the display_sentences function and store the result in the list
106
+ html_block = reparaphrased_sentences_html(batch)
107
+ reparaphrased_sentences_list.append(html_block)
108
+
109
+ distortion_list = []
110
+ detectability_list = []
111
+ euclidean_dist_list = []
112
+
113
+ distortion_calculator = SentenceDistortionCalculator(user_prompt, reparaphrased_sentences)
114
+ distortion_calculator.calculate_all_metrics()
115
+ distortion_calculator.normalize_metrics()
116
+ distortion_calculator.calculate_combined_distortion()
117
+
118
+ distortion = distortion_calculator.get_combined_distortions()
119
+
120
+ for each in distortion.items():
121
+ distortion_list.append(each[1])
122
+
123
+ detectability_calculator = SentenceDetectabilityCalculator(user_prompt, reparaphrased_sentences)
124
+ detectability_calculator.calculate_all_metrics()
125
+ detectability_calculator.normalize_metrics()
126
+ detectability_calculator.calculate_combined_detectability()
127
+
128
+ detectability = detectability_calculator.get_combined_detectabilities()
129
+
130
+ for each in detectability.items():
131
+ detectability_list.append(each[1])
132
+
133
+ euclidean_dist_calculator = SentenceEuclideanDistanceCalculator(user_prompt, reparaphrased_sentences)
134
+ euclidean_dist_calculator.calculate_all_metrics()
135
+ euclidean_dist_calculator.normalize_metrics()
136
+ euclidean_dist_calculator.get_normalized_metrics()
137
+
138
+ euclidean_dist = detectability_calculator.get_combined_detectabilities()
139
+
140
+ for each in euclidean_dist.items():
141
+ euclidean_dist_list.append(each[1])
142
+
143
+ three_D_plot = gen_three_D_plot(detectability_list, distortion_list, euclidean_dist_list)
144
+
145
+ return [highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + trees1 + trees2 + reparaphrased_sentences_list + [three_D_plot]
146
 
147
 
148
  with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
 
186
  tree2 = gr.Plot()
187
  tree2_tabs.append(tree2)
188
 
189
+ # Adding the "Re-paraphrased Sentences" section
190
+ with gr.Row():
191
+ gr.Markdown("### Re-paraphrased Sentences") # Label for re-paraphrased sentences
192
+
193
+ # Adding tabs for the re-paraphrased sentences
194
+ with gr.Row():
195
+ with gr.Tabs():
196
+ reparaphrased_sentences_tabs = []
197
+ for i in range(120): # 120 tabs for 120 batches of sentences
198
+ with gr.TabItem(f"Sentence {i+1}"):
199
+ reparaphrased_sent_html = gr.HTML() # Placeholder for each batch
200
+ reparaphrased_sentences_tabs.append(reparaphrased_sent_html)
201
+
202
+ with gr.Row():
203
+ gr.Markdown("### 3D Plot for Sweet Spot")
204
+ with gr.Row():
205
+ three_D_plot = gr.Plot()
206
+
207
+
208
+ submit_button.click(model, inputs=user_input, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs + reparaphrased_sentences_tabs + [three_D_plot])
209
  clear_button.click(lambda: "", inputs=None, outputs=user_input)
210
+ clear_button.click(lambda: "", inputs=None, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs + reparaphrased_sentences_tabs + [three_D_plot])
211
 
212
  demo.launch(share=True)
detectability.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import nltk
3
+
4
+ import numpy as np
5
+ import torch
6
+ import matplotlib.pyplot as plt
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ from transformers import BertModel, BertTokenizer
9
+ from sentence_transformers import SentenceTransformer
10
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
11
+
12
+ # Download NLTK data if not already present
13
+ nltk.download('punkt', quiet=True)
14
+ detectability_val={}
15
+ class SentenceDetectabilityCalculator:
16
+ """
17
+ A class to calculate and analyze detectability metrics between an original sentence and paraphrased sentences.
18
+ """
19
+
20
+ def __init__(self, original_sentence, paraphrased_sentences):
21
+ """
22
+ Initialize the calculator with the original sentence and a list of paraphrased sentences.
23
+ """
24
+ self.original_sentence = original_sentence
25
+ self.paraphrased_sentences = paraphrased_sentences
26
+
27
+ # Raw metric dictionaries
28
+ self.bleu_scores = {}
29
+ self.cosine_similarities = {}
30
+ self.sts_scores = {}
31
+
32
+ # Normalized metric dictionaries
33
+ self.normalized_bleu = {}
34
+ self.normalized_cosine = {}
35
+ self.normalized_sts = {}
36
+
37
+ # Combined detectability dictionary
38
+ self.combined_detectabilities = {}
39
+
40
+ # Load pre-trained BERT and SentenceTransformer for Cosine Similarity and STS Score
41
+ self.bert_model = BertModel.from_pretrained('bert-base-uncased')
42
+ self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
43
+ self.sts_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
44
+
45
+ def calculate_all_metrics(self):
46
+ """
47
+ Calculate all detectability metrics for each paraphrased sentence.
48
+ """
49
+ original_embedding = self._get_sentence_embedding(self.original_sentence)
50
+ sts_original_embedding = self.sts_model.encode(self.original_sentence)
51
+
52
+ for idx, paraphrased_sentence in enumerate(self.paraphrased_sentences):
53
+ key = f"Sentence_{idx+1}"
54
+
55
+ # BLEU Score
56
+ self.bleu_scores[key] = self._calculate_bleu(self.original_sentence, paraphrased_sentence)
57
+
58
+ # Cosine Similarity
59
+ paraphrase_embedding = self._get_sentence_embedding(paraphrased_sentence)
60
+ self.cosine_similarities[key] = cosine_similarity([original_embedding], [paraphrase_embedding])[0][0]
61
+
62
+ # STS Score
63
+ sts_paraphrase_embedding = self.sts_model.encode(paraphrased_sentence)
64
+ self.sts_scores[key] = cosine_similarity([sts_original_embedding], [sts_paraphrase_embedding])[0][0]
65
+
66
+ def normalize_metrics(self):
67
+ """
68
+ Normalize all metrics to be between 0 and 1.
69
+ """
70
+ self.normalized_bleu = self._normalize_dict(self.bleu_scores)
71
+ self.normalized_cosine = self._normalize_dict(self.cosine_similarities)
72
+ self.normalized_sts = self._normalize_dict(self.sts_scores)
73
+
74
+ def calculate_combined_detectability(self):
75
+ """
76
+ Calculate the combined detectability using the root mean square of the normalized metrics.
77
+ """
78
+ for key in self.normalized_bleu.keys():
79
+ rms = np.sqrt(
80
+ (
81
+ self.normalized_bleu[key] ** 2 +
82
+ self.normalized_cosine[key] ** 2 +
83
+ self.normalized_sts[key] ** 2
84
+ ) / 3
85
+ )
86
+ self.combined_detectabilities[key] = rms
87
+
88
+ def plot_metrics(self):
89
+ """
90
+ Plot each normalized metric and the combined detectability in separate graphs.
91
+ """
92
+ keys = list(self.normalized_bleu.keys())
93
+ indices = np.arange(len(keys))
94
+
95
+ # Prepare data for plotting
96
+ metrics = {
97
+ 'BLEU Score': [self.normalized_bleu[key] for key in keys],
98
+ 'Cosine Similarity': [self.normalized_cosine[key] for key in keys],
99
+ 'STS Score': [self.normalized_sts[key] for key in keys],
100
+ 'Combined Detectability': [self.combined_detectabilities[key] for key in keys]
101
+ }
102
+
103
+ # Plot each metric separately
104
+ for metric_name, values in metrics.items():
105
+ plt.figure(figsize=(12, 6))
106
+ plt.plot(indices, values, marker='o', color=np.random.rand(3,))
107
+ plt.xlabel('Sentence Index')
108
+ plt.ylabel('Normalized Value (0-1)')
109
+ plt.title(f'Normalized {metric_name}')
110
+ plt.grid(True)
111
+ plt.tight_layout()
112
+ plt.show()
113
+
114
+ # Private methods for metric calculations
115
+ def _calculate_bleu(self, reference, candidate):
116
+ """
117
+ Calculate the BLEU score between the original and paraphrased sentence using smoothing.
118
+ """
119
+ reference_tokens = nltk.word_tokenize(reference)
120
+ candidate_tokens = nltk.word_tokenize(candidate)
121
+ smoothing = SmoothingFunction().method1
122
+ return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothing)
123
+
124
+ def _get_sentence_embedding(self, sentence):
125
+ """
126
+ Get sentence embedding using BERT.
127
+ """
128
+ tokens = self.bert_tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
129
+ with torch.no_grad():
130
+ outputs = self.bert_model(**tokens)
131
+ return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
132
+
133
+ def _normalize_dict(self, metric_dict):
134
+ """
135
+ Normalize the values in a dictionary to be between 0 and 1.
136
+ """
137
+ values = np.array(list(metric_dict.values()))
138
+ min_val = values.min()
139
+ max_val = values.max()
140
+ # Avoid division by zero if all values are the same
141
+ if max_val - min_val == 0:
142
+ normalized_values = np.zeros_like(values)
143
+ else:
144
+ normalized_values = (values - min_val) / (max_val - min_val)
145
+ return dict(zip(metric_dict.keys(), normalized_values))
146
+
147
+ # Getter methods
148
+ def get_normalized_metrics(self):
149
+ """
150
+ Get all normalized metrics as a dictionary.
151
+ """
152
+ return {
153
+ 'BLEU Score': self.normalized_bleu,
154
+ 'Cosine Similarity': self.normalized_cosine,
155
+ 'STS Score': self.normalized_sts
156
+ }
157
+
158
+ def get_combined_detectabilities(self):
159
+ """
160
+ Get the dictionary of combined detectability values.
161
+ """
162
+ return self.combined_detectabilities
163
+
164
+
165
+ # Example usage
166
+ if __name__ == "__main__":
167
+ # Original sentence
168
+ original_sentence = "The quick brown fox jumps over the lazy dog"
169
+
170
+ # Paraphrased sentences
171
+ paraphrased_sentences = [
172
+ # Original 1: "A swift auburn fox leaps across a sleepy canine."
173
+ "The swift auburn fox leaps across a sleepy canine.",
174
+ "A quick auburn fox leaps across a sleepy canine.",
175
+ "A swift ginger fox leaps across a sleepy canine.",
176
+ "A swift auburn fox bounds across a sleepy canine.",
177
+ "A swift auburn fox leaps across a tired canine.",
178
+ "Three swift auburn foxes leap across a sleepy canine.",
179
+ "The vulpine specimen rapidly traverses over a dormant dog.",
180
+ "Like lightning, the russet hunter soars over the drowsy guardian.",
181
+ "Tha quick ginger fox jumps o'er the lazy hound, ye ken.",
182
+ "One rapid Vulpes vulpes traverses the path of a quiescent canine.",
183
+ "A swift auburn predator navigates across a lethargic pet.",
184
+ "Subject A (fox) demonstrates velocity over Subject B (dog).",
185
+
186
+ # Original 2: "The agile russet fox bounds over an idle hound."
187
+ "Some agile russet foxes bound over an idle hound.",
188
+ "The nimble russet fox bounds over an idle hound.",
189
+ "The agile brown fox bounds over an idle hound.",
190
+ "The agile russet fox jumps over an idle hound.",
191
+ "The agile russet fox bounds over a lazy hound.",
192
+ "Two agile russet foxes bound over an idle hound.",
193
+ "A dexterous vulpine surpasses a stationary canine.",
194
+ "Quick as thought, the copper warrior sails over the guardian.",
195
+ "Tha nimble reddish fox jumps o'er the doggo, don't ya know.",
196
+ "A dexterous V. vulpes exceeds the plane of an inactive canine.",
197
+ "An agile russet hunter maneuvers above a resting hound.",
198
+ "Test subject F-1 achieves displacement superior to subject D-1.",
199
+
200
+ # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog."
201
+ "The nimble mahogany vulpine vaults above a drowsy dog.",
202
+ "A swift mahogany vulpine vaults above a drowsy dog.",
203
+ "A nimble reddish vulpine vaults above a drowsy dog.",
204
+ "A nimble mahogany fox vaults above a drowsy dog.",
205
+ "A nimble mahogany vulpine leaps above a drowsy dog.",
206
+ "Four nimble mahogany vulpines vault above a drowsy dog.",
207
+ "An agile specimen of reddish fur surpasses a somnolent canine.",
208
+ "Fleet as wind, the earth-toned hunter soars over the sleepy guard.",
209
+ "Tha quick brown beastie jumps o'er the tired pup, aye.",
210
+ "Single V. vulpes demonstrates vertical traverse over C. familiaris.",
211
+ "A nimble rust-colored predator crosses above a drowsy pet.",
212
+ "Observed: Subject Red executes vertical motion over Subject Gray.",
213
+
214
+ # Original 4: "The speedy copper-colored fox hops over the lethargic pup."
215
+ "A speedy copper-colored fox hops over the lethargic pup.",
216
+ "The quick copper-colored fox hops over the lethargic pup.",
217
+ "The speedy bronze fox hops over the lethargic pup.",
218
+ "The speedy copper-colored fox jumps over the lethargic pup.",
219
+ "The speedy copper-colored fox hops over the tired pup.",
220
+ "Multiple speedy copper-colored foxes hop over the lethargic pup.",
221
+ "A rapid vulpine of bronze hue traverses an inactive young canine.",
222
+ "Swift as a dart, the metallic hunter bounds over the lazy puppy.",
223
+ "Tha fast copper beastie leaps o'er the sleepy wee dog.",
224
+ "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.",
225
+ "A fleet copper-toned predator moves past a sluggish young dog.",
226
+ "Field note: Adult fox subject exceeds puppy subject vertically.",
227
+
228
+ # Original 5: "A rapid tawny fox springs over a sluggish dog."
229
+ "The rapid tawny fox springs over a sluggish dog.",
230
+ "A quick tawny fox springs over a sluggish dog.",
231
+ "A rapid golden fox springs over a sluggish dog.",
232
+ "A rapid tawny fox jumps over a sluggish dog.",
233
+ "A rapid tawny fox springs over a lazy dog.",
234
+ "Six rapid tawny foxes spring over a sluggish dog.",
235
+ "An expeditious yellowish vulpine surpasses a torpid canine.",
236
+ "Fast as a bullet, the golden hunter vaults over the idle guard.",
237
+ "Tha swift yellowy fox jumps o'er the lazy mutt, aye.",
238
+ "One V. vulpes displays rapid transit over one inactive C. familiaris.",
239
+ "A speedy yellow-brown predator bypasses a motionless dog.",
240
+ "Log entry: Vulpine subject achieves swift vertical displacement.",
241
+
242
+ # Original 6: "The fleet-footed chestnut fox soars above an indolent canine."
243
+ "A fleet-footed chestnut fox soars above an indolent canine.",
244
+ "The swift chestnut fox soars above an indolent canine.",
245
+ "The fleet-footed brown fox soars above an indolent canine.",
246
+ "The fleet-footed chestnut fox leaps above an indolent canine.",
247
+ "The fleet-footed chestnut fox soars above a lazy canine.",
248
+ "Several fleet-footed chestnut foxes soar above an indolent canine.",
249
+ "A rapid brown vulpine specimen traverses a lethargic domestic dog.",
250
+ "Graceful as a bird, the nutbrown hunter flies over the lazy guard.",
251
+ "Tha quick brown beastie sails o'er the sleepy hound, ken.",
252
+ "Single agile V. vulpes achieves elevation above stationary canine.",
253
+ "A nimble brown predator glides over an unmoving domestic animal.",
254
+ "Research note: Brown subject displays superior vertical mobility.",
255
+
256
+ # Original 7: "A fast ginger fox hurdles past a slothful dog."
257
+ "The fast ginger fox hurdles past a slothful dog.",
258
+ "A quick ginger fox hurdles past a slothful dog.",
259
+ "A fast red fox hurdles past a slothful dog.",
260
+ "A fast ginger fox jumps past a slothful dog.",
261
+ "A fast ginger fox hurdles past a lazy dog.",
262
+ "Five fast ginger foxes hurdle past a slothful dog.",
263
+ "A rapid orange vulpine bypasses a lethargic canine.",
264
+ "Quick as lightning, the flame-colored hunter races past the lazy guard.",
265
+ "Tha swift ginger beastie leaps past the tired doggy, ye see.",
266
+ "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.",
267
+ "A speedy red-orange predator overtakes a motionless dog.",
268
+ "Data point: Orange subject demonstrates rapid transit past Gray subject.",
269
+
270
+ # Original 8: "The spry rusty-colored fox jumps across a dozing hound."
271
+ "A spry rusty-colored fox jumps across a dozing hound.",
272
+ "The agile rusty-colored fox jumps across a dozing hound.",
273
+ "The spry reddish fox jumps across a dozing hound.",
274
+ "The spry rusty-colored fox leaps across a dozing hound.",
275
+ "The spry rusty-colored fox jumps across a sleeping hound.",
276
+ "Multiple spry rusty-colored foxes jump across a dozing hound.",
277
+ "An agile rust-toned vulpine traverses a somnolent canine.",
278
+ "Nimble as thought, the copper hunter bounds over the resting guard.",
279
+ "Tha lively rust-colored beastie hops o'er the snoozin' hound.",
280
+ "Single dexterous V. vulpes crosses path of dormant C. familiaris.",
281
+ "A lithe rust-tinted predator moves past a slumbering dog.",
282
+ "Observation: Russet subject exhibits agility over dormant subject.",
283
+
284
+ # Original 9: "A quick tan fox leaps over an inactive dog."
285
+ "The quick tan fox leaps over an inactive dog.",
286
+ "A swift tan fox leaps over an inactive dog.",
287
+ "A quick beige fox leaps over an inactive dog.",
288
+ "A quick tan fox jumps over an inactive dog.",
289
+ "A quick tan fox leaps over a motionless dog.",
290
+ "Seven quick tan foxes leap over an inactive dog.",
291
+ "A rapid light-brown vulpine surpasses a stationary canine.",
292
+ "Fast as wind, the sand-colored hunter soars over the still guard.",
293
+ "Tha nimble tan beastie jumps o'er the quiet doggy, aye.",
294
+ "One agile fawn V. vulpes traverses one immobile C. familiaris.",
295
+ "A fleet tan-colored predator bypasses an unmoving dog.",
296
+ "Field report: Tan subject demonstrates movement over static subject.",
297
+
298
+ # Original 10: "The brisk auburn vulpine bounces over a listless canine."
299
+ "Some brisk auburn vulpines bounce over a listless canine.",
300
+ "The quick auburn vulpine bounces over a listless canine.",
301
+ "The brisk russet vulpine bounces over a listless canine.",
302
+ "The brisk auburn fox bounces over a listless canine.",
303
+ "The brisk auburn vulpine jumps over a listless canine.",
304
+ "Five brisk auburn vulpines bounce over a listless canine.",
305
+ "The expeditious specimen supersedes a quiescent Canis lupus.",
306
+ "Swift as wind, the russet hunter vaults over the idle guardian.",
307
+ "Tha quick ginger beastie hops o'er the lazy mutt, aye.",
308
+ "One V. vulpes achieves displacement over inactive C. familiaris.",
309
+ "A high-velocity auburn predator traverses an immobile animal.",
310
+ "Final observation: Red subject shows mobility over Gray subject."
311
+ ]
312
+
313
+
314
+ # Initialize the calculator
315
+ calculator = SentenceDetectabilityCalculator(original_sentence, paraphrased_sentences)
316
+
317
+ # Calculate all metrics
318
+ calculator.calculate_all_metrics()
319
+
320
+ # Normalize the metrics
321
+ calculator.normalize_metrics()
322
+
323
+ # Calculate combined detectability
324
+ calculator.calculate_combined_detectability()
325
+
326
+ # Retrieve the normalized metrics and combined detectabilities
327
+ normalized_metrics = calculator.get_normalized_metrics()
328
+ combined_detectabilities = calculator.get_combined_detectabilities()
329
+ detectability_val=combined_detectabilities
330
+
331
+ # Display the results
332
+ # print("Normalized Metrics:")
333
+ # for metric_name, metric_dict in normalized_metrics.items():
334
+ # print(f"\n{metric_name}:")
335
+ # for key, value in metric_dict.items():
336
+ # print(f"{key}: {value:.4f}")
337
+
338
+ print("\nCombined Detectabilities:")
339
+ for each in combined_detectabilities.items():
340
+ print(f"{each[1]}")
341
+
342
+ # Plot the metrics
343
+ # calculator.plot_metrics()
distortion.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import nltk
3
+ import numpy as np
4
+ import torch
5
+ import matplotlib.pyplot as plt
6
+ from scipy.special import rel_entr
7
+ from collections import Counter
8
+ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
9
+ distortion_val={}
10
+ # Download NLTK data if not already present
11
+ nltk.download('punkt', quiet=True)
12
+
13
+ class SentenceDistortionCalculator:
14
+ """
15
+ A class to calculate and analyze distortion metrics between an original sentence and modified sentences.
16
+ """
17
+
18
+ def __init__(self, original_sentence, modified_sentences):
19
+ """
20
+ Initialize the calculator with the original sentence and a list of modified sentences.
21
+ """
22
+ self.original_sentence = original_sentence
23
+ self.modified_sentences = modified_sentences
24
+
25
+ # Raw metric dictionaries
26
+ self.levenshtein_distances = {}
27
+ self.word_level_changes = {}
28
+ self.kl_divergences = {}
29
+ self.perplexities = {}
30
+
31
+ # Normalized metric dictionaries
32
+ self.normalized_levenshtein = {}
33
+ self.normalized_word_changes = {}
34
+ self.normalized_kl_divergences = {}
35
+ self.normalized_perplexities = {}
36
+
37
+ # Combined distortion dictionary
38
+ self.combined_distortions = {}
39
+
40
+ # Initialize GPT-2 model and tokenizer for perplexity calculation
41
+ self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
42
+ self.model = GPT2LMHeadModel.from_pretrained("gpt2")
43
+ self.model.eval() # Set model to evaluation mode
44
+
45
+ def calculate_all_metrics(self):
46
+ """
47
+ Calculate all distortion metrics for each modified sentence.
48
+ """
49
+ for idx, modified_sentence in enumerate(self.modified_sentences):
50
+ key = f"Sentence_{idx+1}"
51
+ self.levenshtein_distances[key] = self._calculate_levenshtein_distance(modified_sentence)
52
+ self.word_level_changes[key] = self._calculate_word_level_change(modified_sentence)
53
+ self.kl_divergences[key] = self._calculate_kl_divergence(modified_sentence)
54
+ self.perplexities[key] = self._calculate_perplexity(modified_sentence)
55
+
56
+ def normalize_metrics(self):
57
+ """
58
+ Normalize all metrics to be between 0 and 1.
59
+ """
60
+ self.normalized_levenshtein = self._normalize_dict(self.levenshtein_distances)
61
+ self.normalized_word_changes = self._normalize_dict(self.word_level_changes)
62
+ self.normalized_kl_divergences = self._normalize_dict(self.kl_divergences)
63
+ self.normalized_perplexities = self._normalize_dict(self.perplexities)
64
+
65
+ def calculate_combined_distortion(self):
66
+ """
67
+ Calculate the combined distortion using the root mean square of the normalized metrics.
68
+ """
69
+ for key in self.normalized_levenshtein.keys():
70
+ rms = np.sqrt(
71
+ (
72
+ self.normalized_levenshtein[key] ** 2 +
73
+ self.normalized_word_changes[key] ** 2 +
74
+ self.normalized_kl_divergences[key] ** 2 +
75
+ self.normalized_perplexities[key] ** 2
76
+ ) / 4
77
+ )
78
+ self.combined_distortions[key] = rms
79
+
80
+ def plot_metrics(self):
81
+ """
82
+ Plot each normalized metric and the combined distortion in separate graphs.
83
+ """
84
+ import matplotlib.pyplot as plt
85
+
86
+ keys = list(self.normalized_levenshtein.keys())
87
+ indices = np.arange(len(keys))
88
+
89
+ # Prepare data for plotting
90
+ metrics = {
91
+ 'Levenshtein Distance': [self.normalized_levenshtein[key] for key in keys],
92
+ 'Word-Level Changes': [self.normalized_word_changes[key] for key in keys],
93
+ 'KL Divergence': [self.normalized_kl_divergences[key] for key in keys],
94
+ 'Perplexity': [self.normalized_perplexities[key] for key in keys],
95
+ 'Combined Distortion': [self.combined_distortions[key] for key in keys]
96
+ }
97
+
98
+ # Plot each metric separately
99
+ for metric_name, values in metrics.items():
100
+ plt.figure(figsize=(12, 6))
101
+ plt.plot(indices, values, marker='o', color=np.random.rand(3,))
102
+ plt.xlabel('Sentence Index')
103
+ plt.ylabel('Normalized Value (0-1)')
104
+ plt.title(f'Normalized {metric_name}')
105
+ plt.grid(True)
106
+ plt.tight_layout()
107
+ plt.show()
108
+
109
+ # Private methods for metric calculations
110
+ def _calculate_levenshtein_distance(self, modified_sentence):
111
+ """
112
+ Calculate the Levenshtein Distance between the original and modified sentence.
113
+ """
114
+ return nltk.edit_distance(self.original_sentence, modified_sentence)
115
+
116
+ def _calculate_word_level_change(self, modified_sentence):
117
+ """
118
+ Calculate the proportion of word-level changes between the original and modified sentence.
119
+ """
120
+ original_words = self.original_sentence.split()
121
+ modified_words = modified_sentence.split()
122
+ total_words = max(len(original_words), len(modified_words))
123
+ changed_words = sum(1 for o, m in zip(original_words, modified_words) if o != m)
124
+ # Account for extra words in the modified sentence
125
+ changed_words += abs(len(original_words) - len(modified_words))
126
+ distortion = changed_words / total_words
127
+ return distortion
128
+
129
+ def _calculate_kl_divergence(self, modified_sentence):
130
+ """
131
+ Calculate the KL Divergence between the word distributions of the original and modified sentence.
132
+ """
133
+ original_counts = Counter(self.original_sentence.lower().split())
134
+ modified_counts = Counter(modified_sentence.lower().split())
135
+ all_words = set(original_counts.keys()).union(set(modified_counts.keys()))
136
+ original_probs = np.array([original_counts.get(word, 0) for word in all_words], dtype=float)
137
+ modified_probs = np.array([modified_counts.get(word, 0) for word in all_words], dtype=float)
138
+
139
+ # Add smoothing to avoid division by zero
140
+ original_probs += 1e-10
141
+ modified_probs += 1e-10
142
+
143
+ # Normalize to create probability distributions
144
+ original_probs /= original_probs.sum()
145
+ modified_probs /= modified_probs.sum()
146
+
147
+ kl_divergence = np.sum(rel_entr(original_probs, modified_probs))
148
+ return kl_divergence
149
+
150
+ def _calculate_perplexity(self, sentence):
151
+ """
152
+ Calculate the perplexity of a sentence using GPT-2.
153
+ """
154
+ encodings = self.tokenizer(sentence, return_tensors='pt')
155
+ max_length = self.model.config.n_positions
156
+ stride = max_length
157
+
158
+ lls = []
159
+ for i in range(0, encodings.input_ids.size(1), stride):
160
+ begin_loc = i
161
+ end_loc = min(i + stride, encodings.input_ids.size(1))
162
+ trg_len = end_loc - begin_loc
163
+
164
+ input_ids = encodings.input_ids[:, begin_loc:end_loc]
165
+ target_ids = input_ids.clone()
166
+
167
+ with torch.no_grad():
168
+ outputs = self.model(input_ids, labels=target_ids)
169
+ log_likelihood = outputs.loss * trg_len
170
+
171
+ lls.append(log_likelihood)
172
+
173
+ ppl = torch.exp(torch.stack(lls).sum() / end_loc)
174
+ return ppl.item()
175
+
176
+ def _normalize_dict(self, metric_dict):
177
+ """
178
+ Normalize the values in a dictionary to be between 0 and 1.
179
+ """
180
+ values = np.array(list(metric_dict.values()))
181
+ min_val = values.min()
182
+ max_val = values.max()
183
+ # Avoid division by zero if all values are the same
184
+ if max_val - min_val == 0:
185
+ normalized_values = np.zeros_like(values)
186
+ else:
187
+ normalized_values = (values - min_val) / (max_val - min_val)
188
+ return dict(zip(metric_dict.keys(), normalized_values))
189
+
190
+ # Getter methods
191
+ def get_normalized_metrics(self):
192
+ """
193
+ Get all normalized metrics as a dictionary.
194
+ """
195
+ return {
196
+ 'Levenshtein Distance': self.normalized_levenshtein,
197
+ 'Word-Level Changes': self.normalized_word_changes,
198
+ 'KL Divergence': self.normalized_kl_divergences,
199
+ 'Perplexity': self.normalized_perplexities
200
+ }
201
+
202
+ def get_combined_distortions(self):
203
+ """
204
+ Get the dictionary of combined distortion values.
205
+ """
206
+ return self.combined_distortions
207
+
208
+ # # Example usage
209
+ # if __name__ == "__main__":
210
+ # # Original sentence
211
+ # original_sentence = "The quick brown fox jumps over the lazy dog"
212
+
213
+
214
+ # paraphrased_sentences = [
215
+ # # Original 1: "A swift auburn fox leaps across a sleepy canine."
216
+ # "The swift auburn fox leaps across a sleepy canine.",
217
+ # "A quick auburn fox leaps across a sleepy canine.",
218
+ # "A swift ginger fox leaps across a sleepy canine.",
219
+ # "A swift auburn fox bounds across a sleepy canine.",
220
+ # "A swift auburn fox leaps across a tired canine.",
221
+ # "Three swift auburn foxes leap across a sleepy canine.",
222
+ # "The vulpine specimen rapidly traverses over a dormant dog.",
223
+ # "Like lightning, the russet hunter soars over the drowsy guardian.",
224
+ # "Tha quick ginger fox jumps o'er the lazy hound, ye ken.",
225
+ # "One rapid Vulpes vulpes traverses the path of a quiescent canine.",
226
+ # "A swift auburn predator navigates across a lethargic pet.",
227
+ # "Subject A (fox) demonstrates velocity over Subject B (dog).",
228
+
229
+ # # Original 2: "The agile russet fox bounds over an idle hound."
230
+ # "Some agile russet foxes bound over an idle hound.",
231
+ # "The nimble russet fox bounds over an idle hound.",
232
+ # "The agile brown fox bounds over an idle hound.",
233
+ # "The agile russet fox jumps over an idle hound.",
234
+ # "The agile russet fox bounds over a lazy hound.",
235
+ # "Two agile russet foxes bound over an idle hound.",
236
+ # "A dexterous vulpine surpasses a stationary canine.",
237
+ # "Quick as thought, the copper warrior sails over the guardian.",
238
+ # "Tha nimble reddish fox jumps o'er the doggo, don't ya know.",
239
+ # "A dexterous V. vulpes exceeds the plane of an inactive canine.",
240
+ # "An agile russet hunter maneuvers above a resting hound.",
241
+ # "Test subject F-1 achieves displacement superior to subject D-1.",
242
+
243
+ # # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog."
244
+ # "The nimble mahogany vulpine vaults above a drowsy dog.",
245
+ # "A swift mahogany vulpine vaults above a drowsy dog.",
246
+ # "A nimble reddish vulpine vaults above a drowsy dog.",
247
+ # "A nimble mahogany fox vaults above a drowsy dog.",
248
+ # "A nimble mahogany vulpine leaps above a drowsy dog.",
249
+ # "Four nimble mahogany vulpines vault above a drowsy dog.",
250
+ # "An agile specimen of reddish fur surpasses a somnolent canine.",
251
+ # "Fleet as wind, the earth-toned hunter soars over the sleepy guard.",
252
+ # "Tha quick brown beastie jumps o'er the tired pup, aye.",
253
+ # "Single V. vulpes demonstrates vertical traverse over C. familiaris.",
254
+ # "A nimble rust-colored predator crosses above a drowsy pet.",
255
+ # "Observed: Subject Red executes vertical motion over Subject Gray.",
256
+
257
+ # # Original 4: "The speedy copper-colored fox hops over the lethargic pup."
258
+ # "A speedy copper-colored fox hops over the lethargic pup.",
259
+ # "The quick copper-colored fox hops over the lethargic pup.",
260
+ # "The speedy bronze fox hops over the lethargic pup.",
261
+ # "The speedy copper-colored fox jumps over the lethargic pup.",
262
+ # "The speedy copper-colored fox hops over the tired pup.",
263
+ # "Multiple speedy copper-colored foxes hop over the lethargic pup.",
264
+ # "A rapid vulpine of bronze hue traverses an inactive young canine.",
265
+ # "Swift as a dart, the metallic hunter bounds over the lazy puppy.",
266
+ # "Tha fast copper beastie leaps o'er the sleepy wee dog.",
267
+ # "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.",
268
+ # "A fleet copper-toned predator moves past a sluggish young dog.",
269
+ # "Field note: Adult fox subject exceeds puppy subject vertically.",
270
+
271
+ # # Original 5: "A rapid tawny fox springs over a sluggish dog."
272
+ # "The rapid tawny fox springs over a sluggish dog.",
273
+ # "A quick tawny fox springs over a sluggish dog.",
274
+ # "A rapid golden fox springs over a sluggish dog.",
275
+ # "A rapid tawny fox jumps over a sluggish dog.",
276
+ # "A rapid tawny fox springs over a lazy dog.",
277
+ # "Six rapid tawny foxes spring over a sluggish dog.",
278
+ # "An expeditious yellowish vulpine surpasses a torpid canine.",
279
+ # "Fast as a bullet, the golden hunter vaults over the idle guard.",
280
+ # "Tha swift yellowy fox jumps o'er the lazy mutt, aye.",
281
+ # "One V. vulpes displays rapid transit over one inactive C. familiaris.",
282
+ # "A speedy yellow-brown predator bypasses a motionless dog.",
283
+ # "Log entry: Vulpine subject achieves swift vertical displacement.",
284
+
285
+ # # Original 6: "The fleet-footed chestnut fox soars above an indolent canine."
286
+ # "A fleet-footed chestnut fox soars above an indolent canine.",
287
+ # "The swift chestnut fox soars above an indolent canine.",
288
+ # "The fleet-footed brown fox soars above an indolent canine.",
289
+ # "The fleet-footed chestnut fox leaps above an indolent canine.",
290
+ # "The fleet-footed chestnut fox soars above a lazy canine.",
291
+ # "Several fleet-footed chestnut foxes soar above an indolent canine.",
292
+ # "A rapid brown vulpine specimen traverses a lethargic domestic dog.",
293
+ # "Graceful as a bird, the nutbrown hunter flies over the lazy guard.",
294
+ # "Tha quick brown beastie sails o'er the sleepy hound, ken.",
295
+ # "Single agile V. vulpes achieves elevation above stationary canine.",
296
+ # "A nimble brown predator glides over an unmoving domestic animal.",
297
+ # "Research note: Brown subject displays superior vertical mobility.",
298
+
299
+ # # Original 7: "A fast ginger fox hurdles past a slothful dog."
300
+ # "The fast ginger fox hurdles past a slothful dog.",
301
+ # "A quick ginger fox hurdles past a slothful dog.",
302
+ # "A fast red fox hurdles past a slothful dog.",
303
+ # "A fast ginger fox jumps past a slothful dog.",
304
+ # "A fast ginger fox hurdles past a lazy dog.",
305
+ # "Five fast ginger foxes hurdle past a slothful dog.",
306
+ # "A rapid orange vulpine bypasses a lethargic canine.",
307
+ # "Quick as lightning, the flame-colored hunter races past the lazy guard.",
308
+ # "Tha swift ginger beastie leaps past the tired doggy, ye see.",
309
+ # "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.",
310
+ # "A speedy red-orange predator overtakes a motionless dog.",
311
+ # "Data point: Orange subject demonstrates rapid transit past Gray subject.",
312
+
313
+ # # Original 8: "The spry rusty-colored fox jumps across a dozing hound."
314
+ # "A spry rusty-colored fox jumps across a dozing hound.",
315
+ # "The agile rusty-colored fox jumps across a dozing hound.",
316
+ # "The spry reddish fox jumps across a dozing hound.",
317
+ # "The spry rusty-colored fox leaps across a dozing hound.",
318
+ # "The spry rusty-colored fox jumps across a sleeping hound.",
319
+ # "Multiple spry rusty-colored foxes jump across a dozing hound.",
320
+ # "An agile rust-toned vulpine traverses a somnolent canine.",
321
+ # "Nimble as thought, the copper hunter bounds over the resting guard.",
322
+ # "Tha lively rust-colored beastie hops o'er the snoozin' hound.",
323
+ # "Single dexterous V. vulpes crosses path of dormant C. familiaris.",
324
+ # "A lithe rust-tinted predator moves past a slumbering dog.",
325
+ # "Observation: Russet subject exhibits agility over dormant subject.",
326
+
327
+ # # Original 9: "A quick tan fox leaps over an inactive dog."
328
+ # "The quick tan fox leaps over an inactive dog.",
329
+ # "A swift tan fox leaps over an inactive dog.",
330
+ # "A quick beige fox leaps over an inactive dog.",
331
+ # "A quick tan fox jumps over an inactive dog.",
332
+ # "A quick tan fox leaps over a motionless dog.",
333
+ # "Seven quick tan foxes leap over an inactive dog.",
334
+ # "A rapid light-brown vulpine surpasses a stationary canine.",
335
+ # "Fast as wind, the sand-colored hunter soars over the still guard.",
336
+ # "Tha nimble tan beastie jumps o'er the quiet doggy, aye.",
337
+ # "One agile fawn V. vulpes traverses one immobile C. familiaris.",
338
+ # "A fleet tan-colored predator bypasses an unmoving dog.",
339
+ # "Field report: Tan subject demonstrates movement over static subject.",
340
+
341
+ # # Original 10: "The brisk auburn vulpine bounces over a listless canine."
342
+ # "Some brisk auburn vulpines bounce over a listless canine.",
343
+ # "The quick auburn vulpine bounces over a listless canine.",
344
+ # "The brisk russet vulpine bounces over a listless canine.",
345
+ # "The brisk auburn fox bounces over a listless canine.",
346
+ # "The brisk auburn vulpine jumps over a listless canine.",
347
+ # "Five brisk auburn vulpines bounce over a listless canine.",
348
+ # "The expeditious specimen supersedes a quiescent Canis lupus.",
349
+ # "Swift as wind, the russet hunter vaults over the idle guardian.",
350
+ # "Tha quick ginger beastie hops o'er the lazy mutt, aye.",
351
+ # "One V. vulpes achieves displacement over inactive C. familiaris.",
352
+ # "A high-velocity auburn predator traverses an immobile animal.",
353
+ # "Final observation: Red subject shows mobility over Gray subject."
354
+ # ]
355
+
356
+
357
+ # # Initialize the calculator
358
+ # calculator = SentenceDistortionCalculator(original_sentence, paraphrased_sentences)
359
+
360
+ # # Calculate all metrics
361
+ # calculator.calculate_all_metrics()
362
+
363
+ # # Normalize the metrics
364
+ # calculator.normalize_metrics()
365
+
366
+ # # Calculate combined distortion
367
+ # calculator.calculate_combined_distortion()
368
+
369
+ # # Retrieve the normalized metrics and combined distortions
370
+ # normalized_metrics = calculator.get_normalized_metrics()
371
+ # combined_distortions = calculator.get_combined_distortions()
372
+ # distortion_val=combined_distortions
373
+ # # Display the results
374
+ # print("Normalized Metrics:")
375
+ # for metric_name, metric_dict in normalized_metrics.items():
376
+ # print(f"\n{metric_name}:")
377
+ # for key, value in metric_dict.items():
378
+ # print(f"{key}: {value:.4f}")
379
+
380
+ # print("\nCombined Distortions:")
381
+ # for key, value in combined_distortions.items():
382
+ # print(f"{key}: {value:.4f}")
383
+
384
+ # # Plot the metrics
385
+ # calculator.plot_metrics()
euclidean_distance.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.metrics.pairwise import euclidean_distances
6
+ euclidean_val={}
7
+ class SentenceEuclideanDistanceCalculator:
8
+ """
9
+ A class to calculate and analyze Euclidean distance between an original sentence and paraphrased sentences.
10
+ """
11
+
12
+ def __init__(self, original_sentence, paraphrased_sentences):
13
+ """
14
+ Initialize the calculator with the original sentence and a list of paraphrased sentences.
15
+ """
16
+ self.original_sentence = original_sentence
17
+ self.paraphrased_sentences = paraphrased_sentences
18
+
19
+ # Euclidean distance dictionary
20
+ self.euclidean_distances = {}
21
+
22
+ # Normalized Euclidean distances
23
+ self.normalized_euclidean = {}
24
+
25
+ # Load SentenceTransformer model for embedding calculation
26
+ self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
27
+
28
+ def calculate_all_metrics(self):
29
+ """
30
+ Calculate Euclidean distance between the original and each paraphrased sentence.
31
+ """
32
+ original_embedding = self._get_sentence_embedding(self.original_sentence)
33
+
34
+ for idx, paraphrased_sentence in enumerate(self.paraphrased_sentences):
35
+ key = f"Sentence_{idx+1}"
36
+
37
+ # Euclidean Distance
38
+ paraphrase_embedding = self._get_sentence_embedding(paraphrased_sentence)
39
+ self.euclidean_distances[key] = euclidean_distances([original_embedding], [paraphrase_embedding])[0][0]
40
+
41
+ def normalize_metrics(self):
42
+ """
43
+ Normalize all metrics to be between 0 and 1.
44
+ """
45
+ self.normalized_euclidean = self._normalize_dict(self.euclidean_distances)
46
+
47
+ def plot_metrics(self):
48
+ """
49
+ Plot the normalized Euclidean distances in a graph.
50
+ """
51
+ keys = list(self.normalized_euclidean.keys())
52
+ indices = np.arange(len(keys))
53
+
54
+ # Prepare data for plotting
55
+ plt.figure(figsize=(12, 6))
56
+ plt.plot(indices, [self.normalized_euclidean[key] for key in keys], marker='o', color=np.random.rand(3,))
57
+ plt.xlabel('Sentence Index')
58
+ plt.ylabel('Normalized Euclidean Distance (0-1)')
59
+ plt.title('Normalized Euclidean Distance')
60
+ plt.grid(True)
61
+ plt.tight_layout()
62
+ plt.show()
63
+
64
+ # Private methods for metric calculations
65
+ def _get_sentence_embedding(self, sentence):
66
+ """
67
+ Get sentence embedding using the SentenceTransformer model.
68
+ """
69
+ return self.model.encode(sentence)
70
+
71
+ def _normalize_dict(self, metric_dict):
72
+ """
73
+ Normalize the values in a dictionary to be between 0 and 1.
74
+ """
75
+ values = np.array(list(metric_dict.values()))
76
+ min_val = values.min()
77
+ max_val = values.max()
78
+ # Avoid division by zero if all values are the same
79
+ if max_val - min_val == 0:
80
+ normalized_values = np.zeros_like(values)
81
+ else:
82
+ normalized_values = (values - min_val) / (max_val - min_val)
83
+ return dict(zip(metric_dict.keys(), normalized_values))
84
+
85
+ # Getter methods
86
+ def get_normalized_metrics(self):
87
+ """
88
+ Get the normalized Euclidean distances as a dictionary.
89
+ """
90
+ return self.normalized_euclidean
91
+
92
+
93
+ # # Example usage
94
+ # if __name__ == "__main__":
95
+ # # Original sentence
96
+ # original_sentence = "The quick brown fox jumps over the lazy dog"
97
+
98
+ # # Paraphrased sentences
99
+ # paraphrased_sentences = [
100
+ # # Original 1: "A swift auburn fox leaps across a sleepy canine."
101
+ # "The swift auburn fox leaps across a sleepy canine.",
102
+ # "A quick auburn fox leaps across a sleepy canine.",
103
+ # "A swift ginger fox leaps across a sleepy canine.",
104
+ # "A swift auburn fox bounds across a sleepy canine.",
105
+ # "A swift auburn fox leaps across a tired canine.",
106
+ # "Three swift auburn foxes leap across a sleepy canine.",
107
+ # "The vulpine specimen rapidly traverses over a dormant dog.",
108
+ # "Like lightning, the russet hunter soars over the drowsy guardian.",
109
+ # "Tha quick ginger fox jumps o'er the lazy hound, ye ken.",
110
+ # "One rapid Vulpes vulpes traverses the path of a quiescent canine.",
111
+ # "A swift auburn predator navigates across a lethargic pet.",
112
+ # "Subject A (fox) demonstrates velocity over Subject B (dog).",
113
+
114
+ # # Original 2: "The agile russet fox bounds over an idle hound."
115
+ # "Some agile russet foxes bound over an idle hound.",
116
+ # "The nimble russet fox bounds over an idle hound.",
117
+ # "The agile brown fox bounds over an idle hound.",
118
+ # "The agile russet fox jumps over an idle hound.",
119
+ # "The agile russet fox bounds over a lazy hound.",
120
+ # "Two agile russet foxes bound over an idle hound.",
121
+ # "A dexterous vulpine surpasses a stationary canine.",
122
+ # "Quick as thought, the copper warrior sails over the guardian.",
123
+ # "Tha nimble reddish fox jumps o'er the doggo, don't ya know.",
124
+ # "A dexterous V. vulpes exceeds the plane of an inactive canine.",
125
+ # "An agile russet hunter maneuvers above a resting hound.",
126
+ # "Test subject F-1 achieves displacement superior to subject D-1.",
127
+
128
+ # # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog."
129
+ # "The nimble mahogany vulpine vaults above a drowsy dog.",
130
+ # "A swift mahogany vulpine vaults above a drowsy dog.",
131
+ # "A nimble reddish vulpine vaults above a drowsy dog.",
132
+ # "A nimble mahogany fox vaults above a drowsy dog.",
133
+ # "A nimble mahogany vulpine leaps above a drowsy dog.",
134
+ # "Four nimble mahogany vulpines vault above a drowsy dog.",
135
+ # "An agile specimen of reddish fur surpasses a somnolent canine.",
136
+ # "Fleet as wind, the earth-toned hunter soars over the sleepy guard.",
137
+ # "Tha quick brown beastie jumps o'er the tired pup, aye.",
138
+ # "Single V. vulpes demonstrates vertical traverse over C. familiaris.",
139
+ # "A nimble rust-colored predator crosses above a drowsy pet.",
140
+ # "Observed: Subject Red executes vertical motion over Subject Gray.",
141
+
142
+ # # Original 4: "The speedy copper-colored fox hops over the lethargic pup."
143
+ # "A speedy copper-colored fox hops over the lethargic pup.",
144
+ # "The quick copper-colored fox hops over the lethargic pup.",
145
+ # "The speedy bronze fox hops over the lethargic pup.",
146
+ # "The speedy copper-colored fox jumps over the lethargic pup.",
147
+ # "The speedy copper-colored fox hops over the tired pup.",
148
+ # "Multiple speedy copper-colored foxes hop over the lethargic pup.",
149
+ # "A rapid vulpine of bronze hue traverses an inactive young canine.",
150
+ # "Swift as a dart, the metallic hunter bounds over the lazy puppy.",
151
+ # "Tha fast copper beastie leaps o'er the sleepy wee dog.",
152
+ # "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.",
153
+ # "A fleet copper-toned predator moves past a sluggish young dog.",
154
+ # "Field note: Adult fox subject exceeds puppy subject vertically.",
155
+
156
+ # # Original 5: "A rapid tawny fox springs over a sluggish dog."
157
+ # "The rapid tawny fox springs over a sluggish dog.",
158
+ # "A quick tawny fox springs over a sluggish dog.",
159
+ # "A rapid golden fox springs over a sluggish dog.",
160
+ # "A rapid tawny fox jumps over a sluggish dog.",
161
+ # "A rapid tawny fox springs over a lazy dog.",
162
+ # "Six rapid tawny foxes spring over a sluggish dog.",
163
+ # "An expeditious yellowish vulpine surpasses a torpid canine.",
164
+ # "Fast as a bullet, the golden hunter vaults over the idle guard.",
165
+ # "Tha swift yellowy fox jumps o'er the lazy mutt, aye.",
166
+ # "One V. vulpes displays rapid transit over one inactive C. familiaris.",
167
+ # "A speedy yellow-brown predator bypasses a motionless dog.",
168
+ # "Log entry: Vulpine subject achieves swift vertical displacement.",
169
+
170
+ # # Original 6: "The fleet-footed chestnut fox soars above an indolent canine."
171
+ # "A fleet-footed chestnut fox soars above an indolent canine.",
172
+ # "The swift chestnut fox soars above an indolent canine.",
173
+ # "The fleet-footed brown fox soars above an indolent canine.",
174
+ # "The fleet-footed chestnut fox leaps above an indolent canine.",
175
+ # "The fleet-footed chestnut fox soars above a lazy canine.",
176
+ # "Several fleet-footed chestnut foxes soar above an indolent canine.",
177
+ # "A rapid brown vulpine specimen traverses a lethargic domestic dog.",
178
+ # "Graceful as a bird, the nutbrown hunter flies over the lazy guard.",
179
+ # "Tha quick brown beastie sails o'er the sleepy hound, ken.",
180
+ # "Single agile V. vulpes achieves elevation above stationary canine.",
181
+ # "A nimble brown predator glides over an unmoving domestic animal.",
182
+ # "Research note: Brown subject displays superior vertical mobility.",
183
+
184
+ # # Original 7: "A fast ginger fox hurdles past a slothful dog."
185
+ # "The fast ginger fox hurdles past a slothful dog.",
186
+ # "A quick ginger fox hurdles past a slothful dog.",
187
+ # "A fast red fox hurdles past a slothful dog.",
188
+ # "A fast ginger fox jumps past a slothful dog.",
189
+ # "A fast ginger fox hurdles past a lazy dog.",
190
+ # "Five fast ginger foxes hurdle past a slothful dog.",
191
+ # "A rapid orange vulpine bypasses a lethargic canine.",
192
+ # "Quick as lightning, the flame-colored hunter races past the lazy guard.",
193
+ # "Tha swift ginger beastie leaps past the tired doggy, ye see.",
194
+ # "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.",
195
+ # "A speedy red-orange predator overtakes a motionless dog.",
196
+ # "Data point: Orange subject demonstrates rapid transit past Gray subject.",
197
+
198
+ # # Original 8: "The spry rusty-colored fox jumps across a dozing hound."
199
+ # "A spry rusty-colored fox jumps across a dozing hound.",
200
+ # "The agile rusty-colored fox jumps across a dozing hound.",
201
+ # "The spry reddish fox jumps across a dozing hound.",
202
+ # "The spry rusty-colored fox leaps across a dozing hound.",
203
+ # "The spry rusty-colored fox jumps across a sleeping hound.",
204
+ # "Multiple spry rusty-colored foxes jump across a dozing hound.",
205
+ # "An agile rust-toned vulpine traverses a somnolent canine.",
206
+ # "Nimble as thought, the copper hunter bounds over the resting guard.",
207
+ # "Tha lively rust-colored beastie hops o'er the snoozin' hound.",
208
+ # "Single dexterous V. vulpes crosses path of dormant C. familiaris.",
209
+ # "A lithe rust-tinted predator moves past a slumbering dog.",
210
+ # "Observation: Russet subject exhibits agility over dormant subject.",
211
+
212
+ # # Original 9: "A quick tan fox leaps over an inactive dog."
213
+ # "The quick tan fox leaps over an inactive dog.",
214
+ # "A swift tan fox leaps over an inactive dog.",
215
+ # "A quick beige fox leaps over an inactive dog.",
216
+ # "A quick tan fox jumps over an inactive dog.",
217
+ # "A quick tan fox leaps over a motionless dog.",
218
+ # "Seven quick tan foxes leap over an inactive dog.",
219
+ # "A rapid light-brown vulpine surpasses a stationary canine.",
220
+ # "Fast as wind, the sand-colored hunter soars over the still guard.",
221
+ # "Tha nimble tan beastie jumps o'er the quiet doggy, aye.",
222
+ # "One agile fawn V. vulpes traverses one immobile C. familiaris.",
223
+ # "A fleet tan-colored predator bypasses an unmoving dog.",
224
+ # "Field report: Tan subject demonstrates movement over static subject.",
225
+
226
+ # # Original 10: "The brisk auburn vulpine bounces over a listless canine."
227
+ # "Some brisk auburn vulpines bounce over a listless canine.",
228
+ # "The quick auburn vulpine bounces over a listless canine.",
229
+ # "The brisk russet vulpine bounces over a listless canine.",
230
+ # "The brisk auburn fox bounces over a listless canine.",
231
+ # "The brisk auburn vulpine jumps over a listless canine.",
232
+ # "Five brisk auburn vulpines bounce over a listless canine.",
233
+ # "The expeditious specimen supersedes a quiescent Canis lupus.",
234
+ # "Swift as wind, the russet hunter vaults over the idle guardian.",
235
+ # "Tha quick ginger beastie hops o'er the lazy mutt, aye.",
236
+ # "One V. vulpes achieves displacement over inactive C. familiaris.",
237
+ # "A high-velocity auburn predator traverses an immobile animal.",
238
+ # "Final observation: Red subject shows mobility over Gray subject."
239
+ # ]
240
+
241
+
242
+ # # Initialize the calculator
243
+ # calculator = SentenceEuclideanDistanceCalculator(original_sentence, paraphrased_sentences)
244
+
245
+ # # Calculate Euclidean distances
246
+ # calculator.calculate_all_metrics()
247
+
248
+ # # Normalize the distances
249
+ # calculator.normalize_metrics()
250
+
251
+ # # Retrieve the normalized Euclidean distances
252
+ # normalized_metrics = calculator.get_normalized_metrics()
253
+ # euclidean_val=normalized_metrics
254
+
255
+ # # Display the results
256
+ # print("Normalized Euclidean Distances:")
257
+ # for key, value in normalized_metrics.items():
258
+ # print(f"{key}: {value:.4f}")
259
+
260
+ # # Plot the metrics
261
+ # calculator.plot_metrics()
gpt_mask_filling.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ openai.api_key = os.getenv("API_KEY")
8
+
9
+
10
+ #Takes in a sentence and returns a list of dicts consisiting of key-value pairs of masked words and lists of the possible replacements
11
+ def predict_masked_words(sentence, n_suggestions=5):
12
+
13
+ prompt = (
14
+ f"Given a sentence with masked words, masked word can be one or more than one, indicated by [MASK], generate {n_suggestions} possible words to fill each mask. "
15
+ "Return the results as a list of dictionaries, where each dictionary key is a masked word and its value is a list of 5 potential words to fill that mask.\n\n"
16
+ "Example input: \"The [MASK] fox [MASK] over the [MASK] dog.\"\n\n"
17
+ "Example output:\n"
18
+ "[\n"
19
+ " {\n"
20
+ " \"[MASK]1\": [\"quick\", \"sly\", \"red\", \"clever\", \"sneaky\"]\n"
21
+ " },\n"
22
+ " {\n"
23
+ " \"[MASK]2\": [\"jumped\", \"leaped\", \"hopped\", \"sprang\", \"bounded\"]\n"
24
+ " },\n"
25
+ " {\n"
26
+ " \"[MASK]3\": [\"lazy\", \"sleeping\", \"brown\", \"tired\", \"old\"]\n"
27
+ " }\n"
28
+ "]\n\n"
29
+ "Example input: \"The [MASK] [MASK] ran swiftly across the [MASK] field.\"\n\n"
30
+ "Example output:\n"
31
+ "[\n"
32
+ " {\n"
33
+ " \"[MASK]1\": [\"tall\", \"fierce\", \"young\", \"old\", \"beautiful\"]\n"
34
+ " },\n"
35
+ " {\n"
36
+ " \"[MASK]2\": [\"lion\", \"tiger\", \"horse\", \"cheetah\", \"deer\"]\n"
37
+ " },\n"
38
+ " {\n"
39
+ " \"[MASK]3\": [\"green\", \"wide\", \"sunny\", \"open\", \"empty\"]\n"
40
+ " }\n"
41
+ "]\n\n"
42
+ "Example input: \"It was a [MASK] day when the train arrived at the station.\"\n\n"
43
+ "Example output:\n"
44
+ "[\n"
45
+ " {\n"
46
+ " \"[MASK]1\": [\"sunny\", \"rainy\", \"cloudy\", \"foggy\", \"stormy\"]\n"
47
+ " },\n"
48
+ "]\n\n"
49
+ "Now, please process the following sentence:\n"
50
+ f"{sentence}"
51
+ )
52
+
53
+
54
+ response = openai.ChatCompletion.create(
55
+ model="gpt-3.5-turbo",
56
+ messages=[
57
+ {"role": "system", "content": "You are a helpful assistant."},
58
+ {"role": "user", "content": prompt}
59
+ ],
60
+ max_tokens=100,
61
+ n=1,
62
+ stop=None,
63
+ temperature=0.7
64
+ )
65
+
66
+ print(response['choices'][0]['message']['content'])
67
+
68
+
69
+ # sentence = "Evacuations and storm [MASK] began on Sunday night as forecasters projected that Hurricane Dorian would hit into Florida’s west coast on Wednesday as a major hurricane packing life-threatening winds and storm surge."
70
+ # predict_masked_words(sentence, n_suggestions=5)
highlighter.py CHANGED
@@ -83,4 +83,22 @@ def highlight_common_words_dict(common_words, sentences, title):
83
  <h3 style="margin-top: 0; font-size: 1em; color: #111827;">{title}</h3>
84
  <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
85
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  '''
 
83
  <h3 style="margin-top: 0; font-size: 1em; color: #111827;">{title}</h3>
84
  <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
85
  </div>
86
+ '''
87
+
88
+ def reparaphrased_sentences_html(sentences):
89
+
90
+ formatted_sentences = []
91
+
92
+ for idx, sentence in enumerate(sentences, start=1):
93
+ # Add index to each sentence
94
+ sentence_with_idx = f"{idx}. {sentence}"
95
+ formatted_sentences.append(sentence_with_idx)
96
+
97
+ final_html = "<br><br>".join(formatted_sentences)
98
+
99
+ return f'''
100
+ <div style="border: solid 1px #ccc; padding: 16px; background-color: #FFFFFF; color: #374151;
101
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
102
+ <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
103
+ </div>
104
  '''
lcs.py CHANGED
@@ -4,7 +4,6 @@ from nltk.corpus import stopwords
4
  def find_common_subsequences(sentence, str_list):
5
  stop_words = set(stopwords.words('english'))
6
  sentence = sentence.lower()
7
-
8
  str_list = [s.lower() for s in str_list]
9
 
10
  def is_present(subseq, str_list):
@@ -17,17 +16,17 @@ def find_common_subsequences(sentence, str_list):
17
  filtered_words = [word for word in words if word.lower() not in stop_words]
18
  return " ".join(filtered_words)
19
 
20
- sentence = remove_stop_words_and_special_chars(sentence)
21
- str_list = [remove_stop_words_and_special_chars(s) for s in str_list]
22
 
23
- words = sentence.split()
24
  common_grams = []
25
  added_phrases = set()
26
 
27
- for n in range(5, 0, -1):
28
  for i in range(len(words) - n + 1):
29
- subseq = " ".join(words[i:i+n])
30
- if is_present(subseq, str_list) and not any(subseq in phrase for phrase in added_phrases):
31
  common_grams.append((i, subseq))
32
  added_phrases.add(subseq)
33
 
@@ -39,8 +38,62 @@ def find_common_subsequences(sentence, str_list):
39
 
40
  return indexed_common_grams
41
 
42
- # Example usage
43
- # sentence = "Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time 5since the Holocaust” to be Jewish in the United States."
44
- # str_list = ['']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- # print(find_common_subsequences(sentence, str_list))
 
4
  def find_common_subsequences(sentence, str_list):
5
  stop_words = set(stopwords.words('english'))
6
  sentence = sentence.lower()
 
7
  str_list = [s.lower() for s in str_list]
8
 
9
  def is_present(subseq, str_list):
 
16
  filtered_words = [word for word in words if word.lower() not in stop_words]
17
  return " ".join(filtered_words)
18
 
19
+ cleaned_sentence = remove_stop_words_and_special_chars(sentence)
20
+ cleaned_str_list = [remove_stop_words_and_special_chars(s) for s in str_list]
21
 
22
+ words = cleaned_sentence.split()
23
  common_grams = []
24
  added_phrases = set()
25
 
26
+ for n in range(5, 0, -1): # Check n-grams from size 5 to 1
27
  for i in range(len(words) - n + 1):
28
+ subseq = " ".join(words[i:i + n])
29
+ if is_present(subseq, cleaned_str_list) and not any(subseq in phrase for phrase in added_phrases):
30
  common_grams.append((i, subseq))
31
  added_phrases.add(subseq)
32
 
 
38
 
39
  return indexed_common_grams
40
 
41
+ def find_common_gram_positions(str_list, common_grams):
42
+ # Initialize a list to hold positions for each sentence
43
+ positions = []
44
+
45
+ for sentence in str_list:
46
+ # Number each word in the sentence
47
+ words = re.sub(r'[^\w\s]', '', sentence).lower().split()
48
+ word_positions = {word: [] for word in words}
49
+
50
+ for idx, word in enumerate(words):
51
+ word_positions[word].append(idx + 1) # Store 1-based index positions
52
+
53
+ # Create a list to store positions of common grams for the current sentence
54
+ sentence_positions = []
55
+
56
+ for gram in common_grams:
57
+ # Clean the gram for matching
58
+ cleaned_gram = re.sub(r'[^\w\s]', '', gram).lower()
59
+ gram_words = cleaned_gram.split()
60
+
61
+ # Check for the position of the common gram in the current sentence
62
+ if all(word in word_positions for word in gram_words):
63
+ # Get the position of the first word of the common gram
64
+ start_idx = word_positions[gram_words[0]][0]
65
+ sentence_positions.append(start_idx)
66
+ else:
67
+ sentence_positions.append(-1) # Common gram not found
68
+
69
+ # Append the positions for the current sentence to the main positions list
70
+ positions.append(sentence_positions)
71
+
72
+ return positions
73
+
74
+
75
+ # # Example usage
76
+ # sentence = "Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time since the Holocaust” to be Jewish in the United States."
77
+ # str_list = [
78
+ # 'During a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump stated that being Jewish in the United States has never been more hazardous since the Holocaust.',
79
+ # 'At a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump declared that being Jewish in the United States has never been more hazardous since the Holocaust.',
80
+ # 'Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania, and stated that being Jewish in the United States has never been more perilous since the Holocaust.',
81
+ # 'Donald Trump made the statement at a campaign rally in Wilkes-Barre, Pennsylvania, saying that being Jewish in the United States has never been more dangerous since the Holocaust.',
82
+ # 'Last month, Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania and stated that being Jewish in the United States has never been more hazardous than during World War II.',
83
+ # 'In Wilkes-Barre, Pennsylvania, Donald Trump spoke at a campaign rally and claimed that the Holocaust was always more hazardous for Jews in the United States.',
84
+ # 'A campaign rally in Wilkes-Barre, Pennsylvania saw Donald Trump declare that being Jewish in the United States has never been more perilous since WWII.',
85
+ # 'Speaking at a campaign rally in Wilkes-Barre, Pennsylvania today, Donald Trump declared that being Jewish has never been more hazardous in the United States since the Holocaust.',
86
+ # 'During his campaign rally in Wilkes-Barre, Pennsylvania today Donald Trump stated: "There has never been a safer place for being Jewish in the United States since the Holocaust."',
87
+ # 'At a campaign rally in Wilkes-Barre, Pennsylvania (pictured), Donald Trump said, "There has never been... gotten worse for being Jewish in America since the Holocaust."'
88
+ # ]
89
+
90
+ # # Find common subsequences
91
+ # common_grams = find_common_subsequences(sentence, str_list)
92
+ # # Extract the subsequences from the common grams for position checking
93
+ # subsequences = [subseq for _, subseq in common_grams]
94
+
95
+ # # Find positions of the common grams
96
+ # common_gram_positions = find_common_gram_positions(str_list, subsequences)
97
+
98
 
99
+ # print(common_grams)
masking_methods.py CHANGED
@@ -1,73 +1,31 @@
1
- # from transformers import AutoTokenizer, AutoModelForMaskedLM
2
- # from transformers import pipeline
3
- # import random
4
- # from nltk.corpus import stopwords
5
- # import math
6
-
7
- # # Masking Model
8
- # def mask_non_stopword(sentence):
9
- # stop_words = set(stopwords.words('english'))
10
- # words = sentence.split()
11
- # non_stop_words = [word for word in words if word.lower() not in stop_words]
12
- # if not non_stop_words:
13
- # return sentence
14
- # word_to_mask = random.choice(non_stop_words)
15
- # masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
16
- # return masked_sentence
17
-
18
- # def mask_non_stopword_pseudorandom(sentence):
19
- # stop_words = set(stopwords.words('english'))
20
- # words = sentence.split()
21
- # non_stop_words = [word for word in words if word.lower() not in stop_words]
22
- # if not non_stop_words:
23
- # return sentence
24
- # random.seed(10)
25
- # word_to_mask = random.choice(non_stop_words)
26
- # masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
27
- # return masked_sentence
28
-
29
- # def high_entropy_words(sentence, non_melting_points):
30
- # stop_words = set(stopwords.words('english'))
31
- # words = sentence.split()
32
-
33
- # non_melting_words = set()
34
- # for _, point in non_melting_points:
35
- # non_melting_words.update(point.lower().split())
36
-
37
- # candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
38
-
39
- # if not candidate_words:
40
- # return sentence
41
-
42
- # max_entropy = -float('inf')
43
- # max_entropy_word = None
44
-
45
- # for word in candidate_words:
46
- # masked_sentence = sentence.replace(word, '[MASK]', 1)
47
- # predictions = fill_mask(masked_sentence)
48
-
49
- # # Calculate entropy based on top 5 predictions
50
- # entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5])
51
-
52
- # if entropy > max_entropy:
53
- # max_entropy = entropy
54
- # max_entropy_word = word
55
-
56
- # return sentence.replace(max_entropy_word, '[MASK]', 1)
57
-
58
-
59
- # # Load tokenizer and model for masked language model
60
- # tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
61
- # model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
62
- # fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
63
-
64
  from transformers import AutoTokenizer, AutoModelForMaskedLM
65
  from transformers import pipeline
66
  import random
67
  from nltk.corpus import stopwords
68
  import math
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- # Masking Model
71
  def mask_non_stopword(sentence):
72
  stop_words = set(stopwords.words('english'))
73
  words = sentence.split()
@@ -76,10 +34,10 @@ def mask_non_stopword(sentence):
76
  return sentence, None, None
77
  word_to_mask = random.choice(non_stop_words)
78
  masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
79
- predictions = fill_mask(masked_sentence)
80
- words = [pred['score'] for pred in predictions]
81
- logits = [pred['token_str'] for pred in predictions]
82
- return masked_sentence, words, logits
83
 
84
  def mask_non_stopword_pseudorandom(sentence):
85
  stop_words = set(stopwords.words('english'))
@@ -87,54 +45,148 @@ def mask_non_stopword_pseudorandom(sentence):
87
  non_stop_words = [word for word in words if word.lower() not in stop_words]
88
  if not non_stop_words:
89
  return sentence, None, None
90
- random.seed(10)
91
  word_to_mask = random.choice(non_stop_words)
92
  masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
93
- predictions = fill_mask(masked_sentence)
94
- words = [pred['score'] for pred in predictions]
95
- logits = [pred['token_str'] for pred in predictions]
96
- return masked_sentence, words, logits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  def high_entropy_words(sentence, non_melting_points):
99
  stop_words = set(stopwords.words('english'))
100
  words = sentence.split()
101
-
102
  non_melting_words = set()
103
  for _, point in non_melting_points:
104
  non_melting_words.update(point.lower().split())
105
-
106
  candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
107
-
108
  if not candidate_words:
109
  return sentence, None, None
110
-
111
  max_entropy = -float('inf')
112
  max_entropy_word = None
113
  max_logits = None
114
-
115
  for word in candidate_words:
116
  masked_sentence = sentence.replace(word, '[MASK]', 1)
117
- predictions = fill_mask(masked_sentence)
 
118
 
119
  # Calculate entropy based on top 5 predictions
120
- entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5])
121
-
 
 
122
  if entropy > max_entropy:
123
  max_entropy = entropy
124
  max_entropy_word = word
125
- max_logits = [pred['score'] for pred in predictions]
126
-
 
 
 
127
  masked_sentence = sentence.replace(max_entropy_word, '[MASK]', 1)
128
- words = [pred['score'] for pred in predictions]
129
- logits = [pred['token_str'] for pred in predictions]
130
- return masked_sentence, words, logits
131
 
132
- # Load tokenizer and model for masked language model
133
- tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
134
- model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
135
- fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
- non_melting_points = [(1, 'Jewish'), (2, 'messages'), (3, 'stab')]
138
- a, b, c = high_entropy_words("A former Cornell University student was sentenced to 21 months in prison on Monday after admitting that he had posted a series of online messages last fall in which he threatened to stab, rape and behead Jewish people", non_melting_points)
139
- print(f"logits type: {type(b)}")
140
- print(f"logits content: {b}")
 
1
+ import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from transformers import AutoTokenizer, AutoModelForMaskedLM
3
  from transformers import pipeline
4
  import random
5
  from nltk.corpus import stopwords
6
  import math
7
+ from vocabulary_split import split_vocabulary, filter_logits
8
+
9
+ # Load tokenizer and model for masked language model
10
+ tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
11
+ model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
12
+ fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
13
+
14
+ # Get permissible vocabulary
15
+ permissible, _ = split_vocabulary(seed=42)
16
+ permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
17
+
18
+ def get_logits_for_mask(model, tokenizer, sentence):
19
+ inputs = tokenizer(sentence, return_tensors="pt")
20
+ mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
21
+
22
+ with torch.no_grad():
23
+ outputs = model(**inputs)
24
+
25
+ logits = outputs.logits
26
+ mask_token_logits = logits[0, mask_token_index, :]
27
+ return mask_token_logits.squeeze()
28
 
 
29
  def mask_non_stopword(sentence):
30
  stop_words = set(stopwords.words('english'))
31
  words = sentence.split()
 
34
  return sentence, None, None
35
  word_to_mask = random.choice(non_stop_words)
36
  masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
37
+ logits = get_logits_for_mask(model, tokenizer, masked_sentence)
38
+ filtered_logits = filter_logits(logits, permissible_indices)
39
+ words = [tokenizer.decode([i]) for i in filtered_logits.argsort()[-5:]]
40
+ return masked_sentence, filtered_logits.tolist(), words
41
 
42
  def mask_non_stopword_pseudorandom(sentence):
43
  stop_words = set(stopwords.words('english'))
 
45
  non_stop_words = [word for word in words if word.lower() not in stop_words]
46
  if not non_stop_words:
47
  return sentence, None, None
48
+ random.seed(10) # Fixed seed for pseudo-randomness
49
  word_to_mask = random.choice(non_stop_words)
50
  masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
51
+ logits = get_logits_for_mask(model, tokenizer, masked_sentence)
52
+ filtered_logits = filter_logits(logits, permissible_indices)
53
+ words = [tokenizer.decode([i]) for i in filtered_logits.argsort()[-5:]]
54
+ return masked_sentence, filtered_logits.tolist(), words
55
+
56
+ # New function: mask words between LCS points
57
+ def mask_between_lcs(sentence, lcs_points):
58
+ words = sentence.split()
59
+ masked_indices = []
60
+
61
+ # Mask between first word and first LCS point
62
+ if lcs_points and lcs_points[0] > 0:
63
+ idx = random.randint(0, lcs_points[0]-1)
64
+ words[idx] = '[MASK]'
65
+ masked_indices.append(idx)
66
+
67
+ # Mask between LCS points
68
+ for i in range(len(lcs_points) - 1):
69
+ start, end = lcs_points[i], lcs_points[i+1]
70
+ if end - start > 1:
71
+ mask_index = random.randint(start + 1, end - 1)
72
+ words[mask_index] = '[MASK]'
73
+ masked_indices.append(mask_index)
74
+
75
+ # Mask between last LCS point and last word
76
+ if lcs_points and lcs_points[-1] < len(words) - 1:
77
+ idx = random.randint(lcs_points[-1]+1, len(words)-1)
78
+ words[idx] = '[MASK]'
79
+ masked_indices.append(idx)
80
+
81
+ masked_sentence = ' '.join(words)
82
+ logits = get_logits_for_mask(model, tokenizer, masked_sentence)
83
+
84
+ # Now process each masked token separately
85
+ top_words_list = []
86
+ logits_list = []
87
+ for i in range(len(masked_indices)):
88
+ logits_i = logits[i]
89
+ if logits_i.dim() > 1:
90
+ logits_i = logits_i.squeeze()
91
+ filtered_logits_i = filter_logits(logits_i, permissible_indices)
92
+ logits_list.append(filtered_logits_i.tolist())
93
+ top_5_indices = filtered_logits_i.topk(5).indices.tolist()
94
+ top_words = [tokenizer.decode([i]) for i in top_5_indices]
95
+ top_words_list.append(top_words)
96
+
97
+ return masked_sentence, logits_list, top_words_list
98
+
99
 
100
  def high_entropy_words(sentence, non_melting_points):
101
  stop_words = set(stopwords.words('english'))
102
  words = sentence.split()
103
+
104
  non_melting_words = set()
105
  for _, point in non_melting_points:
106
  non_melting_words.update(point.lower().split())
107
+
108
  candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
109
+
110
  if not candidate_words:
111
  return sentence, None, None
112
+
113
  max_entropy = -float('inf')
114
  max_entropy_word = None
115
  max_logits = None
116
+
117
  for word in candidate_words:
118
  masked_sentence = sentence.replace(word, '[MASK]', 1)
119
+ logits = get_logits_for_mask(model, tokenizer, masked_sentence)
120
+ filtered_logits = filter_logits(logits, permissible_indices)
121
 
122
  # Calculate entropy based on top 5 predictions
123
+ probs = torch.softmax(filtered_logits, dim=-1)
124
+ top_5_probs = probs.topk(5).values
125
+ entropy = -torch.sum(top_5_probs * torch.log(top_5_probs))
126
+
127
  if entropy > max_entropy:
128
  max_entropy = entropy
129
  max_entropy_word = word
130
+ max_logits = filtered_logits
131
+
132
+ if max_entropy_word is None:
133
+ return sentence, None, None
134
+
135
  masked_sentence = sentence.replace(max_entropy_word, '[MASK]', 1)
136
+ words = [tokenizer.decode([i]) for i in max_logits.argsort()[-5:]]
137
+ return masked_sentence, max_logits.tolist(), words
 
138
 
139
+ # New function: mask based on part of speech
140
+ def mask_by_pos(sentence, pos_to_mask=['NOUN', 'VERB', 'ADJ']):
141
+ import nltk
142
+ nltk.download('averaged_perceptron_tagger', quiet=True)
143
+
144
+ words = nltk.word_tokenize(sentence)
145
+ pos_tags = nltk.pos_tag(words)
146
+
147
+ maskable_words = [word for word, pos in pos_tags if pos[:2] in pos_to_mask]
148
+
149
+ if not maskable_words:
150
+ return sentence, None, None
151
+
152
+ word_to_mask = random.choice(maskable_words)
153
+ masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
154
+
155
+ logits = get_logits_for_mask(model, tokenizer, masked_sentence)
156
+ filtered_logits = filter_logits(logits, permissible_indices)
157
+ words = [tokenizer.decode([i]) for i in filtered_logits.argsort()[-5:]]
158
+
159
+ return masked_sentence, filtered_logits.tolist(), words
160
+
161
+ # New function: mask named entities
162
+ def mask_named_entity(sentence):
163
+ import nltk
164
+ nltk.download('maxent_ne_chunker', quiet=True)
165
+ nltk.download('words', quiet=True)
166
+
167
+ words = nltk.word_tokenize(sentence)
168
+ pos_tags = nltk.pos_tag(words)
169
+ named_entities = nltk.ne_chunk(pos_tags)
170
+
171
+ maskable_words = [word for word, tag in named_entities.leaves() if isinstance(tag, nltk.Tree)]
172
+
173
+ if not maskable_words:
174
+ return sentence, None, None
175
+
176
+ word_to_mask = random.choice(maskable_words)
177
+ masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
178
+
179
+ logits = get_logits_for_mask(model, tokenizer, masked_sentence)
180
+ filtered_logits = filter_logits(logits, permissible_indices)
181
+ words = [tokenizer.decode([i]) for i in filtered_logits.argsort()[-5:]]
182
+
183
+ return masked_sentence, filtered_logits.tolist(), words
184
+
185
+
186
+ # sentence = "This is a sample sentence with some LCS points"
187
+ # lcs_points = [2, 5, 8] # Indices of LCS points
188
+ # masked_sentence, logits_list, top_words_list = mask_between_lcs(sentence, lcs_points)
189
 
190
+ # print("Masked Sentence:", masked_sentence)
191
+ # for idx, top_words in enumerate(top_words_list):
192
+ # print(f"Top words for mask {idx+1}:", top_words)
 
paraphraser.py CHANGED
@@ -1,31 +1,83 @@
1
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
-
3
- # Function to Initialize the Model
4
- def init_model():
5
- para_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
6
- para_model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
7
- return para_tokenizer, para_model
8
-
9
- # Function to Paraphrase the Text
10
- def paraphrase(question, para_tokenizer, para_model, num_beams=10, num_beam_groups=10, num_return_sequences=10, repetition_penalty=10.0, diversity_penalty=3.0, no_repeat_ngram_size=2, temperature=0.7, max_length=64):
11
- input_ids = para_tokenizer(
12
- f'paraphrase: {question}',
13
- return_tensors="pt", padding="longest",
14
- max_length=max_length,
15
- truncation=True,
16
- ).input_ids
17
- outputs = para_model.generate(
18
- input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
19
- num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
20
- num_beams=num_beams, num_beam_groups=num_beam_groups,
21
- max_length=max_length, diversity_penalty=diversity_penalty
22
- )
23
- res = para_tokenizer.batch_decode(outputs, skip_special_tokens=True)
24
- return res
25
-
26
- def generate_paraphrase(question):
27
- para_tokenizer, para_model = init_model()
28
- res = paraphrase(question, para_tokenizer, para_model)
29
- return res
30
-
31
- # print(generate_paraphrase("Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time 5since the Holocaust” to be Jewish in the United States."))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+
3
+ # # Function to Initialize the Model
4
+ # def init_model():
5
+ # para_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
6
+ # para_model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
7
+ # return para_tokenizer, para_model
8
+
9
+ # # Function to Paraphrase the Text
10
+ # def paraphrase(question, para_tokenizer, para_model, num_beams=10, num_beam_groups=10, num_return_sequences=10, repetition_penalty=10.0, diversity_penalty=3.0, no_repeat_ngram_size=2, temperature=0.7, max_length=64):
11
+ # input_ids = para_tokenizer(
12
+ # f'paraphrase: {question}',
13
+ # return_tensors="pt", padding="longest",
14
+ # max_length=max_length,
15
+ # truncation=True,
16
+ # ).input_ids
17
+ # outputs = para_model.generate(
18
+ # input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
19
+ # num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
20
+ # num_beams=num_beams, num_beam_groups=num_beam_groups,
21
+ # max_length=max_length, diversity_penalty=diversity_penalty
22
+ # )
23
+ # res = para_tokenizer.batch_decode(outputs, skip_special_tokens=True)
24
+ # return res
25
+
26
+ # def generate_paraphrase(question):
27
+ # para_tokenizer, para_model = init_model()
28
+ # res = paraphrase(question, para_tokenizer, para_model)
29
+ # return res
30
+
31
+ # print(generate_paraphrase("Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time 5since the Holocaust” to be Jewish in the United States."))
32
+
33
+ '''
34
+ Accepts a sentence or list of sentences and returns a lit of all their paraphrases using GPT-4.
35
+ '''
36
+
37
+ from openai import OpenAI
38
+ from dotenv import load_dotenv
39
+ load_dotenv()
40
+ import os
41
+
42
+ key = os.getenv("OPENAI_API_KEY")
43
+
44
+ # Initialize the OpenAI client
45
+ client = OpenAI(
46
+ api_key=key # Replace with your actual API key
47
+ )
48
+
49
+ # Function to paraphrase sentences using GPT-4
50
+ def generate_paraphrase(sentences, model="gpt-4o", num_paraphrases=10, max_tokens=150, temperature=0.7):
51
+ # Ensure sentences is a list even if a single sentence is passed
52
+ if isinstance(sentences, str):
53
+ sentences = [sentences]
54
+
55
+ paraphrased_sentences_list = []
56
+
57
+ for sentence in sentences:
58
+ full_prompt = f"Paraphrase the following text: '{sentence}'"
59
+ try:
60
+ chat_completion = client.chat.completions.create(
61
+ messages=[
62
+ {
63
+ "role": "user",
64
+ "content": full_prompt,
65
+ }
66
+ ],
67
+ model=model,
68
+ max_tokens=max_tokens,
69
+ temperature=temperature,
70
+ n=num_paraphrases # Number of paraphrased sentences to generate
71
+ )
72
+ # Extract the paraphrased sentences from the response
73
+ paraphrased_sentences = [choice.message.content.strip() for choice in chat_completion.choices]
74
+ # Append paraphrased sentences to the list
75
+ paraphrased_sentences_list.extend(paraphrased_sentences)
76
+ except Exception as e:
77
+ print(f"Error paraphrasing sentence '{sentence}': {e}")
78
+
79
+ return paraphrased_sentences_list
80
+
81
+ result = generate_paraphrase("Mayor Eric Adams did not attend the first candidate forum for the New York City mayoral race, but his record — and the criminal charges he faces — received plenty of attention on Saturday from the Democrats who are running to unseat him.")
82
+
83
+ print(len(result))
requirements.txt CHANGED
@@ -14,4 +14,6 @@ nltk
14
  tenacity
15
  pandas
16
  graphviz==0.20.3
17
- gradio
 
 
 
14
  tenacity
15
  pandas
16
  graphviz==0.20.3
17
+ gradio=4.29.0
18
+ openai
19
+ python-dotenv
sampling_methods.py CHANGED
@@ -1,55 +1,42 @@
1
- # import torch
2
- # import random
3
-
4
- # def sample_word(words, logits, sampling_technique='inverse_transform', temperature=1.0):
5
- # if sampling_technique == 'inverse_transform':
6
- # probs = torch.softmax(torch.tensor(logits), dim=-1)
7
- # cumulative_probs = torch.cumsum(probs, dim=-1)
8
- # random_prob = random.random()
9
- # sampled_index = torch.where(cumulative_probs >= random_prob)[0][0]
10
- # elif sampling_technique == 'exponential_minimum':
11
- # probs = torch.softmax(torch.tensor(logits), dim=-1)
12
- # exp_probs = torch.exp(-torch.log(probs))
13
- # random_probs = torch.rand_like(exp_probs)
14
- # sampled_index = torch.argmax(random_probs * exp_probs)
15
- # elif sampling_technique == 'temperature':
16
- # scaled_logits = torch.tensor(logits) / temperature
17
- # probs = torch.softmax(scaled_logits, dim=-1)
18
- # sampled_index = torch.multinomial(probs, 1).item()
19
- # elif sampling_technique == 'greedy':
20
- # sampled_index = torch.argmax(torch.tensor(logits)).item()
21
- # else:
22
- # raise ValueError("Invalid sampling technique. Choose 'inverse_transform', 'exponential_minimum', 'temperature', or 'greedy'.")
23
-
24
- # sampled_word = words[sampled_index]
25
- # return sampled_word
26
-
27
  import torch
28
  import random
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  def sample_word(sentence, words, logits, sampling_technique='inverse_transform', temperature=1.0):
 
 
31
  if sampling_technique == 'inverse_transform':
32
- probs = torch.softmax(torch.tensor(logits), dim=-1)
33
  cumulative_probs = torch.cumsum(probs, dim=-1)
34
  random_prob = random.random()
35
  sampled_index = torch.where(cumulative_probs >= random_prob)[0][0]
36
  elif sampling_technique == 'exponential_minimum':
37
- probs = torch.softmax(torch.tensor(logits), dim=-1)
38
  exp_probs = torch.exp(-torch.log(probs))
39
  random_probs = torch.rand_like(exp_probs)
40
  sampled_index = torch.argmax(random_probs * exp_probs)
41
  elif sampling_technique == 'temperature':
42
- scaled_logits = torch.tensor(logits) / temperature
43
- probs = torch.softmax(scaled_logits, dim=-1)
44
  sampled_index = torch.multinomial(probs, 1).item()
45
  elif sampling_technique == 'greedy':
46
- sampled_index = torch.argmax(torch.tensor(logits)).item()
47
  else:
48
  raise ValueError("Invalid sampling technique. Choose 'inverse_transform', 'exponential_minimum', 'temperature', or 'greedy'.")
49
-
50
- sampled_word = words[sampled_index]
51
-
52
  # Replace [MASK] with the sampled word
53
  filled_sentence = sentence.replace('[MASK]', sampled_word)
54
-
55
  return filled_sentence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import random
3
+ from vocabulary_split import split_vocabulary, filter_logits
4
+ # from transformers import AutoTokenizer, AutoModelForMaskedLM
5
+ from masking_methods import tokenizer
6
+
7
+ # Load tokenizer and model for masked language model
8
+ # tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
9
+ # model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
10
+
11
+
12
+ # Get permissible vocabulary
13
+ permissible, _ = split_vocabulary(seed=42)
14
+ permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
15
 
16
  def sample_word(sentence, words, logits, sampling_technique='inverse_transform', temperature=1.0):
17
+ filtered_logits = filter_logits(torch.tensor(logits), permissible_indices)
18
+
19
  if sampling_technique == 'inverse_transform':
20
+ probs = torch.softmax(filtered_logits / temperature, dim=-1)
21
  cumulative_probs = torch.cumsum(probs, dim=-1)
22
  random_prob = random.random()
23
  sampled_index = torch.where(cumulative_probs >= random_prob)[0][0]
24
  elif sampling_technique == 'exponential_minimum':
25
+ probs = torch.softmax(filtered_logits / temperature, dim=-1)
26
  exp_probs = torch.exp(-torch.log(probs))
27
  random_probs = torch.rand_like(exp_probs)
28
  sampled_index = torch.argmax(random_probs * exp_probs)
29
  elif sampling_technique == 'temperature':
30
+ probs = torch.softmax(filtered_logits / temperature, dim=-1)
 
31
  sampled_index = torch.multinomial(probs, 1).item()
32
  elif sampling_technique == 'greedy':
33
+ sampled_index = torch.argmax(filtered_logits).item()
34
  else:
35
  raise ValueError("Invalid sampling technique. Choose 'inverse_transform', 'exponential_minimum', 'temperature', or 'greedy'.")
36
+
37
+ sampled_word = tokenizer.decode([sampled_index])
38
+
39
  # Replace [MASK] with the sampled word
40
  filled_sentence = sentence.replace('[MASK]', sampled_word)
41
+
42
  return filled_sentence
threeD_plot.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import numpy as np
2
+ # import plotly.graph_objects as go
3
+ # from scipy.interpolate import griddata
4
+
5
+ # def gen_three_D_plot(detectability_val, distortion_val, euclidean_val):
6
+ # detectability = np.array(detectability_val)
7
+ # distortion = np.array(distortion_val)
8
+ # euclidean = np.array(euclidean_val)
9
+
10
+ # # Find the closest point to the origin
11
+ # distances_to_origin = np.linalg.norm(np.array([distortion, detectability, euclidean]).T, axis=1)
12
+ # closest_point_index = np.argmin(distances_to_origin)
13
+
14
+ # # Determine the closest points to each axis
15
+ # closest_to_x_axis = np.argmin(distortion)
16
+ # closest_to_y_axis = np.argmin(detectability)
17
+ # closest_to_z_axis = np.argmin(euclidean)
18
+
19
+ # # Use the detected closest point as the "sweet spot"
20
+ # sweet_spot_detectability = detectability[closest_point_index]
21
+ # sweet_spot_distortion = distortion[closest_point_index]
22
+ # sweet_spot_euclidean = euclidean[closest_point_index]
23
+
24
+ # # Create a meshgrid from the data
25
+ # x_grid, y_grid = np.meshgrid(np.linspace(min(detectability), max(detectability), 30),
26
+ # np.linspace(min(distortion), max(distortion), 30))
27
+
28
+ # # Interpolate z values (Euclidean distances) to fit the grid
29
+ # z_grid = griddata((detectability, distortion), euclidean, (x_grid, y_grid), method='linear')
30
+
31
+ # if z_grid is None:
32
+ # raise ValueError("griddata could not generate a valid interpolation. Check your input data.")
33
+
34
+ # # Create the 3D contour plot with the Plasma color scale
35
+ # fig = go.Figure(data=go.Surface(
36
+ # z=z_grid,
37
+ # x=x_grid,
38
+ # y=y_grid,
39
+ # contours={
40
+ # "z": {"show": True, "start": min(euclidean), "end": max(euclidean), "size": 0.1, "usecolormap": True}
41
+ # },
42
+ # colorscale='Plasma'
43
+ # ))
44
+
45
+ # # Add a marker for the sweet spot
46
+ # fig.add_trace(go.Scatter3d(
47
+ # x=[sweet_spot_detectability],
48
+ # y=[sweet_spot_distortion],
49
+ # z=[sweet_spot_euclidean],
50
+ # mode='markers+text',
51
+ # marker=dict(size=10, color='red', symbol='circle'),
52
+ # text=["Sweet Spot"],
53
+ # textposition="top center"
54
+ # ))
55
+
56
+ # # Set axis labels
57
+ # fig.update_layout(
58
+ # scene=dict(
59
+ # xaxis_title='Detectability Score',
60
+ # yaxis_title='Distortion Score',
61
+ # zaxis_title='Euclidean Distance'
62
+ # ),
63
+ # margin=dict(l=0, r=0, b=0, t=0)
64
+ # )
65
+
66
+ # return fig
67
+
68
+
69
+ import numpy as np
70
+ import plotly.graph_objects as go
71
+ from scipy.interpolate import griddata
72
+
73
+ def gen_three_D_plot(detectability_val, distortion_val, euclidean_val):
74
+ detectability = np.array(detectability_val)
75
+ distortion = np.array(distortion_val)
76
+ euclidean = np.array(euclidean_val)
77
+
78
+ # Normalize the values to range [0, 1]
79
+ norm_detectability = (detectability - min(detectability)) / (max(detectability) - min(detectability))
80
+ norm_distortion = (distortion - min(distortion)) / (max(distortion) - min(distortion))
81
+ norm_euclidean = (euclidean - min(euclidean)) / (max(euclidean) - min(euclidean))
82
+
83
+ # Composite score: maximize detectability, minimize distortion and Euclidean distance
84
+ # We subtract distortion and euclidean as we want them minimized.
85
+ composite_score = norm_detectability - (norm_distortion + norm_euclidean)
86
+
87
+ # Find the index of the maximum score (sweet spot)
88
+ sweet_spot_index = np.argmax(composite_score)
89
+
90
+ # Sweet spot values
91
+ sweet_spot_detectability = detectability[sweet_spot_index]
92
+ sweet_spot_distortion = distortion[sweet_spot_index]
93
+ sweet_spot_euclidean = euclidean[sweet_spot_index]
94
+
95
+ # Create a meshgrid from the data
96
+ x_grid, y_grid = np.meshgrid(np.linspace(min(detectability), max(detectability), 30),
97
+ np.linspace(min(distortion), max(distortion), 30))
98
+
99
+ # Interpolate z values (Euclidean distances) to fit the grid
100
+ z_grid = griddata((detectability, distortion), euclidean, (x_grid, y_grid), method='linear')
101
+
102
+ if z_grid is None:
103
+ raise ValueError("griddata could not generate a valid interpolation. Check your input data.")
104
+
105
+ # Create the 3D contour plot with the Plasma color scale
106
+ fig = go.Figure(data=go.Surface(
107
+ z=z_grid,
108
+ x=x_grid,
109
+ y=y_grid,
110
+ contours={
111
+ "z": {"show": True, "start": min(euclidean), "end": max(euclidean), "size": 0.1, "usecolormap": True}
112
+ },
113
+ colorscale='Plasma'
114
+ ))
115
+
116
+ # Add a marker for the sweet spot
117
+ fig.add_trace(go.Scatter3d(
118
+ x=[sweet_spot_detectability],
119
+ y=[sweet_spot_distortion],
120
+ z=[sweet_spot_euclidean],
121
+ mode='markers+text',
122
+ marker=dict(size=10, color='red', symbol='circle'),
123
+ text=["Sweet Spot"],
124
+ textposition="top center"
125
+ ))
126
+
127
+ # Set axis labels
128
+ fig.update_layout(
129
+ scene=dict(
130
+ xaxis_title='Detectability Score',
131
+ yaxis_title='Distortion Score',
132
+ zaxis_title='Euclidean Distance'
133
+ ),
134
+ margin=dict(l=0, r=0, b=0, t=0)
135
+ )
136
+
137
+ return fig
tree.py CHANGED
@@ -1,341 +1,3 @@
1
- # import plotly.graph_objects as go
2
- # import textwrap
3
- # import re
4
- # from collections import defaultdict
5
-
6
- # def generate_subplot1(paraphrased_sentence, scheme_sentences, highlight_info):
7
- # # Combine nodes into one list with appropriate labels
8
- # nodes = [paraphrased_sentence] + scheme_sentences
9
- # nodes[0] += ' L0' # Paraphrased sentence is level 0
10
- # for i in range(1, len(nodes)):
11
- # nodes[i] += ' L1' # Scheme sentences are level 1
12
-
13
- # # Define the highlight_words function
14
- # def highlight_words(sentence, color_map):
15
- # for word, color in color_map.items():
16
- # sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
17
- # return sentence
18
-
19
- # # Clean and wrap nodes, and highlight specified words globally
20
- # cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
21
- # global_color_map = dict(highlight_info)
22
- # highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
23
- # wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=50)) for node in highlighted_nodes]
24
-
25
- # # Function to determine tree levels and create edges dynamically
26
- # def get_levels_and_edges(nodes):
27
- # levels = {}
28
- # edges = []
29
- # for i, node in enumerate(nodes):
30
- # level = int(node.split()[-1][1])
31
- # levels[i] = level
32
-
33
- # # Add edges from L0 to all L1 nodes
34
- # root_node = next(i for i, level in levels.items() if level == 0)
35
- # for i, level in levels.items():
36
- # if level == 1:
37
- # edges.append((root_node, i))
38
-
39
- # return levels, edges
40
-
41
- # # Get levels and dynamic edges
42
- # levels, edges = get_levels_and_edges(nodes)
43
- # max_level = max(levels.values(), default=0)
44
-
45
- # # Calculate positions
46
- # positions = {}
47
- # level_heights = defaultdict(int)
48
- # for node, level in levels.items():
49
- # level_heights[level] += 1
50
-
51
- # y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
52
- # x_gap = 2
53
- # l1_y_gap = 10
54
-
55
- # for node, level in levels.items():
56
- # if level == 1:
57
- # positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
58
- # else:
59
- # positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
60
- # y_offsets[level] += 1
61
-
62
- # # Function to highlight words in a wrapped node string
63
- # def color_highlighted_words(node, color_map):
64
- # parts = re.split(r'(\{\{.*?\}\})', node)
65
- # colored_parts = []
66
- # for part in parts:
67
- # match = re.match(r'\{\{(.*?)\}\}', part)
68
- # if match:
69
- # word = match.group(1)
70
- # color = color_map.get(word, 'black')
71
- # colored_parts.append(f"<span style='color: {color};'>{word}</span>")
72
- # else:
73
- # colored_parts.append(part)
74
- # return ''.join(colored_parts)
75
-
76
- # # Define the text for each edge
77
- # edge_texts = [
78
- # "Highest Entropy Masking",
79
- # "Pseudo-random Masking",
80
- # "Random Masking",
81
- # "Greedy Sampling",
82
- # "Temperature Sampling",
83
- # "Exponential Minimum Sampling",
84
- # "Inverse Transform Sampling",
85
- # "Greedy Sampling",
86
- # "Temperature Sampling",
87
- # "Exponential Minimum Sampling",
88
- # "Inverse Transform Sampling",
89
- # "Greedy Sampling",
90
- # "Temperature Sampling",
91
- # "Exponential Minimum Sampling",
92
- # "Inverse Transform Sampling"
93
- # ]
94
-
95
- # # Create figure
96
- # fig1 = go.Figure()
97
-
98
- # # Add nodes to the figure
99
- # for i, node in enumerate(wrapped_nodes):
100
- # colored_node = color_highlighted_words(node, global_color_map)
101
- # x, y = positions[i]
102
- # fig1.add_trace(go.Scatter(
103
- # x=[-x], # Reflect the x coordinate
104
- # y=[y],
105
- # mode='markers',
106
- # marker=dict(size=10, color='blue'),
107
- # hoverinfo='none'
108
- # ))
109
- # fig1.add_annotation(
110
- # x=-x, # Reflect the x coordinate
111
- # y=y,
112
- # text=colored_node,
113
- # showarrow=False,
114
- # xshift=15,
115
- # align="center",
116
- # font=dict(size=12),
117
- # bordercolor='black',
118
- # borderwidth=1,
119
- # borderpad=2,
120
- # bgcolor='white',
121
- # width=300,
122
- # height=120
123
- # )
124
-
125
- # # Add edges and text above each edge
126
- # for i, edge in enumerate(edges):
127
- # x0, y0 = positions[edge[0]]
128
- # x1, y1 = positions[edge[1]]
129
- # fig1.add_trace(go.Scatter(
130
- # x=[-x0, -x1], # Reflect the x coordinates
131
- # y=[y0, y1],
132
- # mode='lines',
133
- # line=dict(color='black', width=1)
134
- # ))
135
-
136
- # # Calculate the midpoint of the edge
137
- # mid_x = (-x0 + -x1) / 2
138
- # mid_y = (y0 + y1) / 2
139
-
140
- # # Adjust y position to shift text upwards
141
- # text_y_position = mid_y + 0.8 # Increase this value to shift the text further upwards
142
-
143
- # # Add text annotation above the edge
144
- # fig1.add_annotation(
145
- # x=mid_x,
146
- # y=text_y_position,
147
- # text=edge_texts[i], # Use the text specific to this edge
148
- # showarrow=False,
149
- # font=dict(size=12),
150
- # align="center"
151
- # )
152
-
153
- # fig1.update_layout(
154
- # showlegend=False,
155
- # margin=dict(t=20, b=20, l=20, r=20),
156
- # xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
157
- # yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
158
- # width=1435, # Adjusted width to accommodate more levels
159
- # height=1000 # Adjusted height to accommodate more levels
160
- # )
161
-
162
- # return fig1
163
-
164
-
165
-
166
- # def generate_subplot2(scheme_sentences, sampled_sentence, highlight_info):
167
- # # Combine nodes into one list with appropriate labels
168
- # nodes = scheme_sentences + sampled_sentence
169
- # para_len = len(scheme_sentences)
170
-
171
- # # Reassign levels: L1 -> L0, L2 -> L1
172
- # for i in range(para_len):
173
- # nodes[i] += ' L0' # Scheme sentences are now level 0
174
- # for i in range(para_len, len(nodes)):
175
- # nodes[i] += ' L1' # Sampled sentences are now level 1
176
-
177
- # # Define the highlight_words function
178
- # def highlight_words(sentence, color_map):
179
- # for word, color in color_map.items():
180
- # sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
181
- # return sentence
182
-
183
- # # Clean and wrap nodes, and highlight specified words globally
184
- # cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
185
- # global_color_map = dict(highlight_info)
186
- # highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
187
- # wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=80)) for node in highlighted_nodes]
188
-
189
- # # Function to determine tree levels and create edges dynamically
190
- # def get_levels_and_edges(nodes):
191
- # levels = {}
192
- # edges = []
193
- # for i, node in enumerate(nodes):
194
- # level = int(node.split()[-1][1])
195
- # levels[i] = level
196
-
197
- # # Add edges from L0 to all L1 nodes
198
- # l0_indices = [i for i, level in levels.items() if level == 0]
199
- # l1_indices = [i for i, level in levels.items() if level == 1]
200
-
201
- # # Ensure there are exactly 3 L0 nodes
202
- # if len(l0_indices) < 3:
203
- # raise ValueError("There should be exactly 3 L0 nodes to attach edges correctly.")
204
-
205
- # # Split L1 nodes into 3 groups of 4 for attaching to L0 nodes
206
- # for i, l1_node in enumerate(l1_indices):
207
- # if i < 4:
208
- # edges.append((l0_indices[0], l1_node)) # Connect to the first L0 node
209
- # elif i < 8:
210
- # edges.append((l0_indices[1], l1_node)) # Connect to the second L0 node
211
- # else:
212
- # edges.append((l0_indices[2], l1_node)) # Connect to the third L0 node
213
-
214
- # return levels, edges
215
-
216
- # # Get levels and dynamic edges
217
- # levels, edges = get_levels_and_edges(nodes)
218
-
219
- # # Calculate positions
220
- # positions = {}
221
- # level_heights = defaultdict(int)
222
- # for node, level in levels.items():
223
- # level_heights[level] += 1
224
-
225
- # y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
226
- # x_gap = 2
227
- # l1_y_gap = 10
228
-
229
- # for node, level in levels.items():
230
- # if level == 1:
231
- # positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
232
- # else:
233
- # positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
234
- # y_offsets[level] += 1
235
-
236
- # # Function to highlight words in a wrapped node string
237
- # def color_highlighted_words(node, color_map):
238
- # parts = re.split(r'(\{\{.*?\}\})', node)
239
- # colored_parts = []
240
- # for part in parts:
241
- # match = re.match(r'\{\{(.*?)\}\}', part)
242
- # if match:
243
- # word = match.group(1)
244
- # color = color_map.get(word, 'black')
245
- # colored_parts.append(f"<span style='color: {color};'>{word}</span>")
246
- # else:
247
- # colored_parts.append(part)
248
- # return ''.join(colored_parts)
249
-
250
- # # Define the text for each edge
251
- # edge_texts = [
252
- # "Highest Entropy Masking",
253
- # "Pseudo-random Masking",
254
- # "Random Masking",
255
- # "Greedy Sampling",
256
- # "Temperature Sampling",
257
- # "Exponential Minimum Sampling",
258
- # "Inverse Transform Sampling",
259
- # "Greedy Sampling",
260
- # "Temperature Sampling",
261
- # "Exponential Minimum Sampling",
262
- # "Inverse Transform Sampling",
263
- # "Greedy Sampling",
264
- # "Temperature Sampling",
265
- # "Exponential Minimum Sampling",
266
- # "Inverse Transform Sampling"
267
- # ]
268
-
269
- # # Create figure
270
- # fig2 = go.Figure()
271
-
272
- # # Add nodes to the figure
273
- # for i, node in enumerate(wrapped_nodes):
274
- # colored_node = color_highlighted_words(node, global_color_map)
275
- # x, y = positions[i]
276
- # fig2.add_trace(go.Scatter(
277
- # x=[-x], # Reflect the x coordinate
278
- # y=[y],
279
- # mode='markers',
280
- # marker=dict(size=10, color='blue'),
281
- # hoverinfo='none'
282
- # ))
283
- # fig2.add_annotation(
284
- # x=-x, # Reflect the x coordinate
285
- # y=y,
286
- # text=colored_node,
287
- # showarrow=False,
288
- # xshift=15,
289
- # align="center",
290
- # font=dict(size=12),
291
- # bordercolor='black',
292
- # borderwidth=1,
293
- # borderpad=2,
294
- # bgcolor='white',
295
- # width=450,
296
- # height=65
297
- # )
298
-
299
- # # Add edges and text above each edge
300
- # for i, edge in enumerate(edges):
301
- # x0, y0 = positions[edge[0]]
302
- # x1, y1 = positions[edge[1]]
303
- # fig2.add_trace(go.Scatter(
304
- # x=[-x0, -x1], # Reflect the x coordinates
305
- # y=[y0, y1],
306
- # mode='lines',
307
- # line=dict(color='black', width=1)
308
- # ))
309
-
310
- # # Calculate the midpoint of the edge
311
- # mid_x = (-x0 + -x1) / 2
312
- # mid_y = (y0 + y1) / 2
313
-
314
- # # Adjust y position to shift text upwards
315
- # text_y_position = mid_y + 0.8 # Increase this value to shift the text further upwards
316
-
317
- # # Add text annotation above the edge
318
- # fig2.add_annotation(A surprising aspect of tests, specifically self-testing soon after exposure to new material, is that they can significantly improve your ability to learn, apply, and maintain new knowledge.
319
- # x=mid_x,
320
- # y=text_y_position,
321
- # text=edge_texts[i], # Use the text specific to this edge
322
- # showarrow=False,
323
- # font=dict(size=12),
324
- # align="center"
325
- # )
326
-
327
- # fig2.update_layout(
328
- # showlegend=False,
329
- # margin=dict(t=20, b=20, l=20, r=20),
330
- # xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
331
- # yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
332
- # width=1435, # Adjusted width to accommodate more levels
333
- # height=1000 # Adjusted height to accommodate more levels
334
- # )
335
-
336
- # return fig2
337
-
338
-
339
  import plotly.graph_objects as go
340
  import textwrap
341
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import plotly.graph_objects as go
2
  import textwrap
3
  import re
vocabulary_split.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
3
+ import torch
4
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
5
+ model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
6
+ def split_vocabulary(seed=42):
7
+ # Initialize the tokenizer and model
8
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
9
+ model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
10
+
11
+ # Get the full vocabulary
12
+ vocab = list(tokenizer.get_vocab().items())
13
+
14
+ # Initialize the random number generator
15
+ random.seed(seed)
16
+
17
+ # Split the vocabulary into permissible and non-permissible buckets
18
+ permissible = {}
19
+ non_permissible = {}
20
+
21
+ for word, index in vocab:
22
+ if random.random() < 0.5: # 50% chance of being permissible
23
+ permissible[word] = index
24
+ else:
25
+ non_permissible[word] = index
26
+
27
+ return permissible, non_permissible
28
+
29
+ def get_logits_for_mask(model, tokenizer, sentence):
30
+ inputs = tokenizer(sentence, return_tensors="pt")
31
+ mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
32
+
33
+ with torch.no_grad():
34
+ outputs = model(**inputs)
35
+
36
+ logits = outputs.logits
37
+ mask_token_logits = logits[0, mask_token_index, :]
38
+ return mask_token_logits.squeeze()
39
+
40
+ def filter_logits(logits, permissible_indices):
41
+ filtered_logits = logits.clone()
42
+ if filtered_logits.dim() > 1:
43
+ filtered_logits = filtered_logits.squeeze()
44
+ if filtered_logits.shape != permissible_indices.shape:
45
+ permissible_indices = permissible_indices[:filtered_logits.shape[0]]
46
+ filtered_logits[~permissible_indices] = float('-inf')
47
+ return filtered_logits
48
+
49
+ # Usage example
50
+ permissible, non_permissible = split_vocabulary(seed=42)
51
+ permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
52
+
53
+ # When sampling:
54
+ sentence = "The [MASK] is bright today."
55
+ logits = get_logits_for_mask(model, tokenizer, sentence)
56
+ filtered_logits = filter_logits(logits, permissible_indices)
57
+ # Use filtered_logits for sampling
watermark_detector.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk.corpus import stopwords
3
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
4
+ from vocabulary_split import split_vocabulary, filter_logits
5
+ import torch
6
+ from lcs import find_common_subsequences
7
+ from paraphraser import generate_paraphrase
8
+
9
+ nltk.download('punkt', quiet=True)
10
+ nltk.download('stopwords', quiet=True)
11
+
12
+ tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
13
+ model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
14
+
15
+ permissible, _ = split_vocabulary(seed=42)
16
+ permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
17
+
18
+ def get_non_melting_points(original_sentence):
19
+ paraphrased_sentences = generate_paraphrase(original_sentence)
20
+ common_subsequences = find_common_subsequences(original_sentence, paraphrased_sentences)
21
+ return common_subsequences
22
+
23
+ def get_word_between_points(sentence, start_point, end_point):
24
+ words = nltk.word_tokenize(sentence)
25
+ stop_words = set(stopwords.words('english'))
26
+ start_index = sentence.index(start_point[1])
27
+ end_index = sentence.index(end_point[1])
28
+
29
+ for word in words[start_index+1:end_index]:
30
+ if word.lower() not in stop_words:
31
+ return word, words.index(word)
32
+ return None, None
33
+
34
+ def get_logits_for_mask(sentence):
35
+ inputs = tokenizer(sentence, return_tensors="pt")
36
+ mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
37
+
38
+ with torch.no_grad():
39
+ outputs = model(**inputs)
40
+
41
+ logits = outputs.logits
42
+ mask_token_logits = logits[0, mask_token_index, :]
43
+ return mask_token_logits.squeeze()
44
+
45
+ def detect_watermark(sentence):
46
+ non_melting_points = get_non_melting_points(sentence)
47
+
48
+ if len(non_melting_points) < 2:
49
+ return False, "Not enough non-melting points found."
50
+
51
+ word_to_check, index = get_word_between_points(sentence, non_melting_points[0], non_melting_points[1])
52
+
53
+ if word_to_check is None:
54
+ return False, "No suitable word found between non-melting points."
55
+
56
+ words = nltk.word_tokenize(sentence)
57
+ masked_sentence = ' '.join(words[:index] + ['[MASK]'] + words[index+1:])
58
+
59
+ logits = get_logits_for_mask(masked_sentence)
60
+ filtered_logits = filter_logits(logits, permissible_indices)
61
+
62
+ top_predictions = filtered_logits.argsort()[-5:]
63
+ predicted_words = [tokenizer.decode([i]) for i in top_predictions]
64
+
65
+ if word_to_check in predicted_words:
66
+ return True, f"Watermark detected. The word '{word_to_check}' is in the permissible vocabulary."
67
+ else:
68
+ return False, f"No watermark detected. The word '{word_to_check}' is not in the permissible vocabulary."
69
+
70
+ # Example usage
71
+ # if __name__ == "__main__":
72
+ # test_sentence = "The quick brown fox jumps over the lazy dog."
73
+ # is_watermarked, message = detect_watermark(test_sentence)
74
+ # print(f"Is the sentence watermarked? {is_watermarked}")
75
+ # print(f"Detection message: {message}")