Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- .gitignore +2 -1
- .gradio/certificate.pem +31 -0
- app.py +88 -10
- detectability.py +343 -0
- distortion.py +385 -0
- euclidean_distance.py +261 -0
- gpt_mask_filling.py +70 -0
- highlighter.py +18 -0
- lcs.py +64 -11
- masking_methods.py +146 -94
- paraphraser.py +83 -31
- requirements.txt +3 -1
- sampling_methods.py +22 -35
- threeD_plot.py +137 -0
- tree.py +0 -338
- vocabulary_split.py +57 -0
- watermark_detector.py +75 -0
.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
|
|
|
|
1 |
+
.env
|
2 |
+
__pycache__/
|
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
app.py
CHANGED
@@ -1,19 +1,22 @@
|
|
1 |
import nltk
|
2 |
nltk.download('stopwords')
|
3 |
-
from transformers import AutoTokenizer
|
4 |
-
from transformers import AutoModelForSeq2SeqLM
|
5 |
import plotly.graph_objs as go
|
6 |
from transformers import pipeline
|
7 |
-
from matplotlib.colors import ListedColormap, rgb2hex
|
8 |
import random
|
9 |
import gradio as gr
|
10 |
from tree import generate_subplot1, generate_subplot2
|
11 |
from paraphraser import generate_paraphrase
|
12 |
-
from lcs import find_common_subsequences
|
13 |
-
from highlighter import highlight_common_words, highlight_common_words_dict
|
14 |
from entailment import analyze_entailment
|
15 |
from masking_methods import mask_non_stopword, mask_non_stopword_pseudorandom, high_entropy_words
|
16 |
from sampling_methods import sample_word
|
|
|
|
|
|
|
|
|
17 |
|
18 |
|
19 |
# Function for the Gradio interface
|
@@ -21,8 +24,10 @@ def model(prompt):
|
|
21 |
user_prompt = prompt
|
22 |
paraphrased_sentences = generate_paraphrase(user_prompt)
|
23 |
analyzed_paraphrased_sentences, selected_sentences, discarded_sentences = analyze_entailment(user_prompt, paraphrased_sentences, 0.7)
|
24 |
-
|
25 |
common_grams = find_common_subsequences(user_prompt, selected_sentences)
|
|
|
|
|
26 |
|
27 |
masked_sentences = []
|
28 |
masked_words = []
|
@@ -51,7 +56,8 @@ def model(prompt):
|
|
51 |
sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='temperature', temperature=1.0))
|
52 |
sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='greedy', temperature=1.0))
|
53 |
|
54 |
-
|
|
|
55 |
|
56 |
colors = ["red", "blue", "brown", "green"]
|
57 |
|
@@ -83,7 +89,60 @@ def model(prompt):
|
|
83 |
masked_index += 3
|
84 |
sampled_index += 12
|
85 |
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
|
89 |
with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
|
@@ -127,8 +186,27 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
|
|
127 |
tree2 = gr.Plot()
|
128 |
tree2_tabs.append(tree2)
|
129 |
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
clear_button.click(lambda: "", inputs=None, outputs=user_input)
|
132 |
-
clear_button.click(lambda: "", inputs=None, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs)
|
133 |
|
134 |
demo.launch(share=True)
|
|
|
1 |
import nltk
|
2 |
nltk.download('stopwords')
|
3 |
+
# from transformers import AutoTokenizer
|
4 |
+
# from transformers import AutoModelForSeq2SeqLM
|
5 |
import plotly.graph_objs as go
|
6 |
from transformers import pipeline
|
|
|
7 |
import random
|
8 |
import gradio as gr
|
9 |
from tree import generate_subplot1, generate_subplot2
|
10 |
from paraphraser import generate_paraphrase
|
11 |
+
from lcs import find_common_subsequences, find_common_gram_positions
|
12 |
+
from highlighter import highlight_common_words, highlight_common_words_dict, reparaphrased_sentences_html
|
13 |
from entailment import analyze_entailment
|
14 |
from masking_methods import mask_non_stopword, mask_non_stopword_pseudorandom, high_entropy_words
|
15 |
from sampling_methods import sample_word
|
16 |
+
from detectability import SentenceDetectabilityCalculator
|
17 |
+
from distortion import SentenceDistortionCalculator
|
18 |
+
from euclidean_distance import SentenceEuclideanDistanceCalculator
|
19 |
+
from threeD_plot import gen_three_D_plot
|
20 |
|
21 |
|
22 |
# Function for the Gradio interface
|
|
|
24 |
user_prompt = prompt
|
25 |
paraphrased_sentences = generate_paraphrase(user_prompt)
|
26 |
analyzed_paraphrased_sentences, selected_sentences, discarded_sentences = analyze_entailment(user_prompt, paraphrased_sentences, 0.7)
|
27 |
+
print(analyze_entailment(user_prompt, paraphrased_sentences, 0.7))
|
28 |
common_grams = find_common_subsequences(user_prompt, selected_sentences)
|
29 |
+
subsequences = [subseq for _, subseq in common_grams]
|
30 |
+
common_grams_position = find_common_gram_positions(selected_sentences, subsequences)
|
31 |
|
32 |
masked_sentences = []
|
33 |
masked_words = []
|
|
|
56 |
sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='temperature', temperature=1.0))
|
57 |
sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='greedy', temperature=1.0))
|
58 |
|
59 |
+
|
60 |
+
|
61 |
|
62 |
colors = ["red", "blue", "brown", "green"]
|
63 |
|
|
|
89 |
masked_index += 3
|
90 |
sampled_index += 12
|
91 |
|
92 |
+
reparaphrased_sentences = generate_paraphrase(sampled_sentences)
|
93 |
+
|
94 |
+
len_reparaphrased_sentences = len(reparaphrased_sentences)
|
95 |
+
|
96 |
+
reparaphrased_sentences_list = []
|
97 |
+
|
98 |
+
# Process the sentences in batches of 10
|
99 |
+
for i in range(0, len_reparaphrased_sentences, 10):
|
100 |
+
# Get the current batch of 10 sentences
|
101 |
+
batch = reparaphrased_sentences[i:i + 10]
|
102 |
+
|
103 |
+
# Check if the batch has exactly 10 sentences
|
104 |
+
if len(batch) == 10:
|
105 |
+
# Call the display_sentences function and store the result in the list
|
106 |
+
html_block = reparaphrased_sentences_html(batch)
|
107 |
+
reparaphrased_sentences_list.append(html_block)
|
108 |
+
|
109 |
+
distortion_list = []
|
110 |
+
detectability_list = []
|
111 |
+
euclidean_dist_list = []
|
112 |
+
|
113 |
+
distortion_calculator = SentenceDistortionCalculator(user_prompt, reparaphrased_sentences)
|
114 |
+
distortion_calculator.calculate_all_metrics()
|
115 |
+
distortion_calculator.normalize_metrics()
|
116 |
+
distortion_calculator.calculate_combined_distortion()
|
117 |
+
|
118 |
+
distortion = distortion_calculator.get_combined_distortions()
|
119 |
+
|
120 |
+
for each in distortion.items():
|
121 |
+
distortion_list.append(each[1])
|
122 |
+
|
123 |
+
detectability_calculator = SentenceDetectabilityCalculator(user_prompt, reparaphrased_sentences)
|
124 |
+
detectability_calculator.calculate_all_metrics()
|
125 |
+
detectability_calculator.normalize_metrics()
|
126 |
+
detectability_calculator.calculate_combined_detectability()
|
127 |
+
|
128 |
+
detectability = detectability_calculator.get_combined_detectabilities()
|
129 |
+
|
130 |
+
for each in detectability.items():
|
131 |
+
detectability_list.append(each[1])
|
132 |
+
|
133 |
+
euclidean_dist_calculator = SentenceEuclideanDistanceCalculator(user_prompt, reparaphrased_sentences)
|
134 |
+
euclidean_dist_calculator.calculate_all_metrics()
|
135 |
+
euclidean_dist_calculator.normalize_metrics()
|
136 |
+
euclidean_dist_calculator.get_normalized_metrics()
|
137 |
+
|
138 |
+
euclidean_dist = detectability_calculator.get_combined_detectabilities()
|
139 |
+
|
140 |
+
for each in euclidean_dist.items():
|
141 |
+
euclidean_dist_list.append(each[1])
|
142 |
+
|
143 |
+
three_D_plot = gen_three_D_plot(detectability_list, distortion_list, euclidean_dist_list)
|
144 |
+
|
145 |
+
return [highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + trees1 + trees2 + reparaphrased_sentences_list + [three_D_plot]
|
146 |
|
147 |
|
148 |
with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
|
|
|
186 |
tree2 = gr.Plot()
|
187 |
tree2_tabs.append(tree2)
|
188 |
|
189 |
+
# Adding the "Re-paraphrased Sentences" section
|
190 |
+
with gr.Row():
|
191 |
+
gr.Markdown("### Re-paraphrased Sentences") # Label for re-paraphrased sentences
|
192 |
+
|
193 |
+
# Adding tabs for the re-paraphrased sentences
|
194 |
+
with gr.Row():
|
195 |
+
with gr.Tabs():
|
196 |
+
reparaphrased_sentences_tabs = []
|
197 |
+
for i in range(120): # 120 tabs for 120 batches of sentences
|
198 |
+
with gr.TabItem(f"Sentence {i+1}"):
|
199 |
+
reparaphrased_sent_html = gr.HTML() # Placeholder for each batch
|
200 |
+
reparaphrased_sentences_tabs.append(reparaphrased_sent_html)
|
201 |
+
|
202 |
+
with gr.Row():
|
203 |
+
gr.Markdown("### 3D Plot for Sweet Spot")
|
204 |
+
with gr.Row():
|
205 |
+
three_D_plot = gr.Plot()
|
206 |
+
|
207 |
+
|
208 |
+
submit_button.click(model, inputs=user_input, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs + reparaphrased_sentences_tabs + [three_D_plot])
|
209 |
clear_button.click(lambda: "", inputs=None, outputs=user_input)
|
210 |
+
clear_button.click(lambda: "", inputs=None, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs + reparaphrased_sentences_tabs + [three_D_plot])
|
211 |
|
212 |
demo.launch(share=True)
|
detectability.py
ADDED
@@ -0,0 +1,343 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import necessary libraries
|
2 |
+
import nltk
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
8 |
+
from transformers import BertModel, BertTokenizer
|
9 |
+
from sentence_transformers import SentenceTransformer
|
10 |
+
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
11 |
+
|
12 |
+
# Download NLTK data if not already present
|
13 |
+
nltk.download('punkt', quiet=True)
|
14 |
+
detectability_val={}
|
15 |
+
class SentenceDetectabilityCalculator:
|
16 |
+
"""
|
17 |
+
A class to calculate and analyze detectability metrics between an original sentence and paraphrased sentences.
|
18 |
+
"""
|
19 |
+
|
20 |
+
def __init__(self, original_sentence, paraphrased_sentences):
|
21 |
+
"""
|
22 |
+
Initialize the calculator with the original sentence and a list of paraphrased sentences.
|
23 |
+
"""
|
24 |
+
self.original_sentence = original_sentence
|
25 |
+
self.paraphrased_sentences = paraphrased_sentences
|
26 |
+
|
27 |
+
# Raw metric dictionaries
|
28 |
+
self.bleu_scores = {}
|
29 |
+
self.cosine_similarities = {}
|
30 |
+
self.sts_scores = {}
|
31 |
+
|
32 |
+
# Normalized metric dictionaries
|
33 |
+
self.normalized_bleu = {}
|
34 |
+
self.normalized_cosine = {}
|
35 |
+
self.normalized_sts = {}
|
36 |
+
|
37 |
+
# Combined detectability dictionary
|
38 |
+
self.combined_detectabilities = {}
|
39 |
+
|
40 |
+
# Load pre-trained BERT and SentenceTransformer for Cosine Similarity and STS Score
|
41 |
+
self.bert_model = BertModel.from_pretrained('bert-base-uncased')
|
42 |
+
self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
43 |
+
self.sts_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
|
44 |
+
|
45 |
+
def calculate_all_metrics(self):
|
46 |
+
"""
|
47 |
+
Calculate all detectability metrics for each paraphrased sentence.
|
48 |
+
"""
|
49 |
+
original_embedding = self._get_sentence_embedding(self.original_sentence)
|
50 |
+
sts_original_embedding = self.sts_model.encode(self.original_sentence)
|
51 |
+
|
52 |
+
for idx, paraphrased_sentence in enumerate(self.paraphrased_sentences):
|
53 |
+
key = f"Sentence_{idx+1}"
|
54 |
+
|
55 |
+
# BLEU Score
|
56 |
+
self.bleu_scores[key] = self._calculate_bleu(self.original_sentence, paraphrased_sentence)
|
57 |
+
|
58 |
+
# Cosine Similarity
|
59 |
+
paraphrase_embedding = self._get_sentence_embedding(paraphrased_sentence)
|
60 |
+
self.cosine_similarities[key] = cosine_similarity([original_embedding], [paraphrase_embedding])[0][0]
|
61 |
+
|
62 |
+
# STS Score
|
63 |
+
sts_paraphrase_embedding = self.sts_model.encode(paraphrased_sentence)
|
64 |
+
self.sts_scores[key] = cosine_similarity([sts_original_embedding], [sts_paraphrase_embedding])[0][0]
|
65 |
+
|
66 |
+
def normalize_metrics(self):
|
67 |
+
"""
|
68 |
+
Normalize all metrics to be between 0 and 1.
|
69 |
+
"""
|
70 |
+
self.normalized_bleu = self._normalize_dict(self.bleu_scores)
|
71 |
+
self.normalized_cosine = self._normalize_dict(self.cosine_similarities)
|
72 |
+
self.normalized_sts = self._normalize_dict(self.sts_scores)
|
73 |
+
|
74 |
+
def calculate_combined_detectability(self):
|
75 |
+
"""
|
76 |
+
Calculate the combined detectability using the root mean square of the normalized metrics.
|
77 |
+
"""
|
78 |
+
for key in self.normalized_bleu.keys():
|
79 |
+
rms = np.sqrt(
|
80 |
+
(
|
81 |
+
self.normalized_bleu[key] ** 2 +
|
82 |
+
self.normalized_cosine[key] ** 2 +
|
83 |
+
self.normalized_sts[key] ** 2
|
84 |
+
) / 3
|
85 |
+
)
|
86 |
+
self.combined_detectabilities[key] = rms
|
87 |
+
|
88 |
+
def plot_metrics(self):
|
89 |
+
"""
|
90 |
+
Plot each normalized metric and the combined detectability in separate graphs.
|
91 |
+
"""
|
92 |
+
keys = list(self.normalized_bleu.keys())
|
93 |
+
indices = np.arange(len(keys))
|
94 |
+
|
95 |
+
# Prepare data for plotting
|
96 |
+
metrics = {
|
97 |
+
'BLEU Score': [self.normalized_bleu[key] for key in keys],
|
98 |
+
'Cosine Similarity': [self.normalized_cosine[key] for key in keys],
|
99 |
+
'STS Score': [self.normalized_sts[key] for key in keys],
|
100 |
+
'Combined Detectability': [self.combined_detectabilities[key] for key in keys]
|
101 |
+
}
|
102 |
+
|
103 |
+
# Plot each metric separately
|
104 |
+
for metric_name, values in metrics.items():
|
105 |
+
plt.figure(figsize=(12, 6))
|
106 |
+
plt.plot(indices, values, marker='o', color=np.random.rand(3,))
|
107 |
+
plt.xlabel('Sentence Index')
|
108 |
+
plt.ylabel('Normalized Value (0-1)')
|
109 |
+
plt.title(f'Normalized {metric_name}')
|
110 |
+
plt.grid(True)
|
111 |
+
plt.tight_layout()
|
112 |
+
plt.show()
|
113 |
+
|
114 |
+
# Private methods for metric calculations
|
115 |
+
def _calculate_bleu(self, reference, candidate):
|
116 |
+
"""
|
117 |
+
Calculate the BLEU score between the original and paraphrased sentence using smoothing.
|
118 |
+
"""
|
119 |
+
reference_tokens = nltk.word_tokenize(reference)
|
120 |
+
candidate_tokens = nltk.word_tokenize(candidate)
|
121 |
+
smoothing = SmoothingFunction().method1
|
122 |
+
return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothing)
|
123 |
+
|
124 |
+
def _get_sentence_embedding(self, sentence):
|
125 |
+
"""
|
126 |
+
Get sentence embedding using BERT.
|
127 |
+
"""
|
128 |
+
tokens = self.bert_tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
|
129 |
+
with torch.no_grad():
|
130 |
+
outputs = self.bert_model(**tokens)
|
131 |
+
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
132 |
+
|
133 |
+
def _normalize_dict(self, metric_dict):
|
134 |
+
"""
|
135 |
+
Normalize the values in a dictionary to be between 0 and 1.
|
136 |
+
"""
|
137 |
+
values = np.array(list(metric_dict.values()))
|
138 |
+
min_val = values.min()
|
139 |
+
max_val = values.max()
|
140 |
+
# Avoid division by zero if all values are the same
|
141 |
+
if max_val - min_val == 0:
|
142 |
+
normalized_values = np.zeros_like(values)
|
143 |
+
else:
|
144 |
+
normalized_values = (values - min_val) / (max_val - min_val)
|
145 |
+
return dict(zip(metric_dict.keys(), normalized_values))
|
146 |
+
|
147 |
+
# Getter methods
|
148 |
+
def get_normalized_metrics(self):
|
149 |
+
"""
|
150 |
+
Get all normalized metrics as a dictionary.
|
151 |
+
"""
|
152 |
+
return {
|
153 |
+
'BLEU Score': self.normalized_bleu,
|
154 |
+
'Cosine Similarity': self.normalized_cosine,
|
155 |
+
'STS Score': self.normalized_sts
|
156 |
+
}
|
157 |
+
|
158 |
+
def get_combined_detectabilities(self):
|
159 |
+
"""
|
160 |
+
Get the dictionary of combined detectability values.
|
161 |
+
"""
|
162 |
+
return self.combined_detectabilities
|
163 |
+
|
164 |
+
|
165 |
+
# Example usage
|
166 |
+
if __name__ == "__main__":
|
167 |
+
# Original sentence
|
168 |
+
original_sentence = "The quick brown fox jumps over the lazy dog"
|
169 |
+
|
170 |
+
# Paraphrased sentences
|
171 |
+
paraphrased_sentences = [
|
172 |
+
# Original 1: "A swift auburn fox leaps across a sleepy canine."
|
173 |
+
"The swift auburn fox leaps across a sleepy canine.",
|
174 |
+
"A quick auburn fox leaps across a sleepy canine.",
|
175 |
+
"A swift ginger fox leaps across a sleepy canine.",
|
176 |
+
"A swift auburn fox bounds across a sleepy canine.",
|
177 |
+
"A swift auburn fox leaps across a tired canine.",
|
178 |
+
"Three swift auburn foxes leap across a sleepy canine.",
|
179 |
+
"The vulpine specimen rapidly traverses over a dormant dog.",
|
180 |
+
"Like lightning, the russet hunter soars over the drowsy guardian.",
|
181 |
+
"Tha quick ginger fox jumps o'er the lazy hound, ye ken.",
|
182 |
+
"One rapid Vulpes vulpes traverses the path of a quiescent canine.",
|
183 |
+
"A swift auburn predator navigates across a lethargic pet.",
|
184 |
+
"Subject A (fox) demonstrates velocity over Subject B (dog).",
|
185 |
+
|
186 |
+
# Original 2: "The agile russet fox bounds over an idle hound."
|
187 |
+
"Some agile russet foxes bound over an idle hound.",
|
188 |
+
"The nimble russet fox bounds over an idle hound.",
|
189 |
+
"The agile brown fox bounds over an idle hound.",
|
190 |
+
"The agile russet fox jumps over an idle hound.",
|
191 |
+
"The agile russet fox bounds over a lazy hound.",
|
192 |
+
"Two agile russet foxes bound over an idle hound.",
|
193 |
+
"A dexterous vulpine surpasses a stationary canine.",
|
194 |
+
"Quick as thought, the copper warrior sails over the guardian.",
|
195 |
+
"Tha nimble reddish fox jumps o'er the doggo, don't ya know.",
|
196 |
+
"A dexterous V. vulpes exceeds the plane of an inactive canine.",
|
197 |
+
"An agile russet hunter maneuvers above a resting hound.",
|
198 |
+
"Test subject F-1 achieves displacement superior to subject D-1.",
|
199 |
+
|
200 |
+
# Original 3: "A nimble mahogany vulpine vaults above a drowsy dog."
|
201 |
+
"The nimble mahogany vulpine vaults above a drowsy dog.",
|
202 |
+
"A swift mahogany vulpine vaults above a drowsy dog.",
|
203 |
+
"A nimble reddish vulpine vaults above a drowsy dog.",
|
204 |
+
"A nimble mahogany fox vaults above a drowsy dog.",
|
205 |
+
"A nimble mahogany vulpine leaps above a drowsy dog.",
|
206 |
+
"Four nimble mahogany vulpines vault above a drowsy dog.",
|
207 |
+
"An agile specimen of reddish fur surpasses a somnolent canine.",
|
208 |
+
"Fleet as wind, the earth-toned hunter soars over the sleepy guard.",
|
209 |
+
"Tha quick brown beastie jumps o'er the tired pup, aye.",
|
210 |
+
"Single V. vulpes demonstrates vertical traverse over C. familiaris.",
|
211 |
+
"A nimble rust-colored predator crosses above a drowsy pet.",
|
212 |
+
"Observed: Subject Red executes vertical motion over Subject Gray.",
|
213 |
+
|
214 |
+
# Original 4: "The speedy copper-colored fox hops over the lethargic pup."
|
215 |
+
"A speedy copper-colored fox hops over the lethargic pup.",
|
216 |
+
"The quick copper-colored fox hops over the lethargic pup.",
|
217 |
+
"The speedy bronze fox hops over the lethargic pup.",
|
218 |
+
"The speedy copper-colored fox jumps over the lethargic pup.",
|
219 |
+
"The speedy copper-colored fox hops over the tired pup.",
|
220 |
+
"Multiple speedy copper-colored foxes hop over the lethargic pup.",
|
221 |
+
"A rapid vulpine of bronze hue traverses an inactive young canine.",
|
222 |
+
"Swift as a dart, the metallic hunter bounds over the lazy puppy.",
|
223 |
+
"Tha fast copper beastie leaps o'er the sleepy wee dog.",
|
224 |
+
"1 rapid V. vulpes crosses above 1 juvenile C. familiaris.",
|
225 |
+
"A fleet copper-toned predator moves past a sluggish young dog.",
|
226 |
+
"Field note: Adult fox subject exceeds puppy subject vertically.",
|
227 |
+
|
228 |
+
# Original 5: "A rapid tawny fox springs over a sluggish dog."
|
229 |
+
"The rapid tawny fox springs over a sluggish dog.",
|
230 |
+
"A quick tawny fox springs over a sluggish dog.",
|
231 |
+
"A rapid golden fox springs over a sluggish dog.",
|
232 |
+
"A rapid tawny fox jumps over a sluggish dog.",
|
233 |
+
"A rapid tawny fox springs over a lazy dog.",
|
234 |
+
"Six rapid tawny foxes spring over a sluggish dog.",
|
235 |
+
"An expeditious yellowish vulpine surpasses a torpid canine.",
|
236 |
+
"Fast as a bullet, the golden hunter vaults over the idle guard.",
|
237 |
+
"Tha swift yellowy fox jumps o'er the lazy mutt, aye.",
|
238 |
+
"One V. vulpes displays rapid transit over one inactive C. familiaris.",
|
239 |
+
"A speedy yellow-brown predator bypasses a motionless dog.",
|
240 |
+
"Log entry: Vulpine subject achieves swift vertical displacement.",
|
241 |
+
|
242 |
+
# Original 6: "The fleet-footed chestnut fox soars above an indolent canine."
|
243 |
+
"A fleet-footed chestnut fox soars above an indolent canine.",
|
244 |
+
"The swift chestnut fox soars above an indolent canine.",
|
245 |
+
"The fleet-footed brown fox soars above an indolent canine.",
|
246 |
+
"The fleet-footed chestnut fox leaps above an indolent canine.",
|
247 |
+
"The fleet-footed chestnut fox soars above a lazy canine.",
|
248 |
+
"Several fleet-footed chestnut foxes soar above an indolent canine.",
|
249 |
+
"A rapid brown vulpine specimen traverses a lethargic domestic dog.",
|
250 |
+
"Graceful as a bird, the nutbrown hunter flies over the lazy guard.",
|
251 |
+
"Tha quick brown beastie sails o'er the sleepy hound, ken.",
|
252 |
+
"Single agile V. vulpes achieves elevation above stationary canine.",
|
253 |
+
"A nimble brown predator glides over an unmoving domestic animal.",
|
254 |
+
"Research note: Brown subject displays superior vertical mobility.",
|
255 |
+
|
256 |
+
# Original 7: "A fast ginger fox hurdles past a slothful dog."
|
257 |
+
"The fast ginger fox hurdles past a slothful dog.",
|
258 |
+
"A quick ginger fox hurdles past a slothful dog.",
|
259 |
+
"A fast red fox hurdles past a slothful dog.",
|
260 |
+
"A fast ginger fox jumps past a slothful dog.",
|
261 |
+
"A fast ginger fox hurdles past a lazy dog.",
|
262 |
+
"Five fast ginger foxes hurdle past a slothful dog.",
|
263 |
+
"A rapid orange vulpine bypasses a lethargic canine.",
|
264 |
+
"Quick as lightning, the flame-colored hunter races past the lazy guard.",
|
265 |
+
"Tha swift ginger beastie leaps past the tired doggy, ye see.",
|
266 |
+
"1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.",
|
267 |
+
"A speedy red-orange predator overtakes a motionless dog.",
|
268 |
+
"Data point: Orange subject demonstrates rapid transit past Gray subject.",
|
269 |
+
|
270 |
+
# Original 8: "The spry rusty-colored fox jumps across a dozing hound."
|
271 |
+
"A spry rusty-colored fox jumps across a dozing hound.",
|
272 |
+
"The agile rusty-colored fox jumps across a dozing hound.",
|
273 |
+
"The spry reddish fox jumps across a dozing hound.",
|
274 |
+
"The spry rusty-colored fox leaps across a dozing hound.",
|
275 |
+
"The spry rusty-colored fox jumps across a sleeping hound.",
|
276 |
+
"Multiple spry rusty-colored foxes jump across a dozing hound.",
|
277 |
+
"An agile rust-toned vulpine traverses a somnolent canine.",
|
278 |
+
"Nimble as thought, the copper hunter bounds over the resting guard.",
|
279 |
+
"Tha lively rust-colored beastie hops o'er the snoozin' hound.",
|
280 |
+
"Single dexterous V. vulpes crosses path of dormant C. familiaris.",
|
281 |
+
"A lithe rust-tinted predator moves past a slumbering dog.",
|
282 |
+
"Observation: Russet subject exhibits agility over dormant subject.",
|
283 |
+
|
284 |
+
# Original 9: "A quick tan fox leaps over an inactive dog."
|
285 |
+
"The quick tan fox leaps over an inactive dog.",
|
286 |
+
"A swift tan fox leaps over an inactive dog.",
|
287 |
+
"A quick beige fox leaps over an inactive dog.",
|
288 |
+
"A quick tan fox jumps over an inactive dog.",
|
289 |
+
"A quick tan fox leaps over a motionless dog.",
|
290 |
+
"Seven quick tan foxes leap over an inactive dog.",
|
291 |
+
"A rapid light-brown vulpine surpasses a stationary canine.",
|
292 |
+
"Fast as wind, the sand-colored hunter soars over the still guard.",
|
293 |
+
"Tha nimble tan beastie jumps o'er the quiet doggy, aye.",
|
294 |
+
"One agile fawn V. vulpes traverses one immobile C. familiaris.",
|
295 |
+
"A fleet tan-colored predator bypasses an unmoving dog.",
|
296 |
+
"Field report: Tan subject demonstrates movement over static subject.",
|
297 |
+
|
298 |
+
# Original 10: "The brisk auburn vulpine bounces over a listless canine."
|
299 |
+
"Some brisk auburn vulpines bounce over a listless canine.",
|
300 |
+
"The quick auburn vulpine bounces over a listless canine.",
|
301 |
+
"The brisk russet vulpine bounces over a listless canine.",
|
302 |
+
"The brisk auburn fox bounces over a listless canine.",
|
303 |
+
"The brisk auburn vulpine jumps over a listless canine.",
|
304 |
+
"Five brisk auburn vulpines bounce over a listless canine.",
|
305 |
+
"The expeditious specimen supersedes a quiescent Canis lupus.",
|
306 |
+
"Swift as wind, the russet hunter vaults over the idle guardian.",
|
307 |
+
"Tha quick ginger beastie hops o'er the lazy mutt, aye.",
|
308 |
+
"One V. vulpes achieves displacement over inactive C. familiaris.",
|
309 |
+
"A high-velocity auburn predator traverses an immobile animal.",
|
310 |
+
"Final observation: Red subject shows mobility over Gray subject."
|
311 |
+
]
|
312 |
+
|
313 |
+
|
314 |
+
# Initialize the calculator
|
315 |
+
calculator = SentenceDetectabilityCalculator(original_sentence, paraphrased_sentences)
|
316 |
+
|
317 |
+
# Calculate all metrics
|
318 |
+
calculator.calculate_all_metrics()
|
319 |
+
|
320 |
+
# Normalize the metrics
|
321 |
+
calculator.normalize_metrics()
|
322 |
+
|
323 |
+
# Calculate combined detectability
|
324 |
+
calculator.calculate_combined_detectability()
|
325 |
+
|
326 |
+
# Retrieve the normalized metrics and combined detectabilities
|
327 |
+
normalized_metrics = calculator.get_normalized_metrics()
|
328 |
+
combined_detectabilities = calculator.get_combined_detectabilities()
|
329 |
+
detectability_val=combined_detectabilities
|
330 |
+
|
331 |
+
# Display the results
|
332 |
+
# print("Normalized Metrics:")
|
333 |
+
# for metric_name, metric_dict in normalized_metrics.items():
|
334 |
+
# print(f"\n{metric_name}:")
|
335 |
+
# for key, value in metric_dict.items():
|
336 |
+
# print(f"{key}: {value:.4f}")
|
337 |
+
|
338 |
+
print("\nCombined Detectabilities:")
|
339 |
+
for each in combined_detectabilities.items():
|
340 |
+
print(f"{each[1]}")
|
341 |
+
|
342 |
+
# Plot the metrics
|
343 |
+
# calculator.plot_metrics()
|
distortion.py
ADDED
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import necessary libraries
|
2 |
+
import nltk
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
from scipy.special import rel_entr
|
7 |
+
from collections import Counter
|
8 |
+
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
|
9 |
+
distortion_val={}
|
10 |
+
# Download NLTK data if not already present
|
11 |
+
nltk.download('punkt', quiet=True)
|
12 |
+
|
13 |
+
class SentenceDistortionCalculator:
|
14 |
+
"""
|
15 |
+
A class to calculate and analyze distortion metrics between an original sentence and modified sentences.
|
16 |
+
"""
|
17 |
+
|
18 |
+
def __init__(self, original_sentence, modified_sentences):
|
19 |
+
"""
|
20 |
+
Initialize the calculator with the original sentence and a list of modified sentences.
|
21 |
+
"""
|
22 |
+
self.original_sentence = original_sentence
|
23 |
+
self.modified_sentences = modified_sentences
|
24 |
+
|
25 |
+
# Raw metric dictionaries
|
26 |
+
self.levenshtein_distances = {}
|
27 |
+
self.word_level_changes = {}
|
28 |
+
self.kl_divergences = {}
|
29 |
+
self.perplexities = {}
|
30 |
+
|
31 |
+
# Normalized metric dictionaries
|
32 |
+
self.normalized_levenshtein = {}
|
33 |
+
self.normalized_word_changes = {}
|
34 |
+
self.normalized_kl_divergences = {}
|
35 |
+
self.normalized_perplexities = {}
|
36 |
+
|
37 |
+
# Combined distortion dictionary
|
38 |
+
self.combined_distortions = {}
|
39 |
+
|
40 |
+
# Initialize GPT-2 model and tokenizer for perplexity calculation
|
41 |
+
self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
42 |
+
self.model = GPT2LMHeadModel.from_pretrained("gpt2")
|
43 |
+
self.model.eval() # Set model to evaluation mode
|
44 |
+
|
45 |
+
def calculate_all_metrics(self):
|
46 |
+
"""
|
47 |
+
Calculate all distortion metrics for each modified sentence.
|
48 |
+
"""
|
49 |
+
for idx, modified_sentence in enumerate(self.modified_sentences):
|
50 |
+
key = f"Sentence_{idx+1}"
|
51 |
+
self.levenshtein_distances[key] = self._calculate_levenshtein_distance(modified_sentence)
|
52 |
+
self.word_level_changes[key] = self._calculate_word_level_change(modified_sentence)
|
53 |
+
self.kl_divergences[key] = self._calculate_kl_divergence(modified_sentence)
|
54 |
+
self.perplexities[key] = self._calculate_perplexity(modified_sentence)
|
55 |
+
|
56 |
+
def normalize_metrics(self):
|
57 |
+
"""
|
58 |
+
Normalize all metrics to be between 0 and 1.
|
59 |
+
"""
|
60 |
+
self.normalized_levenshtein = self._normalize_dict(self.levenshtein_distances)
|
61 |
+
self.normalized_word_changes = self._normalize_dict(self.word_level_changes)
|
62 |
+
self.normalized_kl_divergences = self._normalize_dict(self.kl_divergences)
|
63 |
+
self.normalized_perplexities = self._normalize_dict(self.perplexities)
|
64 |
+
|
65 |
+
def calculate_combined_distortion(self):
|
66 |
+
"""
|
67 |
+
Calculate the combined distortion using the root mean square of the normalized metrics.
|
68 |
+
"""
|
69 |
+
for key in self.normalized_levenshtein.keys():
|
70 |
+
rms = np.sqrt(
|
71 |
+
(
|
72 |
+
self.normalized_levenshtein[key] ** 2 +
|
73 |
+
self.normalized_word_changes[key] ** 2 +
|
74 |
+
self.normalized_kl_divergences[key] ** 2 +
|
75 |
+
self.normalized_perplexities[key] ** 2
|
76 |
+
) / 4
|
77 |
+
)
|
78 |
+
self.combined_distortions[key] = rms
|
79 |
+
|
80 |
+
def plot_metrics(self):
|
81 |
+
"""
|
82 |
+
Plot each normalized metric and the combined distortion in separate graphs.
|
83 |
+
"""
|
84 |
+
import matplotlib.pyplot as plt
|
85 |
+
|
86 |
+
keys = list(self.normalized_levenshtein.keys())
|
87 |
+
indices = np.arange(len(keys))
|
88 |
+
|
89 |
+
# Prepare data for plotting
|
90 |
+
metrics = {
|
91 |
+
'Levenshtein Distance': [self.normalized_levenshtein[key] for key in keys],
|
92 |
+
'Word-Level Changes': [self.normalized_word_changes[key] for key in keys],
|
93 |
+
'KL Divergence': [self.normalized_kl_divergences[key] for key in keys],
|
94 |
+
'Perplexity': [self.normalized_perplexities[key] for key in keys],
|
95 |
+
'Combined Distortion': [self.combined_distortions[key] for key in keys]
|
96 |
+
}
|
97 |
+
|
98 |
+
# Plot each metric separately
|
99 |
+
for metric_name, values in metrics.items():
|
100 |
+
plt.figure(figsize=(12, 6))
|
101 |
+
plt.plot(indices, values, marker='o', color=np.random.rand(3,))
|
102 |
+
plt.xlabel('Sentence Index')
|
103 |
+
plt.ylabel('Normalized Value (0-1)')
|
104 |
+
plt.title(f'Normalized {metric_name}')
|
105 |
+
plt.grid(True)
|
106 |
+
plt.tight_layout()
|
107 |
+
plt.show()
|
108 |
+
|
109 |
+
# Private methods for metric calculations
|
110 |
+
def _calculate_levenshtein_distance(self, modified_sentence):
|
111 |
+
"""
|
112 |
+
Calculate the Levenshtein Distance between the original and modified sentence.
|
113 |
+
"""
|
114 |
+
return nltk.edit_distance(self.original_sentence, modified_sentence)
|
115 |
+
|
116 |
+
def _calculate_word_level_change(self, modified_sentence):
|
117 |
+
"""
|
118 |
+
Calculate the proportion of word-level changes between the original and modified sentence.
|
119 |
+
"""
|
120 |
+
original_words = self.original_sentence.split()
|
121 |
+
modified_words = modified_sentence.split()
|
122 |
+
total_words = max(len(original_words), len(modified_words))
|
123 |
+
changed_words = sum(1 for o, m in zip(original_words, modified_words) if o != m)
|
124 |
+
# Account for extra words in the modified sentence
|
125 |
+
changed_words += abs(len(original_words) - len(modified_words))
|
126 |
+
distortion = changed_words / total_words
|
127 |
+
return distortion
|
128 |
+
|
129 |
+
def _calculate_kl_divergence(self, modified_sentence):
|
130 |
+
"""
|
131 |
+
Calculate the KL Divergence between the word distributions of the original and modified sentence.
|
132 |
+
"""
|
133 |
+
original_counts = Counter(self.original_sentence.lower().split())
|
134 |
+
modified_counts = Counter(modified_sentence.lower().split())
|
135 |
+
all_words = set(original_counts.keys()).union(set(modified_counts.keys()))
|
136 |
+
original_probs = np.array([original_counts.get(word, 0) for word in all_words], dtype=float)
|
137 |
+
modified_probs = np.array([modified_counts.get(word, 0) for word in all_words], dtype=float)
|
138 |
+
|
139 |
+
# Add smoothing to avoid division by zero
|
140 |
+
original_probs += 1e-10
|
141 |
+
modified_probs += 1e-10
|
142 |
+
|
143 |
+
# Normalize to create probability distributions
|
144 |
+
original_probs /= original_probs.sum()
|
145 |
+
modified_probs /= modified_probs.sum()
|
146 |
+
|
147 |
+
kl_divergence = np.sum(rel_entr(original_probs, modified_probs))
|
148 |
+
return kl_divergence
|
149 |
+
|
150 |
+
def _calculate_perplexity(self, sentence):
|
151 |
+
"""
|
152 |
+
Calculate the perplexity of a sentence using GPT-2.
|
153 |
+
"""
|
154 |
+
encodings = self.tokenizer(sentence, return_tensors='pt')
|
155 |
+
max_length = self.model.config.n_positions
|
156 |
+
stride = max_length
|
157 |
+
|
158 |
+
lls = []
|
159 |
+
for i in range(0, encodings.input_ids.size(1), stride):
|
160 |
+
begin_loc = i
|
161 |
+
end_loc = min(i + stride, encodings.input_ids.size(1))
|
162 |
+
trg_len = end_loc - begin_loc
|
163 |
+
|
164 |
+
input_ids = encodings.input_ids[:, begin_loc:end_loc]
|
165 |
+
target_ids = input_ids.clone()
|
166 |
+
|
167 |
+
with torch.no_grad():
|
168 |
+
outputs = self.model(input_ids, labels=target_ids)
|
169 |
+
log_likelihood = outputs.loss * trg_len
|
170 |
+
|
171 |
+
lls.append(log_likelihood)
|
172 |
+
|
173 |
+
ppl = torch.exp(torch.stack(lls).sum() / end_loc)
|
174 |
+
return ppl.item()
|
175 |
+
|
176 |
+
def _normalize_dict(self, metric_dict):
|
177 |
+
"""
|
178 |
+
Normalize the values in a dictionary to be between 0 and 1.
|
179 |
+
"""
|
180 |
+
values = np.array(list(metric_dict.values()))
|
181 |
+
min_val = values.min()
|
182 |
+
max_val = values.max()
|
183 |
+
# Avoid division by zero if all values are the same
|
184 |
+
if max_val - min_val == 0:
|
185 |
+
normalized_values = np.zeros_like(values)
|
186 |
+
else:
|
187 |
+
normalized_values = (values - min_val) / (max_val - min_val)
|
188 |
+
return dict(zip(metric_dict.keys(), normalized_values))
|
189 |
+
|
190 |
+
# Getter methods
|
191 |
+
def get_normalized_metrics(self):
|
192 |
+
"""
|
193 |
+
Get all normalized metrics as a dictionary.
|
194 |
+
"""
|
195 |
+
return {
|
196 |
+
'Levenshtein Distance': self.normalized_levenshtein,
|
197 |
+
'Word-Level Changes': self.normalized_word_changes,
|
198 |
+
'KL Divergence': self.normalized_kl_divergences,
|
199 |
+
'Perplexity': self.normalized_perplexities
|
200 |
+
}
|
201 |
+
|
202 |
+
def get_combined_distortions(self):
|
203 |
+
"""
|
204 |
+
Get the dictionary of combined distortion values.
|
205 |
+
"""
|
206 |
+
return self.combined_distortions
|
207 |
+
|
208 |
+
# # Example usage
|
209 |
+
# if __name__ == "__main__":
|
210 |
+
# # Original sentence
|
211 |
+
# original_sentence = "The quick brown fox jumps over the lazy dog"
|
212 |
+
|
213 |
+
|
214 |
+
# paraphrased_sentences = [
|
215 |
+
# # Original 1: "A swift auburn fox leaps across a sleepy canine."
|
216 |
+
# "The swift auburn fox leaps across a sleepy canine.",
|
217 |
+
# "A quick auburn fox leaps across a sleepy canine.",
|
218 |
+
# "A swift ginger fox leaps across a sleepy canine.",
|
219 |
+
# "A swift auburn fox bounds across a sleepy canine.",
|
220 |
+
# "A swift auburn fox leaps across a tired canine.",
|
221 |
+
# "Three swift auburn foxes leap across a sleepy canine.",
|
222 |
+
# "The vulpine specimen rapidly traverses over a dormant dog.",
|
223 |
+
# "Like lightning, the russet hunter soars over the drowsy guardian.",
|
224 |
+
# "Tha quick ginger fox jumps o'er the lazy hound, ye ken.",
|
225 |
+
# "One rapid Vulpes vulpes traverses the path of a quiescent canine.",
|
226 |
+
# "A swift auburn predator navigates across a lethargic pet.",
|
227 |
+
# "Subject A (fox) demonstrates velocity over Subject B (dog).",
|
228 |
+
|
229 |
+
# # Original 2: "The agile russet fox bounds over an idle hound."
|
230 |
+
# "Some agile russet foxes bound over an idle hound.",
|
231 |
+
# "The nimble russet fox bounds over an idle hound.",
|
232 |
+
# "The agile brown fox bounds over an idle hound.",
|
233 |
+
# "The agile russet fox jumps over an idle hound.",
|
234 |
+
# "The agile russet fox bounds over a lazy hound.",
|
235 |
+
# "Two agile russet foxes bound over an idle hound.",
|
236 |
+
# "A dexterous vulpine surpasses a stationary canine.",
|
237 |
+
# "Quick as thought, the copper warrior sails over the guardian.",
|
238 |
+
# "Tha nimble reddish fox jumps o'er the doggo, don't ya know.",
|
239 |
+
# "A dexterous V. vulpes exceeds the plane of an inactive canine.",
|
240 |
+
# "An agile russet hunter maneuvers above a resting hound.",
|
241 |
+
# "Test subject F-1 achieves displacement superior to subject D-1.",
|
242 |
+
|
243 |
+
# # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog."
|
244 |
+
# "The nimble mahogany vulpine vaults above a drowsy dog.",
|
245 |
+
# "A swift mahogany vulpine vaults above a drowsy dog.",
|
246 |
+
# "A nimble reddish vulpine vaults above a drowsy dog.",
|
247 |
+
# "A nimble mahogany fox vaults above a drowsy dog.",
|
248 |
+
# "A nimble mahogany vulpine leaps above a drowsy dog.",
|
249 |
+
# "Four nimble mahogany vulpines vault above a drowsy dog.",
|
250 |
+
# "An agile specimen of reddish fur surpasses a somnolent canine.",
|
251 |
+
# "Fleet as wind, the earth-toned hunter soars over the sleepy guard.",
|
252 |
+
# "Tha quick brown beastie jumps o'er the tired pup, aye.",
|
253 |
+
# "Single V. vulpes demonstrates vertical traverse over C. familiaris.",
|
254 |
+
# "A nimble rust-colored predator crosses above a drowsy pet.",
|
255 |
+
# "Observed: Subject Red executes vertical motion over Subject Gray.",
|
256 |
+
|
257 |
+
# # Original 4: "The speedy copper-colored fox hops over the lethargic pup."
|
258 |
+
# "A speedy copper-colored fox hops over the lethargic pup.",
|
259 |
+
# "The quick copper-colored fox hops over the lethargic pup.",
|
260 |
+
# "The speedy bronze fox hops over the lethargic pup.",
|
261 |
+
# "The speedy copper-colored fox jumps over the lethargic pup.",
|
262 |
+
# "The speedy copper-colored fox hops over the tired pup.",
|
263 |
+
# "Multiple speedy copper-colored foxes hop over the lethargic pup.",
|
264 |
+
# "A rapid vulpine of bronze hue traverses an inactive young canine.",
|
265 |
+
# "Swift as a dart, the metallic hunter bounds over the lazy puppy.",
|
266 |
+
# "Tha fast copper beastie leaps o'er the sleepy wee dog.",
|
267 |
+
# "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.",
|
268 |
+
# "A fleet copper-toned predator moves past a sluggish young dog.",
|
269 |
+
# "Field note: Adult fox subject exceeds puppy subject vertically.",
|
270 |
+
|
271 |
+
# # Original 5: "A rapid tawny fox springs over a sluggish dog."
|
272 |
+
# "The rapid tawny fox springs over a sluggish dog.",
|
273 |
+
# "A quick tawny fox springs over a sluggish dog.",
|
274 |
+
# "A rapid golden fox springs over a sluggish dog.",
|
275 |
+
# "A rapid tawny fox jumps over a sluggish dog.",
|
276 |
+
# "A rapid tawny fox springs over a lazy dog.",
|
277 |
+
# "Six rapid tawny foxes spring over a sluggish dog.",
|
278 |
+
# "An expeditious yellowish vulpine surpasses a torpid canine.",
|
279 |
+
# "Fast as a bullet, the golden hunter vaults over the idle guard.",
|
280 |
+
# "Tha swift yellowy fox jumps o'er the lazy mutt, aye.",
|
281 |
+
# "One V. vulpes displays rapid transit over one inactive C. familiaris.",
|
282 |
+
# "A speedy yellow-brown predator bypasses a motionless dog.",
|
283 |
+
# "Log entry: Vulpine subject achieves swift vertical displacement.",
|
284 |
+
|
285 |
+
# # Original 6: "The fleet-footed chestnut fox soars above an indolent canine."
|
286 |
+
# "A fleet-footed chestnut fox soars above an indolent canine.",
|
287 |
+
# "The swift chestnut fox soars above an indolent canine.",
|
288 |
+
# "The fleet-footed brown fox soars above an indolent canine.",
|
289 |
+
# "The fleet-footed chestnut fox leaps above an indolent canine.",
|
290 |
+
# "The fleet-footed chestnut fox soars above a lazy canine.",
|
291 |
+
# "Several fleet-footed chestnut foxes soar above an indolent canine.",
|
292 |
+
# "A rapid brown vulpine specimen traverses a lethargic domestic dog.",
|
293 |
+
# "Graceful as a bird, the nutbrown hunter flies over the lazy guard.",
|
294 |
+
# "Tha quick brown beastie sails o'er the sleepy hound, ken.",
|
295 |
+
# "Single agile V. vulpes achieves elevation above stationary canine.",
|
296 |
+
# "A nimble brown predator glides over an unmoving domestic animal.",
|
297 |
+
# "Research note: Brown subject displays superior vertical mobility.",
|
298 |
+
|
299 |
+
# # Original 7: "A fast ginger fox hurdles past a slothful dog."
|
300 |
+
# "The fast ginger fox hurdles past a slothful dog.",
|
301 |
+
# "A quick ginger fox hurdles past a slothful dog.",
|
302 |
+
# "A fast red fox hurdles past a slothful dog.",
|
303 |
+
# "A fast ginger fox jumps past a slothful dog.",
|
304 |
+
# "A fast ginger fox hurdles past a lazy dog.",
|
305 |
+
# "Five fast ginger foxes hurdle past a slothful dog.",
|
306 |
+
# "A rapid orange vulpine bypasses a lethargic canine.",
|
307 |
+
# "Quick as lightning, the flame-colored hunter races past the lazy guard.",
|
308 |
+
# "Tha swift ginger beastie leaps past the tired doggy, ye see.",
|
309 |
+
# "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.",
|
310 |
+
# "A speedy red-orange predator overtakes a motionless dog.",
|
311 |
+
# "Data point: Orange subject demonstrates rapid transit past Gray subject.",
|
312 |
+
|
313 |
+
# # Original 8: "The spry rusty-colored fox jumps across a dozing hound."
|
314 |
+
# "A spry rusty-colored fox jumps across a dozing hound.",
|
315 |
+
# "The agile rusty-colored fox jumps across a dozing hound.",
|
316 |
+
# "The spry reddish fox jumps across a dozing hound.",
|
317 |
+
# "The spry rusty-colored fox leaps across a dozing hound.",
|
318 |
+
# "The spry rusty-colored fox jumps across a sleeping hound.",
|
319 |
+
# "Multiple spry rusty-colored foxes jump across a dozing hound.",
|
320 |
+
# "An agile rust-toned vulpine traverses a somnolent canine.",
|
321 |
+
# "Nimble as thought, the copper hunter bounds over the resting guard.",
|
322 |
+
# "Tha lively rust-colored beastie hops o'er the snoozin' hound.",
|
323 |
+
# "Single dexterous V. vulpes crosses path of dormant C. familiaris.",
|
324 |
+
# "A lithe rust-tinted predator moves past a slumbering dog.",
|
325 |
+
# "Observation: Russet subject exhibits agility over dormant subject.",
|
326 |
+
|
327 |
+
# # Original 9: "A quick tan fox leaps over an inactive dog."
|
328 |
+
# "The quick tan fox leaps over an inactive dog.",
|
329 |
+
# "A swift tan fox leaps over an inactive dog.",
|
330 |
+
# "A quick beige fox leaps over an inactive dog.",
|
331 |
+
# "A quick tan fox jumps over an inactive dog.",
|
332 |
+
# "A quick tan fox leaps over a motionless dog.",
|
333 |
+
# "Seven quick tan foxes leap over an inactive dog.",
|
334 |
+
# "A rapid light-brown vulpine surpasses a stationary canine.",
|
335 |
+
# "Fast as wind, the sand-colored hunter soars over the still guard.",
|
336 |
+
# "Tha nimble tan beastie jumps o'er the quiet doggy, aye.",
|
337 |
+
# "One agile fawn V. vulpes traverses one immobile C. familiaris.",
|
338 |
+
# "A fleet tan-colored predator bypasses an unmoving dog.",
|
339 |
+
# "Field report: Tan subject demonstrates movement over static subject.",
|
340 |
+
|
341 |
+
# # Original 10: "The brisk auburn vulpine bounces over a listless canine."
|
342 |
+
# "Some brisk auburn vulpines bounce over a listless canine.",
|
343 |
+
# "The quick auburn vulpine bounces over a listless canine.",
|
344 |
+
# "The brisk russet vulpine bounces over a listless canine.",
|
345 |
+
# "The brisk auburn fox bounces over a listless canine.",
|
346 |
+
# "The brisk auburn vulpine jumps over a listless canine.",
|
347 |
+
# "Five brisk auburn vulpines bounce over a listless canine.",
|
348 |
+
# "The expeditious specimen supersedes a quiescent Canis lupus.",
|
349 |
+
# "Swift as wind, the russet hunter vaults over the idle guardian.",
|
350 |
+
# "Tha quick ginger beastie hops o'er the lazy mutt, aye.",
|
351 |
+
# "One V. vulpes achieves displacement over inactive C. familiaris.",
|
352 |
+
# "A high-velocity auburn predator traverses an immobile animal.",
|
353 |
+
# "Final observation: Red subject shows mobility over Gray subject."
|
354 |
+
# ]
|
355 |
+
|
356 |
+
|
357 |
+
# # Initialize the calculator
|
358 |
+
# calculator = SentenceDistortionCalculator(original_sentence, paraphrased_sentences)
|
359 |
+
|
360 |
+
# # Calculate all metrics
|
361 |
+
# calculator.calculate_all_metrics()
|
362 |
+
|
363 |
+
# # Normalize the metrics
|
364 |
+
# calculator.normalize_metrics()
|
365 |
+
|
366 |
+
# # Calculate combined distortion
|
367 |
+
# calculator.calculate_combined_distortion()
|
368 |
+
|
369 |
+
# # Retrieve the normalized metrics and combined distortions
|
370 |
+
# normalized_metrics = calculator.get_normalized_metrics()
|
371 |
+
# combined_distortions = calculator.get_combined_distortions()
|
372 |
+
# distortion_val=combined_distortions
|
373 |
+
# # Display the results
|
374 |
+
# print("Normalized Metrics:")
|
375 |
+
# for metric_name, metric_dict in normalized_metrics.items():
|
376 |
+
# print(f"\n{metric_name}:")
|
377 |
+
# for key, value in metric_dict.items():
|
378 |
+
# print(f"{key}: {value:.4f}")
|
379 |
+
|
380 |
+
# print("\nCombined Distortions:")
|
381 |
+
# for key, value in combined_distortions.items():
|
382 |
+
# print(f"{key}: {value:.4f}")
|
383 |
+
|
384 |
+
# # Plot the metrics
|
385 |
+
# calculator.plot_metrics()
|
euclidean_distance.py
ADDED
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import necessary libraries
|
2 |
+
import numpy as np
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
from sklearn.metrics.pairwise import euclidean_distances
|
6 |
+
euclidean_val={}
|
7 |
+
class SentenceEuclideanDistanceCalculator:
|
8 |
+
"""
|
9 |
+
A class to calculate and analyze Euclidean distance between an original sentence and paraphrased sentences.
|
10 |
+
"""
|
11 |
+
|
12 |
+
def __init__(self, original_sentence, paraphrased_sentences):
|
13 |
+
"""
|
14 |
+
Initialize the calculator with the original sentence and a list of paraphrased sentences.
|
15 |
+
"""
|
16 |
+
self.original_sentence = original_sentence
|
17 |
+
self.paraphrased_sentences = paraphrased_sentences
|
18 |
+
|
19 |
+
# Euclidean distance dictionary
|
20 |
+
self.euclidean_distances = {}
|
21 |
+
|
22 |
+
# Normalized Euclidean distances
|
23 |
+
self.normalized_euclidean = {}
|
24 |
+
|
25 |
+
# Load SentenceTransformer model for embedding calculation
|
26 |
+
self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
27 |
+
|
28 |
+
def calculate_all_metrics(self):
|
29 |
+
"""
|
30 |
+
Calculate Euclidean distance between the original and each paraphrased sentence.
|
31 |
+
"""
|
32 |
+
original_embedding = self._get_sentence_embedding(self.original_sentence)
|
33 |
+
|
34 |
+
for idx, paraphrased_sentence in enumerate(self.paraphrased_sentences):
|
35 |
+
key = f"Sentence_{idx+1}"
|
36 |
+
|
37 |
+
# Euclidean Distance
|
38 |
+
paraphrase_embedding = self._get_sentence_embedding(paraphrased_sentence)
|
39 |
+
self.euclidean_distances[key] = euclidean_distances([original_embedding], [paraphrase_embedding])[0][0]
|
40 |
+
|
41 |
+
def normalize_metrics(self):
|
42 |
+
"""
|
43 |
+
Normalize all metrics to be between 0 and 1.
|
44 |
+
"""
|
45 |
+
self.normalized_euclidean = self._normalize_dict(self.euclidean_distances)
|
46 |
+
|
47 |
+
def plot_metrics(self):
|
48 |
+
"""
|
49 |
+
Plot the normalized Euclidean distances in a graph.
|
50 |
+
"""
|
51 |
+
keys = list(self.normalized_euclidean.keys())
|
52 |
+
indices = np.arange(len(keys))
|
53 |
+
|
54 |
+
# Prepare data for plotting
|
55 |
+
plt.figure(figsize=(12, 6))
|
56 |
+
plt.plot(indices, [self.normalized_euclidean[key] for key in keys], marker='o', color=np.random.rand(3,))
|
57 |
+
plt.xlabel('Sentence Index')
|
58 |
+
plt.ylabel('Normalized Euclidean Distance (0-1)')
|
59 |
+
plt.title('Normalized Euclidean Distance')
|
60 |
+
plt.grid(True)
|
61 |
+
plt.tight_layout()
|
62 |
+
plt.show()
|
63 |
+
|
64 |
+
# Private methods for metric calculations
|
65 |
+
def _get_sentence_embedding(self, sentence):
|
66 |
+
"""
|
67 |
+
Get sentence embedding using the SentenceTransformer model.
|
68 |
+
"""
|
69 |
+
return self.model.encode(sentence)
|
70 |
+
|
71 |
+
def _normalize_dict(self, metric_dict):
|
72 |
+
"""
|
73 |
+
Normalize the values in a dictionary to be between 0 and 1.
|
74 |
+
"""
|
75 |
+
values = np.array(list(metric_dict.values()))
|
76 |
+
min_val = values.min()
|
77 |
+
max_val = values.max()
|
78 |
+
# Avoid division by zero if all values are the same
|
79 |
+
if max_val - min_val == 0:
|
80 |
+
normalized_values = np.zeros_like(values)
|
81 |
+
else:
|
82 |
+
normalized_values = (values - min_val) / (max_val - min_val)
|
83 |
+
return dict(zip(metric_dict.keys(), normalized_values))
|
84 |
+
|
85 |
+
# Getter methods
|
86 |
+
def get_normalized_metrics(self):
|
87 |
+
"""
|
88 |
+
Get the normalized Euclidean distances as a dictionary.
|
89 |
+
"""
|
90 |
+
return self.normalized_euclidean
|
91 |
+
|
92 |
+
|
93 |
+
# # Example usage
|
94 |
+
# if __name__ == "__main__":
|
95 |
+
# # Original sentence
|
96 |
+
# original_sentence = "The quick brown fox jumps over the lazy dog"
|
97 |
+
|
98 |
+
# # Paraphrased sentences
|
99 |
+
# paraphrased_sentences = [
|
100 |
+
# # Original 1: "A swift auburn fox leaps across a sleepy canine."
|
101 |
+
# "The swift auburn fox leaps across a sleepy canine.",
|
102 |
+
# "A quick auburn fox leaps across a sleepy canine.",
|
103 |
+
# "A swift ginger fox leaps across a sleepy canine.",
|
104 |
+
# "A swift auburn fox bounds across a sleepy canine.",
|
105 |
+
# "A swift auburn fox leaps across a tired canine.",
|
106 |
+
# "Three swift auburn foxes leap across a sleepy canine.",
|
107 |
+
# "The vulpine specimen rapidly traverses over a dormant dog.",
|
108 |
+
# "Like lightning, the russet hunter soars over the drowsy guardian.",
|
109 |
+
# "Tha quick ginger fox jumps o'er the lazy hound, ye ken.",
|
110 |
+
# "One rapid Vulpes vulpes traverses the path of a quiescent canine.",
|
111 |
+
# "A swift auburn predator navigates across a lethargic pet.",
|
112 |
+
# "Subject A (fox) demonstrates velocity over Subject B (dog).",
|
113 |
+
|
114 |
+
# # Original 2: "The agile russet fox bounds over an idle hound."
|
115 |
+
# "Some agile russet foxes bound over an idle hound.",
|
116 |
+
# "The nimble russet fox bounds over an idle hound.",
|
117 |
+
# "The agile brown fox bounds over an idle hound.",
|
118 |
+
# "The agile russet fox jumps over an idle hound.",
|
119 |
+
# "The agile russet fox bounds over a lazy hound.",
|
120 |
+
# "Two agile russet foxes bound over an idle hound.",
|
121 |
+
# "A dexterous vulpine surpasses a stationary canine.",
|
122 |
+
# "Quick as thought, the copper warrior sails over the guardian.",
|
123 |
+
# "Tha nimble reddish fox jumps o'er the doggo, don't ya know.",
|
124 |
+
# "A dexterous V. vulpes exceeds the plane of an inactive canine.",
|
125 |
+
# "An agile russet hunter maneuvers above a resting hound.",
|
126 |
+
# "Test subject F-1 achieves displacement superior to subject D-1.",
|
127 |
+
|
128 |
+
# # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog."
|
129 |
+
# "The nimble mahogany vulpine vaults above a drowsy dog.",
|
130 |
+
# "A swift mahogany vulpine vaults above a drowsy dog.",
|
131 |
+
# "A nimble reddish vulpine vaults above a drowsy dog.",
|
132 |
+
# "A nimble mahogany fox vaults above a drowsy dog.",
|
133 |
+
# "A nimble mahogany vulpine leaps above a drowsy dog.",
|
134 |
+
# "Four nimble mahogany vulpines vault above a drowsy dog.",
|
135 |
+
# "An agile specimen of reddish fur surpasses a somnolent canine.",
|
136 |
+
# "Fleet as wind, the earth-toned hunter soars over the sleepy guard.",
|
137 |
+
# "Tha quick brown beastie jumps o'er the tired pup, aye.",
|
138 |
+
# "Single V. vulpes demonstrates vertical traverse over C. familiaris.",
|
139 |
+
# "A nimble rust-colored predator crosses above a drowsy pet.",
|
140 |
+
# "Observed: Subject Red executes vertical motion over Subject Gray.",
|
141 |
+
|
142 |
+
# # Original 4: "The speedy copper-colored fox hops over the lethargic pup."
|
143 |
+
# "A speedy copper-colored fox hops over the lethargic pup.",
|
144 |
+
# "The quick copper-colored fox hops over the lethargic pup.",
|
145 |
+
# "The speedy bronze fox hops over the lethargic pup.",
|
146 |
+
# "The speedy copper-colored fox jumps over the lethargic pup.",
|
147 |
+
# "The speedy copper-colored fox hops over the tired pup.",
|
148 |
+
# "Multiple speedy copper-colored foxes hop over the lethargic pup.",
|
149 |
+
# "A rapid vulpine of bronze hue traverses an inactive young canine.",
|
150 |
+
# "Swift as a dart, the metallic hunter bounds over the lazy puppy.",
|
151 |
+
# "Tha fast copper beastie leaps o'er the sleepy wee dog.",
|
152 |
+
# "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.",
|
153 |
+
# "A fleet copper-toned predator moves past a sluggish young dog.",
|
154 |
+
# "Field note: Adult fox subject exceeds puppy subject vertically.",
|
155 |
+
|
156 |
+
# # Original 5: "A rapid tawny fox springs over a sluggish dog."
|
157 |
+
# "The rapid tawny fox springs over a sluggish dog.",
|
158 |
+
# "A quick tawny fox springs over a sluggish dog.",
|
159 |
+
# "A rapid golden fox springs over a sluggish dog.",
|
160 |
+
# "A rapid tawny fox jumps over a sluggish dog.",
|
161 |
+
# "A rapid tawny fox springs over a lazy dog.",
|
162 |
+
# "Six rapid tawny foxes spring over a sluggish dog.",
|
163 |
+
# "An expeditious yellowish vulpine surpasses a torpid canine.",
|
164 |
+
# "Fast as a bullet, the golden hunter vaults over the idle guard.",
|
165 |
+
# "Tha swift yellowy fox jumps o'er the lazy mutt, aye.",
|
166 |
+
# "One V. vulpes displays rapid transit over one inactive C. familiaris.",
|
167 |
+
# "A speedy yellow-brown predator bypasses a motionless dog.",
|
168 |
+
# "Log entry: Vulpine subject achieves swift vertical displacement.",
|
169 |
+
|
170 |
+
# # Original 6: "The fleet-footed chestnut fox soars above an indolent canine."
|
171 |
+
# "A fleet-footed chestnut fox soars above an indolent canine.",
|
172 |
+
# "The swift chestnut fox soars above an indolent canine.",
|
173 |
+
# "The fleet-footed brown fox soars above an indolent canine.",
|
174 |
+
# "The fleet-footed chestnut fox leaps above an indolent canine.",
|
175 |
+
# "The fleet-footed chestnut fox soars above a lazy canine.",
|
176 |
+
# "Several fleet-footed chestnut foxes soar above an indolent canine.",
|
177 |
+
# "A rapid brown vulpine specimen traverses a lethargic domestic dog.",
|
178 |
+
# "Graceful as a bird, the nutbrown hunter flies over the lazy guard.",
|
179 |
+
# "Tha quick brown beastie sails o'er the sleepy hound, ken.",
|
180 |
+
# "Single agile V. vulpes achieves elevation above stationary canine.",
|
181 |
+
# "A nimble brown predator glides over an unmoving domestic animal.",
|
182 |
+
# "Research note: Brown subject displays superior vertical mobility.",
|
183 |
+
|
184 |
+
# # Original 7: "A fast ginger fox hurdles past a slothful dog."
|
185 |
+
# "The fast ginger fox hurdles past a slothful dog.",
|
186 |
+
# "A quick ginger fox hurdles past a slothful dog.",
|
187 |
+
# "A fast red fox hurdles past a slothful dog.",
|
188 |
+
# "A fast ginger fox jumps past a slothful dog.",
|
189 |
+
# "A fast ginger fox hurdles past a lazy dog.",
|
190 |
+
# "Five fast ginger foxes hurdle past a slothful dog.",
|
191 |
+
# "A rapid orange vulpine bypasses a lethargic canine.",
|
192 |
+
# "Quick as lightning, the flame-colored hunter races past the lazy guard.",
|
193 |
+
# "Tha swift ginger beastie leaps past the tired doggy, ye see.",
|
194 |
+
# "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.",
|
195 |
+
# "A speedy red-orange predator overtakes a motionless dog.",
|
196 |
+
# "Data point: Orange subject demonstrates rapid transit past Gray subject.",
|
197 |
+
|
198 |
+
# # Original 8: "The spry rusty-colored fox jumps across a dozing hound."
|
199 |
+
# "A spry rusty-colored fox jumps across a dozing hound.",
|
200 |
+
# "The agile rusty-colored fox jumps across a dozing hound.",
|
201 |
+
# "The spry reddish fox jumps across a dozing hound.",
|
202 |
+
# "The spry rusty-colored fox leaps across a dozing hound.",
|
203 |
+
# "The spry rusty-colored fox jumps across a sleeping hound.",
|
204 |
+
# "Multiple spry rusty-colored foxes jump across a dozing hound.",
|
205 |
+
# "An agile rust-toned vulpine traverses a somnolent canine.",
|
206 |
+
# "Nimble as thought, the copper hunter bounds over the resting guard.",
|
207 |
+
# "Tha lively rust-colored beastie hops o'er the snoozin' hound.",
|
208 |
+
# "Single dexterous V. vulpes crosses path of dormant C. familiaris.",
|
209 |
+
# "A lithe rust-tinted predator moves past a slumbering dog.",
|
210 |
+
# "Observation: Russet subject exhibits agility over dormant subject.",
|
211 |
+
|
212 |
+
# # Original 9: "A quick tan fox leaps over an inactive dog."
|
213 |
+
# "The quick tan fox leaps over an inactive dog.",
|
214 |
+
# "A swift tan fox leaps over an inactive dog.",
|
215 |
+
# "A quick beige fox leaps over an inactive dog.",
|
216 |
+
# "A quick tan fox jumps over an inactive dog.",
|
217 |
+
# "A quick tan fox leaps over a motionless dog.",
|
218 |
+
# "Seven quick tan foxes leap over an inactive dog.",
|
219 |
+
# "A rapid light-brown vulpine surpasses a stationary canine.",
|
220 |
+
# "Fast as wind, the sand-colored hunter soars over the still guard.",
|
221 |
+
# "Tha nimble tan beastie jumps o'er the quiet doggy, aye.",
|
222 |
+
# "One agile fawn V. vulpes traverses one immobile C. familiaris.",
|
223 |
+
# "A fleet tan-colored predator bypasses an unmoving dog.",
|
224 |
+
# "Field report: Tan subject demonstrates movement over static subject.",
|
225 |
+
|
226 |
+
# # Original 10: "The brisk auburn vulpine bounces over a listless canine."
|
227 |
+
# "Some brisk auburn vulpines bounce over a listless canine.",
|
228 |
+
# "The quick auburn vulpine bounces over a listless canine.",
|
229 |
+
# "The brisk russet vulpine bounces over a listless canine.",
|
230 |
+
# "The brisk auburn fox bounces over a listless canine.",
|
231 |
+
# "The brisk auburn vulpine jumps over a listless canine.",
|
232 |
+
# "Five brisk auburn vulpines bounce over a listless canine.",
|
233 |
+
# "The expeditious specimen supersedes a quiescent Canis lupus.",
|
234 |
+
# "Swift as wind, the russet hunter vaults over the idle guardian.",
|
235 |
+
# "Tha quick ginger beastie hops o'er the lazy mutt, aye.",
|
236 |
+
# "One V. vulpes achieves displacement over inactive C. familiaris.",
|
237 |
+
# "A high-velocity auburn predator traverses an immobile animal.",
|
238 |
+
# "Final observation: Red subject shows mobility over Gray subject."
|
239 |
+
# ]
|
240 |
+
|
241 |
+
|
242 |
+
# # Initialize the calculator
|
243 |
+
# calculator = SentenceEuclideanDistanceCalculator(original_sentence, paraphrased_sentences)
|
244 |
+
|
245 |
+
# # Calculate Euclidean distances
|
246 |
+
# calculator.calculate_all_metrics()
|
247 |
+
|
248 |
+
# # Normalize the distances
|
249 |
+
# calculator.normalize_metrics()
|
250 |
+
|
251 |
+
# # Retrieve the normalized Euclidean distances
|
252 |
+
# normalized_metrics = calculator.get_normalized_metrics()
|
253 |
+
# euclidean_val=normalized_metrics
|
254 |
+
|
255 |
+
# # Display the results
|
256 |
+
# print("Normalized Euclidean Distances:")
|
257 |
+
# for key, value in normalized_metrics.items():
|
258 |
+
# print(f"{key}: {value:.4f}")
|
259 |
+
|
260 |
+
# # Plot the metrics
|
261 |
+
# calculator.plot_metrics()
|
gpt_mask_filling.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
openai.api_key = os.getenv("API_KEY")
|
8 |
+
|
9 |
+
|
10 |
+
#Takes in a sentence and returns a list of dicts consisiting of key-value pairs of masked words and lists of the possible replacements
|
11 |
+
def predict_masked_words(sentence, n_suggestions=5):
|
12 |
+
|
13 |
+
prompt = (
|
14 |
+
f"Given a sentence with masked words, masked word can be one or more than one, indicated by [MASK], generate {n_suggestions} possible words to fill each mask. "
|
15 |
+
"Return the results as a list of dictionaries, where each dictionary key is a masked word and its value is a list of 5 potential words to fill that mask.\n\n"
|
16 |
+
"Example input: \"The [MASK] fox [MASK] over the [MASK] dog.\"\n\n"
|
17 |
+
"Example output:\n"
|
18 |
+
"[\n"
|
19 |
+
" {\n"
|
20 |
+
" \"[MASK]1\": [\"quick\", \"sly\", \"red\", \"clever\", \"sneaky\"]\n"
|
21 |
+
" },\n"
|
22 |
+
" {\n"
|
23 |
+
" \"[MASK]2\": [\"jumped\", \"leaped\", \"hopped\", \"sprang\", \"bounded\"]\n"
|
24 |
+
" },\n"
|
25 |
+
" {\n"
|
26 |
+
" \"[MASK]3\": [\"lazy\", \"sleeping\", \"brown\", \"tired\", \"old\"]\n"
|
27 |
+
" }\n"
|
28 |
+
"]\n\n"
|
29 |
+
"Example input: \"The [MASK] [MASK] ran swiftly across the [MASK] field.\"\n\n"
|
30 |
+
"Example output:\n"
|
31 |
+
"[\n"
|
32 |
+
" {\n"
|
33 |
+
" \"[MASK]1\": [\"tall\", \"fierce\", \"young\", \"old\", \"beautiful\"]\n"
|
34 |
+
" },\n"
|
35 |
+
" {\n"
|
36 |
+
" \"[MASK]2\": [\"lion\", \"tiger\", \"horse\", \"cheetah\", \"deer\"]\n"
|
37 |
+
" },\n"
|
38 |
+
" {\n"
|
39 |
+
" \"[MASK]3\": [\"green\", \"wide\", \"sunny\", \"open\", \"empty\"]\n"
|
40 |
+
" }\n"
|
41 |
+
"]\n\n"
|
42 |
+
"Example input: \"It was a [MASK] day when the train arrived at the station.\"\n\n"
|
43 |
+
"Example output:\n"
|
44 |
+
"[\n"
|
45 |
+
" {\n"
|
46 |
+
" \"[MASK]1\": [\"sunny\", \"rainy\", \"cloudy\", \"foggy\", \"stormy\"]\n"
|
47 |
+
" },\n"
|
48 |
+
"]\n\n"
|
49 |
+
"Now, please process the following sentence:\n"
|
50 |
+
f"{sentence}"
|
51 |
+
)
|
52 |
+
|
53 |
+
|
54 |
+
response = openai.ChatCompletion.create(
|
55 |
+
model="gpt-3.5-turbo",
|
56 |
+
messages=[
|
57 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
58 |
+
{"role": "user", "content": prompt}
|
59 |
+
],
|
60 |
+
max_tokens=100,
|
61 |
+
n=1,
|
62 |
+
stop=None,
|
63 |
+
temperature=0.7
|
64 |
+
)
|
65 |
+
|
66 |
+
print(response['choices'][0]['message']['content'])
|
67 |
+
|
68 |
+
|
69 |
+
# sentence = "Evacuations and storm [MASK] began on Sunday night as forecasters projected that Hurricane Dorian would hit into Florida’s west coast on Wednesday as a major hurricane packing life-threatening winds and storm surge."
|
70 |
+
# predict_masked_words(sentence, n_suggestions=5)
|
highlighter.py
CHANGED
@@ -83,4 +83,22 @@ def highlight_common_words_dict(common_words, sentences, title):
|
|
83 |
<h3 style="margin-top: 0; font-size: 1em; color: #111827;">{title}</h3>
|
84 |
<div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
|
85 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
'''
|
|
|
83 |
<h3 style="margin-top: 0; font-size: 1em; color: #111827;">{title}</h3>
|
84 |
<div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
|
85 |
</div>
|
86 |
+
'''
|
87 |
+
|
88 |
+
def reparaphrased_sentences_html(sentences):
|
89 |
+
|
90 |
+
formatted_sentences = []
|
91 |
+
|
92 |
+
for idx, sentence in enumerate(sentences, start=1):
|
93 |
+
# Add index to each sentence
|
94 |
+
sentence_with_idx = f"{idx}. {sentence}"
|
95 |
+
formatted_sentences.append(sentence_with_idx)
|
96 |
+
|
97 |
+
final_html = "<br><br>".join(formatted_sentences)
|
98 |
+
|
99 |
+
return f'''
|
100 |
+
<div style="border: solid 1px #ccc; padding: 16px; background-color: #FFFFFF; color: #374151;
|
101 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
|
102 |
+
<div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
|
103 |
+
</div>
|
104 |
'''
|
lcs.py
CHANGED
@@ -4,7 +4,6 @@ from nltk.corpus import stopwords
|
|
4 |
def find_common_subsequences(sentence, str_list):
|
5 |
stop_words = set(stopwords.words('english'))
|
6 |
sentence = sentence.lower()
|
7 |
-
|
8 |
str_list = [s.lower() for s in str_list]
|
9 |
|
10 |
def is_present(subseq, str_list):
|
@@ -17,17 +16,17 @@ def find_common_subsequences(sentence, str_list):
|
|
17 |
filtered_words = [word for word in words if word.lower() not in stop_words]
|
18 |
return " ".join(filtered_words)
|
19 |
|
20 |
-
|
21 |
-
|
22 |
|
23 |
-
words =
|
24 |
common_grams = []
|
25 |
added_phrases = set()
|
26 |
|
27 |
-
for n in range(5, 0, -1):
|
28 |
for i in range(len(words) - n + 1):
|
29 |
-
subseq = " ".join(words[i:i+n])
|
30 |
-
if is_present(subseq,
|
31 |
common_grams.append((i, subseq))
|
32 |
added_phrases.add(subseq)
|
33 |
|
@@ -39,8 +38,62 @@ def find_common_subsequences(sentence, str_list):
|
|
39 |
|
40 |
return indexed_common_grams
|
41 |
|
42 |
-
|
43 |
-
#
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
# print(
|
|
|
4 |
def find_common_subsequences(sentence, str_list):
|
5 |
stop_words = set(stopwords.words('english'))
|
6 |
sentence = sentence.lower()
|
|
|
7 |
str_list = [s.lower() for s in str_list]
|
8 |
|
9 |
def is_present(subseq, str_list):
|
|
|
16 |
filtered_words = [word for word in words if word.lower() not in stop_words]
|
17 |
return " ".join(filtered_words)
|
18 |
|
19 |
+
cleaned_sentence = remove_stop_words_and_special_chars(sentence)
|
20 |
+
cleaned_str_list = [remove_stop_words_and_special_chars(s) for s in str_list]
|
21 |
|
22 |
+
words = cleaned_sentence.split()
|
23 |
common_grams = []
|
24 |
added_phrases = set()
|
25 |
|
26 |
+
for n in range(5, 0, -1): # Check n-grams from size 5 to 1
|
27 |
for i in range(len(words) - n + 1):
|
28 |
+
subseq = " ".join(words[i:i + n])
|
29 |
+
if is_present(subseq, cleaned_str_list) and not any(subseq in phrase for phrase in added_phrases):
|
30 |
common_grams.append((i, subseq))
|
31 |
added_phrases.add(subseq)
|
32 |
|
|
|
38 |
|
39 |
return indexed_common_grams
|
40 |
|
41 |
+
def find_common_gram_positions(str_list, common_grams):
|
42 |
+
# Initialize a list to hold positions for each sentence
|
43 |
+
positions = []
|
44 |
+
|
45 |
+
for sentence in str_list:
|
46 |
+
# Number each word in the sentence
|
47 |
+
words = re.sub(r'[^\w\s]', '', sentence).lower().split()
|
48 |
+
word_positions = {word: [] for word in words}
|
49 |
+
|
50 |
+
for idx, word in enumerate(words):
|
51 |
+
word_positions[word].append(idx + 1) # Store 1-based index positions
|
52 |
+
|
53 |
+
# Create a list to store positions of common grams for the current sentence
|
54 |
+
sentence_positions = []
|
55 |
+
|
56 |
+
for gram in common_grams:
|
57 |
+
# Clean the gram for matching
|
58 |
+
cleaned_gram = re.sub(r'[^\w\s]', '', gram).lower()
|
59 |
+
gram_words = cleaned_gram.split()
|
60 |
+
|
61 |
+
# Check for the position of the common gram in the current sentence
|
62 |
+
if all(word in word_positions for word in gram_words):
|
63 |
+
# Get the position of the first word of the common gram
|
64 |
+
start_idx = word_positions[gram_words[0]][0]
|
65 |
+
sentence_positions.append(start_idx)
|
66 |
+
else:
|
67 |
+
sentence_positions.append(-1) # Common gram not found
|
68 |
+
|
69 |
+
# Append the positions for the current sentence to the main positions list
|
70 |
+
positions.append(sentence_positions)
|
71 |
+
|
72 |
+
return positions
|
73 |
+
|
74 |
+
|
75 |
+
# # Example usage
|
76 |
+
# sentence = "Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time since the Holocaust” to be Jewish in the United States."
|
77 |
+
# str_list = [
|
78 |
+
# 'During a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump stated that being Jewish in the United States has never been more hazardous since the Holocaust.',
|
79 |
+
# 'At a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump declared that being Jewish in the United States has never been more hazardous since the Holocaust.',
|
80 |
+
# 'Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania, and stated that being Jewish in the United States has never been more perilous since the Holocaust.',
|
81 |
+
# 'Donald Trump made the statement at a campaign rally in Wilkes-Barre, Pennsylvania, saying that being Jewish in the United States has never been more dangerous since the Holocaust.',
|
82 |
+
# 'Last month, Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania and stated that being Jewish in the United States has never been more hazardous than during World War II.',
|
83 |
+
# 'In Wilkes-Barre, Pennsylvania, Donald Trump spoke at a campaign rally and claimed that the Holocaust was always more hazardous for Jews in the United States.',
|
84 |
+
# 'A campaign rally in Wilkes-Barre, Pennsylvania saw Donald Trump declare that being Jewish in the United States has never been more perilous since WWII.',
|
85 |
+
# 'Speaking at a campaign rally in Wilkes-Barre, Pennsylvania today, Donald Trump declared that being Jewish has never been more hazardous in the United States since the Holocaust.',
|
86 |
+
# 'During his campaign rally in Wilkes-Barre, Pennsylvania today Donald Trump stated: "There has never been a safer place for being Jewish in the United States since the Holocaust."',
|
87 |
+
# 'At a campaign rally in Wilkes-Barre, Pennsylvania (pictured), Donald Trump said, "There has never been... gotten worse for being Jewish in America since the Holocaust."'
|
88 |
+
# ]
|
89 |
+
|
90 |
+
# # Find common subsequences
|
91 |
+
# common_grams = find_common_subsequences(sentence, str_list)
|
92 |
+
# # Extract the subsequences from the common grams for position checking
|
93 |
+
# subsequences = [subseq for _, subseq in common_grams]
|
94 |
+
|
95 |
+
# # Find positions of the common grams
|
96 |
+
# common_gram_positions = find_common_gram_positions(str_list, subsequences)
|
97 |
+
|
98 |
|
99 |
+
# print(common_grams)
|
masking_methods.py
CHANGED
@@ -1,73 +1,31 @@
|
|
1 |
-
|
2 |
-
# from transformers import pipeline
|
3 |
-
# import random
|
4 |
-
# from nltk.corpus import stopwords
|
5 |
-
# import math
|
6 |
-
|
7 |
-
# # Masking Model
|
8 |
-
# def mask_non_stopword(sentence):
|
9 |
-
# stop_words = set(stopwords.words('english'))
|
10 |
-
# words = sentence.split()
|
11 |
-
# non_stop_words = [word for word in words if word.lower() not in stop_words]
|
12 |
-
# if not non_stop_words:
|
13 |
-
# return sentence
|
14 |
-
# word_to_mask = random.choice(non_stop_words)
|
15 |
-
# masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
|
16 |
-
# return masked_sentence
|
17 |
-
|
18 |
-
# def mask_non_stopword_pseudorandom(sentence):
|
19 |
-
# stop_words = set(stopwords.words('english'))
|
20 |
-
# words = sentence.split()
|
21 |
-
# non_stop_words = [word for word in words if word.lower() not in stop_words]
|
22 |
-
# if not non_stop_words:
|
23 |
-
# return sentence
|
24 |
-
# random.seed(10)
|
25 |
-
# word_to_mask = random.choice(non_stop_words)
|
26 |
-
# masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
|
27 |
-
# return masked_sentence
|
28 |
-
|
29 |
-
# def high_entropy_words(sentence, non_melting_points):
|
30 |
-
# stop_words = set(stopwords.words('english'))
|
31 |
-
# words = sentence.split()
|
32 |
-
|
33 |
-
# non_melting_words = set()
|
34 |
-
# for _, point in non_melting_points:
|
35 |
-
# non_melting_words.update(point.lower().split())
|
36 |
-
|
37 |
-
# candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
|
38 |
-
|
39 |
-
# if not candidate_words:
|
40 |
-
# return sentence
|
41 |
-
|
42 |
-
# max_entropy = -float('inf')
|
43 |
-
# max_entropy_word = None
|
44 |
-
|
45 |
-
# for word in candidate_words:
|
46 |
-
# masked_sentence = sentence.replace(word, '[MASK]', 1)
|
47 |
-
# predictions = fill_mask(masked_sentence)
|
48 |
-
|
49 |
-
# # Calculate entropy based on top 5 predictions
|
50 |
-
# entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5])
|
51 |
-
|
52 |
-
# if entropy > max_entropy:
|
53 |
-
# max_entropy = entropy
|
54 |
-
# max_entropy_word = word
|
55 |
-
|
56 |
-
# return sentence.replace(max_entropy_word, '[MASK]', 1)
|
57 |
-
|
58 |
-
|
59 |
-
# # Load tokenizer and model for masked language model
|
60 |
-
# tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
|
61 |
-
# model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
|
62 |
-
# fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
|
63 |
-
|
64 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
65 |
from transformers import pipeline
|
66 |
import random
|
67 |
from nltk.corpus import stopwords
|
68 |
import math
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
-
# Masking Model
|
71 |
def mask_non_stopword(sentence):
|
72 |
stop_words = set(stopwords.words('english'))
|
73 |
words = sentence.split()
|
@@ -76,10 +34,10 @@ def mask_non_stopword(sentence):
|
|
76 |
return sentence, None, None
|
77 |
word_to_mask = random.choice(non_stop_words)
|
78 |
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
return masked_sentence,
|
83 |
|
84 |
def mask_non_stopword_pseudorandom(sentence):
|
85 |
stop_words = set(stopwords.words('english'))
|
@@ -87,54 +45,148 @@ def mask_non_stopword_pseudorandom(sentence):
|
|
87 |
non_stop_words = [word for word in words if word.lower() not in stop_words]
|
88 |
if not non_stop_words:
|
89 |
return sentence, None, None
|
90 |
-
random.seed(10)
|
91 |
word_to_mask = random.choice(non_stop_words)
|
92 |
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
return masked_sentence,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
def high_entropy_words(sentence, non_melting_points):
|
99 |
stop_words = set(stopwords.words('english'))
|
100 |
words = sentence.split()
|
101 |
-
|
102 |
non_melting_words = set()
|
103 |
for _, point in non_melting_points:
|
104 |
non_melting_words.update(point.lower().split())
|
105 |
-
|
106 |
candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
|
107 |
-
|
108 |
if not candidate_words:
|
109 |
return sentence, None, None
|
110 |
-
|
111 |
max_entropy = -float('inf')
|
112 |
max_entropy_word = None
|
113 |
max_logits = None
|
114 |
-
|
115 |
for word in candidate_words:
|
116 |
masked_sentence = sentence.replace(word, '[MASK]', 1)
|
117 |
-
|
|
|
118 |
|
119 |
# Calculate entropy based on top 5 predictions
|
120 |
-
|
121 |
-
|
|
|
|
|
122 |
if entropy > max_entropy:
|
123 |
max_entropy = entropy
|
124 |
max_entropy_word = word
|
125 |
-
max_logits =
|
126 |
-
|
|
|
|
|
|
|
127 |
masked_sentence = sentence.replace(max_entropy_word, '[MASK]', 1)
|
128 |
-
words = [
|
129 |
-
|
130 |
-
return masked_sentence, words, logits
|
131 |
|
132 |
-
#
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
-
|
138 |
-
|
139 |
-
print(f"
|
140 |
-
print(f"logits content: {b}")
|
|
|
1 |
+
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
3 |
from transformers import pipeline
|
4 |
import random
|
5 |
from nltk.corpus import stopwords
|
6 |
import math
|
7 |
+
from vocabulary_split import split_vocabulary, filter_logits
|
8 |
+
|
9 |
+
# Load tokenizer and model for masked language model
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
|
11 |
+
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
|
12 |
+
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
|
13 |
+
|
14 |
+
# Get permissible vocabulary
|
15 |
+
permissible, _ = split_vocabulary(seed=42)
|
16 |
+
permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
|
17 |
+
|
18 |
+
def get_logits_for_mask(model, tokenizer, sentence):
|
19 |
+
inputs = tokenizer(sentence, return_tensors="pt")
|
20 |
+
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
|
21 |
+
|
22 |
+
with torch.no_grad():
|
23 |
+
outputs = model(**inputs)
|
24 |
+
|
25 |
+
logits = outputs.logits
|
26 |
+
mask_token_logits = logits[0, mask_token_index, :]
|
27 |
+
return mask_token_logits.squeeze()
|
28 |
|
|
|
29 |
def mask_non_stopword(sentence):
|
30 |
stop_words = set(stopwords.words('english'))
|
31 |
words = sentence.split()
|
|
|
34 |
return sentence, None, None
|
35 |
word_to_mask = random.choice(non_stop_words)
|
36 |
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
|
37 |
+
logits = get_logits_for_mask(model, tokenizer, masked_sentence)
|
38 |
+
filtered_logits = filter_logits(logits, permissible_indices)
|
39 |
+
words = [tokenizer.decode([i]) for i in filtered_logits.argsort()[-5:]]
|
40 |
+
return masked_sentence, filtered_logits.tolist(), words
|
41 |
|
42 |
def mask_non_stopword_pseudorandom(sentence):
|
43 |
stop_words = set(stopwords.words('english'))
|
|
|
45 |
non_stop_words = [word for word in words if word.lower() not in stop_words]
|
46 |
if not non_stop_words:
|
47 |
return sentence, None, None
|
48 |
+
random.seed(10) # Fixed seed for pseudo-randomness
|
49 |
word_to_mask = random.choice(non_stop_words)
|
50 |
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
|
51 |
+
logits = get_logits_for_mask(model, tokenizer, masked_sentence)
|
52 |
+
filtered_logits = filter_logits(logits, permissible_indices)
|
53 |
+
words = [tokenizer.decode([i]) for i in filtered_logits.argsort()[-5:]]
|
54 |
+
return masked_sentence, filtered_logits.tolist(), words
|
55 |
+
|
56 |
+
# New function: mask words between LCS points
|
57 |
+
def mask_between_lcs(sentence, lcs_points):
|
58 |
+
words = sentence.split()
|
59 |
+
masked_indices = []
|
60 |
+
|
61 |
+
# Mask between first word and first LCS point
|
62 |
+
if lcs_points and lcs_points[0] > 0:
|
63 |
+
idx = random.randint(0, lcs_points[0]-1)
|
64 |
+
words[idx] = '[MASK]'
|
65 |
+
masked_indices.append(idx)
|
66 |
+
|
67 |
+
# Mask between LCS points
|
68 |
+
for i in range(len(lcs_points) - 1):
|
69 |
+
start, end = lcs_points[i], lcs_points[i+1]
|
70 |
+
if end - start > 1:
|
71 |
+
mask_index = random.randint(start + 1, end - 1)
|
72 |
+
words[mask_index] = '[MASK]'
|
73 |
+
masked_indices.append(mask_index)
|
74 |
+
|
75 |
+
# Mask between last LCS point and last word
|
76 |
+
if lcs_points and lcs_points[-1] < len(words) - 1:
|
77 |
+
idx = random.randint(lcs_points[-1]+1, len(words)-1)
|
78 |
+
words[idx] = '[MASK]'
|
79 |
+
masked_indices.append(idx)
|
80 |
+
|
81 |
+
masked_sentence = ' '.join(words)
|
82 |
+
logits = get_logits_for_mask(model, tokenizer, masked_sentence)
|
83 |
+
|
84 |
+
# Now process each masked token separately
|
85 |
+
top_words_list = []
|
86 |
+
logits_list = []
|
87 |
+
for i in range(len(masked_indices)):
|
88 |
+
logits_i = logits[i]
|
89 |
+
if logits_i.dim() > 1:
|
90 |
+
logits_i = logits_i.squeeze()
|
91 |
+
filtered_logits_i = filter_logits(logits_i, permissible_indices)
|
92 |
+
logits_list.append(filtered_logits_i.tolist())
|
93 |
+
top_5_indices = filtered_logits_i.topk(5).indices.tolist()
|
94 |
+
top_words = [tokenizer.decode([i]) for i in top_5_indices]
|
95 |
+
top_words_list.append(top_words)
|
96 |
+
|
97 |
+
return masked_sentence, logits_list, top_words_list
|
98 |
+
|
99 |
|
100 |
def high_entropy_words(sentence, non_melting_points):
|
101 |
stop_words = set(stopwords.words('english'))
|
102 |
words = sentence.split()
|
103 |
+
|
104 |
non_melting_words = set()
|
105 |
for _, point in non_melting_points:
|
106 |
non_melting_words.update(point.lower().split())
|
107 |
+
|
108 |
candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
|
109 |
+
|
110 |
if not candidate_words:
|
111 |
return sentence, None, None
|
112 |
+
|
113 |
max_entropy = -float('inf')
|
114 |
max_entropy_word = None
|
115 |
max_logits = None
|
116 |
+
|
117 |
for word in candidate_words:
|
118 |
masked_sentence = sentence.replace(word, '[MASK]', 1)
|
119 |
+
logits = get_logits_for_mask(model, tokenizer, masked_sentence)
|
120 |
+
filtered_logits = filter_logits(logits, permissible_indices)
|
121 |
|
122 |
# Calculate entropy based on top 5 predictions
|
123 |
+
probs = torch.softmax(filtered_logits, dim=-1)
|
124 |
+
top_5_probs = probs.topk(5).values
|
125 |
+
entropy = -torch.sum(top_5_probs * torch.log(top_5_probs))
|
126 |
+
|
127 |
if entropy > max_entropy:
|
128 |
max_entropy = entropy
|
129 |
max_entropy_word = word
|
130 |
+
max_logits = filtered_logits
|
131 |
+
|
132 |
+
if max_entropy_word is None:
|
133 |
+
return sentence, None, None
|
134 |
+
|
135 |
masked_sentence = sentence.replace(max_entropy_word, '[MASK]', 1)
|
136 |
+
words = [tokenizer.decode([i]) for i in max_logits.argsort()[-5:]]
|
137 |
+
return masked_sentence, max_logits.tolist(), words
|
|
|
138 |
|
139 |
+
# New function: mask based on part of speech
|
140 |
+
def mask_by_pos(sentence, pos_to_mask=['NOUN', 'VERB', 'ADJ']):
|
141 |
+
import nltk
|
142 |
+
nltk.download('averaged_perceptron_tagger', quiet=True)
|
143 |
+
|
144 |
+
words = nltk.word_tokenize(sentence)
|
145 |
+
pos_tags = nltk.pos_tag(words)
|
146 |
+
|
147 |
+
maskable_words = [word for word, pos in pos_tags if pos[:2] in pos_to_mask]
|
148 |
+
|
149 |
+
if not maskable_words:
|
150 |
+
return sentence, None, None
|
151 |
+
|
152 |
+
word_to_mask = random.choice(maskable_words)
|
153 |
+
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
|
154 |
+
|
155 |
+
logits = get_logits_for_mask(model, tokenizer, masked_sentence)
|
156 |
+
filtered_logits = filter_logits(logits, permissible_indices)
|
157 |
+
words = [tokenizer.decode([i]) for i in filtered_logits.argsort()[-5:]]
|
158 |
+
|
159 |
+
return masked_sentence, filtered_logits.tolist(), words
|
160 |
+
|
161 |
+
# New function: mask named entities
|
162 |
+
def mask_named_entity(sentence):
|
163 |
+
import nltk
|
164 |
+
nltk.download('maxent_ne_chunker', quiet=True)
|
165 |
+
nltk.download('words', quiet=True)
|
166 |
+
|
167 |
+
words = nltk.word_tokenize(sentence)
|
168 |
+
pos_tags = nltk.pos_tag(words)
|
169 |
+
named_entities = nltk.ne_chunk(pos_tags)
|
170 |
+
|
171 |
+
maskable_words = [word for word, tag in named_entities.leaves() if isinstance(tag, nltk.Tree)]
|
172 |
+
|
173 |
+
if not maskable_words:
|
174 |
+
return sentence, None, None
|
175 |
+
|
176 |
+
word_to_mask = random.choice(maskable_words)
|
177 |
+
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
|
178 |
+
|
179 |
+
logits = get_logits_for_mask(model, tokenizer, masked_sentence)
|
180 |
+
filtered_logits = filter_logits(logits, permissible_indices)
|
181 |
+
words = [tokenizer.decode([i]) for i in filtered_logits.argsort()[-5:]]
|
182 |
+
|
183 |
+
return masked_sentence, filtered_logits.tolist(), words
|
184 |
+
|
185 |
+
|
186 |
+
# sentence = "This is a sample sentence with some LCS points"
|
187 |
+
# lcs_points = [2, 5, 8] # Indices of LCS points
|
188 |
+
# masked_sentence, logits_list, top_words_list = mask_between_lcs(sentence, lcs_points)
|
189 |
|
190 |
+
# print("Masked Sentence:", masked_sentence)
|
191 |
+
# for idx, top_words in enumerate(top_words_list):
|
192 |
+
# print(f"Top words for mask {idx+1}:", top_words)
|
|
paraphraser.py
CHANGED
@@ -1,31 +1,83 @@
|
|
1 |
-
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
2 |
-
|
3 |
-
# Function to Initialize the Model
|
4 |
-
def init_model():
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
# Function to Paraphrase the Text
|
10 |
-
def paraphrase(question, para_tokenizer, para_model, num_beams=10, num_beam_groups=10, num_return_sequences=10, repetition_penalty=10.0, diversity_penalty=3.0, no_repeat_ngram_size=2, temperature=0.7, max_length=64):
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
def generate_paraphrase(question):
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
# print(generate_paraphrase("Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time 5since the Holocaust” to be Jewish in the United States."))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
2 |
+
|
3 |
+
# # Function to Initialize the Model
|
4 |
+
# def init_model():
|
5 |
+
# para_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
|
6 |
+
# para_model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
|
7 |
+
# return para_tokenizer, para_model
|
8 |
+
|
9 |
+
# # Function to Paraphrase the Text
|
10 |
+
# def paraphrase(question, para_tokenizer, para_model, num_beams=10, num_beam_groups=10, num_return_sequences=10, repetition_penalty=10.0, diversity_penalty=3.0, no_repeat_ngram_size=2, temperature=0.7, max_length=64):
|
11 |
+
# input_ids = para_tokenizer(
|
12 |
+
# f'paraphrase: {question}',
|
13 |
+
# return_tensors="pt", padding="longest",
|
14 |
+
# max_length=max_length,
|
15 |
+
# truncation=True,
|
16 |
+
# ).input_ids
|
17 |
+
# outputs = para_model.generate(
|
18 |
+
# input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
|
19 |
+
# num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
|
20 |
+
# num_beams=num_beams, num_beam_groups=num_beam_groups,
|
21 |
+
# max_length=max_length, diversity_penalty=diversity_penalty
|
22 |
+
# )
|
23 |
+
# res = para_tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
24 |
+
# return res
|
25 |
+
|
26 |
+
# def generate_paraphrase(question):
|
27 |
+
# para_tokenizer, para_model = init_model()
|
28 |
+
# res = paraphrase(question, para_tokenizer, para_model)
|
29 |
+
# return res
|
30 |
+
|
31 |
+
# print(generate_paraphrase("Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time 5since the Holocaust” to be Jewish in the United States."))
|
32 |
+
|
33 |
+
'''
|
34 |
+
Accepts a sentence or list of sentences and returns a lit of all their paraphrases using GPT-4.
|
35 |
+
'''
|
36 |
+
|
37 |
+
from openai import OpenAI
|
38 |
+
from dotenv import load_dotenv
|
39 |
+
load_dotenv()
|
40 |
+
import os
|
41 |
+
|
42 |
+
key = os.getenv("OPENAI_API_KEY")
|
43 |
+
|
44 |
+
# Initialize the OpenAI client
|
45 |
+
client = OpenAI(
|
46 |
+
api_key=key # Replace with your actual API key
|
47 |
+
)
|
48 |
+
|
49 |
+
# Function to paraphrase sentences using GPT-4
|
50 |
+
def generate_paraphrase(sentences, model="gpt-4o", num_paraphrases=10, max_tokens=150, temperature=0.7):
|
51 |
+
# Ensure sentences is a list even if a single sentence is passed
|
52 |
+
if isinstance(sentences, str):
|
53 |
+
sentences = [sentences]
|
54 |
+
|
55 |
+
paraphrased_sentences_list = []
|
56 |
+
|
57 |
+
for sentence in sentences:
|
58 |
+
full_prompt = f"Paraphrase the following text: '{sentence}'"
|
59 |
+
try:
|
60 |
+
chat_completion = client.chat.completions.create(
|
61 |
+
messages=[
|
62 |
+
{
|
63 |
+
"role": "user",
|
64 |
+
"content": full_prompt,
|
65 |
+
}
|
66 |
+
],
|
67 |
+
model=model,
|
68 |
+
max_tokens=max_tokens,
|
69 |
+
temperature=temperature,
|
70 |
+
n=num_paraphrases # Number of paraphrased sentences to generate
|
71 |
+
)
|
72 |
+
# Extract the paraphrased sentences from the response
|
73 |
+
paraphrased_sentences = [choice.message.content.strip() for choice in chat_completion.choices]
|
74 |
+
# Append paraphrased sentences to the list
|
75 |
+
paraphrased_sentences_list.extend(paraphrased_sentences)
|
76 |
+
except Exception as e:
|
77 |
+
print(f"Error paraphrasing sentence '{sentence}': {e}")
|
78 |
+
|
79 |
+
return paraphrased_sentences_list
|
80 |
+
|
81 |
+
result = generate_paraphrase("Mayor Eric Adams did not attend the first candidate forum for the New York City mayoral race, but his record — and the criminal charges he faces — received plenty of attention on Saturday from the Democrats who are running to unseat him.")
|
82 |
+
|
83 |
+
print(len(result))
|
requirements.txt
CHANGED
@@ -14,4 +14,6 @@ nltk
|
|
14 |
tenacity
|
15 |
pandas
|
16 |
graphviz==0.20.3
|
17 |
-
gradio
|
|
|
|
|
|
14 |
tenacity
|
15 |
pandas
|
16 |
graphviz==0.20.3
|
17 |
+
gradio=4.29.0
|
18 |
+
openai
|
19 |
+
python-dotenv
|
sampling_methods.py
CHANGED
@@ -1,55 +1,42 @@
|
|
1 |
-
# import torch
|
2 |
-
# import random
|
3 |
-
|
4 |
-
# def sample_word(words, logits, sampling_technique='inverse_transform', temperature=1.0):
|
5 |
-
# if sampling_technique == 'inverse_transform':
|
6 |
-
# probs = torch.softmax(torch.tensor(logits), dim=-1)
|
7 |
-
# cumulative_probs = torch.cumsum(probs, dim=-1)
|
8 |
-
# random_prob = random.random()
|
9 |
-
# sampled_index = torch.where(cumulative_probs >= random_prob)[0][0]
|
10 |
-
# elif sampling_technique == 'exponential_minimum':
|
11 |
-
# probs = torch.softmax(torch.tensor(logits), dim=-1)
|
12 |
-
# exp_probs = torch.exp(-torch.log(probs))
|
13 |
-
# random_probs = torch.rand_like(exp_probs)
|
14 |
-
# sampled_index = torch.argmax(random_probs * exp_probs)
|
15 |
-
# elif sampling_technique == 'temperature':
|
16 |
-
# scaled_logits = torch.tensor(logits) / temperature
|
17 |
-
# probs = torch.softmax(scaled_logits, dim=-1)
|
18 |
-
# sampled_index = torch.multinomial(probs, 1).item()
|
19 |
-
# elif sampling_technique == 'greedy':
|
20 |
-
# sampled_index = torch.argmax(torch.tensor(logits)).item()
|
21 |
-
# else:
|
22 |
-
# raise ValueError("Invalid sampling technique. Choose 'inverse_transform', 'exponential_minimum', 'temperature', or 'greedy'.")
|
23 |
-
|
24 |
-
# sampled_word = words[sampled_index]
|
25 |
-
# return sampled_word
|
26 |
-
|
27 |
import torch
|
28 |
import random
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
def sample_word(sentence, words, logits, sampling_technique='inverse_transform', temperature=1.0):
|
|
|
|
|
31 |
if sampling_technique == 'inverse_transform':
|
32 |
-
probs = torch.softmax(
|
33 |
cumulative_probs = torch.cumsum(probs, dim=-1)
|
34 |
random_prob = random.random()
|
35 |
sampled_index = torch.where(cumulative_probs >= random_prob)[0][0]
|
36 |
elif sampling_technique == 'exponential_minimum':
|
37 |
-
probs = torch.softmax(
|
38 |
exp_probs = torch.exp(-torch.log(probs))
|
39 |
random_probs = torch.rand_like(exp_probs)
|
40 |
sampled_index = torch.argmax(random_probs * exp_probs)
|
41 |
elif sampling_technique == 'temperature':
|
42 |
-
|
43 |
-
probs = torch.softmax(scaled_logits, dim=-1)
|
44 |
sampled_index = torch.multinomial(probs, 1).item()
|
45 |
elif sampling_technique == 'greedy':
|
46 |
-
sampled_index = torch.argmax(
|
47 |
else:
|
48 |
raise ValueError("Invalid sampling technique. Choose 'inverse_transform', 'exponential_minimum', 'temperature', or 'greedy'.")
|
49 |
-
|
50 |
-
sampled_word =
|
51 |
-
|
52 |
# Replace [MASK] with the sampled word
|
53 |
filled_sentence = sentence.replace('[MASK]', sampled_word)
|
54 |
-
|
55 |
return filled_sentence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
import random
|
3 |
+
from vocabulary_split import split_vocabulary, filter_logits
|
4 |
+
# from transformers import AutoTokenizer, AutoModelForMaskedLM
|
5 |
+
from masking_methods import tokenizer
|
6 |
+
|
7 |
+
# Load tokenizer and model for masked language model
|
8 |
+
# tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
|
9 |
+
# model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
|
10 |
+
|
11 |
+
|
12 |
+
# Get permissible vocabulary
|
13 |
+
permissible, _ = split_vocabulary(seed=42)
|
14 |
+
permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
|
15 |
|
16 |
def sample_word(sentence, words, logits, sampling_technique='inverse_transform', temperature=1.0):
|
17 |
+
filtered_logits = filter_logits(torch.tensor(logits), permissible_indices)
|
18 |
+
|
19 |
if sampling_technique == 'inverse_transform':
|
20 |
+
probs = torch.softmax(filtered_logits / temperature, dim=-1)
|
21 |
cumulative_probs = torch.cumsum(probs, dim=-1)
|
22 |
random_prob = random.random()
|
23 |
sampled_index = torch.where(cumulative_probs >= random_prob)[0][0]
|
24 |
elif sampling_technique == 'exponential_minimum':
|
25 |
+
probs = torch.softmax(filtered_logits / temperature, dim=-1)
|
26 |
exp_probs = torch.exp(-torch.log(probs))
|
27 |
random_probs = torch.rand_like(exp_probs)
|
28 |
sampled_index = torch.argmax(random_probs * exp_probs)
|
29 |
elif sampling_technique == 'temperature':
|
30 |
+
probs = torch.softmax(filtered_logits / temperature, dim=-1)
|
|
|
31 |
sampled_index = torch.multinomial(probs, 1).item()
|
32 |
elif sampling_technique == 'greedy':
|
33 |
+
sampled_index = torch.argmax(filtered_logits).item()
|
34 |
else:
|
35 |
raise ValueError("Invalid sampling technique. Choose 'inverse_transform', 'exponential_minimum', 'temperature', or 'greedy'.")
|
36 |
+
|
37 |
+
sampled_word = tokenizer.decode([sampled_index])
|
38 |
+
|
39 |
# Replace [MASK] with the sampled word
|
40 |
filled_sentence = sentence.replace('[MASK]', sampled_word)
|
41 |
+
|
42 |
return filled_sentence
|
threeD_plot.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import numpy as np
|
2 |
+
# import plotly.graph_objects as go
|
3 |
+
# from scipy.interpolate import griddata
|
4 |
+
|
5 |
+
# def gen_three_D_plot(detectability_val, distortion_val, euclidean_val):
|
6 |
+
# detectability = np.array(detectability_val)
|
7 |
+
# distortion = np.array(distortion_val)
|
8 |
+
# euclidean = np.array(euclidean_val)
|
9 |
+
|
10 |
+
# # Find the closest point to the origin
|
11 |
+
# distances_to_origin = np.linalg.norm(np.array([distortion, detectability, euclidean]).T, axis=1)
|
12 |
+
# closest_point_index = np.argmin(distances_to_origin)
|
13 |
+
|
14 |
+
# # Determine the closest points to each axis
|
15 |
+
# closest_to_x_axis = np.argmin(distortion)
|
16 |
+
# closest_to_y_axis = np.argmin(detectability)
|
17 |
+
# closest_to_z_axis = np.argmin(euclidean)
|
18 |
+
|
19 |
+
# # Use the detected closest point as the "sweet spot"
|
20 |
+
# sweet_spot_detectability = detectability[closest_point_index]
|
21 |
+
# sweet_spot_distortion = distortion[closest_point_index]
|
22 |
+
# sweet_spot_euclidean = euclidean[closest_point_index]
|
23 |
+
|
24 |
+
# # Create a meshgrid from the data
|
25 |
+
# x_grid, y_grid = np.meshgrid(np.linspace(min(detectability), max(detectability), 30),
|
26 |
+
# np.linspace(min(distortion), max(distortion), 30))
|
27 |
+
|
28 |
+
# # Interpolate z values (Euclidean distances) to fit the grid
|
29 |
+
# z_grid = griddata((detectability, distortion), euclidean, (x_grid, y_grid), method='linear')
|
30 |
+
|
31 |
+
# if z_grid is None:
|
32 |
+
# raise ValueError("griddata could not generate a valid interpolation. Check your input data.")
|
33 |
+
|
34 |
+
# # Create the 3D contour plot with the Plasma color scale
|
35 |
+
# fig = go.Figure(data=go.Surface(
|
36 |
+
# z=z_grid,
|
37 |
+
# x=x_grid,
|
38 |
+
# y=y_grid,
|
39 |
+
# contours={
|
40 |
+
# "z": {"show": True, "start": min(euclidean), "end": max(euclidean), "size": 0.1, "usecolormap": True}
|
41 |
+
# },
|
42 |
+
# colorscale='Plasma'
|
43 |
+
# ))
|
44 |
+
|
45 |
+
# # Add a marker for the sweet spot
|
46 |
+
# fig.add_trace(go.Scatter3d(
|
47 |
+
# x=[sweet_spot_detectability],
|
48 |
+
# y=[sweet_spot_distortion],
|
49 |
+
# z=[sweet_spot_euclidean],
|
50 |
+
# mode='markers+text',
|
51 |
+
# marker=dict(size=10, color='red', symbol='circle'),
|
52 |
+
# text=["Sweet Spot"],
|
53 |
+
# textposition="top center"
|
54 |
+
# ))
|
55 |
+
|
56 |
+
# # Set axis labels
|
57 |
+
# fig.update_layout(
|
58 |
+
# scene=dict(
|
59 |
+
# xaxis_title='Detectability Score',
|
60 |
+
# yaxis_title='Distortion Score',
|
61 |
+
# zaxis_title='Euclidean Distance'
|
62 |
+
# ),
|
63 |
+
# margin=dict(l=0, r=0, b=0, t=0)
|
64 |
+
# )
|
65 |
+
|
66 |
+
# return fig
|
67 |
+
|
68 |
+
|
69 |
+
import numpy as np
|
70 |
+
import plotly.graph_objects as go
|
71 |
+
from scipy.interpolate import griddata
|
72 |
+
|
73 |
+
def gen_three_D_plot(detectability_val, distortion_val, euclidean_val):
|
74 |
+
detectability = np.array(detectability_val)
|
75 |
+
distortion = np.array(distortion_val)
|
76 |
+
euclidean = np.array(euclidean_val)
|
77 |
+
|
78 |
+
# Normalize the values to range [0, 1]
|
79 |
+
norm_detectability = (detectability - min(detectability)) / (max(detectability) - min(detectability))
|
80 |
+
norm_distortion = (distortion - min(distortion)) / (max(distortion) - min(distortion))
|
81 |
+
norm_euclidean = (euclidean - min(euclidean)) / (max(euclidean) - min(euclidean))
|
82 |
+
|
83 |
+
# Composite score: maximize detectability, minimize distortion and Euclidean distance
|
84 |
+
# We subtract distortion and euclidean as we want them minimized.
|
85 |
+
composite_score = norm_detectability - (norm_distortion + norm_euclidean)
|
86 |
+
|
87 |
+
# Find the index of the maximum score (sweet spot)
|
88 |
+
sweet_spot_index = np.argmax(composite_score)
|
89 |
+
|
90 |
+
# Sweet spot values
|
91 |
+
sweet_spot_detectability = detectability[sweet_spot_index]
|
92 |
+
sweet_spot_distortion = distortion[sweet_spot_index]
|
93 |
+
sweet_spot_euclidean = euclidean[sweet_spot_index]
|
94 |
+
|
95 |
+
# Create a meshgrid from the data
|
96 |
+
x_grid, y_grid = np.meshgrid(np.linspace(min(detectability), max(detectability), 30),
|
97 |
+
np.linspace(min(distortion), max(distortion), 30))
|
98 |
+
|
99 |
+
# Interpolate z values (Euclidean distances) to fit the grid
|
100 |
+
z_grid = griddata((detectability, distortion), euclidean, (x_grid, y_grid), method='linear')
|
101 |
+
|
102 |
+
if z_grid is None:
|
103 |
+
raise ValueError("griddata could not generate a valid interpolation. Check your input data.")
|
104 |
+
|
105 |
+
# Create the 3D contour plot with the Plasma color scale
|
106 |
+
fig = go.Figure(data=go.Surface(
|
107 |
+
z=z_grid,
|
108 |
+
x=x_grid,
|
109 |
+
y=y_grid,
|
110 |
+
contours={
|
111 |
+
"z": {"show": True, "start": min(euclidean), "end": max(euclidean), "size": 0.1, "usecolormap": True}
|
112 |
+
},
|
113 |
+
colorscale='Plasma'
|
114 |
+
))
|
115 |
+
|
116 |
+
# Add a marker for the sweet spot
|
117 |
+
fig.add_trace(go.Scatter3d(
|
118 |
+
x=[sweet_spot_detectability],
|
119 |
+
y=[sweet_spot_distortion],
|
120 |
+
z=[sweet_spot_euclidean],
|
121 |
+
mode='markers+text',
|
122 |
+
marker=dict(size=10, color='red', symbol='circle'),
|
123 |
+
text=["Sweet Spot"],
|
124 |
+
textposition="top center"
|
125 |
+
))
|
126 |
+
|
127 |
+
# Set axis labels
|
128 |
+
fig.update_layout(
|
129 |
+
scene=dict(
|
130 |
+
xaxis_title='Detectability Score',
|
131 |
+
yaxis_title='Distortion Score',
|
132 |
+
zaxis_title='Euclidean Distance'
|
133 |
+
),
|
134 |
+
margin=dict(l=0, r=0, b=0, t=0)
|
135 |
+
)
|
136 |
+
|
137 |
+
return fig
|
tree.py
CHANGED
@@ -1,341 +1,3 @@
|
|
1 |
-
# import plotly.graph_objects as go
|
2 |
-
# import textwrap
|
3 |
-
# import re
|
4 |
-
# from collections import defaultdict
|
5 |
-
|
6 |
-
# def generate_subplot1(paraphrased_sentence, scheme_sentences, highlight_info):
|
7 |
-
# # Combine nodes into one list with appropriate labels
|
8 |
-
# nodes = [paraphrased_sentence] + scheme_sentences
|
9 |
-
# nodes[0] += ' L0' # Paraphrased sentence is level 0
|
10 |
-
# for i in range(1, len(nodes)):
|
11 |
-
# nodes[i] += ' L1' # Scheme sentences are level 1
|
12 |
-
|
13 |
-
# # Define the highlight_words function
|
14 |
-
# def highlight_words(sentence, color_map):
|
15 |
-
# for word, color in color_map.items():
|
16 |
-
# sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
|
17 |
-
# return sentence
|
18 |
-
|
19 |
-
# # Clean and wrap nodes, and highlight specified words globally
|
20 |
-
# cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
|
21 |
-
# global_color_map = dict(highlight_info)
|
22 |
-
# highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
|
23 |
-
# wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=50)) for node in highlighted_nodes]
|
24 |
-
|
25 |
-
# # Function to determine tree levels and create edges dynamically
|
26 |
-
# def get_levels_and_edges(nodes):
|
27 |
-
# levels = {}
|
28 |
-
# edges = []
|
29 |
-
# for i, node in enumerate(nodes):
|
30 |
-
# level = int(node.split()[-1][1])
|
31 |
-
# levels[i] = level
|
32 |
-
|
33 |
-
# # Add edges from L0 to all L1 nodes
|
34 |
-
# root_node = next(i for i, level in levels.items() if level == 0)
|
35 |
-
# for i, level in levels.items():
|
36 |
-
# if level == 1:
|
37 |
-
# edges.append((root_node, i))
|
38 |
-
|
39 |
-
# return levels, edges
|
40 |
-
|
41 |
-
# # Get levels and dynamic edges
|
42 |
-
# levels, edges = get_levels_and_edges(nodes)
|
43 |
-
# max_level = max(levels.values(), default=0)
|
44 |
-
|
45 |
-
# # Calculate positions
|
46 |
-
# positions = {}
|
47 |
-
# level_heights = defaultdict(int)
|
48 |
-
# for node, level in levels.items():
|
49 |
-
# level_heights[level] += 1
|
50 |
-
|
51 |
-
# y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
|
52 |
-
# x_gap = 2
|
53 |
-
# l1_y_gap = 10
|
54 |
-
|
55 |
-
# for node, level in levels.items():
|
56 |
-
# if level == 1:
|
57 |
-
# positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
|
58 |
-
# else:
|
59 |
-
# positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
|
60 |
-
# y_offsets[level] += 1
|
61 |
-
|
62 |
-
# # Function to highlight words in a wrapped node string
|
63 |
-
# def color_highlighted_words(node, color_map):
|
64 |
-
# parts = re.split(r'(\{\{.*?\}\})', node)
|
65 |
-
# colored_parts = []
|
66 |
-
# for part in parts:
|
67 |
-
# match = re.match(r'\{\{(.*?)\}\}', part)
|
68 |
-
# if match:
|
69 |
-
# word = match.group(1)
|
70 |
-
# color = color_map.get(word, 'black')
|
71 |
-
# colored_parts.append(f"<span style='color: {color};'>{word}</span>")
|
72 |
-
# else:
|
73 |
-
# colored_parts.append(part)
|
74 |
-
# return ''.join(colored_parts)
|
75 |
-
|
76 |
-
# # Define the text for each edge
|
77 |
-
# edge_texts = [
|
78 |
-
# "Highest Entropy Masking",
|
79 |
-
# "Pseudo-random Masking",
|
80 |
-
# "Random Masking",
|
81 |
-
# "Greedy Sampling",
|
82 |
-
# "Temperature Sampling",
|
83 |
-
# "Exponential Minimum Sampling",
|
84 |
-
# "Inverse Transform Sampling",
|
85 |
-
# "Greedy Sampling",
|
86 |
-
# "Temperature Sampling",
|
87 |
-
# "Exponential Minimum Sampling",
|
88 |
-
# "Inverse Transform Sampling",
|
89 |
-
# "Greedy Sampling",
|
90 |
-
# "Temperature Sampling",
|
91 |
-
# "Exponential Minimum Sampling",
|
92 |
-
# "Inverse Transform Sampling"
|
93 |
-
# ]
|
94 |
-
|
95 |
-
# # Create figure
|
96 |
-
# fig1 = go.Figure()
|
97 |
-
|
98 |
-
# # Add nodes to the figure
|
99 |
-
# for i, node in enumerate(wrapped_nodes):
|
100 |
-
# colored_node = color_highlighted_words(node, global_color_map)
|
101 |
-
# x, y = positions[i]
|
102 |
-
# fig1.add_trace(go.Scatter(
|
103 |
-
# x=[-x], # Reflect the x coordinate
|
104 |
-
# y=[y],
|
105 |
-
# mode='markers',
|
106 |
-
# marker=dict(size=10, color='blue'),
|
107 |
-
# hoverinfo='none'
|
108 |
-
# ))
|
109 |
-
# fig1.add_annotation(
|
110 |
-
# x=-x, # Reflect the x coordinate
|
111 |
-
# y=y,
|
112 |
-
# text=colored_node,
|
113 |
-
# showarrow=False,
|
114 |
-
# xshift=15,
|
115 |
-
# align="center",
|
116 |
-
# font=dict(size=12),
|
117 |
-
# bordercolor='black',
|
118 |
-
# borderwidth=1,
|
119 |
-
# borderpad=2,
|
120 |
-
# bgcolor='white',
|
121 |
-
# width=300,
|
122 |
-
# height=120
|
123 |
-
# )
|
124 |
-
|
125 |
-
# # Add edges and text above each edge
|
126 |
-
# for i, edge in enumerate(edges):
|
127 |
-
# x0, y0 = positions[edge[0]]
|
128 |
-
# x1, y1 = positions[edge[1]]
|
129 |
-
# fig1.add_trace(go.Scatter(
|
130 |
-
# x=[-x0, -x1], # Reflect the x coordinates
|
131 |
-
# y=[y0, y1],
|
132 |
-
# mode='lines',
|
133 |
-
# line=dict(color='black', width=1)
|
134 |
-
# ))
|
135 |
-
|
136 |
-
# # Calculate the midpoint of the edge
|
137 |
-
# mid_x = (-x0 + -x1) / 2
|
138 |
-
# mid_y = (y0 + y1) / 2
|
139 |
-
|
140 |
-
# # Adjust y position to shift text upwards
|
141 |
-
# text_y_position = mid_y + 0.8 # Increase this value to shift the text further upwards
|
142 |
-
|
143 |
-
# # Add text annotation above the edge
|
144 |
-
# fig1.add_annotation(
|
145 |
-
# x=mid_x,
|
146 |
-
# y=text_y_position,
|
147 |
-
# text=edge_texts[i], # Use the text specific to this edge
|
148 |
-
# showarrow=False,
|
149 |
-
# font=dict(size=12),
|
150 |
-
# align="center"
|
151 |
-
# )
|
152 |
-
|
153 |
-
# fig1.update_layout(
|
154 |
-
# showlegend=False,
|
155 |
-
# margin=dict(t=20, b=20, l=20, r=20),
|
156 |
-
# xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
|
157 |
-
# yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
|
158 |
-
# width=1435, # Adjusted width to accommodate more levels
|
159 |
-
# height=1000 # Adjusted height to accommodate more levels
|
160 |
-
# )
|
161 |
-
|
162 |
-
# return fig1
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
# def generate_subplot2(scheme_sentences, sampled_sentence, highlight_info):
|
167 |
-
# # Combine nodes into one list with appropriate labels
|
168 |
-
# nodes = scheme_sentences + sampled_sentence
|
169 |
-
# para_len = len(scheme_sentences)
|
170 |
-
|
171 |
-
# # Reassign levels: L1 -> L0, L2 -> L1
|
172 |
-
# for i in range(para_len):
|
173 |
-
# nodes[i] += ' L0' # Scheme sentences are now level 0
|
174 |
-
# for i in range(para_len, len(nodes)):
|
175 |
-
# nodes[i] += ' L1' # Sampled sentences are now level 1
|
176 |
-
|
177 |
-
# # Define the highlight_words function
|
178 |
-
# def highlight_words(sentence, color_map):
|
179 |
-
# for word, color in color_map.items():
|
180 |
-
# sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
|
181 |
-
# return sentence
|
182 |
-
|
183 |
-
# # Clean and wrap nodes, and highlight specified words globally
|
184 |
-
# cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
|
185 |
-
# global_color_map = dict(highlight_info)
|
186 |
-
# highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
|
187 |
-
# wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=80)) for node in highlighted_nodes]
|
188 |
-
|
189 |
-
# # Function to determine tree levels and create edges dynamically
|
190 |
-
# def get_levels_and_edges(nodes):
|
191 |
-
# levels = {}
|
192 |
-
# edges = []
|
193 |
-
# for i, node in enumerate(nodes):
|
194 |
-
# level = int(node.split()[-1][1])
|
195 |
-
# levels[i] = level
|
196 |
-
|
197 |
-
# # Add edges from L0 to all L1 nodes
|
198 |
-
# l0_indices = [i for i, level in levels.items() if level == 0]
|
199 |
-
# l1_indices = [i for i, level in levels.items() if level == 1]
|
200 |
-
|
201 |
-
# # Ensure there are exactly 3 L0 nodes
|
202 |
-
# if len(l0_indices) < 3:
|
203 |
-
# raise ValueError("There should be exactly 3 L0 nodes to attach edges correctly.")
|
204 |
-
|
205 |
-
# # Split L1 nodes into 3 groups of 4 for attaching to L0 nodes
|
206 |
-
# for i, l1_node in enumerate(l1_indices):
|
207 |
-
# if i < 4:
|
208 |
-
# edges.append((l0_indices[0], l1_node)) # Connect to the first L0 node
|
209 |
-
# elif i < 8:
|
210 |
-
# edges.append((l0_indices[1], l1_node)) # Connect to the second L0 node
|
211 |
-
# else:
|
212 |
-
# edges.append((l0_indices[2], l1_node)) # Connect to the third L0 node
|
213 |
-
|
214 |
-
# return levels, edges
|
215 |
-
|
216 |
-
# # Get levels and dynamic edges
|
217 |
-
# levels, edges = get_levels_and_edges(nodes)
|
218 |
-
|
219 |
-
# # Calculate positions
|
220 |
-
# positions = {}
|
221 |
-
# level_heights = defaultdict(int)
|
222 |
-
# for node, level in levels.items():
|
223 |
-
# level_heights[level] += 1
|
224 |
-
|
225 |
-
# y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
|
226 |
-
# x_gap = 2
|
227 |
-
# l1_y_gap = 10
|
228 |
-
|
229 |
-
# for node, level in levels.items():
|
230 |
-
# if level == 1:
|
231 |
-
# positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
|
232 |
-
# else:
|
233 |
-
# positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
|
234 |
-
# y_offsets[level] += 1
|
235 |
-
|
236 |
-
# # Function to highlight words in a wrapped node string
|
237 |
-
# def color_highlighted_words(node, color_map):
|
238 |
-
# parts = re.split(r'(\{\{.*?\}\})', node)
|
239 |
-
# colored_parts = []
|
240 |
-
# for part in parts:
|
241 |
-
# match = re.match(r'\{\{(.*?)\}\}', part)
|
242 |
-
# if match:
|
243 |
-
# word = match.group(1)
|
244 |
-
# color = color_map.get(word, 'black')
|
245 |
-
# colored_parts.append(f"<span style='color: {color};'>{word}</span>")
|
246 |
-
# else:
|
247 |
-
# colored_parts.append(part)
|
248 |
-
# return ''.join(colored_parts)
|
249 |
-
|
250 |
-
# # Define the text for each edge
|
251 |
-
# edge_texts = [
|
252 |
-
# "Highest Entropy Masking",
|
253 |
-
# "Pseudo-random Masking",
|
254 |
-
# "Random Masking",
|
255 |
-
# "Greedy Sampling",
|
256 |
-
# "Temperature Sampling",
|
257 |
-
# "Exponential Minimum Sampling",
|
258 |
-
# "Inverse Transform Sampling",
|
259 |
-
# "Greedy Sampling",
|
260 |
-
# "Temperature Sampling",
|
261 |
-
# "Exponential Minimum Sampling",
|
262 |
-
# "Inverse Transform Sampling",
|
263 |
-
# "Greedy Sampling",
|
264 |
-
# "Temperature Sampling",
|
265 |
-
# "Exponential Minimum Sampling",
|
266 |
-
# "Inverse Transform Sampling"
|
267 |
-
# ]
|
268 |
-
|
269 |
-
# # Create figure
|
270 |
-
# fig2 = go.Figure()
|
271 |
-
|
272 |
-
# # Add nodes to the figure
|
273 |
-
# for i, node in enumerate(wrapped_nodes):
|
274 |
-
# colored_node = color_highlighted_words(node, global_color_map)
|
275 |
-
# x, y = positions[i]
|
276 |
-
# fig2.add_trace(go.Scatter(
|
277 |
-
# x=[-x], # Reflect the x coordinate
|
278 |
-
# y=[y],
|
279 |
-
# mode='markers',
|
280 |
-
# marker=dict(size=10, color='blue'),
|
281 |
-
# hoverinfo='none'
|
282 |
-
# ))
|
283 |
-
# fig2.add_annotation(
|
284 |
-
# x=-x, # Reflect the x coordinate
|
285 |
-
# y=y,
|
286 |
-
# text=colored_node,
|
287 |
-
# showarrow=False,
|
288 |
-
# xshift=15,
|
289 |
-
# align="center",
|
290 |
-
# font=dict(size=12),
|
291 |
-
# bordercolor='black',
|
292 |
-
# borderwidth=1,
|
293 |
-
# borderpad=2,
|
294 |
-
# bgcolor='white',
|
295 |
-
# width=450,
|
296 |
-
# height=65
|
297 |
-
# )
|
298 |
-
|
299 |
-
# # Add edges and text above each edge
|
300 |
-
# for i, edge in enumerate(edges):
|
301 |
-
# x0, y0 = positions[edge[0]]
|
302 |
-
# x1, y1 = positions[edge[1]]
|
303 |
-
# fig2.add_trace(go.Scatter(
|
304 |
-
# x=[-x0, -x1], # Reflect the x coordinates
|
305 |
-
# y=[y0, y1],
|
306 |
-
# mode='lines',
|
307 |
-
# line=dict(color='black', width=1)
|
308 |
-
# ))
|
309 |
-
|
310 |
-
# # Calculate the midpoint of the edge
|
311 |
-
# mid_x = (-x0 + -x1) / 2
|
312 |
-
# mid_y = (y0 + y1) / 2
|
313 |
-
|
314 |
-
# # Adjust y position to shift text upwards
|
315 |
-
# text_y_position = mid_y + 0.8 # Increase this value to shift the text further upwards
|
316 |
-
|
317 |
-
# # Add text annotation above the edge
|
318 |
-
# fig2.add_annotation(A surprising aspect of tests, specifically self-testing soon after exposure to new material, is that they can significantly improve your ability to learn, apply, and maintain new knowledge.
|
319 |
-
# x=mid_x,
|
320 |
-
# y=text_y_position,
|
321 |
-
# text=edge_texts[i], # Use the text specific to this edge
|
322 |
-
# showarrow=False,
|
323 |
-
# font=dict(size=12),
|
324 |
-
# align="center"
|
325 |
-
# )
|
326 |
-
|
327 |
-
# fig2.update_layout(
|
328 |
-
# showlegend=False,
|
329 |
-
# margin=dict(t=20, b=20, l=20, r=20),
|
330 |
-
# xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
|
331 |
-
# yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
|
332 |
-
# width=1435, # Adjusted width to accommodate more levels
|
333 |
-
# height=1000 # Adjusted height to accommodate more levels
|
334 |
-
# )
|
335 |
-
|
336 |
-
# return fig2
|
337 |
-
|
338 |
-
|
339 |
import plotly.graph_objects as go
|
340 |
import textwrap
|
341 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import plotly.graph_objects as go
|
2 |
import textwrap
|
3 |
import re
|
vocabulary_split.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
3 |
+
import torch
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
5 |
+
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
|
6 |
+
def split_vocabulary(seed=42):
|
7 |
+
# Initialize the tokenizer and model
|
8 |
+
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
9 |
+
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
|
10 |
+
|
11 |
+
# Get the full vocabulary
|
12 |
+
vocab = list(tokenizer.get_vocab().items())
|
13 |
+
|
14 |
+
# Initialize the random number generator
|
15 |
+
random.seed(seed)
|
16 |
+
|
17 |
+
# Split the vocabulary into permissible and non-permissible buckets
|
18 |
+
permissible = {}
|
19 |
+
non_permissible = {}
|
20 |
+
|
21 |
+
for word, index in vocab:
|
22 |
+
if random.random() < 0.5: # 50% chance of being permissible
|
23 |
+
permissible[word] = index
|
24 |
+
else:
|
25 |
+
non_permissible[word] = index
|
26 |
+
|
27 |
+
return permissible, non_permissible
|
28 |
+
|
29 |
+
def get_logits_for_mask(model, tokenizer, sentence):
|
30 |
+
inputs = tokenizer(sentence, return_tensors="pt")
|
31 |
+
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
|
32 |
+
|
33 |
+
with torch.no_grad():
|
34 |
+
outputs = model(**inputs)
|
35 |
+
|
36 |
+
logits = outputs.logits
|
37 |
+
mask_token_logits = logits[0, mask_token_index, :]
|
38 |
+
return mask_token_logits.squeeze()
|
39 |
+
|
40 |
+
def filter_logits(logits, permissible_indices):
|
41 |
+
filtered_logits = logits.clone()
|
42 |
+
if filtered_logits.dim() > 1:
|
43 |
+
filtered_logits = filtered_logits.squeeze()
|
44 |
+
if filtered_logits.shape != permissible_indices.shape:
|
45 |
+
permissible_indices = permissible_indices[:filtered_logits.shape[0]]
|
46 |
+
filtered_logits[~permissible_indices] = float('-inf')
|
47 |
+
return filtered_logits
|
48 |
+
|
49 |
+
# Usage example
|
50 |
+
permissible, non_permissible = split_vocabulary(seed=42)
|
51 |
+
permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
|
52 |
+
|
53 |
+
# When sampling:
|
54 |
+
sentence = "The [MASK] is bright today."
|
55 |
+
logits = get_logits_for_mask(model, tokenizer, sentence)
|
56 |
+
filtered_logits = filter_logits(logits, permissible_indices)
|
57 |
+
# Use filtered_logits for sampling
|
watermark_detector.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
from nltk.corpus import stopwords
|
3 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
4 |
+
from vocabulary_split import split_vocabulary, filter_logits
|
5 |
+
import torch
|
6 |
+
from lcs import find_common_subsequences
|
7 |
+
from paraphraser import generate_paraphrase
|
8 |
+
|
9 |
+
nltk.download('punkt', quiet=True)
|
10 |
+
nltk.download('stopwords', quiet=True)
|
11 |
+
|
12 |
+
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
|
13 |
+
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
|
14 |
+
|
15 |
+
permissible, _ = split_vocabulary(seed=42)
|
16 |
+
permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
|
17 |
+
|
18 |
+
def get_non_melting_points(original_sentence):
|
19 |
+
paraphrased_sentences = generate_paraphrase(original_sentence)
|
20 |
+
common_subsequences = find_common_subsequences(original_sentence, paraphrased_sentences)
|
21 |
+
return common_subsequences
|
22 |
+
|
23 |
+
def get_word_between_points(sentence, start_point, end_point):
|
24 |
+
words = nltk.word_tokenize(sentence)
|
25 |
+
stop_words = set(stopwords.words('english'))
|
26 |
+
start_index = sentence.index(start_point[1])
|
27 |
+
end_index = sentence.index(end_point[1])
|
28 |
+
|
29 |
+
for word in words[start_index+1:end_index]:
|
30 |
+
if word.lower() not in stop_words:
|
31 |
+
return word, words.index(word)
|
32 |
+
return None, None
|
33 |
+
|
34 |
+
def get_logits_for_mask(sentence):
|
35 |
+
inputs = tokenizer(sentence, return_tensors="pt")
|
36 |
+
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
|
37 |
+
|
38 |
+
with torch.no_grad():
|
39 |
+
outputs = model(**inputs)
|
40 |
+
|
41 |
+
logits = outputs.logits
|
42 |
+
mask_token_logits = logits[0, mask_token_index, :]
|
43 |
+
return mask_token_logits.squeeze()
|
44 |
+
|
45 |
+
def detect_watermark(sentence):
|
46 |
+
non_melting_points = get_non_melting_points(sentence)
|
47 |
+
|
48 |
+
if len(non_melting_points) < 2:
|
49 |
+
return False, "Not enough non-melting points found."
|
50 |
+
|
51 |
+
word_to_check, index = get_word_between_points(sentence, non_melting_points[0], non_melting_points[1])
|
52 |
+
|
53 |
+
if word_to_check is None:
|
54 |
+
return False, "No suitable word found between non-melting points."
|
55 |
+
|
56 |
+
words = nltk.word_tokenize(sentence)
|
57 |
+
masked_sentence = ' '.join(words[:index] + ['[MASK]'] + words[index+1:])
|
58 |
+
|
59 |
+
logits = get_logits_for_mask(masked_sentence)
|
60 |
+
filtered_logits = filter_logits(logits, permissible_indices)
|
61 |
+
|
62 |
+
top_predictions = filtered_logits.argsort()[-5:]
|
63 |
+
predicted_words = [tokenizer.decode([i]) for i in top_predictions]
|
64 |
+
|
65 |
+
if word_to_check in predicted_words:
|
66 |
+
return True, f"Watermark detected. The word '{word_to_check}' is in the permissible vocabulary."
|
67 |
+
else:
|
68 |
+
return False, f"No watermark detected. The word '{word_to_check}' is not in the permissible vocabulary."
|
69 |
+
|
70 |
+
# Example usage
|
71 |
+
# if __name__ == "__main__":
|
72 |
+
# test_sentence = "The quick brown fox jumps over the lazy dog."
|
73 |
+
# is_watermarked, message = detect_watermark(test_sentence)
|
74 |
+
# print(f"Is the sentence watermarked? {is_watermarked}")
|
75 |
+
# print(f"Detection message: {message}")
|