aiisc-watermarking-model / euclidean_distance.py
BheemaShankerNeyigapula's picture
Upload folder using huggingface_hub
ea6afa4 verified
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import euclidean_distances
class SentenceEuclideanDistanceCalculator:
"""
A class to calculate and analyze Euclidean distance between an original sentence and paraphrased sentences.
"""
def __init__(self, original_sentence, paraphrased_sentences):
"""
Initialize the calculator with the original sentence and a list of paraphrased sentences.
"""
self.original_sentence = original_sentence
self.paraphrased_sentences = paraphrased_sentences
# Load SentenceTransformer model for embedding calculation
self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Precompute the original sentence embedding
self.original_embedding = self.model.encode(original_sentence, convert_to_tensor=True)
# Calculate Euclidean distances and normalize them
self.euclidean_distances = self._calculate_all_metrics()
self.normalized_euclidean = self._normalize_dict(self.euclidean_distances)
def _calculate_all_metrics(self):
"""
Calculate Euclidean distance between the original and each paraphrased sentence.
"""
distances = {}
paraphrase_embeddings = self.model.encode(self.paraphrased_sentences, convert_to_tensor=True)
for idx, paraphrase_embedding in enumerate(paraphrase_embeddings):
key = f"Sentence_{idx + 1}"
distances[key] = euclidean_distances([self.original_embedding], [paraphrase_embedding])[0][0]
return distances
def _normalize_dict(self, metric_dict):
"""
Normalize the values in a dictionary to be between 0 and 1.
"""
values = np.array(list(metric_dict.values()))
min_val, max_val = values.min(), values.max()
# Normalize values
normalized_values = (values - min_val) / (max_val - min_val) if max_val > min_val else np.zeros_like(values)
return dict(zip(metric_dict.keys(), normalized_values))
def plot_metrics(self):
"""
Plot the normalized Euclidean distances in a graph.
"""
keys = list(self.normalized_euclidean.keys())
indices = np.arange(len(keys))
plt.figure(figsize=(12, 6))
plt.plot(indices, [self.normalized_euclidean[key] for key in keys], marker='o', color=np.random.rand(3,))
plt.xlabel('Sentence Index')
plt.ylabel('Normalized Euclidean Distance (0-1)')
plt.title('Normalized Euclidean Distance')
plt.grid(True)
plt.tight_layout()
plt.show()
# Getter methods
def get_normalized_metrics(self):
"""
Get the normalized Euclidean distances as a dictionary.
"""
return self.normalized_euclidean