File size: 2,950 Bytes
ea6afa4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import euclidean_distances

class SentenceEuclideanDistanceCalculator:
    """
    A class to calculate and analyze Euclidean distance between an original sentence and paraphrased sentences.
    """

    def __init__(self, original_sentence, paraphrased_sentences):
        """
        Initialize the calculator with the original sentence and a list of paraphrased sentences.
        """
        self.original_sentence = original_sentence
        self.paraphrased_sentences = paraphrased_sentences

        # Load SentenceTransformer model for embedding calculation
        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        
        # Precompute the original sentence embedding
        self.original_embedding = self.model.encode(original_sentence, convert_to_tensor=True)
        
        # Calculate Euclidean distances and normalize them
        self.euclidean_distances = self._calculate_all_metrics()
        self.normalized_euclidean = self._normalize_dict(self.euclidean_distances)

    def _calculate_all_metrics(self):
        """
        Calculate Euclidean distance between the original and each paraphrased sentence.
        """
        distances = {}
        paraphrase_embeddings = self.model.encode(self.paraphrased_sentences, convert_to_tensor=True)
        
        for idx, paraphrase_embedding in enumerate(paraphrase_embeddings):
            key = f"Sentence_{idx + 1}"
            distances[key] = euclidean_distances([self.original_embedding], [paraphrase_embedding])[0][0]
        
        return distances

    def _normalize_dict(self, metric_dict):
        """
        Normalize the values in a dictionary to be between 0 and 1.
        """
        values = np.array(list(metric_dict.values()))
        min_val, max_val = values.min(), values.max()
        
        # Normalize values
        normalized_values = (values - min_val) / (max_val - min_val) if max_val > min_val else np.zeros_like(values)
        return dict(zip(metric_dict.keys(), normalized_values))

    def plot_metrics(self):
        """
        Plot the normalized Euclidean distances in a graph.
        """
        keys = list(self.normalized_euclidean.keys())
        indices = np.arange(len(keys))

        plt.figure(figsize=(12, 6))
        plt.plot(indices, [self.normalized_euclidean[key] for key in keys], marker='o', color=np.random.rand(3,))
        plt.xlabel('Sentence Index')
        plt.ylabel('Normalized Euclidean Distance (0-1)')
        plt.title('Normalized Euclidean Distance')
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    # Getter methods
    def get_normalized_metrics(self):
        """
        Get the normalized Euclidean distances as a dictionary.
        """
        return self.normalized_euclidean