File size: 4,632 Bytes
3815e0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import torch
from torch.nn.functional import cosine_similarity
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import numpy as np


def get_concreteness(prompts, word2score):
    scores=[]
    for prompt in prompts:
        conc_scores=[word2score[w]/10 for w in prompt.split() if w in word2score]
        if len(conc_scores) < 1:
            scores.append(0.10)
        else:
            scores.append(np.mean(conc_scores))

    return scores
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def compute_cosine_similarity(embeddings_1, embeddings_2):
    # Compute cosine similarity between embeddings_1 and embeddings_2
    similarities = cosine_similarity(embeddings_1, embeddings_2)

    return similarities

class SentenceDataset(Dataset):
    def __init__(self, sentences):
        self.sentences = sentences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        return self.sentences[index]

class Collate_t5:
    def __init__(self, tokenizer):
        self.t5_tokenizer = tokenizer

    def __call__(self, documents):
        batch=['summarize: ' + s for s in documents]
        # Tokenize sentences
        encoded_inputs = self.t5_tokenizer(batch, return_tensors="pt",
                                             add_special_tokens=True, padding='longest',
                                             )
        return documents, encoded_inputs

class collate_cl:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        # Tokenize sentences
        encoded_inputs = self.tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
        return encoded_inputs

class mpnet_embed_class():
    def __init__(self, device='cuda', nli=True):
        self.device = device

        if nli:
            model = AutoModel.from_pretrained('sentence-transformers/nli-mpnet-base-v2')
            tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/nli-mpnet-base-v2')
        else:
            model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')
            tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')

        model.to(device)
        self.model = model
        self.tokenizer = tokenizer
        self.collate_fn = collate_cl(tokenizer)

    def get_mpnet_embed_batch(self, predictions, ground_truth, batch_size=10):

        dataset_1 = SentenceDataset(predictions)
        dataset_2 = SentenceDataset(ground_truth)

        dataloader_1 = DataLoader(dataset_1, batch_size=batch_size, collate_fn=self.collate_fn, num_workers=1)
        dataloader_2 = DataLoader(dataset_2, batch_size=batch_size, collate_fn=self.collate_fn, num_workers=1)

        # Compute token embeddings
        embeddings_1 = []
        embeddings_2 = []

        with torch.no_grad():
            for count, (batch_1, batch_2) in enumerate(zip(dataloader_1, dataloader_2)):
                if count % 50 == 0:
                    print(count, ' out of ', len(dataloader_2))
                batch_1 = {key: value.to(self.device) for key, value in batch_1.items()}
                batch_2 = {key: value.to(self.device) for key, value in batch_2.items()}

                model_output_1 = self.model(**batch_1)
                model_output_2 = self.model(**batch_2)

                sentence_embeddings_1 = mean_pooling(model_output_1, batch_1['attention_mask'])
                sentence_embeddings_2 = mean_pooling(model_output_2, batch_2['attention_mask'])

                embeddings_1.append(sentence_embeddings_1)
                embeddings_2.append(sentence_embeddings_2)

        # Concatenate embeddings
        embeddings_1 = torch.cat(embeddings_1)
        embeddings_2 = torch.cat(embeddings_2)

        # Normalize embeddings
        embeddings_1 = torch.nn.functional.normalize(embeddings_1, p=2, dim=1)
        embeddings_2 = torch.nn.functional.normalize(embeddings_2, p=2, dim=1)

        # Compute cosine similarity
        similarities = compute_cosine_similarity(embeddings_1, embeddings_2)

        # # Average cosine similarity
        # average_similarity = torch.mean(similarities)

        return similarities