Spaces:

DeepLearning101
/

IE101TW

Running

App Files Files Community

IE101TW / loss /similarity_loss.py

DeepLearning101

Upload 6 files

fdc4786 12 months ago

raw

history blame

No virus

3.6 kB

	# -- coding: utf-8 --
	# @Time : 2022/03/23 16:55
	# @Author : Jianing Wang
	# @Email : lygwjn@gmail.com
	# @File : SimilarityLoss.py
	# !/usr/bin/env python
	# coding=utf-8

	import torch
	from torch import nn, Tensor
	from transformers.models.bert.modeling_bert import BertModel
	from transformers import BertTokenizer, BertConfig


	class CosineSimilarityLoss(nn.Module):
	"""
	CosineSimilarityLoss expects, that the InputExamples consists of two texts and a float label.

	It computes the vectors u = model(input_text[0]) and v = model(input_text[1]) and measures the cosine-similarity between the two.
	By default, it minimizes the following loss: \|\|input_label - cos_score_transformation(cosine_sim(u,v))\|\|_2.

	:param loss_fct: Which pytorch loss function should be used to compare the cosine_similartiy(u,v) with the input_label? By default, MSE: \|\|input_label - cosine_sim(u,v)\|\|_2
	:param cos_score_transformation: The cos_score_transformation function is applied on top of cosine_similarity. By default, the identify function is used (i.e. no change).


	"""
	def __init__(self, loss_fct = nn.MSELoss(), cos_score_transformation=nn.Identity()):
	super(CosineSimilarityLoss, self).__init__()
	self.loss_fct = loss_fct
	self.cos_score_transformation = cos_score_transformation


	def forward(self, rep_a, rep_b, label: Tensor):
	# rep_a: [batch_size, hidden_dim]
	# rep_b: [batch_size, hidden_dim]
	output = self.cos_score_transformation(torch.cosine_similarity(rep_a, rep_b))
	# print(output) # tensor([0.9925, 0.5846], grad_fn=<DivBackward0>), tensor(0.1709, grad_fn=<MseLossBackward0>)
	return self.loss_fct(output, label.view(-1))



	if __name__ == "__main__":
	# configure for huggingface pre-trained language models
	config = BertConfig.from_pretrained("bert-base-cased")
	# tokenizer for huggingface pre-trained language models
	tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
	# pytorch_model.bin for huggingface pre-trained language models
	model = BertModel.from_pretrained("bert-base-cased")
	# obtain two batch of examples, each corresponding example is a pair
	examples1 = ["Beijing is one of the biggest city in China.", "Disney film is well seeing for us."]
	examples2 = ["Shanghai is the largest city in east of China.", "ACL 2021 will be held in line due to COVID-19."]
	label = [1, 0]
	# convert each example for feature
	# {"input_ids": xxx, "attention_mask": xxx, "token_tuype_ids": xxx}
	features1 = tokenizer(examples1, add_special_tokens=True, padding=True)
	features2 = tokenizer(examples2, add_special_tokens=True, padding=True)
	# padding and convert to feature batch
	max_seq_lem = 24
	features1 = {key: torch.Tensor([value + [0] * (max_seq_lem - len(value)) for value in values]).long() for key, values in features1.items()}
	features2 = {key: torch.Tensor([value + [0] * (max_seq_lem - len(value)) for value in values]).long() for key, values in features2.items()}
	label = torch.Tensor(label).long()
	# obtain sentence embedding by averaged pooling
	rep_a = model(**features1)[0] # [batch_size, max_seq_len, hidden_dim]
	rep_b = model(**features2)[0] # [batch_size, max_seq_len, hidden_dim]
	rep_a = torch.mean(rep_a, -1) # [batch_size, hidden_dim]
	rep_b = torch.mean(rep_b, -1) # [batch_size, hidden_dim]
	# obtain contrastive loss
	loss_fn = CosineSimilarityLoss()
	loss = loss_fn(rep_a=rep_a, rep_b=rep_b, label=label)
	print(loss) # tensor(0.1709, grad_fn=<SumBackward0>)