Spaces:

BridgeAI-Lab
/

SemF1

Sleeping

App Files Files Community

SemF1 / encoder_models.py

nbansal

Major refactoring and added test cases

a249916 5 months ago

raw

history blame

3.9 kB

	import abc
	from typing import List, Union

	from numpy.typing import NDArray
	from sentence_transformers import SentenceTransformer

	from type_aliases import ENCODER_DEVICE_TYPE


	class Encoder(abc.ABC):
	@abc.abstractmethod
	def encode(self, prediction: List[str]) -> NDArray:
	"""
	Abstract method to encode a list of sentences into sentence embeddings.

	Args:
	prediction (List[str]): List of sentences to encode.

	Returns:
	NDArray: Array of sentence embeddings with shape (num_sentences, embedding_dim).

	Raises:
	NotImplementedError: If the method is not implemented in the subclass.
	"""
	raise NotImplementedError("Method 'encode' must be implemented in subclass.")


	class USE(Encoder):
	def __init__(self):
	pass

	def encode(self, prediction: List[str]) -> NDArray:
	pass


	class SBertEncoder(Encoder):
	def __init__(self, model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool):
	"""
	Initialize SBertEncoder instance.

	Args:
	model_name (str): Name or path of the Sentence Transformer model.
	device (Union[str, int, List[Union[str, int]]]): Device specification for encoding
	batch_size (int): Batch size for encoding.
	verbose (bool): Whether to print verbose information during encoding.
	"""
	self.model = SentenceTransformer(model_name)
	self.device = device
	self.batch_size = batch_size
	self.verbose = verbose

	def encode(self, prediction: List[str]) -> NDArray:
	"""
	Encode a list of sentences into sentence embeddings.

	Args:
	prediction (List[str]): List of sentences to encode.

	Returns:
	NDArray: Array of sentence embeddings with shape (num_sentences, embedding_dim).
	"""

	# SBert output is always Batch x Dim
	if isinstance(self.device, list):
	# Use multiprocess encoding for list of devices
	pool = self.model.start_multi_process_pool(target_devices=self.device)
	embeddings = self.model.encode_multi_process(prediction, pool=pool, batch_size=self.batch_size)
	self.model.stop_multi_process_pool(pool)
	else:
	# Single device encoding
	embeddings = self.model.encode(
	prediction,
	device=self.device,
	batch_size=self.batch_size,
	show_progress_bar=self.verbose,
	)

	return embeddings


	def get_encoder(model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool) -> Encoder:
	"""
	Get the encoder instance based on the specified model name.

	Args:
	model_name (str): Name of the model to instantiate
	Options: [pv1, stsb, use]
	pv1 - paraphrase-distilroberta-base-v1 (Default)
	stsb - stsb-roberta-large
	use - Universal Sentence Encoder
	device (Union[str, int, List[Union[str, int]]): Device specification for the encoder
	(e.g., "cuda", 0 for GPU, "cpu").
	batch_size (int): Batch size for encoding.
	verbose (bool): Whether to print verbose information during encoder initialization.

	Returns:
	Encoder: Instance of the selected encoder based on the model_name.

	Raises:
	ValueError: If an unsupported model_name is provided.
	"""

	# TODO: chnage this when changing the TF model
	if model_name == "use":
	return SBertEncoder("sentence-transformers/use-cmlm-multilingual", device, batch_size, verbose)
	# return USE()
	else:
	return SBertEncoder(model_name, device, batch_size, verbose)