Spaces:

BridgeAI-Lab
/

Sem-nCG

Sleeping

App Files Files Community

Sem-nCG / encoder_models.py

nbansal

Fix Title

27a1559 6 months ago

raw

history blame

4.5 kB

	import abc
	from typing import List, Union

	from numpy.typing import NDArray
	from sentence_transformers import SentenceTransformer

	from .type_aliases import ENCODER_DEVICE_TYPE


	class Encoder(abc.ABC):
	@abc.abstractmethod
	def encode(self, prediction: List[str]) -> NDArray:
	"""
	Abstract method to encode a list of sentences into sentence embeddings.

	Args:
	prediction (List[str]): List of sentences to encode.

	Returns:
	NDArray: Array of sentence embeddings with shape (num_sentences, embedding_dim).

	Raises:
	NotImplementedError: If the method is not implemented in the subclass.
	"""
	raise NotImplementedError("Method 'encode' must be implemented in subclass.")


	class SBertEncoder(Encoder):
	def __init__(self, model: SentenceTransformer, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool):
	"""
	Initialize SBertEncoder instance.

	Args:
	model (SentenceTransformer): The Sentence Transformer model instance to use for encoding.
	device (Union[str, int, List[Union[str, int]]]): Device specification for encoding
	batch_size (int): Batch size for encoding.
	verbose (bool): Whether to print verbose information during encoding.
	"""
	self.model = model
	self.device = device
	self.batch_size = batch_size
	self.verbose = verbose

	def encode(self, prediction: List[str]) -> NDArray:
	"""
	Encode a list of sentences into sentence embeddings.

	Args:
	prediction (List[str]): List of sentences to encode.

	Returns:
	NDArray: Array of sentence embeddings with shape (num_sentences, embedding_dim).
	"""

	# SBert output is always Batch x Dim
	if isinstance(self.device, list):
	# Use multiprocess encoding for list of devices
	pool = self.model.start_multi_process_pool(target_devices=self.device)
	embeddings = self.model.encode_multi_process(prediction, pool=pool, batch_size=self.batch_size)
	self.model.stop_multi_process_pool(pool)
	else:
	# Single device encoding
	embeddings = self.model.encode(
	prediction,
	device=self.device,
	batch_size=self.batch_size,
	show_progress_bar=self.verbose,
	)

	return embeddings


	def get_encoder(
	sbert_model: SentenceTransformer,
	device: ENCODER_DEVICE_TYPE,
	batch_size: int,
	verbose: bool,
	) -> Encoder:
	"""
	Get an instance of SBertEncoder using the provided parameters.

	Args:
	sbert_model (SentenceTransformer): An instance of SentenceTransformer model to use for encoding.
	device (Union[str, int, List[Union[str, int]]): Device specification for the encoder
	(e.g., "cuda", 0 for GPU, "cpu").
	batch_size (int): Batch size to use for encoding.
	verbose (bool): Whether to print verbose information during encoding.

	Returns:
	SBertEncoder: Instance of the selected encoder based on the model_name.

	Example:
	>>> model_name = "paraphrase-distilroberta-base-v1"
	>>> sbert_model = get_sbert_encoder(model_name)
	>>> device = get_gpu("cuda")
	>>> batch_size = 32
	>>> verbose = True
	>>> encoder = get_encoder(sbert_model, device, batch_size, verbose)
	"""
	encoder = SBertEncoder(sbert_model, device, batch_size, verbose)
	return encoder


	def get_sbert_encoder(model_name: str) -> SentenceTransformer:
	"""
	Get an instance of SentenceTransformer encoder based on the specified model name.

	Args:
	model_name (str): Name of the model to instantiate. You can use any model on Huggingface/SentenceTransformer
	that is supported by SentenceTransformer.

	Returns:
	SentenceTransformer: Instance of the selected encoder based on the model_name.

	Raises:
	EnvironmentError: If an unsupported model_name is provided.
	RuntimeError: If there's an issue during instantiation of the encoder.
	"""

	try:
	encoder = SentenceTransformer(model_name, trust_remote_code=True)
	except EnvironmentError as err:
	raise EnvironmentError(str(err)) from None
	except Exception as err:
	raise RuntimeError(str(err)) from None

	return encoder