SemF1 / encoder_models.py
nbansal's picture
Major refactoring and added test cases
a249916
raw
history blame
3.9 kB
import abc
from typing import List, Union
from numpy.typing import NDArray
from sentence_transformers import SentenceTransformer
from type_aliases import ENCODER_DEVICE_TYPE
class Encoder(abc.ABC):
@abc.abstractmethod
def encode(self, prediction: List[str]) -> NDArray:
"""
Abstract method to encode a list of sentences into sentence embeddings.
Args:
prediction (List[str]): List of sentences to encode.
Returns:
NDArray: Array of sentence embeddings with shape (num_sentences, embedding_dim).
Raises:
NotImplementedError: If the method is not implemented in the subclass.
"""
raise NotImplementedError("Method 'encode' must be implemented in subclass.")
class USE(Encoder):
def __init__(self):
pass
def encode(self, prediction: List[str]) -> NDArray:
pass
class SBertEncoder(Encoder):
def __init__(self, model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool):
"""
Initialize SBertEncoder instance.
Args:
model_name (str): Name or path of the Sentence Transformer model.
device (Union[str, int, List[Union[str, int]]]): Device specification for encoding
batch_size (int): Batch size for encoding.
verbose (bool): Whether to print verbose information during encoding.
"""
self.model = SentenceTransformer(model_name)
self.device = device
self.batch_size = batch_size
self.verbose = verbose
def encode(self, prediction: List[str]) -> NDArray:
"""
Encode a list of sentences into sentence embeddings.
Args:
prediction (List[str]): List of sentences to encode.
Returns:
NDArray: Array of sentence embeddings with shape (num_sentences, embedding_dim).
"""
# SBert output is always Batch x Dim
if isinstance(self.device, list):
# Use multiprocess encoding for list of devices
pool = self.model.start_multi_process_pool(target_devices=self.device)
embeddings = self.model.encode_multi_process(prediction, pool=pool, batch_size=self.batch_size)
self.model.stop_multi_process_pool(pool)
else:
# Single device encoding
embeddings = self.model.encode(
prediction,
device=self.device,
batch_size=self.batch_size,
show_progress_bar=self.verbose,
)
return embeddings
def get_encoder(model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool) -> Encoder:
"""
Get the encoder instance based on the specified model name.
Args:
model_name (str): Name of the model to instantiate
Options: [pv1, stsb, use]
pv1 - paraphrase-distilroberta-base-v1 (Default)
stsb - stsb-roberta-large
use - Universal Sentence Encoder
device (Union[str, int, List[Union[str, int]]): Device specification for the encoder
(e.g., "cuda", 0 for GPU, "cpu").
batch_size (int): Batch size for encoding.
verbose (bool): Whether to print verbose information during encoder initialization.
Returns:
Encoder: Instance of the selected encoder based on the model_name.
Raises:
ValueError: If an unsupported model_name is provided.
"""
# TODO: chnage this when changing the TF model
if model_name == "use":
return SBertEncoder("sentence-transformers/use-cmlm-multilingual", device, batch_size, verbose)
# return USE()
else:
return SBertEncoder(model_name, device, batch_size, verbose)