Spaces:
Sleeping
Sleeping
import abc | |
from typing import List, Union | |
from numpy.typing import NDArray | |
from sentence_transformers import SentenceTransformer | |
from .type_aliases import ENCODER_DEVICE_TYPE | |
class Encoder(abc.ABC): | |
def encode(self, prediction: List[str]) -> NDArray: | |
""" | |
Abstract method to encode a list of sentences into sentence embeddings. | |
Args: | |
prediction (List[str]): List of sentences to encode. | |
Returns: | |
NDArray: Array of sentence embeddings with shape (num_sentences, embedding_dim). | |
Raises: | |
NotImplementedError: If the method is not implemented in the subclass. | |
""" | |
raise NotImplementedError("Method 'encode' must be implemented in subclass.") | |
class SBertEncoder(Encoder): | |
def __init__(self, model: SentenceTransformer, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool): | |
""" | |
Initialize SBertEncoder instance. | |
Args: | |
model (SentenceTransformer): The Sentence Transformer model instance to use for encoding. | |
device (Union[str, int, List[Union[str, int]]]): Device specification for encoding | |
batch_size (int): Batch size for encoding. | |
verbose (bool): Whether to print verbose information during encoding. | |
""" | |
self.model = model | |
self.device = device | |
self.batch_size = batch_size | |
self.verbose = verbose | |
def encode(self, prediction: List[str]) -> NDArray: | |
""" | |
Encode a list of sentences into sentence embeddings. | |
Args: | |
prediction (List[str]): List of sentences to encode. | |
Returns: | |
NDArray: Array of sentence embeddings with shape (num_sentences, embedding_dim). | |
""" | |
# SBert output is always Batch x Dim | |
if isinstance(self.device, list): | |
# Use multiprocess encoding for list of devices | |
pool = self.model.start_multi_process_pool(target_devices=self.device) | |
embeddings = self.model.encode_multi_process(prediction, pool=pool, batch_size=self.batch_size) | |
self.model.stop_multi_process_pool(pool) | |
else: | |
# Single device encoding | |
embeddings = self.model.encode( | |
prediction, | |
device=self.device, | |
batch_size=self.batch_size, | |
show_progress_bar=self.verbose, | |
) | |
return embeddings | |
def get_encoder( | |
sbert_model: SentenceTransformer, | |
device: ENCODER_DEVICE_TYPE, | |
batch_size: int, | |
verbose: bool, | |
) -> Encoder: | |
""" | |
Get an instance of SBertEncoder using the provided parameters. | |
Args: | |
sbert_model (SentenceTransformer): An instance of SentenceTransformer model to use for encoding. | |
device (Union[str, int, List[Union[str, int]]): Device specification for the encoder | |
(e.g., "cuda", 0 for GPU, "cpu"). | |
batch_size (int): Batch size to use for encoding. | |
verbose (bool): Whether to print verbose information during encoding. | |
Returns: | |
SBertEncoder: Instance of the selected encoder based on the model_name. | |
Example: | |
>>> model_name = "paraphrase-distilroberta-base-v1" | |
>>> sbert_model = get_sbert_encoder(model_name) | |
>>> device = get_gpu("cuda") | |
>>> batch_size = 32 | |
>>> verbose = True | |
>>> encoder = get_encoder(sbert_model, device, batch_size, verbose) | |
""" | |
encoder = SBertEncoder(sbert_model, device, batch_size, verbose) | |
return encoder | |
def get_sbert_encoder(model_name: str) -> SentenceTransformer: | |
""" | |
Get an instance of SentenceTransformer encoder based on the specified model name. | |
Args: | |
model_name (str): Name of the model to instantiate. You can use any model on Huggingface/SentenceTransformer | |
that is supported by SentenceTransformer. | |
Returns: | |
SentenceTransformer: Instance of the selected encoder based on the model_name. | |
Raises: | |
EnvironmentError: If an unsupported model_name is provided. | |
RuntimeError: If there's an issue during instantiation of the encoder. | |
""" | |
try: | |
encoder = SentenceTransformer(model_name, trust_remote_code=True) | |
except EnvironmentError as err: | |
raise EnvironmentError(str(err)) from None | |
except Exception as err: | |
raise RuntimeError(str(err)) from None | |
return encoder | |