Spaces:

BridgeAI-Lab
/

SemF1

Sleeping

App Files Files Community

nbansal commited on Jun 20, 2024

Commit

a249916

1 Parent(s): de5dcb7

Major refactoring and added test cases

Browse files

Files changed (6) hide show

.gitignore +1 -0
encoder_models.py +108 -0
semf1.py +74 -69
tests.py +179 -17
type_aliases.py +10 -0
utils.py +78 -10

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

encoder_models.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import abc
+from typing import List, Union
+from numpy.typing import NDArray
+from sentence_transformers import SentenceTransformer
+from type_aliases import ENCODER_DEVICE_TYPE
+class Encoder(abc.ABC):
+    @abc.abstractmethod
+    def encode(self, prediction: List[str]) -> NDArray:
+        """
+            Abstract method to encode a list of sentences into sentence embeddings.
+            Args:
+                prediction (List[str]): List of sentences to encode.
+            Returns:
+                NDArray: Array of sentence embeddings with shape (num_sentences, embedding_dim).
+            Raises:
+                NotImplementedError: If the method is not implemented in the subclass.
+        """
+        raise NotImplementedError("Method 'encode' must be implemented in subclass.")
+class USE(Encoder):
+    def __init__(self):
+        pass
+    def encode(self, prediction: List[str]) -> NDArray:
+        pass
+class SBertEncoder(Encoder):
+    def __init__(self, model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool):
+        """
+            Initialize SBertEncoder instance.
+            Args:
+                model_name (str): Name or path of the Sentence Transformer model.
+                device (Union[str, int, List[Union[str, int]]]): Device specification for encoding
+                batch_size (int): Batch size for encoding.
+                verbose (bool): Whether to print verbose information during encoding.
+        """
+        self.model = SentenceTransformer(model_name)
+        self.device = device
+        self.batch_size = batch_size
+        self.verbose = verbose
+    def encode(self, prediction: List[str]) -> NDArray:
+        """
+           Encode a list of sentences into sentence embeddings.
+           Args:
+               prediction (List[str]): List of sentences to encode.
+           Returns:
+               NDArray: Array of sentence embeddings with shape (num_sentences, embedding_dim).
+        """
+        # SBert output is always Batch x Dim
+        if isinstance(self.device, list):
+            # Use multiprocess encoding for list of devices
+            pool = self.model.start_multi_process_pool(target_devices=self.device)
+            embeddings = self.model.encode_multi_process(prediction, pool=pool, batch_size=self.batch_size)
+            self.model.stop_multi_process_pool(pool)
+        else:
+            # Single device encoding
+            embeddings = self.model.encode(
+                prediction,
+                device=self.device,
+                batch_size=self.batch_size,
+                show_progress_bar=self.verbose,
+            )
+        return embeddings
+def get_encoder(model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool) -> Encoder:
+    """
+        Get the encoder instance based on the specified model name.
+        Args:
+            model_name (str): Name of the model to instantiate
+                Options: [pv1, stsb, use]
+                    pv1 - paraphrase-distilroberta-base-v1 (Default)
+                    stsb - stsb-roberta-large
+                    use - Universal Sentence Encoder
+            device (Union[str, int, List[Union[str, int]]): Device specification for the encoder
+                (e.g., "cuda", 0 for GPU, "cpu").
+            batch_size (int): Batch size for encoding.
+            verbose (bool): Whether to print verbose information during encoder initialization.
+        Returns:
+            Encoder: Instance of the selected encoder based on the model_name.
+        Raises:
+            ValueError: If an unsupported model_name is provided.
+        """
+    # TODO: chnage this when changing the TF model
+    if model_name == "use":
+        return SBertEncoder("sentence-transformers/use-cmlm-multilingual", device, batch_size, verbose)
+        # return USE()
+    else:
+        return SBertEncoder(model_name, device, batch_size, verbose)

semf1.py CHANGED Viewed

@@ -14,21 +14,19 @@
 # TODO: Add test cases, Remove tokenize_sentences flag since it can be determined from the input itself.
 """Sem-F1 metric"""
-import abc
-import sys
-from typing import List, Optional, Tuple, Union
 import datasets
 import evaluate
 import nltk
 import numpy as np
 from numpy.typing import NDArray
-from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
-import torch
-from tqdm import tqdm
-from utils import is_list_of_strings_at_depth, Scores, slice_embeddings, flatten_list
 _CITATION = """\
 @inproceedings{bansal-etal-2022-sem,
@@ -123,80 +121,80 @@ Examples:
     [0.77, 0.56]
 """
-_PREDICTION_TYPE = Union[List[str], List[List[str]]]
-_REFERENCE_TYPE = Union[List[str], List[List[str]], List[List[List[str]]]]
-class Encoder(metaclass=abc.ABCMeta):
-    @abc.abstractmethod
-    def encode(self, prediction: List[str]) -> NDArray:
-        pass
-class USE(Encoder):
-    def __init__(self):
-        pass
-    def encode(self, prediction: List[str]) -> NDArray:
-        pass
-class SBertEncoder(Encoder):
-    def __init__(self, model_name: str, device: Union[str, int], batch_size: int):
-        self.model = SentenceTransformer(model_name)
-        self.device = device
-        self.batch_size = batch_size
-    def encode(self, prediction: List[str]) -> NDArray:
-        """Returns sentence embeddings of dim: Batch x Dim"""
-        # SBert output is always Batch x Dim
-        return self.model.encode(prediction, device=self.device, batch_size=self.batch_size)
-def _get_encoder(model_name: str, device: Union[str, int], batch_size: int) -> Encoder:
-    if model_name == "use":
-        return SBertEncoder(model_name, device, batch_size)
-        # return USE()  # TODO: This will change depending on PyTorch USE VS TF USE model
-    else:
-        return SBertEncoder(model_name, device, batch_size)
-def _compute_cosine_similarity(pred_embeds: NDArray, ref_embeds: NDArray) -> Tuple[float, float]:
-    cosine_scores = cosine_similarity(pred_embeds, ref_embeds)
-    precision_per_sentence_sim = np.max(cosine_scores, axis=-1)
-    recall_per_sentence_sim = np.max(cosine_scores, axis=0)
-    return np.mean(precision_per_sentence_sim).item(), np.mean(recall_per_sentence_sim).item()
-def _get_gpu(gpu: Union[bool, int]) -> Union[str, int]:
-    # Ensure gpu index is within the range of total available gpus
-    gpu_available = torch.cuda.is_available()
-    if gpu_available:
-        gpu_count = torch.cuda.device_count()
-        if isinstance(gpu, int) and gpu >= gpu_count:
-            raise ValueError(
-                f"There are {gpu_count} gpus available. Provide the correct gpu index. You provided: {gpu}"
-            )
-    # get the device
-    if gpu is False:
-        device = "cpu"
-    elif gpu is True and gpu_available:
-        device = 0  # by default run on device 0
-    elif isinstance(gpu, int):
-        device = gpu
-    else:  # This will never happen
-        raise ValueError(f"gpu must be bool or int. Provided value: {gpu}")
-    return device
-def _validate_input_format(
-        tokenize_sentences: bool,
-        multi_references: bool,
-        predictions: _PREDICTION_TYPE,
-        references: _REFERENCE_TYPE,
-):
     if tokenize_sentences and multi_references:
         condition = is_list_of_strings_at_depth(predictions, 1) and is_list_of_strings_at_depth(references, 2)
     elif not tokenize_sentences and multi_references:
@@ -215,7 +213,7 @@ class SemF1(evaluate.Metric):
     _MODEL_TYPE_TO_NAME = {
         "pv1": "paraphrase-distilroberta-base-v1",
         "stsb": "stsb-roberta-large",
-        "use": "sentence-transformers/use-cmlm-multilingual",  # TODO: check PyTorch USE VS TF USE
     }
     def _info(self):
@@ -275,7 +273,7 @@ class SemF1(evaluate.Metric):
     def _get_model_name(self, model_type: Optional[str] = None) -> str:
         if model_type is None:
-            model_type = "pv1"  # TODO: Change it to use
         if model_type not in self._MODEL_TYPE_TO_NAME.keys():
             raise ValueError(f"Provide a correct model_type.\n"
@@ -291,7 +289,6 @@ class SemF1(evaluate.Metric):
         # if not nltk.data.find("tokenizers/punkt"):  # TODO: check why it is not working
         #     pass
     def _compute(
             self,
             predictions,
@@ -299,8 +296,9 @@ class SemF1(evaluate.Metric):
             model_type: Optional[str] = None,
             tokenize_sentences: bool = True,
             multi_references: bool = False,
-            gpu: Union[bool, int] = False,
             batch_size: int = 32,
     ) -> List[Scores]:
         """
             Compute precision, recall, and F1 scores for given predictions and references.
@@ -308,10 +306,15 @@ class SemF1(evaluate.Metric):
             :param predictions
             :param references
             :param model_type: Type of model to use for encoding.
             :param tokenize_sentences: Flag to sentence tokenize the document.
             :param multi_references: Flag to indicate multiple references.
             :param gpu: GPU device to use.
             :param batch_size: Batch size for encoding.
             :return: List of Scores dataclass with precision, recall, and F1 scores.
         """
@@ -320,11 +323,13 @@ class SemF1(evaluate.Metric):
         _validate_input_format(tokenize_sentences, multi_references, predictions, references)
         # Get GPU
-        device = _get_gpu(gpu)
         # Get the encoder model
         model_name = self._get_model_name(model_type)
-        encoder = _get_encoder(model_name, device=device, batch_size=batch_size)
         # We'll handle the single reference and multi-reference case same way. So change the data format accordingly
         if not multi_references:

 # TODO: Add test cases, Remove tokenize_sentences flag since it can be determined from the input itself.
 """Sem-F1 metric"""
+from functools import partial
+from typing import List, Optional, Tuple
 import datasets
 import evaluate
 import nltk
 import numpy as np
 from numpy.typing import NDArray
 from sklearn.metrics.pairwise import cosine_similarity
+from encoder_models import get_encoder
+from type_aliases import DEVICE_TYPE, PREDICTION_TYPE, REFERENCE_TYPE
+from utils import is_nested_list_of_type, Scores, slice_embeddings, flatten_list, get_gpu
 _CITATION = """\
 @inproceedings{bansal-etal-2022-sem,
     [0.77, 0.56]
 """
+def _compute_cosine_similarity(pred_embeds: NDArray, ref_embeds: NDArray) -> Tuple[float, float]:
+    """
+        Compute precision and recall based on cosine similarity between predicted and reference embeddings.
+        Args:
+            pred_embeds (NDArray): Predicted embeddings (shape: [num_pred, embedding_dim]).
+            ref_embeds (NDArray): Reference embeddings (shape: [num_ref, embedding_dim]).
+        Returns:
+            Tuple[float, float]: Precision and recall based on cosine similarity scores.
+                Precision: Average maximum cosine similarity score per predicted embedding.
+                Recall: Average maximum cosine similarity score per reference embedding.
+        """
+    # Compute cosine similarity between predicted and reference embeddings
+    cosine_scores = cosine_similarity(pred_embeds, ref_embeds)
+    # Compute precision per predicted embedding
+    precision_per_sentence_sim = np.max(cosine_scores, axis=-1)
+    # Compute recall per reference embedding
+    recall_per_sentence_sim = np.max(cosine_scores, axis=0)
+    # Calculate mean precision and recall scores
+    precision = np.mean(precision_per_sentence_sim).item()
+    recall = np.mean(recall_per_sentence_sim).item()
+    return precision, recall
+def _validate_input_format(
+        tokenize_sentences: bool,
+        multi_references: bool,
+        predictions: PREDICTION_TYPE,
+        references: REFERENCE_TYPE,
+):
+    """
+        Validate the format of predictions and references based on specified criteria.
+        Args:
+        - tokenize_sentences (bool): Flag indicating whether sentences should be tokenized.
+        - multi_references (bool): Flag indicating whether multiple references are provided.
+        - predictions (PREDICTION_TYPE): Predictions to validate.
+        - references (REFERENCE_TYPE): References to validate.
+        Raises:
+        - ValueError: If the format of predictions or references does not meet the specified criteria.
+        Validation Criteria:
+        The function validates predictions and references based on the following conditions:
+        1. If `tokenize_sentences` is True and `multi_references` is True:
+           - Predictions must be a list of strings (`is_list_of_strings_at_depth(predictions, 1)`).
+           - References must be a list of list of strings (`is_list_of_strings_at_depth(references, 2)`).
+        2. If `tokenize_sentences` is False and `multi_references` is True:
+           - Predictions must be a list of list of strings (`is_list_of_strings_at_depth(predictions, 2)`).
+           - References must be a list of list of list of strings (`is_list_of_strings_at_depth(references, 3)`).
+        3. If `tokenize_sentences` is True and `multi_references` is False:
+           - Predictions must be a list of strings (`is_list_of_strings_at_depth(predictions, 1)`).
+           - References must be a list of strings (`is_list_of_strings_at_depth(references, 1)`).
+        4. If `tokenize_sentences` is False and `multi_references` is False:
+           - Predictions must be a list of list of strings (`is_list_of_strings_at_depth(predictions, 2)`).
+           - References must be a list of list of strings (`is_list_of_strings_at_depth(references, 2)`).
+        The function checks these conditions and raises a ValueError if any condition is not met,
+        indicating that predictions or references are not in the valid input format.
+        Note:
+        - `PREDICTION_TYPE` and `REFERENCE_TYPE` are defined at the top of the file
+    """
+    is_list_of_strings_at_depth = partial(is_nested_list_of_type, element_type=str)
     if tokenize_sentences and multi_references:
         condition = is_list_of_strings_at_depth(predictions, 1) and is_list_of_strings_at_depth(references, 2)
     elif not tokenize_sentences and multi_references:
     _MODEL_TYPE_TO_NAME = {
         "pv1": "paraphrase-distilroberta-base-v1",
         "stsb": "stsb-roberta-large",
+        "use": "use",  # "sentence-transformers/use-cmlm-multilingual",  # TODO: check PyTorch USE VS TF USE
     }
     def _info(self):
     def _get_model_name(self, model_type: Optional[str] = None) -> str:
         if model_type is None:
+            model_type = "use"
         if model_type not in self._MODEL_TYPE_TO_NAME.keys():
             raise ValueError(f"Provide a correct model_type.\n"
         # if not nltk.data.find("tokenizers/punkt"):  # TODO: check why it is not working
         #     pass
     def _compute(
             self,
             predictions,
             model_type: Optional[str] = None,
             tokenize_sentences: bool = True,
             multi_references: bool = False,
+            gpu: DEVICE_TYPE = False,
             batch_size: int = 32,
+            verbose: bool = False,
     ) -> List[Scores]:
         """
             Compute precision, recall, and F1 scores for given predictions and references.
             :param predictions
             :param references
             :param model_type: Type of model to use for encoding.
+                Options: [pv1, stsb, use]
+                    pv1 - paraphrase-distilroberta-base-v1 (Default)
+                    stsb - stsb-roberta-large
+                    use - Universal Sentence Encoder
             :param tokenize_sentences: Flag to sentence tokenize the document.
             :param multi_references: Flag to indicate multiple references.
             :param gpu: GPU device to use.
             :param batch_size: Batch size for encoding.
+            :param verbose: Flag to indicate verbose output.
             :return: List of Scores dataclass with precision, recall, and F1 scores.
         """
         _validate_input_format(tokenize_sentences, multi_references, predictions, references)
         # Get GPU
+        device = get_gpu(gpu)
+        if verbose:
+            print(f"Using devices: {device}")
         # Get the encoder model
         model_name = self._get_model_name(model_type)
+        encoder = get_encoder(model_name, device=device, batch_size=batch_size, verbose=verbose)
         # We'll handle the single reference and multi-reference case same way. So change the data format accordingly
         if not multi_references:

tests.py CHANGED Viewed

@@ -1,17 +1,179 @@
-test_cases = [
-    {
-        "predictions": [0, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0}
-    },
-    {
-        "predictions": [1, 1],
-        "references": [1, 1],
-        "result": {"metric_score": 1}
-    },
-    {
-        "predictions": [1, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0.5}
-    }
-]

+import statistics
+import unittest
+import numpy as np
+import torch
+from sentence_transformers import SentenceTransformer
+from encoder_models import SBertEncoder, get_encoder
+from utils import get_gpu, slice_embeddings, is_nested_list_of_type, flatten_list, compute_f1, Scores
+class TestUtils(unittest.TestCase):
+    def test_get_gpu(self):
+        gpu_count = torch.cuda.device_count()
+        gpu_available = torch.cuda.is_available()
+        # Test single boolean input
+        self.assertEqual(get_gpu(True), 0 if gpu_available else "cpu")
+        self.assertEqual(get_gpu(False), "cpu")
+        # Test single string input
+        self.assertEqual(get_gpu("cpu"), "cpu")
+        self.assertEqual(get_gpu("gpu"), 0 if gpu_available else "cpu")
+        self.assertEqual(get_gpu("cuda"), 0 if gpu_available else "cpu")
+        # Test single integer input
+        self.assertEqual(get_gpu(0), 0 if gpu_available else "cpu")
+        self.assertEqual(get_gpu(1), 1 if gpu_available else "cpu")
+        # Test list input with unique elements
+        self.assertEqual(get_gpu([True, "cpu", 0]), [0, "cpu"] if gpu_available else ["cpu", "cpu", "cpu"])
+        # Test list input with duplicate elements
+        self.assertEqual(get_gpu([0, 0, "gpu"]), [0] if gpu_available else ["cpu", "cpu", "cpu"])
+        # Test list input with duplicate elements of different types
+        self.assertEqual(get_gpu([True, 0, "gpu"]), [0] if gpu_available else ["cpu", "cpu", "cpu"])
+        # Test list input with all integers
+        self.assertEqual(get_gpu(list(range(gpu_count))),
+                         list(range(gpu_count)) if gpu_available else gpu_count * ["cpu"])
+        with self.assertRaises(ValueError):
+            get_gpu("invalid")
+        with self.assertRaises(ValueError):
+            get_gpu(torch.cuda.device_count())
+    def test_slice_embeddings(self):
+        embeddings = np.random.rand(10, 5)
+        num_sentences = [3, 2, 5]
+        expected_output = [embeddings[:3], embeddings[3:5], embeddings[5:]]
+        self.assertTrue(
+            all(np.array_equal(a, b) for a, b in zip(slice_embeddings(embeddings, num_sentences),
+                                                     expected_output))
+        )
+        num_sentences_nested = [[2, 1], [3, 4]]
+        expected_output_nested = [[embeddings[:2], embeddings[2:3]], [embeddings[3:6], embeddings[6:]]]
+        self.assertTrue(
+            slice_embeddings(embeddings, num_sentences_nested), expected_output_nested
+        )
+        with self.assertRaises(TypeError):
+            slice_embeddings(embeddings, "invalid")
+    def test_is_nested_list_of_type(self):
+        # Test case: Depth 0, single element matching element_type
+        self.assertTrue(is_nested_list_of_type("test", str, 0))
+        # Test case: Depth 0, single element not matching element_type
+        self.assertFalse(is_nested_list_of_type("test", int, 0))
+        # Test case: Depth 1, list of elements matching element_type
+        self.assertTrue(is_nested_list_of_type(["apple", "banana"], str, 1))
+        # Test case: Depth 1, list of elements not matching element_type
+        self.assertFalse(is_nested_list_of_type([1, 2, 3], str, 1))
+        # Test case: Depth 0 (Wrong), list of elements matching element_type
+        self.assertFalse(is_nested_list_of_type([1, 2, 3], str, 0))
+        # Depth 2
+        self.assertTrue(is_nested_list_of_type([[1, 2], [3, 4]], int, 2))
+        self.assertTrue(is_nested_list_of_type([['1', '2'], ['3', '4']], str, 2))
+        self.assertFalse(is_nested_list_of_type([[1, 2], ["a", "b"]], int, 2))
+        # Depth 3
+        self.assertFalse(is_nested_list_of_type([[[1], [2]], [[3], [4]]], list, 3))
+        self.assertTrue(is_nested_list_of_type([[[1], [2]], [[3], [4]]], int, 3))
+        with self.assertRaises(ValueError):
+            is_nested_list_of_type([1, 2], int, -1)
+    def test_flatten_list(self):
+        self.assertEqual(flatten_list([1, [2, 3], [[4], 5]]), [1, 2, 3, 4, 5])
+        self.assertEqual(flatten_list([]), [])
+        self.assertEqual(flatten_list([1, 2, 3]), [1, 2, 3])
+        self.assertEqual(flatten_list([[[[1]]]]), [1])
+    def test_compute_f1(self):
+        self.assertAlmostEqual(compute_f1(0.5, 0.5), 0.5)
+        self.assertAlmostEqual(compute_f1(1, 0), 0.0)
+        self.assertAlmostEqual(compute_f1(0, 1), 0.0)
+        self.assertAlmostEqual(compute_f1(1, 1), 1.0)
+    def test_scores(self):
+        scores = Scores(precision=0.8, recall=[0.7, 0.9])
+        self.assertAlmostEqual(scores.f1, compute_f1(0.8, statistics.fmean([0.7, 0.9])))
+class TestSBertEncoder(unittest.TestCase):
+    def setUp(self, device=None):
+        if device is None:
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.device = device
+        self.model_name = "stsb-roberta-large"
+        self.batch_size = 8
+        self.verbose = False
+        self.encoder = SBertEncoder(self.model_name, self.device, self.batch_size, self.verbose)
+    def test_initialization(self):
+        self.assertIsInstance(self.encoder.model, SentenceTransformer)
+        self.assertEqual(self.encoder.device, self.device)
+        self.assertEqual(self.encoder.batch_size, self.batch_size)
+        self.assertEqual(self.encoder.verbose, self.verbose)
+    def test_encode_single_device(self):
+        sentences = ["This is a test sentence.", "Here is another sentence."]
+        embeddings = self.encoder.encode(sentences)
+        self.assertIsInstance(embeddings, np.ndarray)
+        self.assertEqual(embeddings.shape[0], len(sentences))
+        self.assertEqual(embeddings.shape[1], self.encoder.model.get_sentence_embedding_dimension())
+    def test_encode_multi_device(self):
+        if torch.cuda.device_count() < 2:
+            self.skipTest("Multi-GPU test requires at least 2 GPUs.")
+        else:
+            devices = ["cuda:0", "cuda:1"]
+            self.setUp(devices)
+            sentences = ["This is a test sentence.", "Here is another sentence.", "This is a test sentence."]
+            embeddings = self.encoder.encode(sentences)
+            self.assertIsInstance(embeddings, np.ndarray)
+            self.assertEqual(embeddings.shape[0], 3)
+            self.assertEqual(embeddings.shape[1], self.encoder.model.get_sentence_embedding_dimension())
+class TestGetEncoder(unittest.TestCase):
+    def test_get_sbert_encoder(self):
+        model_name = "stsb-roberta-large"
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        batch_size = 8
+        verbose = False
+        encoder = get_encoder(model_name, device, batch_size, verbose)
+        self.assertIsInstance(encoder, SBertEncoder)
+        self.assertEqual(encoder.device, device)
+        self.assertEqual(encoder.batch_size, batch_size)
+        self.assertEqual(encoder.verbose, verbose)
+    def test_get_use_encoder(self):
+        model_name = "use"
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        batch_size = 8
+        verbose = False
+        encoder = get_encoder(model_name, device, batch_size, verbose)
+        self.assertIsInstance(encoder, SBertEncoder)  # SBertEncoder is returned for "use" for now
+        # Uncomment below when implementing USE class
+        # self.assertIsInstance(encoder, USE)
+        # self.assertEqual(encoder.model_name, model_name)
+        # self.assertEqual(encoder.device, device)
+        # self.assertEqual(encoder.batch_size, batch_size)
+        # self.assertEqual(encoder.verbose, verbose)
+if __name__ == '__main__':
+    unittest.main()

type_aliases.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from typing import List, Union
+from numpy.typing import NDArray
+NumSentencesType = Union[List[int], List[List[int]]]
+EmbeddingSlicesType = Union[List[NDArray], List[List[NDArray]]]
+PREDICTION_TYPE = Union[List[str], List[List[str]]]
+REFERENCE_TYPE = Union[List[str], List[List[str]], List[List[List[str]]]]
+DEVICE_TYPE = Union[bool, str, int, List[Union[str, int]]]
+ENCODER_DEVICE_TYPE = Union[str, int, List[Union[str, int]]]

utils.py CHANGED Viewed

@@ -1,13 +1,81 @@
-from dataclasses import dataclass
 import statistics
 import sys
 from typing import List, Union
 from numpy.typing import NDArray
-NumSentencesType = Union[List[int], List[List[int]]]
-EmbeddingSlicesType = Union[List[NDArray], List[List[NDArray]]]
 def slice_embeddings(embeddings: NDArray, num_sentences: NumSentencesType) -> EmbeddingSlicesType:
@@ -22,10 +90,10 @@ def slice_embeddings(embeddings: NDArray, num_sentences: NumSentencesType) -> Em
         result, _ = _slice_embeddings(0, num_sentences)
         return result
     elif isinstance(num_sentences, list) and all(
-        isinstance(sublist, list) and all(
-            isinstance(item, int) for item in sublist
-        )
-        for sublist in num_sentences
     ):
         nested_result = []
         start_idx = 0
@@ -38,11 +106,11 @@ def slice_embeddings(embeddings: NDArray, num_sentences: NumSentencesType) -> Em
         raise TypeError(f"Incorrect Type for {num_sentences=}")
-def is_list_of_strings_at_depth(obj, depth: int) -> bool:
     if depth == 0:
-        return isinstance(obj, str)
     elif depth > 0:
-        return isinstance(obj, list) and all(is_list_of_strings_at_depth(item, depth - 1) for item in obj)
     else:
         raise ValueError("Depth can't be negative")

 import statistics
 import sys
+from dataclasses import dataclass
 from typing import List, Union
+import torch
 from numpy.typing import NDArray
+from type_aliases import DEVICE_TYPE, ENCODER_DEVICE_TYPE, NumSentencesType, EmbeddingSlicesType
+def get_gpu(gpu: DEVICE_TYPE) -> ENCODER_DEVICE_TYPE:
+    """
+        Determine the correct GPU device based on the provided input. In the following, output 0 means CUDA device 0.
+        Args:
+            gpu (Union[bool, str, int, List[Union[str, int]]]): Input specifying the GPU device(s):
+                - bool: If True, returns 0 if CUDA is available, otherwise returns "cpu".
+                - str: Can be "cpu", "gpu", or "cuda" (case-insensitive). Returns 0 if CUDA is available
+                  and the input is not "cpu", otherwise returns "cpu".
+                - int: Should be a valid GPU index. Returns the index if CUDA is available and valid,
+                  otherwise returns "cpu".
+                - List[Union[str, int]]: List containing combinations of the str/int. Processes each
+                  element and returns a list of corresponding results.
+        Returns:
+            Union[str, int, List[Union[str, int]]]: Depending on the input type:
+                - str: Returns "cpu" if no GPU is available or the input is "cpu".
+                - int: Returns the GPU index if valid and CUDA is available.
+                - List[Union[str, int]]: Returns a list of strings and/or integers based on the input list.
+        Raises:
+            ValueError: If the input gpu type is not recognized or invalid.
+            ValueError: If a string input is not one of ["cpu", "gpu", "cuda"].
+            ValueError: If an integer input is outside the valid range of GPU indices.
+        Notes:
+            - This function checks CUDA availability using torch.cuda.is_available() and counts
+              available GPUs using torch.cuda.device_count().
+            - Case insensitivity is maintained for string inputs ("cpu", "gpu", "cuda").
+            - The function ensures robust error handling for invalid input types or out-of-range indices.
+        """
+    # Ensure gpu index is within the range of total available gpus
+    gpu_available = torch.cuda.is_available()
+    gpu_count = torch.cuda.device_count()
+    correct_strs = ["cpu", "gpu", "cuda"]
+    def _get_single_device(gpu_item):
+        if isinstance(gpu_item, bool):
+            return 0 if gpu_item and gpu_available else "cpu"
+        elif isinstance(gpu_item, str):
+            if gpu_item.lower() not in correct_strs:
+                raise ValueError(f"Wrong gpu type: {gpu_item}. Should be one of {correct_strs}")
+            return 0 if (gpu_item.lower() != "cpu") and gpu_available else "cpu"
+        elif isinstance(gpu_item, int):
+            if gpu_item >= gpu_count:
+                raise ValueError(
+                    f"There are {gpu_count} GPUs available. Provide a valid GPU index. You provided: {gpu_item}"
+                )
+            return gpu_item if gpu_available else "cpu"
+        else:
+            raise ValueError(f"Invalid gpu type: {type(gpu_item)}. Must be bool, str, or int.")
+    if isinstance(gpu, list):
+        seen_indices = set()
+        result = []
+        for item in gpu:
+            device = _get_single_device(item)
+            if isinstance(device, int):
+                if device not in seen_indices:
+                    seen_indices.add(device)
+                    result.append(device)
+            else:
+                result.append(device)
+        return result
+    else:
+        return _get_single_device(gpu)
 def slice_embeddings(embeddings: NDArray, num_sentences: NumSentencesType) -> EmbeddingSlicesType:
         result, _ = _slice_embeddings(0, num_sentences)
         return result
     elif isinstance(num_sentences, list) and all(
+            isinstance(sublist, list) and all(
+                isinstance(item, int) for item in sublist
+            )
+            for sublist in num_sentences
     ):
         nested_result = []
         start_idx = 0
         raise TypeError(f"Incorrect Type for {num_sentences=}")
+def is_nested_list_of_type(lst_obj, element_type, depth: int) -> bool:
     if depth == 0:
+        return isinstance(lst_obj, element_type)
     elif depth > 0:
+        return isinstance(lst_obj, list) and all(is_nested_list_of_type(item, element_type, depth - 1) for item in lst_obj)
     else:
         raise ValueError("Depth can't be negative")