Spaces:

BridgeAI-Lab
/

SemF1

Sleeping

App Files Files Community

nbansal commited on May 30, 2024

Commit

61ff8d5

1 Parent(s): dfd7508

Made it work with gpu and multi-references and some optimizations

Browse files

Files changed (2) hide show

requirements.txt +1 -0
semf1.py +173 -25

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 git+https://github.com/huggingface/evaluate@main
 scikit-learn
 sentence-transformers

 git+https://github.com/huggingface/evaluate@main
+nltk
 scikit-learn
 sentence-transformers

semf1.py CHANGED Viewed

@@ -11,20 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# TODO: Add test cases
 """SEM-F1 metric"""
 import abc
 import sys
-from typing import List, Optional, Tuple
 import datasets
 import evaluate
 import numpy as np
 from numpy.typing import NDArray
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 _CITATION = """\
 @inproceedings{bansal-etal-2022-sem,
@@ -103,18 +104,23 @@ class USE(Encoder):
 class SBertEncoder(Encoder):
-    def __init__(self, model_name: str):
         self.model = SentenceTransformer(model_name)
     def encode(self, prediction: List[str]) -> NDArray:
-        return self.model.encode(prediction)
-def _get_encoder(model_name: str):
     if model_name == "use":
-        return USE()
     else:
-        return SBertEncoder(model_name)
 def _compute_f1(p, r, eps=sys.float_info.epsilon):
@@ -140,7 +146,7 @@ class SemF1(evaluate.Metric):
     _MODEL_TYPE_TO_NAME = {
         "pv1": "paraphrase-distilroberta-base-v1",
         "stsb": "stsb-roberta-large",
-        "use": "use",
     }
     def _info(self):
@@ -151,19 +157,56 @@ class SemF1(evaluate.Metric):
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
-            features=datasets.Features({
-                'predictions': datasets.Sequence(datasets.Value("string", id="sequence"), id="predictions"),
-                'references': datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
-            }),
             # # Homepage of the module for documentation
             # Additional links to the codebase or references
             reference_urls=["https://aclanthology.org/2022.emnlp-main.49/"]
         )
     def _get_model_name(self, model_type: Optional[str] = None) -> str:
-        # TODO: make it work with USE as well
         if model_type is None:
-            model_type = "pv1"  # Change it to use
         if model_type not in self._MODEL_TYPE_TO_NAME.keys():
             raise ValueError(f"Provide a correct model_type.\n"
@@ -172,21 +215,126 @@ class SemF1(evaluate.Metric):
         return self._MODEL_TYPE_TO_NAME[model_type]
-    def _compute(self, predictions, references, model_type: Optional[str] = None):
         model_name = self._get_model_name(model_type)
-        encoder = _get_encoder(model_name)
         precisions = [0] * len(predictions)
         recalls = [0] * len(predictions)
         f1_scores = [0] * len(predictions)
-        for idx, (preds, refs) in enumerate(zip(predictions, references)):
-            pred_embeddings = encoder.encode(preds)
-            ref_embeddings = encoder.encode(refs)
-            p, r = _compute_cosine_similarity(pred_embeddings, ref_embeddings)
-            f1 = _compute_f1(p, r)
-            precisions[idx] = p
-            recalls[idx] = r
-            f1_scores[idx] = f1
         return {"precision": precisions, "recall": recalls, "f1": f1_scores}

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# TODO: Add test cases, Provide an option to pass batch size when computing the embeddings
 """SEM-F1 metric"""
 import abc
 import sys
+from typing import List, Optional, Tuple, Union
 import datasets
 import evaluate
+import nltk
 import numpy as np
 from numpy.typing import NDArray
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
+import torch
 _CITATION = """\
 @inproceedings{bansal-etal-2022-sem,
 class SBertEncoder(Encoder):
+    def __init__(self, model_name: str, device: Union[str, int], batch_size: int):
         self.model = SentenceTransformer(model_name)
+        self.device = device
+        self.batch_size = batch_size
     def encode(self, prediction: List[str]) -> NDArray:
+        """Returns sentence embeddings of dim: Batch x Dim"""
+        # SBert output is always Batch x Dim
+        return self.model.encode(prediction, device=self.device, batch_size=self.batch_size)
+def _get_encoder(model_name: str, device: Union[str, int], batch_size: int) -> Encoder:
     if model_name == "use":
+        return SBertEncoder(model_name, device)
+        # return USE()  # TODO: This will change depending on PyTorch USE VS TF USE model
     else:
+        return SBertEncoder(model_name, device, batch_size)
 def _compute_f1(p, r, eps=sys.float_info.epsilon):
     _MODEL_TYPE_TO_NAME = {
         "pv1": "paraphrase-distilroberta-base-v1",
         "stsb": "stsb-roberta-large",
+        "use": "sentence-transformers/use-cmlm-multilingual",  # TODO: check PyTorch USE VS TF USE
     }
     def _info(self):
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
+            features=[
+                # Multi References: False, Tokenize_Sentences = False
+                datasets.Features(
+                    {
+                        # predictions: List[List[str]] - List of predictions where prediction is a list of sentences
+                        "predictions": datasets.Sequence(datasets.Value("string", id="sequence"), id="predictions"),
+                        # references: List[List[str]] - List of references where each reference is a list of sentences
+                        "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
+                    }
+                ),
+                # Multi References: False, Tokenize_Sentences = True
+                datasets.Features(
+                    {
+                        # predictions: List[str] - List of predictions
+                        "predictions": datasets.Value("string", id="sequence"),
+                        # references: List[str] - List of documents
+                        "references": datasets.Value("string", id="sequence"),
+                    }
+                ),
+                # Multi References: True, Tokenize_Sentences = False
+                datasets.Features(
+                    {
+                        # predictions: List[List[str]] - List of predictions where prediction is a list of sentences
+                        "predictions": datasets.Sequence(datasets.Value("string", id="sequence"), id="predictions"),
+                        # references: List[List[List[str]]] - List of multi-references.
+                        #                                     So each "reference" is also a list (r1, r2, ...).
+                        #                                     Further, each ri's are also list of sentences.
+                        "references": datasets.Sequence(
+                            datasets.Sequence(datasets.Value("string", id="sequence"), id="ref"), id="references"),
+                    }
+                ),
+                # Multi References: True, Tokenize_Sentences = True
+                datasets.Features(
+                    {
+                        # predictions: List[str] - List of predictions
+                        "predictions": datasets.Value("string", id="sequence"),
+                        # references: List[List[List[str]]] - List of multi-references.
+                        #                                     So each "reference" is also a list (r1, r2, ...).
+                        "references": datasets.Sequence(datasets.Value("string", id="ref"), id="references"),
+                    }
+                ),
+            ],
             # # Homepage of the module for documentation
             # Additional links to the codebase or references
             reference_urls=["https://aclanthology.org/2022.emnlp-main.49/"]
         )
     def _get_model_name(self, model_type: Optional[str] = None) -> str:
         if model_type is None:
+            model_type = "pv1"  # TODO: Change it to use
         if model_type not in self._MODEL_TYPE_TO_NAME.keys():
             raise ValueError(f"Provide a correct model_type.\n"
         return self._MODEL_TYPE_TO_NAME[model_type]
+    def _download_and_prepare(self, dl_manager):
+        """Optional: download external resources useful to compute the scores"""
+        import nltk
+        if not nltk.data.find("tokenizers/punkt"):
+            nltk.download("punkt", quiet=True)
+    def _compute(
+            self,
+            predictions,
+            references,
+            model_type: Optional[str] = None,
+            tokenize_sentences: bool = True,
+            gpu: Union[bool, int] = False,
+            batch_size: int = 32,
+    ):
+        # Ensure gpu index is within the range of total available gpus
+        gpu_available = True if torch.cuda.is_available() else False
+        if gpu_available:
+            gpu_count = torch.cuda.device_count()
+            if isinstance(gpu, int) and gpu >= gpu_count:
+                raise ValueError(
+                    f"There are {gpu_count} gpus available. Provide the correct gpu index. You provided: {gpu}"
+                )
+        # get the device
+        if gpu is False:
+            device = "cpu"
+        elif gpu is True and torch.cuda.is_available():
+            device = 0  # by default run on device 0
+        elif isinstance(gpu, int):
+            device = gpu
+        else:  # This will never happen
+            raise ValueError(f"gpu must be bool or int. Provided value: {gpu}")
+        # TODO: Also have a check on references to ensure they are also in correct format
+        # Ensure prediction documents are not already tokenized if tokenize_sentences is True
+        if not isinstance(predictions[0], str) and tokenize_sentences:
+            raise ValueError(f"Each prediction/reference should be a document i.e. when tokenize_sentences is True. "
+                             f"Currently, each prediction is of type {type(predictions[0])} ")
+        # Check single reference or multi-reference case
+        multi_references = False
+        if tokenize_sentences:
+            # references: List[List[reference]]
+            if isinstance(references[0], list) and isinstance(references[0][0], str):
+                multi_references = True
+        else:
+            # references: List[List[List[sentence]]]
+            if (
+                    isinstance(references[0], list) and
+                    isinstance(references[0][0], list) and
+                    isinstance(references[0][0][0], str)
+            ):
+                multi_references = True
+        # Get the encoder model
         model_name = self._get_model_name(model_type)
+        encoder = _get_encoder(model_name, device=device)
+        # Init output scores
         precisions = [0] * len(predictions)
         recalls = [0] * len(predictions)
         f1_scores = [0] * len(predictions)
+        # Compute Score in case of single reference
+        if not multi_references:
+            for idx, (pred, ref) in enumerate(zip(predictions, references)):
+                # Sentence Tokenize prediction and reference
+                if tokenize_sentences:
+                    ref = nltk.tokenize.sent_tokenize(ref)  # List[str]
+                    pred = nltk.tokenize.sent_tokenize(pred)  # List[str]
+                pred_sent_count = len(pred)
+                embeddings = encoder.encode(pred + ref)
+                pred_embeddings = embeddings[:pred_sent_count]
+                ref_embeddings = embeddings[pred_sent_count:]
+                p, r = _compute_cosine_similarity(pred_embeddings, ref_embeddings)
+                f1 = _compute_f1(p, r)
+                precisions[idx] = p
+                recalls[idx] = r
+                f1_scores[idx] = f1
+        else:
+            # Compute Score in case of multiple reference
+            for idx, (pred, refs) in enumerate(zip(predictions, references)):
+                # Sentence Tokenize prediction and reference
+                if tokenize_sentences:
+                    refs = [nltk.tokenize.sent_tokenize(ref) for ref in refs]  # List[List[str]]
+                    pred = nltk.tokenize.sent_tokenize(pred)  # List[str]
+                ref_count = len(refs)
+                pred_sent_count = len(pred)
+                ref_sent_counts = [0] + [len(ref) for ref in refs]
+                cumsum_ref_sent_counts = np.cumsum(ref_sent_counts)
+                all_sentences = pred + sum(refs, [])
+                embeddings = encoder.encode(all_sentences)
+                pred_embeddings = embeddings[:pred_sent_count]
+                ref_embeddings = [
+                    embeddings[pred_sent_count + cumsum_ref_sent_counts[c_idx]:
+                               pred_sent_count + cumsum_ref_sent_counts[c_idx + 1]]
+                    for c_idx in range(ref_count)
+                ]
+                # pred_embeddings = encoder.encode(pred)
+                # ref_embeddings = [encoder.encode(refs) for ref in refs]
+                # Precision: Concatenate all the sentences in all the references
+                concat_ref_embeddings = np.concatenate(ref_embeddings, axis=0)
+                p, _ = _compute_cosine_similarity(pred_embeddings, concat_ref_embeddings)
+                # Recall: Compute individually for each reference
+                scores = [_compute_cosine_similarity(r_embeds, pred_embeddings) for r_embeds in ref_embeddings]
+                r = np.mean([r_scores for (r_scores, _) in scores]).item()
+                f1 = _compute_f1(p, r)
+                precisions[idx] = p  # TODO: check why idx says invalid type
+                recalls[idx] = r
+                f1_scores[idx] = f1
         return {"precision": precisions, "recall": recalls, "f1": f1_scores}