nbansal commited on
Commit
251bfda
·
1 Parent(s): 23122f2

Support other SentenceTransformer models as well and update the documentation accordingly

Browse files
Files changed (4) hide show
  1. README.md +11 -3
  2. encoder_models.py +19 -20
  3. semf1.py +12 -8
  4. tests.py +35 -24
README.md CHANGED
@@ -53,6 +53,10 @@ Sem-F1 also accepts multiple optional arguments:
53
  - `pv1` - [paraphrase-distilroberta-base-v1](https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1)
54
  - `stsb` - [stsb-roberta-large](https://huggingface.co/sentence-transformers/stsb-roberta-large)
55
  - `use` - [Universal Sentence Encoder](https://huggingface.co/sentence-transformers/use-cmlm-multilingual) (Default)
 
 
 
 
56
  - `tokenize_sentences (bool)`: Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
57
  - `multi_references (bool)`: Flag to indicate whether multiple references are provided. Default: False.
58
  - `gpu (Union[bool, str, int, List[Union[str, int]]])`: Whether to use GPU, CPU or multiple-processes for computation.
@@ -79,10 +83,14 @@ List of `Scores` dataclass corresponding to each sample -
79
  - `f1: float`: F1 score (between precision and average recall).
80
 
81
 
82
- ## Future Extensions
83
  Currently, we have only implemented the 3 encoders* that we experimented with in our
84
- [paper](https://aclanthology.org/2022.emnlp-main.49/). However, it can easily with extended for more models by simply
85
- extending the `Encoder` base class. (Refer to `encoder_models.py` file).
 
 
 
 
86
 
87
  `*` *In out paper, we used the Tensorflow [version](https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder)
88
  of the USE model, however, in our current implementation, we used [PyTorch version](https://huggingface.co/sentence-transformers/use-cmlm-multilingual).*
 
53
  - `pv1` - [paraphrase-distilroberta-base-v1](https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1)
54
  - `stsb` - [stsb-roberta-large](https://huggingface.co/sentence-transformers/stsb-roberta-large)
55
  - `use` - [Universal Sentence Encoder](https://huggingface.co/sentence-transformers/use-cmlm-multilingual) (Default)
56
+
57
+ Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by SentenceTransformer
58
+ such as `all-mpnet-base-v2` or `roberta-base`
59
+
60
  - `tokenize_sentences (bool)`: Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
61
  - `multi_references (bool)`: Flag to indicate whether multiple references are provided. Default: False.
62
  - `gpu (Union[bool, str, int, List[Union[str, int]]])`: Whether to use GPU, CPU or multiple-processes for computation.
 
83
  - `f1: float`: F1 score (between precision and average recall).
84
 
85
 
86
+ ## Extensions
87
  Currently, we have only implemented the 3 encoders* that we experimented with in our
88
+ [paper](https://aclanthology.org/2022.emnlp-main.49/). Furthermore, you can use any model on
89
+ Huggingface/SentenceTransformer that is supported by SentenceTransformer such as `all-mpnet-base-v2` or `roberta-base`.
90
+
91
+ If you want to use your own encoder model, either make sure that is supported by `SentenceTransformer`. Or if it's a
92
+ completely new architecture, it can easily with extended for more models by extending the `Encoder` base class (Refer to
93
+ `encoder_models.py` file).
94
 
95
  `*` *In out paper, we used the Tensorflow [version](https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder)
96
  of the USE model, however, in our current implementation, we used [PyTorch version](https://huggingface.co/sentence-transformers/use-cmlm-multilingual).*
encoder_models.py CHANGED
@@ -25,14 +25,6 @@ class Encoder(abc.ABC):
25
  raise NotImplementedError("Method 'encode' must be implemented in subclass.")
26
 
27
 
28
- class USE(Encoder):
29
- def __init__(self):
30
- pass
31
-
32
- def encode(self, prediction: List[str]) -> NDArray:
33
- pass
34
-
35
-
36
  class SBertEncoder(Encoder):
37
  def __init__(self, model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool):
38
  """
@@ -44,7 +36,7 @@ class SBertEncoder(Encoder):
44
  batch_size (int): Batch size for encoding.
45
  verbose (bool): Whether to print verbose information during encoding.
46
  """
47
- self.model = SentenceTransformer(model_name)
48
  self.device = device
49
  self.batch_size = batch_size
50
  self.verbose = verbose
@@ -84,10 +76,13 @@ def get_encoder(model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, v
84
 
85
  Args:
86
  model_name (str): Name of the model to instantiate
87
- Options: [pv1, stsb, use]
88
- pv1 - paraphrase-distilroberta-base-v1 (Default)
89
- stsb - stsb-roberta-large
90
- use - Universal Sentence Encoder
 
 
 
91
  device (Union[str, int, List[Union[str, int]]): Device specification for the encoder
92
  (e.g., "cuda", 0 for GPU, "cpu").
93
  batch_size (int): Batch size for encoding.
@@ -97,12 +92,16 @@ def get_encoder(model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, v
97
  Encoder: Instance of the selected encoder based on the model_name.
98
 
99
  Raises:
100
- ValueError: If an unsupported model_name is provided.
101
  """
102
 
103
- # TODO: chnage this when changing the TF model
104
- if model_name == "use":
105
- return SBertEncoder("sentence-transformers/use-cmlm-multilingual", device, batch_size, verbose)
106
- # return USE()
107
- else:
108
- return SBertEncoder(model_name, device, batch_size, verbose)
 
 
 
 
 
25
  raise NotImplementedError("Method 'encode' must be implemented in subclass.")
26
 
27
 
 
 
 
 
 
 
 
 
28
  class SBertEncoder(Encoder):
29
  def __init__(self, model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool):
30
  """
 
36
  batch_size (int): Batch size for encoding.
37
  verbose (bool): Whether to print verbose information during encoding.
38
  """
39
+ self.model = SentenceTransformer(model_name, trust_remote_code=True)
40
  self.device = device
41
  self.batch_size = batch_size
42
  self.verbose = verbose
 
76
 
77
  Args:
78
  model_name (str): Name of the model to instantiate
79
+ Options:
80
+ paraphrase-distilroberta-base-v1,
81
+ stsb-roberta-large,
82
+ sentence-transformers/use-cmlm-multilingual
83
+ Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by
84
+ SentenceTransformer.
85
+
86
  device (Union[str, int, List[Union[str, int]]): Device specification for the encoder
87
  (e.g., "cuda", 0 for GPU, "cpu").
88
  batch_size (int): Batch size for encoding.
 
92
  Encoder: Instance of the selected encoder based on the model_name.
93
 
94
  Raises:
95
+ EnvironmentError/RuntimeError: If an unsupported model_name is provided.
96
  """
97
 
98
+ try:
99
+ encoder = SBertEncoder(model_name, device, batch_size, verbose)
100
+ except EnvironmentError as err:
101
+ raise EnvironmentError(str(err)) from None
102
+ except Exception as err:
103
+ raise RuntimeError(str(err)) from None
104
+
105
+ return encoder
106
+
107
+
semf1.py CHANGED
@@ -62,9 +62,12 @@ Args:
62
  predictions (list): List of predictions. Format varies based on `tokenize_sentences` and `multi_references` flags.
63
  references (list): List of references. Format varies based on `tokenize_sentences` and `multi_references` flags.
64
  model_type (str): Model to use for encoding sentences. Options: ['pv1', 'stsb', 'use']
65
- pv1 - paraphrase-distilroberta-base-v1 (Default)
66
  stsb - stsb-roberta-large
67
- use - Universal Sentence Encoder
 
 
 
68
  tokenize_sentences (bool): Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
69
  multi_references (bool): Flag to indicate whether multiple references are provided. Default is False.
70
  gpu (Union[bool, str, int, List[Union[str, int]]]): Whether to use GPU or CPU for computation.
@@ -241,7 +244,7 @@ class SemF1(evaluate.Metric):
241
  _MODEL_TYPE_TO_NAME = {
242
  "pv1": "paraphrase-distilroberta-base-v1",
243
  "stsb": "stsb-roberta-large",
244
- "use": "use", # "sentence-transformers/use-cmlm-multilingual", # TODO: check PyTorch USE VS TF USE
245
  }
246
 
247
  def _info(self):
@@ -304,9 +307,7 @@ class SemF1(evaluate.Metric):
304
  model_type = "use"
305
 
306
  if model_type not in self._MODEL_TYPE_TO_NAME.keys():
307
- raise ValueError(f"Provide a correct model_type.\n"
308
- f"Options: {self._MODEL_TYPE_TO_NAME.keys()}\n"
309
- f"Currently provided: {model_type}")
310
 
311
  return self._MODEL_TYPE_TO_NAME[model_type]
312
 
@@ -335,9 +336,12 @@ class SemF1(evaluate.Metric):
335
  :param references
336
  :param model_type: Type of model to use for encoding.
337
  Options: [pv1, stsb, use]
338
- pv1 - paraphrase-distilroberta-base-v1 (Default)
339
  stsb - stsb-roberta-large
340
- use - Universal Sentence Encoder
 
 
 
341
  :param tokenize_sentences: Flag to sentence tokenize the document.
342
  :param multi_references: Flag to indicate multiple references.
343
  :param gpu: GPU device to use.
 
62
  predictions (list): List of predictions. Format varies based on `tokenize_sentences` and `multi_references` flags.
63
  references (list): List of references. Format varies based on `tokenize_sentences` and `multi_references` flags.
64
  model_type (str): Model to use for encoding sentences. Options: ['pv1', 'stsb', 'use']
65
+ pv1 - paraphrase-distilroberta-base-v1
66
  stsb - stsb-roberta-large
67
+ use - Universal Sentence Encoder (Default)
68
+ Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by SentenceTransformer such
69
+ as `all-mpnet-base-v2` or `roberta-base`
70
+
71
  tokenize_sentences (bool): Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
72
  multi_references (bool): Flag to indicate whether multiple references are provided. Default is False.
73
  gpu (Union[bool, str, int, List[Union[str, int]]]): Whether to use GPU or CPU for computation.
 
244
  _MODEL_TYPE_TO_NAME = {
245
  "pv1": "paraphrase-distilroberta-base-v1",
246
  "stsb": "stsb-roberta-large",
247
+ "use": "sentence-transformers/use-cmlm-multilingual",
248
  }
249
 
250
  def _info(self):
 
307
  model_type = "use"
308
 
309
  if model_type not in self._MODEL_TYPE_TO_NAME.keys():
310
+ return model_type
 
 
311
 
312
  return self._MODEL_TYPE_TO_NAME[model_type]
313
 
 
336
  :param references
337
  :param model_type: Type of model to use for encoding.
338
  Options: [pv1, stsb, use]
339
+ pv1 - paraphrase-distilroberta-base-v1
340
  stsb - stsb-roberta-large
341
+ use - Universal Sentence Encoder (Default)
342
+ Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by
343
+ SentenceTransformer.
344
+
345
  :param tokenize_sentences: Flag to sentence tokenize the document.
346
  :param multi_references: Flag to indicate multiple references.
347
  :param gpu: GPU device to use.
tests.py CHANGED
@@ -1,5 +1,6 @@
1
  import statistics
2
  import unittest
 
3
 
4
  import numpy as np
5
  import torch
@@ -153,32 +154,42 @@ class TestSBertEncoder(unittest.TestCase):
153
 
154
 
155
  class TestGetEncoder(unittest.TestCase):
156
- def test_get_sbert_encoder(self):
157
- model_name = "stsb-roberta-large"
158
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
159
- batch_size = 8
160
- verbose = False
161
 
162
- encoder = get_encoder(model_name, device, batch_size, verbose)
 
 
 
163
  self.assertIsInstance(encoder, SBertEncoder)
164
- self.assertEqual(encoder.device, device)
165
- self.assertEqual(encoder.batch_size, batch_size)
166
- self.assertEqual(encoder.verbose, verbose)
167
-
168
- def test_get_use_encoder(self):
169
- model_name = "use"
170
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
171
- batch_size = 8
172
- verbose = False
173
-
174
- encoder = get_encoder(model_name, device, batch_size, verbose)
175
- self.assertIsInstance(encoder, SBertEncoder) # SBertEncoder is returned for "use" for now
176
- # Uncomment below when implementing USE class
177
- # self.assertIsInstance(encoder, USE)
178
- # self.assertEqual(encoder.model_name, model_name)
179
- # self.assertEqual(encoder.device, device)
180
- # self.assertEqual(encoder.batch_size, batch_size)
181
- # self.assertEqual(encoder.verbose, verbose)
 
 
 
 
 
 
 
 
182
 
183
 
184
  class TestSemF1(unittest.TestCase):
 
1
  import statistics
2
  import unittest
3
+ from unittest.mock import patch, MagicMock
4
 
5
  import numpy as np
6
  import torch
 
154
 
155
 
156
  class TestGetEncoder(unittest.TestCase):
157
+ def setUp(self):
158
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
159
+ self.batch_size = 8
160
+ self.verbose = False
 
161
 
162
+ def _base_test(self, model_name):
163
+ encoder = get_encoder(model_name, self.device, self.batch_size, self.verbose)
164
+
165
+ # Assert
166
  self.assertIsInstance(encoder, SBertEncoder)
167
+ self.assertEqual(encoder.device, self.device)
168
+ self.assertEqual(encoder.batch_size, self.batch_size)
169
+ self.assertEqual(encoder.verbose, self.verbose)
170
+
171
+ def test_get_sbert_encoder(self):
172
+ model_name = "stsb-roberta-large"
173
+ self._base_test(model_name)
174
+
175
+ def test_sbert_model(self):
176
+ model_name = "all-mpnet-base-v2"
177
+ self._base_test(model_name)
178
+
179
+ def test_huggingface_model(self):
180
+ """Test Huggingface models which work with SBert library"""
181
+ model_name = "roberta-base"
182
+ self._base_test(model_name)
183
+
184
+ def test_get_encoder_environment_error(self): # This parameter is used when using patch decorator
185
+ model_name = "abc" # Wrong model_name
186
+ with self.assertRaises(EnvironmentError):
187
+ get_encoder(model_name, self.device, self.batch_size, self.verbose)
188
+
189
+ def test_get_encoder_other_exception(self):
190
+ model_name = "apple/OpenELM-270M" # This model is not supported by SentenceTransformer lib
191
+ with self.assertRaises(RuntimeError):
192
+ get_encoder(model_name, self.device, self.batch_size, self.verbose)
193
 
194
 
195
  class TestSemF1(unittest.TestCase):