Spaces:

evaluate-measurement
/

text_duplicates

Running

lvwerra HF staff commited on Sep 22, 2022

Commit

b93343f

1 Parent(s): ad4eeb1

Update Space (evaluate main: e4a27243)

Files changed (2) hide show

requirements.txt CHANGED Viewed

	@@ -1 +1 @@
1	- git+https://github.com/huggingface/evaluate.git@~~80448674f5447a9682afe051db243c4a13bfe4ff~~


1	+ git+https://github.com/huggingface/evaluate.git@e4a2724377909fe2aeb4357e3971e5a569673b39

text_duplicates.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import hashlib
 from collections import Counter
 import datasets
@@ -57,18 +58,29 @@ def get_hash(example):
     return hashlib.md5(example.strip().encode("utf-8")).hexdigest()
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class TextDuplicates(evaluate.Measurement):
     """This measurement returns the duplicate strings contained in the input(s)."""
-    def _info(self):
-        # TODO: Specifies the evaluate.MeasurementInfo object
         return evaluate.MeasurementInfo(
             # This is the description that will appear on the modules page.
             module_type="measurement",
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
             features=datasets.Features(
                 {
@@ -77,9 +89,9 @@ class TextDuplicates(evaluate.Measurement):
             ),
         )
-    def _compute(self, data, list_duplicates=False):
         """Returns the duplicates contained in the input data and the number of times they are repeated."""
-        if list_duplicates == True:
             logger.warning("This functionality can be memory-intensive for large datasets!")
             n_dedup = len(set([get_hash(d) for d in data]))
             c = Counter(data)

 import hashlib
 from collections import Counter
+from dataclasses import dataclass
 import datasets
     return hashlib.md5(example.strip().encode("utf-8")).hexdigest()
+@dataclass
+class TextDuplicatesConfig(evaluate.info.Config):
+    name: str = "default"
+    list_duplicates: bool = False
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class TextDuplicates(evaluate.Measurement):
     """This measurement returns the duplicate strings contained in the input(s)."""
+    CONFIG_CLASS = TextDuplicatesConfig
+    ALLOWED_CONFIG_NAMES = ["default"]
+    def _info(self, config):
         return evaluate.MeasurementInfo(
             # This is the description that will appear on the modules page.
             module_type="measurement",
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
+            config=config,
             # This defines the format of each prediction and reference
             features=datasets.Features(
                 {
             ),
         )
+    def _compute(self, data):
         """Returns the duplicates contained in the input data and the number of times they are repeated."""
+        if self.config.list_duplicates == True:
             logger.warning("This functionality can be memory-intensive for large datasets!")
             n_dedup = len(set([get_hash(d) for d in data]))
             c = Counter(data)