lvwerra HF staff commited on
Commit
b93343f
·
1 Parent(s): ad4eeb1

Update Space (evaluate main: e4a27243)

Browse files
Files changed (2) hide show
  1. requirements.txt +1 -1
  2. text_duplicates.py +16 -4
requirements.txt CHANGED
@@ -1 +1 @@
1
- git+https://github.com/huggingface/evaluate.git@80448674f5447a9682afe051db243c4a13bfe4ff
 
1
+ git+https://github.com/huggingface/evaluate.git@e4a2724377909fe2aeb4357e3971e5a569673b39
text_duplicates.py CHANGED
@@ -14,6 +14,7 @@
14
 
15
  import hashlib
16
  from collections import Counter
 
17
 
18
  import datasets
19
 
@@ -57,18 +58,29 @@ def get_hash(example):
57
  return hashlib.md5(example.strip().encode("utf-8")).hexdigest()
58
 
59
 
 
 
 
 
 
 
 
 
60
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61
  class TextDuplicates(evaluate.Measurement):
62
  """This measurement returns the duplicate strings contained in the input(s)."""
63
 
64
- def _info(self):
65
- # TODO: Specifies the evaluate.MeasurementInfo object
 
 
66
  return evaluate.MeasurementInfo(
67
  # This is the description that will appear on the modules page.
68
  module_type="measurement",
69
  description=_DESCRIPTION,
70
  citation=_CITATION,
71
  inputs_description=_KWARGS_DESCRIPTION,
 
72
  # This defines the format of each prediction and reference
73
  features=datasets.Features(
74
  {
@@ -77,9 +89,9 @@ class TextDuplicates(evaluate.Measurement):
77
  ),
78
  )
79
 
80
- def _compute(self, data, list_duplicates=False):
81
  """Returns the duplicates contained in the input data and the number of times they are repeated."""
82
- if list_duplicates == True:
83
  logger.warning("This functionality can be memory-intensive for large datasets!")
84
  n_dedup = len(set([get_hash(d) for d in data]))
85
  c = Counter(data)
 
14
 
15
  import hashlib
16
  from collections import Counter
17
+ from dataclasses import dataclass
18
 
19
  import datasets
20
 
 
58
  return hashlib.md5(example.strip().encode("utf-8")).hexdigest()
59
 
60
 
61
+ @dataclass
62
+ class TextDuplicatesConfig(evaluate.info.Config):
63
+
64
+ name: str = "default"
65
+
66
+ list_duplicates: bool = False
67
+
68
+
69
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
70
  class TextDuplicates(evaluate.Measurement):
71
  """This measurement returns the duplicate strings contained in the input(s)."""
72
 
73
+ CONFIG_CLASS = TextDuplicatesConfig
74
+ ALLOWED_CONFIG_NAMES = ["default"]
75
+
76
+ def _info(self, config):
77
  return evaluate.MeasurementInfo(
78
  # This is the description that will appear on the modules page.
79
  module_type="measurement",
80
  description=_DESCRIPTION,
81
  citation=_CITATION,
82
  inputs_description=_KWARGS_DESCRIPTION,
83
+ config=config,
84
  # This defines the format of each prediction and reference
85
  features=datasets.Features(
86
  {
 
89
  ),
90
  )
91
 
92
+ def _compute(self, data):
93
  """Returns the duplicates contained in the input data and the number of times they are repeated."""
94
+ if self.config.list_duplicates == True:
95
  logger.warning("This functionality can be memory-intensive for large datasets!")
96
  n_dedup = len(set([get_hash(d) for d in data]))
97
  c = Counter(data)