d-matrix commited on
Commit
b97e015
1 Parent(s): 4fa9f3e

Adding initial files for perplexity metric

Browse files
Files changed (4) hide show
  1. README.md +27 -1
  2. app.py +6 -0
  3. perplexity.py +127 -0
  4. requirements.txt +5 -0
README.md CHANGED
@@ -8,6 +8,32 @@ sdk_version: 4.7.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
 
 
 
 
 
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ tags:
12
+ - evaluate
13
+ - metric
14
+ description: >-
15
+ Perplexity metric implemented by d-Matrix.
16
+ Perplexity (PPL) is one of the most common metrics for evaluating language models.
17
+ It is defined as the exponentiated average negative log-likelihood of a sequence, calculated with exponent base `e`.
18
+ For more information, see https://huggingface.co/docs/transformers/perplexity
19
  ---
20
 
21
+ # Metric Card for Perplexity
22
+
23
+
24
+ ## Metric Description
25
+
26
+ Perplexity metric implemented by d-Matrix.
27
+ Perplexity (PPL) is one of the most common metrics for evaluating language models.
28
+ It is defined as the exponentiated average negative log-likelihood of a sequence, calculated with exponent base `e`.
29
+ For more information, see https://huggingface.co/docs/transformers/perplexity
30
+
31
+ ## How to Use
32
+ At minimum, this metric requires the model and text as inputs.
33
+ ```python
34
+ >>> perplexity = evaluate.load("d-matrix/perplexity", module_type="metric")
35
+ >>> input_texts = ["lorem ipsum", "Happy Birthday!", "Bienvenue"]
36
+ >>> results = perplexity.compute(model='distilgpt2',text=input_texts)
37
+ >>> print(results)
38
+ {'accuracy': 1.0}
39
+ ```
app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+
5
+ module = evaluate.load("d-matrix/perplexity")
6
+ launch_gradio_widget(module)
perplexity.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import evaluate
2
+ import datasets
3
+ from typing import Union, Dict
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ import torch
6
+ from tqdm import tqdm
7
+
8
+ _DESCRIPTION = """
9
+ Perplexity metric implemented by d-Matrix.
10
+ Perplexity (PPL) is one of the most common metrics for evaluating language models.
11
+ It is defined as the exponentiated average negative log-likelihood of a sequence, calculated with exponent base `e`.
12
+ For more information, see https://huggingface.co/docs/transformers/perplexity
13
+ """
14
+
15
+ _KWARGS_DESCRIPTION = """
16
+ Args:
17
+ model (Union[str,AutoModelForCausalLM]): model used for calculating Perplexity
18
+ NOTE: Perplexity can only be calculated for causal language models.
19
+ This includes models such as gpt2, causal variations of bert,
20
+ causal versions of t5, and more (the full list can be found
21
+ in the AutoModelForCausalLM documentation here:
22
+ https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForCausalLM )
23
+ text (list of str): input text, each separate text snippet is one list entry.
24
+ device (str): device to run on, defaults to 'cuda' when available.
25
+ max_length (int): maximum sequence length, defaults to 2048.
26
+ Returns:
27
+ perplexity: dictionary containing the perplexity score and loss.
28
+ Examples:
29
+ Example:
30
+ >>> from datasets import load_dataset
31
+ >>> perplexity = evaluate.load("dmx_perplexity", module_type="metric")
32
+ >>> input_texts = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")["text"][:10] # doctest: +SKIP
33
+ >>> results = perplexity.compute(model='distilgpt2',
34
+ ... text=input_texts)
35
+ >>> print(list(results.keys()))
36
+ ['loss', 'perplexity']
37
+ >>> print(results['loss']) # doctest: +SKIP
38
+ 3.8299286365509033
39
+ >>> print(results['perplexity']) # doctest: +SKIP
40
+ 46.05925369262695
41
+ """
42
+
43
+
44
+ class DmxPerplexity(evaluate.Metric):
45
+ def _info(self):
46
+ return evaluate.MetricInfo(
47
+ module_type="metric",
48
+ description=_DESCRIPTION,
49
+ citation="",
50
+ inputs_description=_KWARGS_DESCRIPTION,
51
+ features=datasets.Features(
52
+ {
53
+ "text": datasets.Value("string"),
54
+ }
55
+ ),
56
+ reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
57
+ )
58
+
59
+ def _compute(
60
+ self,
61
+ text,
62
+ model: Union[str, AutoModelForCausalLM],
63
+ device=None,
64
+ max_length=None,
65
+ ):
66
+ if device is not None:
67
+ assert device in [
68
+ "gpu",
69
+ "cpu",
70
+ "cuda",
71
+ ], "device should be either gpu or cpu."
72
+ if device == "gpu":
73
+ device = "cuda"
74
+ else:
75
+ device = "cuda" if torch.cuda.is_available() else "cpu"
76
+
77
+ if isinstance(model, str):
78
+ tokenizer = AutoTokenizer.from_pretrained(model)
79
+ model = AutoModelForCausalLM.from_pretrained(model)
80
+
81
+ else:
82
+ tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path)
83
+
84
+ if max_length:
85
+ max_seq_len = max_length
86
+ elif hasattr(model.config, "max_position_embeddings"):
87
+ max_seq_len = model.config.max_position_embeddings
88
+ elif hasattr(model.config, "n_positions"):
89
+ max_seq_len = model.config.n_positions
90
+ else:
91
+ max_seq_len = 2048
92
+
93
+ model = model.to(device)
94
+ encodings = tokenizer("\n\n".join(text), return_tensors="pt")
95
+
96
+ stride = max_seq_len
97
+ seq_len = encodings.input_ids.size(1)
98
+
99
+ nlls = []
100
+ prev_end_loc = 0
101
+ for begin_loc in tqdm(range(0, seq_len, stride)):
102
+ end_loc = min(begin_loc + max_seq_len, seq_len)
103
+ trg_len = end_loc - prev_end_loc
104
+ input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
105
+ target_ids = input_ids.clone()
106
+ target_ids[:, :-trg_len] = -100
107
+
108
+ with torch.no_grad():
109
+ outputs = model(input_ids, labels=target_ids)
110
+ if isinstance(outputs, Dict):
111
+ neg_log_likelihood = outputs["loss"] * trg_len
112
+ else:
113
+ neg_log_likelihood = outputs.loss * trg_len
114
+
115
+ nlls.append(neg_log_likelihood)
116
+
117
+ prev_end_loc = end_loc
118
+ if end_loc == seq_len:
119
+ break
120
+
121
+ loss = torch.stack(nlls).float().sum() / end_loc
122
+ ppl = torch.exp(loss)
123
+
124
+ return dict(
125
+ loss=loss.item(),
126
+ perplexity=ppl.item(),
127
+ )
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ evaluate
2
+ transformers
3
+ torch
4
+ tqdm
5
+ datasets