aimlnerd commited on
Commit
6eb192a
·
1 Parent(s): f273844
data/raw_data/annotations/Letter 0-1-ccf1b225-ann.json CHANGED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ transformers[torch]==4.36.1
2
+ numpy==1.26.3
3
+ scikit-learn==1.3.2
4
+ matplotlib==3.8.2
5
+ datasets==2.16.1
6
+ evaluate==0.4.1
7
+ accelerate==0.25.0
8
+ seqeval==1.2.2
9
+ pandas==2.1.4
source/services/ner/train/train.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Token classification (PyTorch)
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter7/section2_pt.ipynb
8
+
9
+ # Token classification (PyTorch)
10
+
11
+ Install the Transformers, Datasets, and Evaluate libraries to run this notebook.
12
+ """
13
+
14
+ !pip install datasets evaluate transformers[sentencepiece]
15
+ !pip install accelerate
16
+
17
+ """You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."""
18
+
19
+ from huggingface_hub import notebook_login
20
+
21
+ notebook_login()
22
+
23
+ from datasets import load_dataset
24
+
25
+ raw_datasets = load_dataset("conll2003")
26
+
27
+ raw_datasets
28
+
29
+ raw_datasets["train"][0]["tokens"]
30
+
31
+ raw_datasets["train"][0]["ner_tags"]
32
+
33
+ ner_feature = raw_datasets["train"].features["ner_tags"]
34
+ ner_feature
35
+
36
+ label_names = ner_feature.feature.names
37
+ label_names
38
+
39
+ words = raw_datasets["train"][0]["tokens"]
40
+ labels = raw_datasets["train"][0]["ner_tags"]
41
+ line1 = ""
42
+ line2 = ""
43
+ for word, label in zip(words, labels):
44
+ full_label = label_names[label]
45
+ max_length = max(len(word), len(full_label))
46
+ line1 += word + " " * (max_length - len(word) + 1)
47
+ line2 += full_label + " " * (max_length - len(full_label) + 1)
48
+
49
+ print(line1)
50
+ print(line2)
51
+
52
+ from transformers import AutoTokenizer
53
+
54
+ model_checkpoint = "bert-base-cased"
55
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
56
+
57
+ tokenizer.is_fast
58
+
59
+ inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
60
+ inputs.tokens()
61
+
62
+ inputs.word_ids()
63
+
64
+ def align_labels_with_tokens(labels, word_ids):
65
+ new_labels = []
66
+ current_word = None
67
+ for word_id in word_ids:
68
+ if word_id != current_word:
69
+ # Start of a new word!
70
+ current_word = word_id
71
+ label = -100 if word_id is None else labels[word_id]
72
+ new_labels.append(label)
73
+ elif word_id is None:
74
+ # Special token
75
+ new_labels.append(-100)
76
+ else:
77
+ # Same word as previous token
78
+ label = labels[word_id]
79
+ # If the label is B-XXX we change it to I-XXX
80
+ if label % 2 == 1:
81
+ label += 1
82
+ new_labels.append(label)
83
+
84
+ return new_labels
85
+
86
+ labels = raw_datasets["train"][0]["ner_tags"]
87
+ word_ids = inputs.word_ids()
88
+ print(labels)
89
+ print(align_labels_with_tokens(labels, word_ids))
90
+
91
+ def tokenize_and_align_labels(examples):
92
+ tokenized_inputs = tokenizer(
93
+ examples["tokens"], truncation=True, is_split_into_words=True
94
+ )
95
+ all_labels = examples["ner_tags"]
96
+ new_labels = []
97
+ for i, labels in enumerate(all_labels):
98
+ word_ids = tokenized_inputs.word_ids(i)
99
+ new_labels.append(align_labels_with_tokens(labels, word_ids))
100
+
101
+ tokenized_inputs["labels"] = new_labels
102
+ return tokenized_inputs
103
+
104
+ tokenized_datasets = raw_datasets.map(
105
+ tokenize_and_align_labels,
106
+ batched=True,
107
+ remove_columns=raw_datasets["train"].column_names,
108
+ )
109
+
110
+ from transformers import DataCollatorForTokenClassification
111
+
112
+ data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
113
+
114
+ batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
115
+ batch["labels"]
116
+
117
+ for i in range(2):
118
+ print(tokenized_datasets["train"][i]["labels"])
119
+
120
+ !pip install seqeval
121
+
122
+ import evaluate
123
+
124
+ metric = evaluate.load("seqeval")
125
+
126
+ labels = raw_datasets["train"][0]["ner_tags"]
127
+ labels = [label_names[i] for i in labels]
128
+ labels
129
+
130
+ predictions = labels.copy()
131
+ predictions[2] = "O"
132
+ metric.compute(predictions=[predictions], references=[labels])
133
+
134
+ import numpy as np
135
+
136
+
137
+ def compute_metrics(eval_preds):
138
+ logits, labels = eval_preds
139
+ predictions = np.argmax(logits, axis=-1)
140
+
141
+ # Remove ignored index (special tokens) and convert to labels
142
+ true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
143
+ true_predictions = [
144
+ [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
145
+ for prediction, label in zip(predictions, labels)
146
+ ]
147
+ all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
148
+ return {
149
+ "precision": all_metrics["overall_precision"],
150
+ "recall": all_metrics["overall_recall"],
151
+ "f1": all_metrics["overall_f1"],
152
+ "accuracy": all_metrics["overall_accuracy"],
153
+ }
154
+
155
+ id2label = {i: label for i, label in enumerate(label_names)}
156
+ label2id = {v: k for k, v in id2label.items()}
157
+
158
+ from transformers import AutoModelForTokenClassification
159
+
160
+ model = AutoModelForTokenClassification.from_pretrained(
161
+ model_checkpoint,
162
+ id2label=id2label,
163
+ label2id=label2id,
164
+ )
165
+
166
+ model.config.num_labels
167
+
168
+ from huggingface_hub import notebook_login
169
+
170
+ notebook_login()
171
+
172
+ from transformers import TrainingArguments
173
+
174
+ args = TrainingArguments(
175
+ "bert-finetuned-ner",
176
+ evaluation_strategy="epoch",
177
+ save_strategy="epoch",
178
+ learning_rate=2e-5,
179
+ num_train_epochs=3,
180
+ weight_decay=0.01,
181
+ push_to_hub=True,
182
+ )
183
+
184
+ from transformers import Trainer
185
+
186
+ trainer = Trainer(
187
+ model=model,
188
+ args=args,
189
+ train_dataset=tokenized_datasets["train"],
190
+ eval_dataset=tokenized_datasets["validation"],
191
+ data_collator=data_collator,
192
+ compute_metrics=compute_metrics,
193
+ tokenizer=tokenizer,
194
+ )
195
+ trainer.train()
196
+
197
+ trainer.push_to_hub(commit_message="Training complete")
198
+
199
+ from torch.utils.data import DataLoader
200
+
201
+ train_dataloader = DataLoader(
202
+ tokenized_datasets["train"],
203
+ shuffle=True,
204
+ collate_fn=data_collator,
205
+ batch_size=8,
206
+ )
207
+ eval_dataloader = DataLoader(
208
+ tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
209
+ )
210
+
211
+ model = AutoModelForTokenClassification.from_pretrained(
212
+ model_checkpoint,
213
+ id2label=id2label,
214
+ label2id=label2id,
215
+ )
216
+
217
+ from torch.optim import AdamW
218
+
219
+ optimizer = AdamW(model.parameters(), lr=2e-5)
220
+
221
+ from accelerate import Accelerator
222
+
223
+ accelerator = Accelerator()
224
+ model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
225
+ model, optimizer, train_dataloader, eval_dataloader
226
+ )
227
+
228
+ from transformers import get_scheduler
229
+
230
+ num_train_epochs = 3
231
+ num_update_steps_per_epoch = len(train_dataloader)
232
+ num_training_steps = num_train_epochs * num_update_steps_per_epoch
233
+
234
+ lr_scheduler = get_scheduler(
235
+ "linear",
236
+ optimizer=optimizer,
237
+ num_warmup_steps=0,
238
+ num_training_steps=num_training_steps,
239
+ )
240
+
241
+ from huggingface_hub import Repository, get_full_repo_name
242
+
243
+ model_name = "bert-finetuned-ner-accelerate"
244
+ repo_name = get_full_repo_name(model_name)
245
+ repo_name
246
+
247
+ output_dir = "bert-finetuned-ner-accelerate"
248
+ repo = Repository(output_dir, clone_from=repo_name)
249
+
250
+ def postprocess(predictions, labels):
251
+ predictions = predictions.detach().cpu().clone().numpy()
252
+ labels = labels.detach().cpu().clone().numpy()
253
+
254
+ # Remove ignored index (special tokens) and convert to labels
255
+ true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
256
+ true_predictions = [
257
+ [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
258
+ for prediction, label in zip(predictions, labels)
259
+ ]
260
+ return true_labels, true_predictions
261
+
262
+ from tqdm.auto import tqdm
263
+ import torch
264
+
265
+ progress_bar = tqdm(range(num_training_steps))
266
+
267
+ for epoch in range(num_train_epochs):
268
+ # Training
269
+ model.train()
270
+ for batch in train_dataloader:
271
+ outputs = model(**batch)
272
+ loss = outputs.loss
273
+ accelerator.backward(loss)
274
+
275
+ optimizer.step()
276
+ lr_scheduler.step()
277
+ optimizer.zero_grad()
278
+ progress_bar.update(1)
279
+
280
+ # Evaluation
281
+ model.eval()
282
+ for batch in eval_dataloader:
283
+ with torch.no_grad():
284
+ outputs = model(**batch)
285
+
286
+ predictions = outputs.logits.argmax(dim=-1)
287
+ labels = batch["labels"]
288
+
289
+ # Necessary to pad predictions and labels for being gathered
290
+ predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
291
+ labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
292
+
293
+ predictions_gathered = accelerator.gather(predictions)
294
+ labels_gathered = accelerator.gather(labels)
295
+
296
+ true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
297
+ metric.add_batch(predictions=true_predictions, references=true_labels)
298
+
299
+ results = metric.compute()
300
+ print(
301
+ f"epoch {epoch}:",
302
+ {
303
+ key: results[f"overall_{key}"]
304
+ for key in ["precision", "recall", "f1", "accuracy"]
305
+ },
306
+ )
307
+
308
+ # Save and upload
309
+ accelerator.wait_for_everyone()
310
+ unwrapped_model = accelerator.unwrap_model(model)
311
+ unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
312
+ if accelerator.is_main_process:
313
+ tokenizer.save_pretrained(output_dir)
314
+ repo.push_to_hub(
315
+ commit_message=f"Training in progress epoch {epoch}", blocking=False
316
+ )
317
+
318
+ accelerator.wait_for_everyone()
319
+ unwrapped_model = accelerator.unwrap_model(model)
320
+ unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
321
+
322
+ from transformers import pipeline
323
+
324
+ # Replace this with your own checkpoint
325
+ model_checkpoint = "huggingface-course/bert-finetuned-ner"
326
+ token_classifier = pipeline(
327
+ "token-classification", model=model_checkpoint, aggregation_strategy="simple"
328
+ )
329
+ token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")