aimlnerd commited on
Commit
02c62ed
·
1 Parent(s): a30f2dc

add custom training loop

Browse files
Files changed (1) hide show
  1. source/services/ner/train/train.py +39 -16
source/services/ner/train/train.py CHANGED
@@ -11,12 +11,24 @@ notebook_login()
11
 
12
  """
13
  import datasets
 
 
 
14
 
15
  dataset = datasets.load_dataset("json", data_files="data/ner_input_data/ner_dataset.json")
16
 
17
  # Convert ner_tag list of string to sequence of classlabels as expected by hugging face for target var https://discuss.huggingface.co/t/sequence-features-class-label-cast/44638/3
18
  def get_label_list(labels):
19
- # copied from https://github.com/huggingface/transformers/blob/66fd3a8d626a32989f4569260db32785c6cbf42a/examples/pytorch/token-classification/run_ner.py#L320
 
 
 
 
 
 
 
 
 
20
  unique_labels = set()
21
  for label in labels:
22
  unique_labels = unique_labels | set(label)
@@ -66,6 +78,17 @@ inputs.tokens()
66
  inputs.word_ids()
67
 
68
  def align_labels_with_tokens(labels, word_ids):
 
 
 
 
 
 
 
 
 
 
 
69
  new_labels = []
70
  current_word = None
71
  for word_id in word_ids:
@@ -93,6 +116,14 @@ print(labels)
93
  print(align_labels_with_tokens(labels, word_ids))
94
 
95
  def tokenize_and_align_labels(examples):
 
 
 
 
 
 
 
 
96
  tokenized_inputs = tokenizer(
97
  examples["tokens"], truncation=True, is_split_into_words=True
98
  )
@@ -111,8 +142,6 @@ tokenized_datasets = raw_datasets.map(
111
  remove_columns=raw_datasets["train"].column_names,
112
  )
113
 
114
- from transformers import DataCollatorForTokenClassification
115
-
116
  data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
117
 
118
  batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
@@ -122,8 +151,6 @@ for i in range(2):
122
  print(tokenized_datasets["train"][i]["labels"])
123
 
124
 
125
- import evaluate
126
-
127
  metric = evaluate.load("seqeval")
128
 
129
  labels = raw_datasets["train"][0]["ner_tags"]
@@ -134,7 +161,7 @@ predictions = labels.copy()
134
  predictions[2] = "O"
135
  metric.compute(predictions=[predictions], references=[labels])
136
 
137
- import numpy as np
138
 
139
 
140
  def compute_metrics(eval_preds):
@@ -158,8 +185,8 @@ def compute_metrics(eval_preds):
158
  id2label = {i: label for i, label in enumerate(label_names)}
159
  label2id = {v: k for k, v in id2label.items()}
160
 
161
- from transformers import AutoModelForTokenClassification
162
 
 
163
  model = AutoModelForTokenClassification.from_pretrained(
164
  model_checkpoint,
165
  id2label=id2label,
@@ -169,20 +196,16 @@ model = AutoModelForTokenClassification.from_pretrained(
169
  model.config.num_labels
170
 
171
 
172
-
173
- from transformers import TrainingArguments
174
-
175
  args = TrainingArguments(
176
  output_dir="source/services/ner/model/hf_tokenclassification/bert-finetuned-legalentity-ner",
177
  evaluation_strategy="epoch",
178
  save_strategy="epoch",
179
  learning_rate=2e-5,
180
- num_train_epochs=3,
181
  weight_decay=0.01,
182
  push_to_hub=True,
183
  )
184
 
185
- from transformers import Trainer
186
 
187
  trainer = Trainer(
188
  model=model,
@@ -196,7 +219,7 @@ trainer = Trainer(
196
  trainer.train()
197
 
198
  trainer.push_to_hub(commit_message="Training complete")
199
-
200
  from torch.utils.data import DataLoader
201
 
202
  train_dataloader = DataLoader(
@@ -241,11 +264,11 @@ lr_scheduler = get_scheduler(
241
 
242
  from huggingface_hub import Repository, get_full_repo_name
243
 
244
- model_name = "bert-finetuned-ner-accelerate"
245
  repo_name = get_full_repo_name(model_name)
246
  repo_name
247
 
248
- output_dir = "bert-finetuned-ner-accelerate"
249
  repo = Repository(output_dir, clone_from=repo_name)
250
 
251
  def postprocess(predictions, labels):
@@ -327,4 +350,4 @@ model_checkpoint = "aimlnerd/bert-finetuned-legalentity-ner"
327
  token_classifier = pipeline(
328
  "token-classification", model=model_checkpoint, aggregation_strategy="simple"
329
  )
330
- token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")
 
11
 
12
  """
13
  import datasets
14
+ import evaluate
15
+ import numpy as np
16
+ from transformers import Trainer, AutoModelForTokenClassification, TrainingArguments, DataCollatorForTokenClassification
17
 
18
  dataset = datasets.load_dataset("json", data_files="data/ner_input_data/ner_dataset.json")
19
 
20
  # Convert ner_tag list of string to sequence of classlabels as expected by hugging face for target var https://discuss.huggingface.co/t/sequence-features-class-label-cast/44638/3
21
  def get_label_list(labels):
22
+ """Create list of ner labels to create ClassLabel
23
+
24
+ Args:
25
+ labels (_type_): ner label column in the dataset
26
+
27
+ Returns:
28
+ _type_: unique NER labels
29
+ https://github.com/huggingface/transformers/blob/66fd3a8d626a32989f4569260db32785c6cbf42a/examples/pytorch/token-classification/run_ner.py#L320
30
+ """
31
+
32
  unique_labels = set()
33
  for label in labels:
34
  unique_labels = unique_labels | set(label)
 
78
  inputs.word_ids()
79
 
80
  def align_labels_with_tokens(labels, word_ids):
81
+ """Expand our label list to match the ##subtokens post tokenization. Because tokenization adds ##subtokenz
82
+ Special tokens get a label of -100(ignored in the loss function)
83
+ For tokens inside a word but not at the beginning, we replace the B- with I-
84
+
85
+ Args:
86
+ labels (_type_): labels column
87
+ word_ids (_type_): word_ids
88
+
89
+ Returns:
90
+ _type_: new labels
91
+ """
92
  new_labels = []
93
  current_word = None
94
  for word_id in word_ids:
 
116
  print(align_labels_with_tokens(labels, word_ids))
117
 
118
  def tokenize_and_align_labels(examples):
119
+ """Tokenize and handle ##subword tokens
120
+
121
+ Args:
122
+ examples (_type_): _description_
123
+
124
+ Returns:
125
+ _type_: _description_
126
+ """
127
  tokenized_inputs = tokenizer(
128
  examples["tokens"], truncation=True, is_split_into_words=True
129
  )
 
142
  remove_columns=raw_datasets["train"].column_names,
143
  )
144
 
 
 
145
  data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
146
 
147
  batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
 
151
  print(tokenized_datasets["train"][i]["labels"])
152
 
153
 
 
 
154
  metric = evaluate.load("seqeval")
155
 
156
  labels = raw_datasets["train"][0]["ner_tags"]
 
161
  predictions[2] = "O"
162
  metric.compute(predictions=[predictions], references=[labels])
163
 
164
+
165
 
166
 
167
  def compute_metrics(eval_preds):
 
185
  id2label = {i: label for i, label in enumerate(label_names)}
186
  label2id = {v: k for k, v in id2label.items()}
187
 
 
188
 
189
+ """ Uncomment to uses highlevel Trainer from huggingface instead of custom training loop
190
  model = AutoModelForTokenClassification.from_pretrained(
191
  model_checkpoint,
192
  id2label=id2label,
 
196
  model.config.num_labels
197
 
198
 
 
 
 
199
  args = TrainingArguments(
200
  output_dir="source/services/ner/model/hf_tokenclassification/bert-finetuned-legalentity-ner",
201
  evaluation_strategy="epoch",
202
  save_strategy="epoch",
203
  learning_rate=2e-5,
204
+ num_train_epochs=6,
205
  weight_decay=0.01,
206
  push_to_hub=True,
207
  )
208
 
 
209
 
210
  trainer = Trainer(
211
  model=model,
 
219
  trainer.train()
220
 
221
  trainer.push_to_hub(commit_message="Training complete")
222
+ """
223
  from torch.utils.data import DataLoader
224
 
225
  train_dataloader = DataLoader(
 
264
 
265
  from huggingface_hub import Repository, get_full_repo_name
266
 
267
+ model_name = "bert-finetuned-legalentity-ner-accelerate"
268
  repo_name = get_full_repo_name(model_name)
269
  repo_name
270
 
271
+ output_dir = "source/services/ner/model/hf_tokenclassification/bert-finetuned-legalentity-ner-accelerate"
272
  repo = Repository(output_dir, clone_from=repo_name)
273
 
274
  def postprocess(predictions, labels):
 
350
  token_classifier = pipeline(
351
  "token-classification", model=model_checkpoint, aggregation_strategy="simple"
352
  )
353
+ token_classifier("My name is James Bond and I work at MI6 in London.")