Spaces:

aimlnerd
/

legal-entity-ner-transformers

Runtime error

App Files Files Community

aimlnerd commited on Jan 9, 2024

Commit

02c62ed

1 Parent(s): a30f2dc

add custom training loop

Browse files

Files changed (1) hide show

source/services/ner/train/train.py +39 -16

source/services/ner/train/train.py CHANGED Viewed

@@ -11,12 +11,24 @@ notebook_login()
 """
 import datasets
 dataset = datasets.load_dataset("json", data_files="data/ner_input_data/ner_dataset.json")
 # Convert ner_tag list of string to sequence of classlabels as expected by hugging face for target var https://discuss.huggingface.co/t/sequence-features-class-label-cast/44638/3
 def get_label_list(labels):
-    # copied from https://github.com/huggingface/transformers/blob/66fd3a8d626a32989f4569260db32785c6cbf42a/examples/pytorch/token-classification/run_ner.py#L320
     unique_labels = set()
     for label in labels:
         unique_labels = unique_labels | set(label)
@@ -66,6 +78,17 @@ inputs.tokens()
 inputs.word_ids()
 def align_labels_with_tokens(labels, word_ids):
     new_labels = []
     current_word = None
     for word_id in word_ids:
@@ -93,6 +116,14 @@ print(labels)
 print(align_labels_with_tokens(labels, word_ids))
 def tokenize_and_align_labels(examples):
     tokenized_inputs = tokenizer(
         examples["tokens"], truncation=True, is_split_into_words=True
     )
@@ -111,8 +142,6 @@ tokenized_datasets = raw_datasets.map(
     remove_columns=raw_datasets["train"].column_names,
 )
-from transformers import DataCollatorForTokenClassification
 data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
 batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
@@ -122,8 +151,6 @@ for i in range(2):
     print(tokenized_datasets["train"][i]["labels"])
-import evaluate
 metric = evaluate.load("seqeval")
 labels = raw_datasets["train"][0]["ner_tags"]
@@ -134,7 +161,7 @@ predictions = labels.copy()
 predictions[2] = "O"
 metric.compute(predictions=[predictions], references=[labels])
-import numpy as np
 def compute_metrics(eval_preds):
@@ -158,8 +185,8 @@ def compute_metrics(eval_preds):
 id2label = {i: label for i, label in enumerate(label_names)}
 label2id = {v: k for k, v in id2label.items()}
-from transformers import AutoModelForTokenClassification
 model = AutoModelForTokenClassification.from_pretrained(
     model_checkpoint,
     id2label=id2label,
@@ -169,20 +196,16 @@ model = AutoModelForTokenClassification.from_pretrained(
 model.config.num_labels
-from transformers import TrainingArguments
 args = TrainingArguments(
     output_dir="source/services/ner/model/hf_tokenclassification/bert-finetuned-legalentity-ner",
     evaluation_strategy="epoch",
     save_strategy="epoch",
     learning_rate=2e-5,
-    num_train_epochs=3,
     weight_decay=0.01,
     push_to_hub=True,
 )
-from transformers import Trainer
 trainer = Trainer(
     model=model,
@@ -196,7 +219,7 @@ trainer = Trainer(
 trainer.train()
 trainer.push_to_hub(commit_message="Training complete")
 from torch.utils.data import DataLoader
 train_dataloader = DataLoader(
@@ -241,11 +264,11 @@ lr_scheduler = get_scheduler(
 from huggingface_hub import Repository, get_full_repo_name
-model_name = "bert-finetuned-ner-accelerate"
 repo_name = get_full_repo_name(model_name)
 repo_name
-output_dir = "bert-finetuned-ner-accelerate"
 repo = Repository(output_dir, clone_from=repo_name)
 def postprocess(predictions, labels):
@@ -327,4 +350,4 @@ model_checkpoint = "aimlnerd/bert-finetuned-legalentity-ner"
 token_classifier = pipeline(
     "token-classification", model=model_checkpoint, aggregation_strategy="simple"
 )
-token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

 """
 import datasets
+import evaluate
+import numpy as np
+from transformers import Trainer, AutoModelForTokenClassification, TrainingArguments, DataCollatorForTokenClassification
 dataset = datasets.load_dataset("json", data_files="data/ner_input_data/ner_dataset.json")
 # Convert ner_tag list of string to sequence of classlabels as expected by hugging face for target var https://discuss.huggingface.co/t/sequence-features-class-label-cast/44638/3
 def get_label_list(labels):
+    """Create list of ner labels to create ClassLabel
+    Args:
+        labels (_type_): ner label column in the dataset
+    Returns:
+        _type_: unique NER labels
+    https://github.com/huggingface/transformers/blob/66fd3a8d626a32989f4569260db32785c6cbf42a/examples/pytorch/token-classification/run_ner.py#L320
+    """
     unique_labels = set()
     for label in labels:
         unique_labels = unique_labels | set(label)
 inputs.word_ids()
 def align_labels_with_tokens(labels, word_ids):
+    """Expand our label list to match the ##subtokens post tokenization. Because tokenization adds ##subtokenz
+       Special tokens get a label of -100(ignored in the loss function)
+       For tokens inside a word but not at the beginning, we replace the B- with I-
+    Args:
+        labels (_type_): labels column
+        word_ids (_type_): word_ids
+    Returns:
+        _type_: new labels
+    """
     new_labels = []
     current_word = None
     for word_id in word_ids:
 print(align_labels_with_tokens(labels, word_ids))
 def tokenize_and_align_labels(examples):
+    """Tokenize and handle ##subword tokens
+    Args:
+        examples (_type_): _description_
+    Returns:
+        _type_: _description_
+    """
     tokenized_inputs = tokenizer(
         examples["tokens"], truncation=True, is_split_into_words=True
     )
     remove_columns=raw_datasets["train"].column_names,
 )
 data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
 batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
     print(tokenized_datasets["train"][i]["labels"])
 metric = evaluate.load("seqeval")
 labels = raw_datasets["train"][0]["ner_tags"]
 predictions[2] = "O"
 metric.compute(predictions=[predictions], references=[labels])
 def compute_metrics(eval_preds):
 id2label = {i: label for i, label in enumerate(label_names)}
 label2id = {v: k for k, v in id2label.items()}
+""" Uncomment to uses highlevel Trainer from huggingface instead of custom training loop
 model = AutoModelForTokenClassification.from_pretrained(
     model_checkpoint,
     id2label=id2label,
 model.config.num_labels
 args = TrainingArguments(
     output_dir="source/services/ner/model/hf_tokenclassification/bert-finetuned-legalentity-ner",
     evaluation_strategy="epoch",
     save_strategy="epoch",
     learning_rate=2e-5,
+    num_train_epochs=6,
     weight_decay=0.01,
     push_to_hub=True,
 )
 trainer = Trainer(
     model=model,
 trainer.train()
 trainer.push_to_hub(commit_message="Training complete")
+"""
 from torch.utils.data import DataLoader
 train_dataloader = DataLoader(
 from huggingface_hub import Repository, get_full_repo_name
+model_name = "bert-finetuned-legalentity-ner-accelerate"
 repo_name = get_full_repo_name(model_name)
 repo_name
+output_dir = "source/services/ner/model/hf_tokenclassification/bert-finetuned-legalentity-ner-accelerate"
 repo = Repository(output_dir, clone_from=repo_name)
 def postprocess(predictions, labels):
 token_classifier = pipeline(
     "token-classification", model=model_checkpoint, aggregation_strategy="simple"
 )
+token_classifier("My name is James Bond and I work at MI6 in London.")