tomaarsen HF staff commited on
Commit
9b77503
1 Parent(s): 8cf7e30

Upload train.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train.py +87 -0
train.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import shutil
3
+ from datasets import load_dataset
4
+ from transformers import TrainingArguments
5
+ from span_marker import SpanMarkerModel, Trainer
6
+ from span_marker.model_card import SpanMarkerModelCardData
7
+ from huggingface_hub import upload_folder, upload_file
8
+
9
+
10
+ def main() -> None:
11
+ # Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels
12
+ dataset = load_dataset("DFKI-SLT/few-nerd", "supervised")
13
+ dataset = dataset.remove_columns("ner_tags")
14
+ dataset = dataset.rename_column("fine_ner_tags", "ner_tags")
15
+ labels = dataset["train"].features["ner_tags"].feature.names
16
+
17
+ # Initialize a SpanMarker model using a pretrained BERT-style encoder
18
+ encoder_id = "bert-base-multilingual-cased"
19
+ model_id = f"tomaarsen/span-marker-mbert-base-fewnerd-fine-super"
20
+ model = SpanMarkerModel.from_pretrained(
21
+ encoder_id,
22
+ labels=labels,
23
+ # SpanMarker hyperparameters:
24
+ model_max_length=256,
25
+ marker_max_length=128,
26
+ entity_max_length=8,
27
+ # Model card variables
28
+ model_card_data=SpanMarkerModelCardData(
29
+ model_id=model_id,
30
+ encoder_id=encoder_id,
31
+ dataset_name="FewNERD",
32
+ license="cc-by-sa-4.0",
33
+ language=["en", "multilingual"],
34
+ ),
35
+ )
36
+
37
+ # Prepare the 🤗 transformers training arguments
38
+ output_dir = Path("models") / model_id
39
+ args = TrainingArguments(
40
+ output_dir=output_dir,
41
+ run_name=model_id,
42
+ # Training Hyperparameters:
43
+ learning_rate=5e-5,
44
+ per_device_train_batch_size=16,
45
+ per_device_eval_batch_size=16,
46
+ num_train_epochs=3,
47
+ weight_decay=0.01,
48
+ warmup_ratio=0.1,
49
+ bf16=True, # Replace `bf16` with `fp16` if your hardware can't use bf16.
50
+ # Other Training parameters
51
+ logging_first_step=True,
52
+ logging_steps=50,
53
+ evaluation_strategy="steps",
54
+ save_strategy="steps",
55
+ eval_steps=3000,
56
+ save_total_limit=1,
57
+ dataloader_num_workers=4,
58
+ )
59
+
60
+ # Initialize the trainer using our model, training args & dataset, and train
61
+ trainer = Trainer(
62
+ model=model,
63
+ args=args,
64
+ train_dataset=dataset["train"],
65
+ eval_dataset=dataset["validation"],
66
+ )
67
+ trainer.train()
68
+
69
+ # Compute & save the metrics on the test set
70
+ metrics = trainer.evaluate(dataset["test"], metric_key_prefix="test")
71
+ trainer.save_metrics("test", metrics)
72
+
73
+ # Save the model & training script locally
74
+ trainer.save_model(output_dir / "checkpoint-final")
75
+ shutil.copy2(__file__, output_dir / "checkpoint-final" / "train.py")
76
+
77
+ # Upload everything to the Hub
78
+ breakpoint()
79
+ model.push_to_hub(model_id, private=True)
80
+ upload_folder(folder_path=output_dir / "runs", path_in_repo="runs", repo_id=model_id)
81
+ upload_file(path_or_fileobj=__file__, path_in_repo="train.py", repo_id=model_id)
82
+ upload_file(path_or_fileobj=output_dir / "all_results.json", path_in_repo="all_results.json", repo_id=model_id)
83
+ upload_file(path_or_fileobj=output_dir / "emissions.csv", path_in_repo="emissions.csv", repo_id=model_id)
84
+
85
+
86
+ if __name__ == "__main__":
87
+ main()