# https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb#scrollTo=VNZZs-r6iKAV from transformers import Trainer, TrainingArguments from transformers import DataCollatorForLanguageModeling from transformers import LineByLineTextDataset from transformers import BertForMaskedLM from transformers import BertTokenizerFast, BertTokenizer from transformers import BertConfig import torch import time print(torch.cuda.is_available()) # Start the timer start_time = time.time() # Define a config config = BertConfig( vocab_size=25000, max_position_embeddings=512, num_attention_heads=12, num_hidden_layers=12, type_vocab_size=2, ) tokenizer = BertTokenizerFast.from_pretrained( "latin_WP_tokenizer", ) # initialize from config to train from scratch model = BertForMaskedLM(config=config) print(f"There are {model.num_parameters()} parameters") full_corpus_file = "03_full_latin_corpus_for_training.txt" dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=full_corpus_file, block_size=128, ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 ) # initialize trainer output_dir = "./Latin_BERT_training_2" training_args = TrainingArguments( output_dir=output_dir, overwrite_output_dir=True, # num_train_epochs=3, # like the original BERT num_train_epochs=1, # just one epoch per_device_train_batch_size=64, save_steps=10000, save_total_limit=2, prediction_loss_only=True, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) # now train trainer.train() trainer.save_model("./latin_BERT_2") # End the timer end_time = time.time() # Calculate the elapsed time elapsed_time = end_time - start_time # Print the elapsed time print(f"Elapsed time: {elapsed_time} seconds")