from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, TrainingArguments from datasets import load_dataset # Define model and tokenizer names model_name = "facebook/bart-base" tokenizer_name = model_name # Load dataset dataset = load_dataset("cnn_dailymail", split="train") # Preprocess data (example) - define your cleaning and tokenization functions here def preprocess_function(examples): inputs = [ex["article"] for ex in examples] targets = [ex["highlights"] for ex in examples] # Tokenize inputs and targets, add padding tokenized_data = tokenizer(inputs, targets, padding="max_length", truncation=True) return tokenized_data # Preprocess train and validation data train_data = dataset.map(preprocess_function, batched=True) # Define training arguments training_args = TrainingArguments( output_dir="./outputs", # any desired output directory per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, # Adjust number of epochs for training save_steps=10_000, evaluation_strategy="epoch", logging_steps=500, push_to_hub=True, # Set to True for direct upload to Hub during training ) # Load pre-trained model and tokenizer model = AutoModelForSeq2SeqLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) # Define Trainer instance trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_data, tokenizer=tokenizer, ) # Start training trainer.train() # Model is now trained and uploaded to the Hub if push_to_hub was True # For manual upload after training, we use the Hub API (refer to Hugging Face documentation)