import gradio as gr from datasets import load_dataset imdb = load_dataset("imdb") from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") def preprocess_function(examples): return tokenizer(examples["text"], truncation=True) tokenized_imdb = imdb.map(preprocess_function, batched=True) from transformers import DataCollatorWithPadding data_collator = DataCollatorWithPadding(tokenizer=tokenizer) from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2) training_args = TrainingArguments( output_dir="./results", learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=0.01, weight_decay=0.01, ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_imdb["train"], eval_dataset=tokenized_imdb["test"], tokenizer=tokenizer, data_collator=data_collator, ) trainer.train()