'teacher', (0, 13436) 'distily_tinyBenchmarks/logs/harness_benchmarks=tinyBenchmarks, learning_rate=0.0001, lr_scheduler_kwargs=__power___0.7___lr_end___2e-05_, lr_scheduler_type=polynomial, per_device_train_batch_size=8', (13824, 448) 'logis/teacher', (27648, 448) 'distily_tinyBenchmarks/logs/harness_benchmarks=tinyBenchmarks, learning_rate=0.0001, lr_scheduler_kwargs=__power___0.7___lr_end___2e-05_, lr_scheduler_type=polynomial, per_device_train_batch_size=8, warmup_ratio=0.1', (41472, 448) 'distily_tinyBenchmarks/logs/harness_benchmarks=tinyBenchmarks, learning_rate=5e-05, lr_scheduler_kwargs=__power___0.7___lr_end___2e-05_, lr_scheduler_type=polynomial, per_device_train_batch_size=8', (41984, 448) 'distily_tinyBenchmarks/logs/harness_benchmarks=tinyBenchmarks, learning_rate=5e-05, lr_scheduler_kwargs=__power___0.7___lr_end___2e-05_, lr_scheduler_type=polynomial, per_device_train_batch_size=8, warmup_ratio=0.1', (42496, 448) 'distily_tinyBenchmarks/logs/harness_benchmarks=tinyBenchmarks, learning_rate=0.0002, lr_scheduler_kwargs=__power___0.7___lr_end___2e-05_, lr_scheduler_type=polynomial, per_device_train_batch_size=8', (43008, 448) 'distily_tinyBenchmarks/logs/harness_benchmarks=tinyBenchmarks, learning_rate=0.0002, lr_scheduler_kwargs=__power___0.7___lr_end___2e-05_, lr_scheduler_type=polynomial, per_device_train_batch_size=8, warmup_ratio=0.1', (43520, 448)