model =gpt2 total batch size=40 train num epochs=10 fp16 =True max seq length =40 eval_acc = 0.6864357017028773 eval_loss = 2.5627794878823416