File size: 1,643 Bytes
5736201 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Kaludi/chatgpt-gpt4-prompts-bart-large-cnn-samsum", from_tf=True)
# Assuming you have your own dataset for fine-tuning
# Replace this with loading your dataset as needed
# For example, you can use the datasets library for loading datasets
# See previous responses for an example of how to use datasets
# Define data collator for sequence-to-sequence modeling
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
# Define training arguments
training_args = Seq2SeqTrainingArguments(
output_dir="./gpt4-text-gen",
overwrite_output_dir=True,
per_device_train_batch_size=4,
save_steps=10_000,
save_total_limit=2,
)
# Create Seq2SeqTrainer
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=your_training_dataset, # Replace with your training dataset
)
# Train the model
trainer.train()
# Save the fine-tuned model and tokenizer
model.save_pretrained("./gpt4-text-gen")
tokenizer.save_pretrained("./gpt4-text-gen")
# Generate text using the fine-tuned model
input_text = "Once upon a time"
input_ids = tokenizer.encode(input_text, return_tensors="pt")
output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, top_p=0.95)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text: ", generated_text) |