{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from transformers import GPT2LMHeadModel, GPT2Tokenizer\n", "import torch\n", "DEVICE = torch.device(\"cuda:0\")\n", "\n", "model_name_or_path = \"sberbank-ai/rugpt3small_based_on_gpt2\"\n", "tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)\n", "model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(DEVICE)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "with open('anekdoty.txt', 'r', encoding='utf-8') as file:\n", " text = file.read()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/polyakovk/venv_linux/lib/python3.11/site-packages/transformers/data/datasets/language_modeling.py:53: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py\n", " warnings.warn(\n" ] } ], "source": [ "from transformers import TextDataset, DataCollatorForLanguageModeling\n", "\n", "# Сохраним обучающие данные в .txt файл \n", "train_path = 'train_dataset.txt'\n", "with open(train_path, \"w\") as f:\n", " f.write(text)\n", "\n", "# Создание датасета\n", "train_dataset = TextDataset(tokenizer=tokenizer,file_path=train_path,block_size=32)\n", " \n", "# Создание даталодера (нарезает текст на оптимальные по длине куски)\n", "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from transformers import Trainer, TrainingArguments\n", "\n", "training_args = TrainingArguments(\n", " output_dir=\"./finetuned\",\n", " overwrite_output_dir=True,\n", " num_train_epochs=30,\n", " per_device_train_batch_size=32,\n", " per_device_eval_batch_size=16,\n", " warmup_steps=10,\n", " gradient_accumulation_steps=32,\n", " )\n", "\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " data_collator=data_collator,\n", " train_dataset=train_dataset,\n", " optimizers = (torch.optim.AdamW(model.parameters(),lr=0.001),None)\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
Step | \n", "Training Loss | \n", "
---|
"
],
"text/plain": [
"