daichira commited on
Commit
b81eee6
·
verified ·
1 Parent(s): f4b0c9b

Update README.md

Browse files

再現確認済みの実行コード全体を最後尾に追記

Files changed (1) hide show
  1. README.md +151 -0
README.md CHANGED
@@ -239,4 +239,155 @@ with open(f"{new_model_id}_output.jsonl", 'w', encoding='utf-8') as f:
239
 
240
  ---
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  ```
 
239
 
240
  ---
241
 
242
+ ## 実行コード全体
243
+ ```python
244
+ # 必要なライブラリのインストール
245
+ !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
246
+ !pip install --upgrade torch
247
+ !pip install --upgrade xformers
248
+ !pip install ipywidgets --upgrade
249
+
250
+ # Flash Attention 2のインストール
251
+ import torch
252
+ if torch.cuda.get_device_capability()[0] >= 8:
253
+ !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"
254
+
255
+ # モデルとトークナイザーのロード
256
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
257
+ from unsloth import FastLanguageModel
258
+
259
+ # モデル設定
260
+ max_seq_length = 1024
261
+ dtype = None
262
+ load_in_4bit = True
263
+ model_id = "daichira/llm-jp-3-13b-finetune2"
264
+ new_model_id = "llm-jp-3-13b-itnew9"
265
+
266
+ model, tokenizer = FastLanguageModel.from_pretrained(
267
+ model_name=model_id,
268
+ dtype=dtype,
269
+ load_in_4bit=load_in_4bit,
270
+ trust_remote_code=True,
271
+ )
272
+
273
+ # SFT用のモデル設定
274
+ model = FastLanguageModel.get_peft_model(
275
+ model,
276
+ r=32,
277
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
278
+ lora_alpha=32,
279
+ lora_dropout=0.05,
280
+ bias="none",
281
+ use_gradient_checkpointing="unsloth",
282
+ random_state=3407,
283
+ use_rslora=False,
284
+ loftq_config=None,
285
+ max_seq_length=max_seq_length,
286
+ )
287
+
288
+ # Hugging Faceのトークン設定
289
+ HF_TOKEN = "your_token"
290
+
291
+ # データセットの準備
292
+ !pip install datasets
293
+ import os
294
+ from datasets import load_dataset
295
+
296
+ dataset = load_dataset("DeL-TaiseiOzaki/Tengentoppa-sft-v1.0", split="train")
297
+ chunk_size = 30000
298
+ output_dir = "/content/tengentoppa_chunks"
299
+ os.makedirs(output_dir, exist_ok=True)
300
+
301
+ total_rows = len(dataset)
302
+ num_chunks = (total_rows + chunk_size - 1) // chunk_size
303
+
304
+ for i in range(num_chunks):
305
+ start_idx = i * chunk_size
306
+ end_idx = min(start_idx + chunk_size, total_rows)
307
+ chunk = dataset.select(range(start_idx, end_idx))
308
+ chunk_file = f"{output_dir}/tengentoppa_chunk_{i+1}.json"
309
+ chunk.to_json(chunk_file)
310
+ print(f"Saved chunk {i+1}/{num_chunks} to {chunk_file}")
311
+
312
+ print("All chunks have been saved!")
313
+
314
+ # JSON形式のデータセットをロード
315
+ json_path = "/content/tengentoppa_chunks/tengentoppa_chunk_3.json"
316
+ dataset = load_dataset("json", data_files=json_path)
317
+ print(dataset)
318
+
319
+ # プロンプトフォーマットの適用
320
+ prompt = """### 指示
321
+ {}
322
+ ### 回答
323
+ {}"""
324
+ EOS_TOKEN = tokenizer.eos_token
325
+
326
+ def formatting_prompts_func(examples):
327
+ input_text = examples["instruction"]
328
+ output_text = examples["output"]
329
+ return {"formatted_text": prompt.format(input_text, output_text) + EOS_TOKEN}
330
+
331
+ dataset = dataset.map(formatting_prompts_func, num_proc=4)
332
+
333
+ # トレーニングの設定
334
+ from trl import SFTTrainer
335
+ from transformers import TrainingArguments
336
+ from unsloth import is_bfloat16_supported
337
+
338
+ trainer = SFTTrainer(
339
+ model=model,
340
+ tokenizer=tokenizer,
341
+ train_dataset=dataset["train"],
342
+ max_seq_length=max_seq_length,
343
+ dataset_text_field="formatted_text",
344
+ args=TrainingArguments(
345
+ per_device_train_batch_size=6,
346
+ gradient_accumulation_steps=4,
347
+ num_train_epochs=1,
348
+ logging_steps=50,
349
+ warmup_steps=500,
350
+ save_steps=500,
351
+ save_total_limit=2,
352
+ learning_rate=3e-4,
353
+ fp16=not is_bfloat16_supported(),
354
+ bf16=is_bfloat16_supported(),
355
+ group_by_length=True,
356
+ seed=3407,
357
+ output_dir="outputs",
358
+ ),
359
+ )
360
+
361
+ # 学習実行
362
+ torch.cuda.empty_cache()
363
+ trainer.train()
364
+
365
+ # 推論の準備
366
+ import json
367
+ from tqdm import tqdm
368
+
369
+ with open("/content/elyza-tasks-100-TV_0.jsonl", "r") as f:
370
+ datasets = [json.loads(line) for line in f if line.strip().endswith("}")]
371
+
372
+ FastLanguageModel.for_inference(model)
373
+
374
+ results = []
375
+ for dt in tqdm(datasets):
376
+ input_text = dt["input"]
377
+ prompt = f"""### 指示\n{input_text}\n### 回答\n"""
378
+
379
+ inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
380
+ outputs = model.generate(**inputs, max_new_tokens=512, use_cache=True, do_sample=False, repetition_penalty=1.2)
381
+ prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).split('\n### 回答')[-1]
382
+
383
+ results.append({"task_id": dt["task_id"], "input": input_text, "output": prediction})
384
+
385
+ # 推論結果の保存
386
+ with open(f"{new_model_id}_output.jsonl", 'w', encoding='utf-8') as f:
387
+ for result in results:
388
+ json.dump(result, f, ensure_ascii=False)
389
+ f.write('\n')
390
+ ```
391
+
392
+
393
  ```