Update README.md
Browse files
README.md
CHANGED
@@ -14,9 +14,10 @@ base_model:
|
|
14 |
|
15 |
# Model Card for Model ID
|
16 |
|
17 |
-
|
18 |
-
|
19 |
|
|
|
20 |
|
21 |
## Model Details
|
22 |
|
@@ -26,21 +27,17 @@ base_model:
|
|
26 |
|
27 |
This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
|
28 |
|
29 |
-
- **Developed by:** [
|
30 |
-
- **
|
31 |
-
- **
|
32 |
-
- **
|
33 |
-
- **
|
34 |
-
- **License:** [More Information Needed]
|
35 |
-
- **Finetuned from model [optional]:** [More Information Needed]
|
36 |
|
37 |
### Model Sources [optional]
|
38 |
|
39 |
<!-- Provide the basic links for the model. -->
|
40 |
|
41 |
-
- **Repository:**
|
42 |
-
- **Paper [optional]:** [More Information Needed]
|
43 |
-
- **Demo [optional]:** [More Information Needed]
|
44 |
|
45 |
## Uses
|
46 |
|
@@ -52,18 +49,20 @@ This is the model card of a 🤗 transformers model that has been pushed on the
|
|
52 |
|
53 |
```txt:requirements.txt
|
54 |
numpy
|
55 |
-
torch
|
56 |
datasets
|
57 |
-
transformers
|
|
|
|
|
58 |
FlagEmbedding
|
59 |
```
|
60 |
|
61 |
~~~python
|
62 |
-
from FlagEmbedding import BGEM3FlagModel
|
63 |
-
|
64 |
import torch
|
65 |
import numpy as np
|
|
|
66 |
from datasets import Dataset, load_dataset
|
|
|
67 |
from transformers import (
|
68 |
AutoModelForCausalLM,
|
69 |
AutoTokenizer,
|
@@ -83,16 +82,15 @@ def retrieve(input_text):
|
|
83 |
|
84 |
input_texts = [input_text]
|
85 |
input_embeds = model.encode(input_texts)["dense_vecs"]
|
86 |
-
# print(input_embeds)
|
87 |
|
88 |
# 類似度の計算
|
89 |
similarity = input_embeds @ target_embeds.T
|
90 |
most_similar_text = target_texts[np.argmax(similarity)]
|
91 |
|
92 |
target_index = target_texts.index(most_similar_text)
|
93 |
-
|
94 |
return target_index
|
95 |
|
|
|
96 |
class CallLLM:
|
97 |
def __init__(self, model_name_or_path: str) -> None:
|
98 |
self.model = AutoModelForCausalLM.from_pretrained(
|
@@ -185,44 +183,53 @@ model_path_or_id = "Yuto-24/llm-jp-3-13B-Tengentoppa_magpie"
|
|
185 |
llm = CallLLM(model_path_or_id)
|
186 |
|
187 |
SYSTEM_PROMPT = """
|
188 |
-
#
|
|
|
|
|
189 |
|
190 |
あなたは誠実で優秀なアシスタントです。
|
|
|
191 |
ハルシネーションをしません。
|
192 |
必ず正しい情報のみを答えます。
|
193 |
|
194 |
## 指示
|
195 |
|
196 |
-
|
197 |
-
|
198 |
-
|
|
|
|
|
199 |
|
200 |
-
|
201 |
|
202 |
```markdown
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
{dataset_input}
|
206 |
|
207 |
-
評価観点
|
208 |
|
209 |
{dataset_eval_aspect}
|
210 |
|
211 |
-
出力
|
212 |
|
213 |
{dataset_answer}
|
214 |
-
```
|
215 |
-
|
216 |
""".strip()
|
217 |
|
218 |
|
219 |
-
#
|
220 |
-
# omnicampusの開発環境では、左にタスクのjsonl
|
|
|
221 |
import os
|
222 |
import json
|
223 |
|
224 |
datasets = []
|
225 |
-
with open(f"{os.path.dirname(os.path.abspath('
|
226 |
item = ""
|
227 |
for line in f:
|
228 |
line = line.strip()
|
@@ -264,10 +271,10 @@ for data in tqdm(datasets, smoothing=0.0):
|
|
264 |
# stream=True,
|
265 |
).strip()
|
266 |
# print("-----------------------------------------------------------------------------------------------------------------------------------")
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
|
272 |
results.append({
|
273 |
"task_id": data["task_id"],
|
@@ -275,16 +282,85 @@ for data in tqdm(datasets, smoothing=0.0):
|
|
275 |
"output_org": output.strip(),
|
276 |
"output": re.sub(r"^[\s\S]*?### 出力", "", output).strip(),
|
277 |
"elyza_tasks_id": dataset_index,
|
|
|
|
|
|
|
278 |
})
|
279 |
|
|
|
280 |
|
|
|
|
|
281 |
|
282 |
-
# results にタスクの解答が入っている
|
283 |
|
284 |
-
|
|
|
|
|
|
|
|
|
285 |
|
286 |
|
287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
|
289 |
### Downstream Use [optional]
|
290 |
|
@@ -322,26 +398,205 @@ Use the code below to get started with the model.
|
|
322 |
|
323 |
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
324 |
|
325 |
-
[
|
|
|
|
|
326 |
|
327 |
### Training Procedure
|
328 |
|
329 |
-
|
330 |
|
331 |
-
|
|
|
|
|
|
|
332 |
|
333 |
-
|
|
|
|
|
|
|
|
|
|
|
334 |
|
|
|
|
|
335 |
|
336 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
|
338 |
-
|
|
|
|
|
|
|
339 |
|
340 |
-
|
|
|
|
|
341 |
|
342 |
-
|
|
|
343 |
|
344 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
345 |
|
346 |
## Evaluation
|
347 |
|
@@ -373,8 +628,6 @@ Use the code below to get started with the model.
|
|
373 |
|
374 |
#### Summary
|
375 |
|
376 |
-
|
377 |
-
|
378 |
## Model Examination [optional]
|
379 |
|
380 |
<!-- Relevant interpretability work for the model goes here -->
|
@@ -439,4 +692,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
|
|
439 |
|
440 |
## Model Card Contact
|
441 |
|
442 |
-
[More Information Needed]
|
|
|
14 |
|
15 |
# Model Card for Model ID
|
16 |
|
17 |
+
This is Full Parameter Fine Tuned model based on `llm-jp/llm-jp-3-13B`.
|
18 |
+
See the base details [here](https://huggingface.co/llm-jp/llm-jp-3-13b).
|
19 |
|
20 |
+
Made for the task of `elyza-tasks-100-TV` which Matsuo Lab made in a class.
|
21 |
|
22 |
## Model Details
|
23 |
|
|
|
27 |
|
28 |
This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
|
29 |
|
30 |
+
- **Developed by:** [Yuto-24](https://github.com/Yuto-24/)
|
31 |
+
- **Model type:** Text Generation
|
32 |
+
- **Language(s) (NLP):** Japanese, English
|
33 |
+
- **License:** CC-BY-4.0
|
34 |
+
- **Finetuned from model:** [llm-jp/llm-jp-3-13B](https://huggingface.co/llm-jp/llm-jp-3-13b)
|
|
|
|
|
35 |
|
36 |
### Model Sources [optional]
|
37 |
|
38 |
<!-- Provide the basic links for the model. -->
|
39 |
|
40 |
+
- **Repository:** coming soon...
|
|
|
|
|
41 |
|
42 |
## Uses
|
43 |
|
|
|
49 |
|
50 |
```txt:requirements.txt
|
51 |
numpy
|
52 |
+
torch>=2.3.0
|
53 |
datasets
|
54 |
+
transformers>=4.40.1
|
55 |
+
accelerate>=0.29.3
|
56 |
+
flash-attn>=2.5.8
|
57 |
FlagEmbedding
|
58 |
```
|
59 |
|
60 |
~~~python
|
|
|
|
|
61 |
import torch
|
62 |
import numpy as np
|
63 |
+
|
64 |
from datasets import Dataset, load_dataset
|
65 |
+
from FlagEmbedding import BGEM3FlagModel
|
66 |
from transformers import (
|
67 |
AutoModelForCausalLM,
|
68 |
AutoTokenizer,
|
|
|
82 |
|
83 |
input_texts = [input_text]
|
84 |
input_embeds = model.encode(input_texts)["dense_vecs"]
|
|
|
85 |
|
86 |
# 類似度の計算
|
87 |
similarity = input_embeds @ target_embeds.T
|
88 |
most_similar_text = target_texts[np.argmax(similarity)]
|
89 |
|
90 |
target_index = target_texts.index(most_similar_text)
|
|
|
91 |
return target_index
|
92 |
|
93 |
+
|
94 |
class CallLLM:
|
95 |
def __init__(self, model_name_or_path: str) -> None:
|
96 |
self.model = AutoModelForCausalLM.from_pretrained(
|
|
|
183 |
llm = CallLLM(model_path_or_id)
|
184 |
|
185 |
SYSTEM_PROMPT = """
|
186 |
+
# あなたが必ず従うべき事項
|
187 |
+
|
188 |
+
## 役割
|
189 |
|
190 |
あなたは誠実で優秀なアシスタントです。
|
191 |
+
質問に対し、簡潔に答えます。
|
192 |
ハルシネーションをしません。
|
193 |
必ず正しい情報のみを答えます。
|
194 |
|
195 |
## 指示
|
196 |
|
197 |
+
- 評価観点に沿った出力を作成します。
|
198 |
+
- ユーザから特別な指示が与えられている場合には、必ず従います。
|
199 |
+
- 具体例には評価観点が含まれていますが、あなたが考える「出力」のみを回答してください。
|
200 |
+
- 評価観点は、人間があなたの出力を評価するために利用します。
|
201 |
+
- 論理的にステップバイステップで考えてください。
|
202 |
|
203 |
+
## 具体例
|
204 |
|
205 |
```markdown
|
206 |
+
{examples}
|
207 |
+
```
|
208 |
+
""".strip()
|
209 |
+
|
210 |
+
EXAMPLE_TEMPLATE = """
|
211 |
+
### 入力
|
212 |
|
213 |
{dataset_input}
|
214 |
|
215 |
+
### 評価観点
|
216 |
|
217 |
{dataset_eval_aspect}
|
218 |
|
219 |
+
### 出力
|
220 |
|
221 |
{dataset_answer}
|
|
|
|
|
222 |
""".strip()
|
223 |
|
224 |
|
225 |
+
# タスクとなるデータの読み込み
|
226 |
+
# omnicampusの開発環境では、左にタスクのjsonlをドラッグアンドドロップしてから実行
|
227 |
+
|
228 |
import os
|
229 |
import json
|
230 |
|
231 |
datasets = []
|
232 |
+
with open(f"{os.path.dirname(os.path.abspath('**file**'))}/workspace/elyza-tasks-100-TV_0.jsonl", "r") as f:
|
233 |
item = ""
|
234 |
for line in f:
|
235 |
line = line.strip()
|
|
|
271 |
# stream=True,
|
272 |
).strip()
|
273 |
# print("-----------------------------------------------------------------------------------------------------------------------------------")
|
274 |
+
print(output.strip())
|
275 |
+
print("===================================================================================================================================")
|
276 |
+
print(re.sub(r"^[\s\S]*?### 出力", "", re.sub(r"^[\s\S]*?\*\*出力\*\*:", "", output)).strip())
|
277 |
+
print("-----------------------------------------------------------------------------------------------------------------------------------")
|
278 |
|
279 |
results.append({
|
280 |
"task_id": data["task_id"],
|
|
|
282 |
"output_org": output.strip(),
|
283 |
"output": re.sub(r"^[\s\S]*?### 出力", "", output).strip(),
|
284 |
"elyza_tasks_id": dataset_index,
|
285 |
+
"dataset_input": elyza_tasks_datasets["test"]["input"][dataset_index],
|
286 |
+
"dataset_eval_aspect": elyza_tasks_datasets["test"]["eval_aspect"][dataset_index],
|
287 |
+
"dataset_answer": elyza_tasks_datasets["test"]["output"][dataset_index],
|
288 |
})
|
289 |
|
290 |
+
# results にタスクの解答が入っている
|
291 |
|
292 |
+
from pprint import pprint
|
293 |
+
import pandas as pd
|
294 |
|
|
|
295 |
|
296 |
+
# 最大表示「列」数の指定
|
297 |
+
pd.set_option("display.max_columns", 0)
|
298 |
+
# 最大表示「行」数の指定
|
299 |
+
pd.set_option("display.max_rows", 100)
|
300 |
+
pd.set_option("display.max_colwidth", 550)
|
301 |
|
302 |
|
303 |
+
json4df = {
|
304 |
+
"task_id": [],
|
305 |
+
"input": [],
|
306 |
+
"output": [],
|
307 |
+
"output_org": [],
|
308 |
+
# "elyza_tasks_id": [],
|
309 |
+
# "dataset_input": [],
|
310 |
+
# "dataset_eval_aspect": [],
|
311 |
+
# "dataset_answer": [],
|
312 |
+
}
|
313 |
+
|
314 |
+
for result in results:
|
315 |
+
json4df["task_id"].append(result["task_id"])
|
316 |
+
json4df["input"].append(result["input"])
|
317 |
+
json4df["output_org"].append(result["output_org"])
|
318 |
+
json4df["output"].append(result["output"])
|
319 |
+
|
320 |
+
JSON_FILE_NAME = "llm-jp-3-13B-Tengentoppa-FPFT-magpie-FPFT-elyza-RAG_v2"
|
321 |
+
|
322 |
+
result4out = results.copy()
|
323 |
+
results
|
324 |
+
|
325 |
+
|
326 |
+
# 本コードではinputとeval_aspectも含んでいますが、なくても問題ありません。
|
327 |
+
# 必須なのはtask_idとoutputとなります。
|
328 |
+
|
329 |
+
import re
|
330 |
+
import sys
|
331 |
+
from os.path import dirname, abspath, join, isfile
|
332 |
+
|
333 |
+
|
334 |
+
result4out = results.copy()
|
335 |
+
|
336 |
+
|
337 |
+
WD = dirname(abspath("__file__"))
|
338 |
+
json_dir = join(
|
339 |
+
WD,
|
340 |
+
"..",
|
341 |
+
"jsonl",
|
342 |
+
)
|
343 |
+
|
344 |
+
|
345 |
+
if JSON_FILE_NAME != "":
|
346 |
+
file_path = join(json_dir, f"{JSON_FILE_NAME}.jsonl")
|
347 |
+
else:
|
348 |
+
jsonl_id = re.sub(".*/", "", merged_model_path)
|
349 |
+
file_path = join(json_dir, f"{jsonl_id}-outputs.jsonl")
|
350 |
+
|
351 |
+
assert not isfile(file_path), f"Error: File `{file_path}` is already exist."
|
352 |
+
|
353 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
354 |
+
for result in result4out:
|
355 |
+
result = {k: v for k, v in result.items() if k != "elyza_tasks_id" and k != "dataset_input" and k !=
|
356 |
+
"dataset_eval_aspect" and k != "dataset_answer"}
|
357 |
+
json.dump(
|
358 |
+
result, f, ensure_ascii=False
|
359 |
+
) # ensure_ascii=False for handling non-ASCII characters
|
360 |
+
f.write("\n")
|
361 |
+
|
362 |
+
|
363 |
+
~~~
|
364 |
|
365 |
### Downstream Use [optional]
|
366 |
|
|
|
398 |
|
399 |
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
400 |
|
401 |
+
- [DeL-TaiseiOzaki/Tengentoppa-sft-v1.0](https://huggingface.co/datasets/DeL-TaiseiOzaki/Tengentoppa-sft-v1.0)
|
402 |
+
- [llm-jp/magpie-sft-v1.0](https://huggingface.co/datasets/llm-jp/magpie-sft-v1.0)
|
403 |
+
- [ntotsuka123/clean3-ultraboros-20k-ja-filter](https://huggingface.co/datasets/ntotsuka123/clean3-ultraboros-20k-ja-filter)
|
404 |
|
405 |
### Training Procedure
|
406 |
|
407 |
+
using axolotl and yaml below.
|
408 |
|
409 |
+
```yaml: For the first training
|
410 |
+
base_model: llm-jp/llm-jp-3-13b
|
411 |
+
model_type: AutoModelForCausalLM
|
412 |
+
tokenizer_type: AutoTokenizer
|
413 |
|
414 |
+
load_in_8bit: false
|
415 |
+
load_in_4bit: false
|
416 |
+
strict: false
|
417 |
+
|
418 |
+
# domain_yyyymmdd
|
419 |
+
output_dir: outputs/matsuo/llm-jp/3/13B/FPFT_20241213
|
420 |
|
421 |
+
chat_template: chatml
|
422 |
+
default_system_message: あなたは、大塚商会の誠実で優秀なアシスタントです。
|
423 |
|
424 |
+
shuffle_merged_datasets: true
|
425 |
+
datasets:
|
426 |
+
# # General
|
427 |
+
# - path: data/general/magpie-sft-v1.0.jsonl
|
428 |
+
# ds_type: json
|
429 |
+
# type: chat_template
|
430 |
+
# chat_template: chatml
|
431 |
+
# field_messages: conversations
|
432 |
+
# message_field_role: role
|
433 |
+
# message_field_content: content
|
434 |
+
# roles:
|
435 |
+
# user:
|
436 |
+
# - user
|
437 |
+
# assistant:
|
438 |
+
# - assistant
|
439 |
+
# system:
|
440 |
+
# - system
|
441 |
+
- path: data/general/Tengentoppa-sft-v1.0.jsonl
|
442 |
+
ds_type: json
|
443 |
+
type: alpaca
|
444 |
+
# - path: data/general/clean3-ultraboros-20k-ja-filter_train.jsonl
|
445 |
+
# ds_type: json
|
446 |
+
# type: chat_template
|
447 |
+
# # chat_template: chatml
|
448 |
+
# field_messages: conversations
|
449 |
+
# message_field_role: role
|
450 |
+
# message_field_content: value
|
451 |
+
# roles:
|
452 |
+
# user:
|
453 |
+
# - human
|
454 |
+
# assistant:
|
455 |
+
# - gpt
|
456 |
+
# system:
|
457 |
+
# - system
|
458 |
+
# train_on_eos: turn
|
459 |
+
|
460 |
+
val_set_size: 0.05
|
461 |
+
|
462 |
+
sequence_len: 4096
|
463 |
+
sample_packing: true
|
464 |
+
pad_to_sequence_len: true
|
465 |
+
|
466 |
+
gradient_accumulation_steps: 4
|
467 |
+
micro_batch_size: 1
|
468 |
+
num_epochs: 2
|
469 |
+
optimizer: paged_adamw_8bit
|
470 |
+
lr_scheduler: cosine
|
471 |
+
learning_rate: 0.00002
|
472 |
+
|
473 |
+
train_on_inputs: false
|
474 |
+
group_by_length: false
|
475 |
+
bf16: auto
|
476 |
+
fp16:
|
477 |
+
tf32: true
|
478 |
+
|
479 |
+
gradient_checkpointing: true
|
480 |
+
gradient_checkpointing_kwargs:
|
481 |
+
use_reentrant: false
|
482 |
+
early_stopping_patience:
|
483 |
+
resume_from_checkpoint:
|
484 |
+
logging_steps: 1
|
485 |
+
xformers_attention:
|
486 |
+
flash_attention: true
|
487 |
+
|
488 |
+
# warmup_steps: 100
|
489 |
+
warmup_ratio: 0.1
|
490 |
+
evals_per_epoch: 1
|
491 |
+
eval_table_size:
|
492 |
+
saves_per_epoch: 1
|
493 |
+
debug:
|
494 |
+
deepspeed: deepspeed_configs/zero3.json
|
495 |
+
weight_decay: 0.0
|
496 |
+
fsdp:
|
497 |
+
fsdp_config:
|
498 |
+
special_tokens:
|
499 |
+
eos_token: <|im_end|>
|
500 |
+
```
|
501 |
|
502 |
+
```yaml: For the second training
|
503 |
+
base_model: outputs/matsuo/llm-jp/3/13B/FPFT_20241213
|
504 |
+
model_type: AutoModelForCausalLM
|
505 |
+
tokenizer_type: AutoTokenizer
|
506 |
|
507 |
+
load_in_8bit: false
|
508 |
+
load_in_4bit: false
|
509 |
+
strict: false
|
510 |
|
511 |
+
# domain_yyyymmdd
|
512 |
+
output_dir: outputs/matsuo/llm-jp/3/13B/FPFT_20241215
|
513 |
|
514 |
+
chat_template: chatml
|
515 |
+
default_system_message: あなたは、大塚商会の誠実で優秀なアシスタントです。
|
516 |
+
|
517 |
+
shuffle_merged_datasets: true
|
518 |
+
datasets:
|
519 |
+
- path: data/general/magpie-sft-v1.0.jsonl
|
520 |
+
ds_type: json
|
521 |
+
type: chat_template
|
522 |
+
chat_template: chatml
|
523 |
+
field_messages: conversations
|
524 |
+
message_field_role: role
|
525 |
+
message_field_content: content
|
526 |
+
roles:
|
527 |
+
user:
|
528 |
+
- user
|
529 |
+
assistant:
|
530 |
+
- assistant
|
531 |
+
system:
|
532 |
+
- system
|
533 |
+
# - path: data/general/Tengentoppa-sft-v1.0.jsonl
|
534 |
+
# ds_type: json
|
535 |
+
# type: alpaca
|
536 |
+
- path: data/general/clean3-ultraboros-20k-ja-filter_train.jsonl
|
537 |
+
ds_type: json
|
538 |
+
type: chat_template
|
539 |
+
chat_template: chatml
|
540 |
+
field_messages: conversations
|
541 |
+
message_field_role: role
|
542 |
+
message_field_content: value
|
543 |
+
roles:
|
544 |
+
user:
|
545 |
+
- human
|
546 |
+
assistant:
|
547 |
+
- gpt
|
548 |
+
system:
|
549 |
+
- system
|
550 |
+
## NOTE: Leaving the below empty will default to using the simple legacy tokenization strategy where only last message is trained on.
|
551 |
+
# Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
|
552 |
+
roles_to_train: ["gpt", "assistant"]
|
553 |
+
# Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:
|
554 |
+
# - all: train on all EOS tokens
|
555 |
+
# - turn: train on the EOS token at the end of each trainable turn
|
556 |
+
# - last: train on the last EOS token in the conversation
|
557 |
+
train_on_eos: last
|
558 |
+
|
559 |
+
val_set_size: 0.05
|
560 |
+
|
561 |
+
sequence_len: 4096
|
562 |
+
sample_packing: true
|
563 |
+
pad_to_sequence_len: true
|
564 |
+
|
565 |
+
gradient_accumulation_steps: 4
|
566 |
+
micro_batch_size: 1
|
567 |
+
num_epochs: 2
|
568 |
+
optimizer: paged_adamw_8bit
|
569 |
+
lr_scheduler: cosine
|
570 |
+
learning_rate: 0.00002
|
571 |
+
|
572 |
+
train_on_inputs: false
|
573 |
+
group_by_length: false
|
574 |
+
bf16: auto
|
575 |
+
fp16:
|
576 |
+
tf32: true
|
577 |
+
|
578 |
+
gradient_checkpointing: true
|
579 |
+
gradient_checkpointing_kwargs:
|
580 |
+
use_reentrant: false
|
581 |
+
early_stopping_patience:
|
582 |
+
resume_from_checkpoint:
|
583 |
+
logging_steps: 1
|
584 |
+
xformers_attention:
|
585 |
+
flash_attention: true
|
586 |
+
|
587 |
+
# warmup_steps: 100
|
588 |
+
warmup_ratio: 0.1
|
589 |
+
evals_per_epoch: 1
|
590 |
+
eval_table_size:
|
591 |
+
saves_per_epoch: 1
|
592 |
+
debug:
|
593 |
+
deepspeed: deepspeed_configs/zero3.json
|
594 |
+
weight_decay: 0.0
|
595 |
+
fsdp:
|
596 |
+
fsdp_config:
|
597 |
+
special_tokens:
|
598 |
+
eos_token: <|im_end|>
|
599 |
+
```
|
600 |
|
601 |
## Evaluation
|
602 |
|
|
|
628 |
|
629 |
#### Summary
|
630 |
|
|
|
|
|
631 |
## Model Examination [optional]
|
632 |
|
633 |
<!-- Relevant interpretability work for the model goes here -->
|
|
|
692 |
|
693 |
## Model Card Contact
|
694 |
|
695 |
+
[More Information Needed]
|