dh-mc commited on
Commit
3860729
·
1 Parent(s): fd78d87

initial code for Chinese/English translation

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +18 -0
  2. .gitignore +152 -0
  3. datasets/mac/mac-test.tsv +3 -0
  4. datasets/mac/mac-train.tsv +3 -0
  5. datasets/mac/mac.tsv +3 -0
  6. eval_modules/calc_repetitions.py +79 -0
  7. llama-factory/config/llama3_8b_lora_sft.yaml +46 -0
  8. llama-factory/config/qwen2_0.5b_lora_sft.yaml +42 -0
  9. llama-factory/config/qwen2_0.5b_lora_sft_unsloth.yaml +45 -0
  10. llama-factory/config/qwen2_1.5b_lora_sft.yaml +42 -0
  11. llama-factory/config/qwen2_1.5b_lora_sft_unsloth.yaml +45 -0
  12. llama-factory/config/qwen2_7b_lora_sft.yaml +45 -0
  13. llama-factory/config/qwen2_7b_lora_sft_unsloth.yaml +45 -0
  14. llama-factory/data/alpaca_mac.json +3 -0
  15. llama-factory/data/dataset_info.json +3 -0
  16. llama-factory/inference/qwen2_1.5b_lora_sft.yaml +4 -0
  17. llm_toolkit/chat.py +88 -0
  18. llm_toolkit/eval.py +67 -0
  19. llm_toolkit/eval_lf.py +110 -0
  20. llm_toolkit/llm_utils.py +165 -0
  21. llm_toolkit/translation_engine.py +130 -0
  22. llm_toolkit/translation_utils.py +420 -0
  23. llm_toolkit/tune.py +143 -0
  24. notebooks/00_Data_Analysis.ipynb +0 -0
  25. notebooks/01_Qwen2-0.5B_Unsloth.ipynb +0 -0
  26. notebooks/02_Qwen2-1.5B_Unsloth.ipynb +0 -0
  27. notebooks/03_Qwen2-0.5B_1.5B-4bit.ipynb +0 -0
  28. notebooks/04_tune-small-no-flash-attn.ipynb +0 -0
  29. notebooks/05_tune-small-with-flash-attn.ipynb +0 -0
  30. notebooks/06_tune-small-py3.11.ipynb +0 -0
  31. notebooks/07_tune-lf-py3.11.ipynb +0 -0
  32. notebooks/07r2_tune-lf-py3.11.ipynb +0 -0
  33. notebooks/08_eval-lf-py3.11.ipynb +0 -0
  34. requirements.txt +15 -1
  35. results/experiment-1-results.csv +3 -0
  36. results/experiment-2-results.csv +3 -0
  37. results/experiment-3-results.csv +3 -0
  38. results/mac-results-no-flash-attn.csv +3 -0
  39. results/mac-results-with-flash-attn.csv +3 -0
  40. results/mac-results.csv +3 -0
  41. results/mac-results_final.csv +3 -0
  42. results/mac-results_lf-r2.csv +3 -0
  43. results/mac-results_lf-r3.csv +3 -0
  44. results/mac-results_lf.csv +3 -0
  45. results/mac-results_py3.11.csv +3 -0
  46. results/mac-results_v3.csv +3 -0
  47. results/model_training_evaluation_times.csv +3 -0
  48. scripts/lf-api.sh +8 -0
  49. scripts/tune-large.sh +24 -0
  50. scripts/tune-lf.sh +9 -0
.gitattributes CHANGED
@@ -33,3 +33,21 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ results/mac-results.csv filter=lfs diff=lfs merge=lfs -text
37
+ results/mac-results_lf.csv filter=lfs diff=lfs merge=lfs -text
38
+ results/mac-results_lf-r3.csv filter=lfs diff=lfs merge=lfs -text
39
+ results/experiment-1-results.csv filter=lfs diff=lfs merge=lfs -text
40
+ results/mac-results-no-flash-attn.csv filter=lfs diff=lfs merge=lfs -text
41
+ results/mac-results_lf-r2.csv filter=lfs diff=lfs merge=lfs -text
42
+ results/model_training_evaluation_times.csv filter=lfs diff=lfs merge=lfs -text
43
+ results/experiment-3-results.csv filter=lfs diff=lfs merge=lfs -text
44
+ results/mac-results_final.csv filter=lfs diff=lfs merge=lfs -text
45
+ results/mac-results_py3.11.csv filter=lfs diff=lfs merge=lfs -text
46
+ results/experiment-2-results.csv filter=lfs diff=lfs merge=lfs -text
47
+ results/mac-results-with-flash-attn.csv filter=lfs diff=lfs merge=lfs -text
48
+ results/mac-results_v3.csv filter=lfs diff=lfs merge=lfs -text
49
+ llama-factory/data/alpaca_mac.json filter=lfs diff=lfs merge=lfs -text
50
+ llama-factory/data/dataset_info.json filter=lfs diff=lfs merge=lfs -text
51
+ datasets/mac/mac-test.tsv filter=lfs diff=lfs merge=lfs -text
52
+ datasets/mac/mac-train.tsv filter=lfs diff=lfs merge=lfs -text
53
+ datasets/mac/mac.tsv filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.out
2
+ *.log
3
+ */outputs/
4
+ */models/
5
+ */wandb/
6
+ */cs605-nlp-assignment-2*/
7
+ */augmented_data/
8
+ */inflaton/
9
+ */llama.cpp/
10
+ wandb/
11
+
12
+ # Byte-compiled / optimized / DLL files
13
+ __pycache__/
14
+ *.py[cod]
15
+ *$py.class
16
+
17
+ # C extensions
18
+ *.so
19
+
20
+ # Distribution / packaging
21
+ .Python
22
+ build/
23
+ develop-eggs/
24
+ dist/
25
+ downloads/
26
+ eggs/
27
+ .eggs/
28
+ lib/
29
+ lib64/
30
+ parts/
31
+ sdist/
32
+ var/
33
+ wheels/
34
+ pip-wheel-metadata/
35
+ share/python-wheels/
36
+ *.egg-info/
37
+ .installed.cfg
38
+ *.egg
39
+ MANIFEST
40
+
41
+ # PyInstaller
42
+ # Usually these files are written by a python script from a template
43
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
44
+ *.manifest
45
+ *.spec
46
+
47
+ # Installer logs
48
+ pip-log.txt
49
+ pip-delete-this-directory.txt
50
+
51
+ # Unit test / coverage reports
52
+ htmlcov/
53
+ .tox/
54
+ .nox/
55
+ .coverage
56
+ .coverage.*
57
+ .cache
58
+ nosetests.xml
59
+ coverage.xml
60
+ *.cover
61
+ *.py,cover
62
+ .hypothesis/
63
+ .pytest_cache/
64
+
65
+ # Translations
66
+ *.mo
67
+ *.pot
68
+
69
+ # Django stuff:
70
+ # *.log
71
+ local_settings.py
72
+ db.sqlite3
73
+ db.sqlite3-journal
74
+
75
+ # Flask stuff:
76
+ instance/
77
+ .webassets-cache
78
+
79
+ # Scrapy stuff:
80
+ .scrapy
81
+
82
+ # Sphinx documentation
83
+ docs/_build/
84
+
85
+ # PyBuilder
86
+ target/
87
+
88
+ # Jupyter Notebook
89
+ .ipynb_checkpoints
90
+
91
+ # IPython
92
+ profile_default/
93
+ ipython_config.py
94
+
95
+ # pyenv
96
+ .python-version
97
+
98
+ # pipenv
99
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
101
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
102
+ # install all needed dependencies.
103
+ #Pipfile.lock
104
+
105
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
106
+ __pypackages__/
107
+
108
+ # Celery stuff
109
+ celerybeat-schedule
110
+ celerybeat.pid
111
+
112
+ # SageMath parsed files
113
+ *.sage.py
114
+
115
+ # Environments
116
+ .env
117
+ .venv
118
+ env/
119
+ venv/
120
+ ENV/
121
+ env.bak/
122
+ venv.bak/
123
+
124
+ # Spyder project settings
125
+ .spyderproject
126
+ .spyproject
127
+
128
+ # Rope project settings
129
+ .ropeproject
130
+
131
+ # mkdocs documentation
132
+ /site
133
+
134
+ # mypy
135
+ .mypy_cache/
136
+ .dmypy.json
137
+ dmypy.json
138
+
139
+ # Pyre type checker
140
+ .pyre/
141
+
142
+ # JetBrains
143
+ .idea
144
+
145
+ *.db
146
+
147
+ .DS_Store
148
+ /outputs
149
+ /models
150
+ /llama.cpp
151
+ /llama-factory/saves
152
+ /llama-factory/saves-1
datasets/mac/mac-test.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5663c7521eaf9942a9fea40b2950a46e37b761b22cc698eb6fe6b57bf70d0c4
3
+ size 253194
datasets/mac/mac-train.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:424f0adcb2727eec322acef12294f4efb10412fc0b0529887d28dddc5171af05
3
+ size 1031685
datasets/mac/mac.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93f3ab2ba07b67b0a3f9ff05291c1b6748851999cda050bc165f8dd259daa2aa
3
+ size 1289106
eval_modules/calc_repetitions.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import pandas as pd
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+ import matplotlib.ticker as mtick
8
+ import seaborn as sns
9
+ import nltk
10
+ import evaluate
11
+
12
+ meteor = evaluate.load("meteor")
13
+
14
+ print(f"loading: {__file__}")
15
+
16
+ # final version
17
+ pattern_excessive_whitespaces = re.compile(r"\s{5,}")
18
+ pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL)
19
+
20
+
21
+ def del_excessive_whitespaces(text, debug=False):
22
+ count = 0
23
+
24
+ if isinstance(text, str):
25
+ if debug:
26
+ print("----detect excessive whitespaces----")
27
+ count = len(text)
28
+ text = pattern_excessive_whitespaces.sub("", text)
29
+ count -= len(text)
30
+ if debug and count:
31
+ print(f"removed excessive whitespaces: {count}")
32
+ return text, count
33
+
34
+
35
+ # final version for repetition detection
36
+ def detect_text_repetitions(text, debug=False):
37
+ count = 0
38
+
39
+ if isinstance(text, str):
40
+ if debug:
41
+ print("----detect text repetitions----")
42
+ matches = pattern_text_repetitions.finditer(text)
43
+ for match in matches:
44
+ if debug:
45
+ print(match)
46
+ for groupNum in range(0, len(match.groups())):
47
+ groupNum = groupNum + 1
48
+ print(
49
+ "Group {groupNum} found at {start}-{end}: `{group}`".format(
50
+ groupNum=groupNum,
51
+ start=match.start(groupNum),
52
+ end=match.end(groupNum),
53
+ group=match.group(groupNum),
54
+ )
55
+ )
56
+
57
+ start, end = match.span()
58
+ count += end - start
59
+
60
+ return count
61
+
62
+
63
+ def detect_repetitions(text, debug=False):
64
+ text, count_excessive_whitespaces = del_excessive_whitespaces(text, debug=debug)
65
+ count_text_repetitions = detect_text_repetitions(text, debug=debug)
66
+ total_repetitions = count_excessive_whitespaces + count_text_repetitions
67
+
68
+ result = (count_excessive_whitespaces, count_text_repetitions, total_repetitions)
69
+
70
+ if debug:
71
+ print(result)
72
+ return result
73
+
74
+
75
+ def detect_scores(text, debug=False):
76
+ newline_score, repetition_score, total_repetitions = detect_repetitions(
77
+ text, debug=debug
78
+ )
79
+ return pd.Series([newline_score, repetition_score, total_repetitions])
llama-factory/config/llama3_8b_lora_sft.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### model
2
+ model_name_or_path: gradientai/Llama-3-8B-Instruct-Gradient-1048k
3
+
4
+ ### method
5
+ stage: sft
6
+ do_train: true
7
+ finetuning_type: lora
8
+ lora_target: all
9
+ quantization_bit: 4 # use 4-bit QLoRA
10
+ loraplus_lr_ratio: 16.0 # use LoRA+ with lambda=16.0
11
+ # use_unsloth: true # use UnslothAI's LoRA optimization for 2x faster training
12
+
13
+ ### dataset
14
+ dataset: alpaca_mac
15
+ template: llama3
16
+ cutoff_len: 1024
17
+ max_samples: 4528
18
+ overwrite_cache: true
19
+ preprocessing_num_workers: 16
20
+
21
+ ### output
22
+ # output_dir: saves/llama3-8b/lora/sft
23
+ output_dir: /Workspace/Users/donghao.huang@mastercard.com/lf-saves/llama3-8b/lora/sft/
24
+ logging_steps: 10
25
+ save_steps: 560
26
+ plot_loss: true
27
+ overwrite_output_dir: true
28
+ # resume_from_checkpoint: true
29
+
30
+ ### train
31
+ per_device_train_batch_size: 1
32
+ gradient_accumulation_steps: 8
33
+ learning_rate: 1.0e-4
34
+ num_train_epochs: 6.0
35
+ lr_scheduler_type: cosine
36
+ warmup_ratio: 0.1
37
+ bf16: true
38
+ ddp_timeout: 180000000
39
+
40
+ ### eval
41
+ val_size: 0.01
42
+ per_device_eval_batch_size: 1
43
+ eval_strategy: steps
44
+ eval_steps: 560
45
+
46
+ report_to: none
llama-factory/config/qwen2_0.5b_lora_sft.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### model
2
+ model_name_or_path: Qwen/Qwen2-0.5B-Instruct
3
+
4
+ ### method
5
+ stage: sft
6
+ do_train: true
7
+ finetuning_type: lora
8
+ lora_target: all
9
+
10
+ ### dataset
11
+ dataset: alpaca_mac
12
+ template: chatml
13
+ cutoff_len: 1024
14
+ max_samples: 4528
15
+ overwrite_cache: true
16
+ preprocessing_num_workers: 16
17
+
18
+ ### output
19
+ output_dir: saves/qwen2-0.5b/lora/sft
20
+ logging_steps: 10
21
+ save_steps: 560
22
+ plot_loss: true
23
+ overwrite_output_dir: true
24
+
25
+ ### train
26
+ per_device_train_batch_size: 1
27
+ gradient_accumulation_steps: 8
28
+ learning_rate: 1.0e-4
29
+ num_train_epochs: 6.0
30
+ lr_scheduler_type: cosine
31
+ warmup_ratio: 0.1
32
+ bf16: true
33
+ ddp_timeout: 180000000
34
+
35
+ ### eval
36
+ val_size: 0.01
37
+ per_device_eval_batch_size: 1
38
+ eval_strategy: steps
39
+ eval_steps: 560
40
+
41
+ report_to: wandb
42
+ run_name: qwen2_0.5b_lora_sft # optional
llama-factory/config/qwen2_0.5b_lora_sft_unsloth.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### model
2
+ model_name_or_path: Qwen/Qwen2-0.5B-Instruct
3
+
4
+ ### method
5
+ stage: sft
6
+ do_train: true
7
+ finetuning_type: lora
8
+ lora_target: all
9
+ quantization_bit: 4 # use 4-bit QLoRA
10
+ loraplus_lr_ratio: 16.0 # use LoRA+ with lambda=16.0
11
+ use_unsloth: true # use UnslothAI's LoRA optimization for 2x faster training
12
+
13
+ ### dataset
14
+ dataset: alpaca_mac
15
+ template: chatml
16
+ cutoff_len: 1024
17
+ max_samples: 4528
18
+ overwrite_cache: true
19
+ preprocessing_num_workers: 16
20
+
21
+ ### output
22
+ output_dir: saves/qwen2-0.5b/lora/sft
23
+ logging_steps: 10
24
+ save_steps: 560
25
+ plot_loss: true
26
+ overwrite_output_dir: true
27
+
28
+ ### train
29
+ per_device_train_batch_size: 1
30
+ gradient_accumulation_steps: 8
31
+ learning_rate: 1.0e-4
32
+ num_train_epochs: 6.0
33
+ lr_scheduler_type: cosine
34
+ warmup_ratio: 0.1
35
+ bf16: true
36
+ ddp_timeout: 180000000
37
+
38
+ ### eval
39
+ val_size: 0.01
40
+ per_device_eval_batch_size: 1
41
+ eval_strategy: steps
42
+ eval_steps: 560
43
+
44
+ report_to: wandb
45
+ run_name: qwen2_0.5b_lora_sft # optional
llama-factory/config/qwen2_1.5b_lora_sft.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### model
2
+ model_name_or_path: Qwen/Qwen2-1.5B-Instruct
3
+
4
+ ### method
5
+ stage: sft
6
+ do_train: true
7
+ finetuning_type: lora
8
+ lora_target: all
9
+
10
+ ### dataset
11
+ dataset: alpaca_mac
12
+ template: chatml
13
+ cutoff_len: 1024
14
+ max_samples: 4528
15
+ overwrite_cache: true
16
+ preprocessing_num_workers: 16
17
+
18
+ ### output
19
+ output_dir: saves/qwen2-1.5b/lora/sft
20
+ logging_steps: 10
21
+ save_steps: 560
22
+ plot_loss: true
23
+ overwrite_output_dir: true
24
+
25
+ ### train
26
+ per_device_train_batch_size: 1
27
+ gradient_accumulation_steps: 8
28
+ learning_rate: 1.0e-4
29
+ num_train_epochs: 6.0
30
+ lr_scheduler_type: cosine
31
+ warmup_ratio: 0.1
32
+ bf16: true
33
+ ddp_timeout: 180000000
34
+
35
+ ### eval
36
+ val_size: 0.01
37
+ per_device_eval_batch_size: 1
38
+ eval_strategy: steps
39
+ eval_steps: 560
40
+
41
+ report_to: wandb
42
+ run_name: qwen2_1.5b_lora_sft # optional
llama-factory/config/qwen2_1.5b_lora_sft_unsloth.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### model
2
+ model_name_or_path: Qwen/Qwen2-1.5B-Instruct
3
+
4
+ ### method
5
+ stage: sft
6
+ do_train: true
7
+ finetuning_type: lora
8
+ lora_target: all
9
+ quantization_bit: 4 # use 4-bit QLoRA
10
+ loraplus_lr_ratio: 16.0 # use LoRA+ with lambda=16.0
11
+ use_unsloth: true # use UnslothAI's LoRA optimization for 2x faster training
12
+
13
+ ### dataset
14
+ dataset: alpaca_mac
15
+ template: chatml
16
+ cutoff_len: 1024
17
+ max_samples: 4528
18
+ overwrite_cache: true
19
+ preprocessing_num_workers: 16
20
+
21
+ ### output
22
+ output_dir: saves/qwen2-1.5b/lora/sft
23
+ logging_steps: 10
24
+ save_steps: 560
25
+ plot_loss: true
26
+ overwrite_output_dir: true
27
+
28
+ ### train
29
+ per_device_train_batch_size: 1
30
+ gradient_accumulation_steps: 8
31
+ learning_rate: 1.0e-4
32
+ num_train_epochs: 6.0
33
+ lr_scheduler_type: cosine
34
+ warmup_ratio: 0.1
35
+ bf16: true
36
+ ddp_timeout: 180000000
37
+
38
+ ### eval
39
+ val_size: 0.01
40
+ per_device_eval_batch_size: 1
41
+ eval_strategy: steps
42
+ eval_steps: 560
43
+
44
+ report_to: wandb
45
+ run_name: qwen2_1.5b_lora_sft # optional
llama-factory/config/qwen2_7b_lora_sft.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### model
2
+ model_name_or_path: Qwen/Qwen2-7B-Instruct
3
+
4
+ ### method
5
+ stage: sft
6
+ do_train: true
7
+ finetuning_type: lora
8
+ lora_target: all
9
+ quantization_bit: 4 # use 4-bit QLoRA
10
+ loraplus_lr_ratio: 16.0 # use LoRA+ with lambda=16.0
11
+ # use_unsloth: true # use UnslothAI's LoRA optimization for 2x faster training
12
+
13
+ ### dataset
14
+ dataset: alpaca_mac
15
+ template: chatml
16
+ cutoff_len: 1024
17
+ max_samples: 4528
18
+ overwrite_cache: true
19
+ preprocessing_num_workers: 16
20
+
21
+ ### output
22
+ output_dir: saves/qwen2-7b/lora/sft
23
+ logging_steps: 10
24
+ save_steps: 560
25
+ plot_loss: true
26
+ overwrite_output_dir: true
27
+
28
+ ### train
29
+ per_device_train_batch_size: 1
30
+ gradient_accumulation_steps: 8
31
+ learning_rate: 1.0e-4
32
+ num_train_epochs: 6.0
33
+ lr_scheduler_type: cosine
34
+ warmup_ratio: 0.1
35
+ bf16: true
36
+ ddp_timeout: 180000000
37
+
38
+ ### eval
39
+ val_size: 0.01
40
+ per_device_eval_batch_size: 1
41
+ eval_strategy: steps
42
+ eval_steps: 560
43
+
44
+ report_to: wandb
45
+ run_name: qwen2_7b_lora_sft # optional
llama-factory/config/qwen2_7b_lora_sft_unsloth.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### model
2
+ model_name_or_path: Qwen/Qwen2-7B-Instruct
3
+
4
+ ### method
5
+ stage: sft
6
+ do_train: true
7
+ finetuning_type: lora
8
+ lora_target: all
9
+ quantization_bit: 4 # use 4-bit QLoRA
10
+ loraplus_lr_ratio: 16.0 # use LoRA+ with lambda=16.0
11
+ use_unsloth: true # use UnslothAI's LoRA optimization for 2x faster training
12
+
13
+ ### dataset
14
+ dataset: alpaca_mac
15
+ template: chatml
16
+ cutoff_len: 1024
17
+ max_samples: 4528
18
+ overwrite_cache: true
19
+ preprocessing_num_workers: 16
20
+
21
+ ### output
22
+ output_dir: saves/qwen2-7b/lora/sft
23
+ logging_steps: 10
24
+ save_steps: 560
25
+ plot_loss: true
26
+ overwrite_output_dir: true
27
+
28
+ ### train
29
+ per_device_train_batch_size: 1
30
+ gradient_accumulation_steps: 8
31
+ learning_rate: 1.0e-4
32
+ num_train_epochs: 6.0
33
+ lr_scheduler_type: cosine
34
+ warmup_ratio: 0.1
35
+ bf16: true
36
+ ddp_timeout: 180000000
37
+
38
+ ### eval
39
+ val_size: 0.01
40
+ per_device_eval_batch_size: 1
41
+ eval_strategy: steps
42
+ eval_steps: 560
43
+
44
+ report_to: wandb
45
+ run_name: qwen2_7b_lora_sft # optional
llama-factory/data/alpaca_mac.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f03e62eb461c2204bbaef55f2de28ec115b1a5834b81f03b10f157551d5fe9f
3
+ size 2240344
llama-factory/data/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84bce610296ed7e729647e85d25576b6226d20ddf0bca4982fb1deb02de35911
3
+ size 13560
llama-factory/inference/qwen2_1.5b_lora_sft.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ model_name_or_path: Qwen/Qwen2-1.5B-Instruct
2
+ adapter_name_or_path: saves/qwen2-1.5b/lora/sft/checkpoint-1680
3
+ template: chatml
4
+ finetuning_type: lora
llm_toolkit/chat.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from llamafactory.chat import ChatModel
4
+ from llamafactory.extras.misc import torch_gc
5
+
6
+ from dotenv import find_dotenv, load_dotenv
7
+
8
+ found_dotenv = find_dotenv(".env")
9
+
10
+ if len(found_dotenv) == 0:
11
+ found_dotenv = find_dotenv(".env.example")
12
+ print(f"loading env vars from: {found_dotenv}")
13
+ load_dotenv(found_dotenv, override=False)
14
+
15
+ path = os.path.dirname(found_dotenv)
16
+ print(f"Adding {path} to sys.path")
17
+ sys.path.append(path)
18
+
19
+ from llm_toolkit.translation_engine import *
20
+ from llm_toolkit.translation_utils import *
21
+
22
+ model_name = os.getenv("MODEL_NAME")
23
+ load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
24
+ eval_base_model = os.getenv("EVAL_BASE_MODEL") == "true"
25
+ eval_fine_tuned = os.getenv("EVAL_FINE_TUNED") == "true"
26
+ save_fine_tuned_model = os.getenv("SAVE_FINE_TUNED") == "true"
27
+ num_train_epochs = int(os.getenv("NUM_TRAIN_EPOCHS") or 0)
28
+ data_path = os.getenv("DATA_PATH")
29
+ results_path = os.getenv("RESULTS_PATH")
30
+
31
+ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
32
+ dtype = (
33
+ None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
34
+ )
35
+
36
+ print(
37
+ model_name,
38
+ load_in_4bit,
39
+ max_seq_length,
40
+ num_train_epochs,
41
+ dtype,
42
+ data_path,
43
+ results_path,
44
+ eval_base_model,
45
+ eval_fine_tuned,
46
+ save_fine_tuned_model,
47
+ )
48
+
49
+ adapter_name_or_path = (
50
+ sys.argv[1]
51
+ if len(sys.argv) > 1
52
+ else "llama-factory/saves/qwen2-0.5b/lora/sft/checkpoint-560"
53
+ )
54
+
55
+ args = dict(
56
+ model_name_or_path=model_name, # use bnb-4bit-quantized Llama-3-8B-Instruct model
57
+ adapter_name_or_path=adapter_name_or_path, # load the saved LoRA adapters
58
+ template="chatml", # same to the one in training
59
+ finetuning_type="lora", # same to the one in training
60
+ quantization_bit=4, # load 4-bit quantized model
61
+ )
62
+ chat_model = ChatModel(args)
63
+
64
+ messages = []
65
+ print(
66
+ "Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application."
67
+ )
68
+ while True:
69
+ query = input("\nUser: ")
70
+ if query.strip() == "exit":
71
+ break
72
+ if query.strip() == "clear":
73
+ messages = []
74
+ torch_gc()
75
+ print("History has been removed.")
76
+ continue
77
+
78
+ messages.append({"role": "user", "content": query})
79
+ print("Assistant: ", end="", flush=True)
80
+
81
+ response = ""
82
+ for new_text in chat_model.stream_chat(messages):
83
+ print(new_text, end="", flush=True)
84
+ response += new_text
85
+ print()
86
+ messages.append({"role": "assistant", "content": response})
87
+
88
+ torch_gc()
llm_toolkit/eval.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ from dotenv import find_dotenv, load_dotenv
5
+
6
+ found_dotenv = find_dotenv(".env")
7
+
8
+ if len(found_dotenv) == 0:
9
+ found_dotenv = find_dotenv(".env.example")
10
+ print(f"loading env vars from: {found_dotenv}")
11
+ load_dotenv(found_dotenv, override=False)
12
+
13
+ path = os.path.dirname(found_dotenv)
14
+ print(f"Adding {path} to sys.path")
15
+ sys.path.append(path)
16
+
17
+ from llm_toolkit.translation_engine import *
18
+ from llm_toolkit.translation_utils import *
19
+
20
+ model_name = os.getenv("MODEL_NAME")
21
+ adapter_name_or_path = os.getenv("ADAPTER_NAME_OR_PATH")
22
+ load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
23
+ data_path = os.getenv("DATA_PATH")
24
+ results_path = os.getenv("RESULTS_PATH")
25
+
26
+ print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
27
+
28
+ gpu_stats = torch.cuda.get_device_properties(0)
29
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
30
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
31
+ print(f"(1) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
32
+ print(f"{start_gpu_memory} GB of memory reserved.")
33
+
34
+ model, tokenizer = load_model(
35
+ model_name, load_in_4bit=load_in_4bit, adapter_name_or_path=adapter_name_or_path
36
+ )
37
+
38
+ gpu_stats = torch.cuda.get_device_properties(0)
39
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
40
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
41
+ print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
42
+ print(f"{start_gpu_memory} GB of memory reserved.")
43
+
44
+ datasets = load_translation_dataset(data_path, tokenizer)
45
+
46
+ print("Evaluating model: " + model_name)
47
+ predictions = eval_model(model, tokenizer, datasets["test"])
48
+
49
+ gpu_stats = torch.cuda.get_device_properties(0)
50
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
51
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
52
+ print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
53
+ print(f"{start_gpu_memory} GB of memory reserved.")
54
+
55
+ if adapter_name_or_path is not None:
56
+ model_name += "_" + adapter_name_or_path.split("/")[-1]
57
+
58
+ save_results(
59
+ model_name,
60
+ results_path,
61
+ datasets["test"],
62
+ predictions,
63
+ debug=True,
64
+ )
65
+
66
+ metrics = calc_metrics(datasets["test"]["english"], predictions, debug=True)
67
+ print(metrics)
llm_toolkit/eval_lf.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ from dotenv import find_dotenv, load_dotenv
5
+ from llamafactory.chat import ChatModel
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
7
+
8
+ found_dotenv = find_dotenv(".env")
9
+
10
+ if len(found_dotenv) == 0:
11
+ found_dotenv = find_dotenv(".env.example")
12
+ print(f"loading env vars from: {found_dotenv}")
13
+ load_dotenv(found_dotenv, override=False)
14
+
15
+ path = os.path.dirname(found_dotenv)
16
+ print(f"Adding {path} to sys.path")
17
+ sys.path.append(path)
18
+
19
+ from llm_toolkit.translation_utils import *
20
+
21
+ model_name = os.getenv("MODEL_NAME")
22
+ adapter_name_or_path = os.getenv("ADAPTER_NAME_OR_PATH")
23
+ load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
24
+ data_path = os.getenv("DATA_PATH")
25
+ results_path = os.getenv("RESULTS_PATH")
26
+
27
+ print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
28
+
29
+
30
+ def load_model(
31
+ model_name,
32
+ max_seq_length=2048,
33
+ dtype=torch.bfloat16,
34
+ load_in_4bit=False,
35
+ adapter_name_or_path=None,
36
+ ):
37
+ print(f"loading model: {model_name}")
38
+
39
+ if adapter_name_or_path:
40
+ template = "llama3" if "llama-3" in model_name.lower() else "chatml"
41
+
42
+ args = dict(
43
+ model_name_or_path=model_name,
44
+ adapter_name_or_path=adapter_name_or_path, # load the saved LoRA adapters
45
+ template=template, # same to the one in training
46
+ finetuning_type="lora", # same to the one in training
47
+ quantization_bit=4 if load_in_4bit else None, # load 4-bit quantized model
48
+ )
49
+ chat_model = ChatModel(args)
50
+ return chat_model.engine.model, chat_model.engine.tokenizer
51
+
52
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
53
+ bnb_config = BitsAndBytesConfig(
54
+ load_in_4bit=load_in_4bit,
55
+ bnb_4bit_quant_type="nf4",
56
+ bnb_4bit_use_double_quant=False,
57
+ bnb_4bit_compute_dtype=dtype,
58
+ )
59
+
60
+ model = AutoModelForCausalLM.from_pretrained(
61
+ model_name,
62
+ quantization_config=bnb_config,
63
+ torch_dtype=dtype,
64
+ trust_remote_code=True,
65
+ device_map="auto",
66
+ )
67
+
68
+ return model, tokenizer
69
+
70
+
71
+ gpu_stats = torch.cuda.get_device_properties(0)
72
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
73
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
74
+ print(f"(1) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
75
+ print(f"{start_gpu_memory} GB of memory reserved.")
76
+
77
+ model, tokenizer = load_model(
78
+ model_name, load_in_4bit=load_in_4bit, adapter_name_or_path=adapter_name_or_path
79
+ )
80
+
81
+ gpu_stats = torch.cuda.get_device_properties(0)
82
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
83
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
84
+ print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
85
+ print(f"{start_gpu_memory} GB of memory reserved.")
86
+
87
+ datasets = load_translation_dataset(data_path, tokenizer)
88
+
89
+ print("Evaluating model: " + model_name)
90
+ predictions = eval_model(model, tokenizer, datasets["test"])
91
+
92
+ gpu_stats = torch.cuda.get_device_properties(0)
93
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
94
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
95
+ print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
96
+ print(f"{start_gpu_memory} GB of memory reserved.")
97
+
98
+ if adapter_name_or_path is not None:
99
+ model_name += "_" + adapter_name_or_path.split("/")[-1]
100
+
101
+ save_results(
102
+ model_name,
103
+ results_path,
104
+ datasets["test"],
105
+ predictions,
106
+ debug=True,
107
+ )
108
+
109
+ metrics = calc_metrics(datasets["test"]["english"], predictions, debug=True)
110
+ print(metrics)
llm_toolkit/llm_utils.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import sys
4
+ import torch
5
+ from llamafactory.chat import ChatModel
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer
7
+
8
+
9
+ def load_model(
10
+ model_name,
11
+ max_seq_length=2048,
12
+ dtype=torch.bfloat16,
13
+ load_in_4bit=False,
14
+ adapter_name_or_path=None,
15
+ ):
16
+ print(f"loading model: {model_name}")
17
+
18
+ if adapter_name_or_path:
19
+ template = "llama3" if "llama-3" in model_name.lower() else "chatml"
20
+
21
+ args = dict(
22
+ model_name_or_path=model_name,
23
+ adapter_name_or_path=adapter_name_or_path, # load the saved LoRA adapters
24
+ template=template, # same to the one in training
25
+ finetuning_type="lora", # same to the one in training
26
+ quantization_bit=4 if load_in_4bit else None, # load 4-bit quantized model
27
+ )
28
+ chat_model = ChatModel(args)
29
+ return chat_model.engine.model, chat_model.engine.tokenizer
30
+
31
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
32
+ bnb_config = BitsAndBytesConfig(
33
+ load_in_4bit=load_in_4bit,
34
+ bnb_4bit_quant_type="nf4",
35
+ bnb_4bit_use_double_quant=False,
36
+ bnb_4bit_compute_dtype=dtype,
37
+ )
38
+
39
+ model = AutoModelForCausalLM.from_pretrained(
40
+ model_name,
41
+ quantization_config=bnb_config,
42
+ torch_dtype=dtype,
43
+ trust_remote_code=True,
44
+ device_map="auto",
45
+ ) if load_in_4bit else AutoModelForCausalLM.from_pretrained(
46
+ model_name,
47
+ torch_dtype=dtype,
48
+ trust_remote_code=True,
49
+ device_map="auto",
50
+ )
51
+
52
+ return model, tokenizer
53
+
54
+ def test_model(model, tokenizer, prompt):
55
+ inputs = tokenizer(
56
+ [prompt],
57
+ return_tensors="pt",
58
+ ).to("cuda")
59
+
60
+ text_streamer = TextStreamer(tokenizer)
61
+
62
+ _ = model.generate(
63
+ **inputs, max_new_tokens=2048, streamer=text_streamer, use_cache=True
64
+ )
65
+
66
+
67
+ def extract_answer(text, debug=False):
68
+ if text:
69
+ # Remove the begin and end tokens
70
+ text = re.sub(
71
+ r".*?(assistant|\[/INST\]).+?\b", "", text, flags=re.DOTALL | re.MULTILINE
72
+ )
73
+ if debug:
74
+ print("--------\nstep 1:", text)
75
+
76
+ text = re.sub(r"<.+?>.*", "", text, flags=re.DOTALL | re.MULTILINE)
77
+ if debug:
78
+ print("--------\nstep 2:", text)
79
+
80
+ text = re.sub(
81
+ r".*?end_header_id\|>\n\n", "", text, flags=re.DOTALL | re.MULTILINE
82
+ )
83
+ if debug:
84
+ print("--------\nstep 3:", text)
85
+
86
+ return text
87
+
88
+ def eval_model(model, tokenizer, eval_dataset):
89
+ total = len(eval_dataset)
90
+ predictions = []
91
+ for i in tqdm(range(total)):
92
+ inputs = tokenizer(
93
+ eval_dataset["prompt"][i : i + 1],
94
+ return_tensors="pt",
95
+ ).to("cuda")
96
+
97
+ outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
98
+ decoded_output = tokenizer.batch_decode(outputs)
99
+ debug = i == 0
100
+ decoded_output = [
101
+ extract_answer(output, debug=debug) for output in decoded_output
102
+ ]
103
+ predictions.extend(decoded_output)
104
+
105
+ return predictions
106
+
107
+ def save_model(
108
+ model,
109
+ tokenizer,
110
+ include_gguf=True,
111
+ include_merged=True,
112
+ publish=True,
113
+ ):
114
+ try:
115
+ token = os.getenv("HF_TOKEN") or None
116
+ model_name = os.getenv("MODEL_NAME")
117
+
118
+ save_method = "lora"
119
+ quantization_method = "q5_k_m"
120
+
121
+ model_names = get_model_names(
122
+ model_name, save_method=save_method, quantization_method=quantization_method
123
+ )
124
+
125
+ model.save_pretrained(model_names["local"])
126
+ tokenizer.save_pretrained(model_names["local"])
127
+
128
+ if publish:
129
+ model.push_to_hub(
130
+ model_names["hub"],
131
+ token=token,
132
+ )
133
+ tokenizer.push_to_hub(
134
+ model_names["hub"],
135
+ token=token,
136
+ )
137
+
138
+ if include_merged:
139
+ model.save_pretrained_merged(
140
+ model_names["local"] + "-merged", tokenizer, save_method=save_method
141
+ )
142
+ if publish:
143
+ model.push_to_hub_merged(
144
+ model_names["hub"] + "-merged",
145
+ tokenizer,
146
+ save_method="lora",
147
+ token="",
148
+ )
149
+
150
+ if include_gguf:
151
+ model.save_pretrained_gguf(
152
+ model_names["local-gguf"],
153
+ tokenizer,
154
+ quantization_method=quantization_method,
155
+ )
156
+
157
+ if publish:
158
+ model.push_to_hub_gguf(
159
+ model_names["hub-gguf"],
160
+ tokenizer,
161
+ quantization_method=quantization_method,
162
+ token=token,
163
+ )
164
+ except Exception as e:
165
+ print(e)
llm_toolkit/translation_engine.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import torch
4
+ from unsloth import FastLanguageModel, is_bfloat16_supported
5
+ from trl import SFTTrainer
6
+ from transformers import TrainingArguments, TextStreamer
7
+ from llm_toolkit.translation_utils import *
8
+ from llamafactory.chat import ChatModel
9
+
10
+ print(f"loading {__file__}")
11
+
12
+
13
+ def get_model_names(
14
+ model_name, save_method="merged_4bit_forced", quantization_method="q5_k_m"
15
+ ):
16
+ hub_model = model_name.split("/")[-1] + "-MAC-"
17
+ local_model = "models/" + hub_model
18
+
19
+ return {
20
+ "local": local_model + save_method,
21
+ "local-gguf": local_model + quantization_method,
22
+ "hub": hub_model + save_method,
23
+ "hub-gguf": hub_model + "gguf-" + quantization_method,
24
+ }
25
+
26
+
27
+ def load_model(
28
+ model_name,
29
+ max_seq_length=2048,
30
+ dtype=None,
31
+ load_in_4bit=False,
32
+ template="chatml",
33
+ adapter_name_or_path=None,
34
+ ):
35
+ print(f"loading model: {model_name}")
36
+
37
+ if adapter_name_or_path:
38
+ args = dict(
39
+ model_name_or_path=model_name,
40
+ adapter_name_or_path=adapter_name_or_path, # load the saved LoRA adapters
41
+ template=template, # same to the one in training
42
+ finetuning_type="lora", # same to the one in training
43
+ quantization_bit=4, # load 4-bit quantized model
44
+ )
45
+ chat_model = ChatModel(args)
46
+ return chat_model.engine.model, chat_model.engine.tokenizer
47
+
48
+ model, tokenizer = FastLanguageModel.from_pretrained(
49
+ model_name=model_name, # YOUR MODEL YOU USED FOR TRAINING
50
+ max_seq_length=max_seq_length,
51
+ dtype=dtype,
52
+ load_in_4bit=load_in_4bit,
53
+ trust_remote_code=True,
54
+ )
55
+ FastLanguageModel.for_inference(model)
56
+
57
+ return model, tokenizer
58
+
59
+
60
+ def test_model(model, tokenizer, prompt):
61
+ inputs = tokenizer(
62
+ [prompt],
63
+ return_tensors="pt",
64
+ ).to("cuda")
65
+
66
+ text_streamer = TextStreamer(tokenizer)
67
+
68
+ _ = model.generate(
69
+ **inputs, max_new_tokens=128, streamer=text_streamer, use_cache=True
70
+ )
71
+
72
+
73
+ def load_trainer(
74
+ model,
75
+ tokenizer,
76
+ dataset,
77
+ num_train_epochs,
78
+ max_seq_length=2048,
79
+ fp16=False,
80
+ bf16=False,
81
+ output_dir="./outputs",
82
+ ):
83
+ model = FastLanguageModel.get_peft_model(
84
+ model,
85
+ r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
86
+ target_modules=[
87
+ "q_proj",
88
+ "k_proj",
89
+ "v_proj",
90
+ "o_proj",
91
+ "gate_proj",
92
+ "up_proj",
93
+ "down_proj",
94
+ ],
95
+ lora_alpha=16,
96
+ lora_dropout=0, # Supports any, but = 0 is optimized
97
+ bias="none", # Supports any, but = "none" is optimized
98
+ # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
99
+ use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
100
+ random_state=3407,
101
+ use_rslora=False, # We support rank stabilized LoRA
102
+ loftq_config=None, # And LoftQ
103
+ )
104
+
105
+ trainer = SFTTrainer(
106
+ model=model,
107
+ tokenizer=tokenizer,
108
+ train_dataset=dataset,
109
+ dataset_text_field="text",
110
+ max_seq_length=max_seq_length,
111
+ dataset_num_proc=2,
112
+ packing=False, # Can make training 5x faster for short sequences.
113
+ args=TrainingArguments(
114
+ per_device_train_batch_size=2,
115
+ gradient_accumulation_steps=4,
116
+ warmup_steps=5,
117
+ num_train_epochs=num_train_epochs,
118
+ learning_rate=2e-4,
119
+ fp16=not is_bfloat16_supported(),
120
+ bf16=is_bfloat16_supported(),
121
+ logging_steps=100,
122
+ optim="adamw_8bit",
123
+ weight_decay=0.01,
124
+ lr_scheduler_type="linear",
125
+ seed=3407,
126
+ output_dir=output_dir,
127
+ ),
128
+ )
129
+
130
+ return trainer
llm_toolkit/translation_utils.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import pandas as pd
4
+ import evaluate
5
+ import seaborn as sns
6
+ import matplotlib.pyplot as plt
7
+ from datasets import load_dataset
8
+ from langchain_openai import ChatOpenAI
9
+ from langchain_core.prompts import ChatPromptTemplate
10
+ from tqdm import tqdm
11
+
12
+ print(f"loading {__file__}")
13
+
14
+ bleu = evaluate.load("bleu")
15
+ rouge = evaluate.load("rouge")
16
+ meteor = evaluate.load("meteor")
17
+ accuracy = evaluate.load("accuracy")
18
+
19
+
20
+ def extract_answer(text, debug=False):
21
+ if text:
22
+ # Remove the begin and end tokens
23
+ text = re.sub(
24
+ r".*?(assistant|\[/INST\]).+?\b", "", text, flags=re.DOTALL | re.MULTILINE
25
+ )
26
+ if debug:
27
+ print("--------\nstep 1:", text)
28
+
29
+ text = re.sub(r"<.+?>.*", "", text, flags=re.DOTALL | re.MULTILINE)
30
+ if debug:
31
+ print("--------\nstep 2:", text)
32
+
33
+ text = re.sub(
34
+ r".*?end_header_id\|>\n\n", "", text, flags=re.DOTALL | re.MULTILINE
35
+ )
36
+ if debug:
37
+ print("--------\nstep 3:", text)
38
+
39
+ return text
40
+
41
+
42
+ def calc_metrics(references, predictions, debug=False):
43
+ assert len(references) == len(
44
+ predictions
45
+ ), f"lengths are difference: {len(references)} != {len(predictions)}"
46
+
47
+ predictions = [extract_answer(text) for text in predictions]
48
+
49
+ correct = [1 if ref == pred else 0 for ref, pred in zip(references, predictions)]
50
+ accuracy = sum(correct) / len(references)
51
+
52
+ results = {"accuracy": accuracy}
53
+ if debug:
54
+ correct_ids = [i for i, c in enumerate(correct) if c == 1]
55
+ results["correct_ids"] = correct_ids
56
+
57
+ results["meteor"] = meteor.compute(predictions=predictions, references=references)[
58
+ "meteor"
59
+ ]
60
+
61
+ results["bleu_scores"] = bleu.compute(
62
+ predictions=predictions, references=references, max_order=4
63
+ )
64
+ results["rouge_scores"] = rouge.compute(
65
+ predictions=predictions, references=references
66
+ )
67
+ return results
68
+
69
+
70
+ def save_results(model_name, results_path, dataset, predictions, debug=False):
71
+ if not os.path.exists(results_path):
72
+ # Get the directory part of the file path
73
+ dir_path = os.path.dirname(results_path)
74
+
75
+ # Create all directories in the path (if they don't exist)
76
+ os.makedirs(dir_path, exist_ok=True)
77
+ df = dataset.to_pandas()
78
+ df.drop(columns=["text", "prompt"], inplace=True)
79
+ else:
80
+ df = pd.read_csv(results_path, on_bad_lines="warn")
81
+
82
+ df[model_name] = predictions
83
+
84
+ if debug:
85
+ print(df.head(1))
86
+
87
+ df.to_csv(results_path, index=False)
88
+
89
+
90
+ def load_translation_dataset(data_path, tokenizer=None):
91
+ train_data_file = data_path.replace(".tsv", "-train.tsv")
92
+ test_data_file = data_path.replace(".tsv", "-test.tsv")
93
+
94
+ if not os.path.exists(train_data_file):
95
+ print("generating train/test data files")
96
+ dataset = load_dataset(
97
+ "csv", data_files=data_path, delimiter="\t", split="train"
98
+ )
99
+ print(len(dataset))
100
+ dataset = dataset.filter(lambda x: x["chinese"] and x["english"])
101
+
102
+ datasets = dataset.train_test_split(test_size=0.2)
103
+ print(len(dataset))
104
+
105
+ # Convert to pandas DataFrame
106
+ train_df = pd.DataFrame(datasets["train"])
107
+ test_df = pd.DataFrame(datasets["test"])
108
+
109
+ # Save to TSV
110
+ train_df.to_csv(train_data_file, sep="\t", index=False)
111
+ test_df.to_csv(test_data_file, sep="\t", index=False)
112
+
113
+ print("loading train/test data files")
114
+ datasets = load_dataset(
115
+ "csv",
116
+ data_files={"train": train_data_file, "test": test_data_file},
117
+ delimiter="\t",
118
+ )
119
+
120
+ if tokenizer:
121
+ translation_prompt = "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{}"
122
+
123
+ def formatting_prompts_func(examples):
124
+ inputs = examples["chinese"]
125
+ outputs = examples["english"]
126
+
127
+ messages = [
128
+ {
129
+ "role": "system",
130
+ "content": "You are an expert in translating Chinese to English.",
131
+ },
132
+ None,
133
+ ]
134
+
135
+ model_name = os.getenv("MODEL_NAME")
136
+
137
+ if "mistral" in model_name.lower():
138
+ messages = messages[1:]
139
+
140
+ texts = []
141
+ prompts = []
142
+ for input, output in zip(inputs, outputs):
143
+ prompt = translation_prompt.format(input)
144
+ messages[-1] = {"role": "user", "content": prompt}
145
+
146
+ prompt = tokenizer.apply_chat_template(
147
+ messages, tokenize=False, add_generation_prompt=True
148
+ )
149
+ prompts.append(prompt)
150
+ texts.append(prompt + output + tokenizer.eos_token)
151
+ return {"text": texts, "prompt": prompts}
152
+
153
+ datasets = datasets.map(
154
+ formatting_prompts_func,
155
+ batched=True,
156
+ )
157
+
158
+ print(datasets)
159
+ return datasets
160
+
161
+
162
+ def eval_model(model, tokenizer, eval_dataset):
163
+ total = len(eval_dataset)
164
+ predictions = []
165
+ for i in tqdm(range(total)):
166
+ inputs = tokenizer(
167
+ eval_dataset["prompt"][i : i + 1],
168
+ return_tensors="pt",
169
+ ).to("cuda")
170
+
171
+ outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
172
+ decoded_output = tokenizer.batch_decode(outputs)
173
+ debug = i == 0
174
+ decoded_output = [
175
+ extract_answer(output, debug=debug) for output in decoded_output
176
+ ]
177
+ predictions.extend(decoded_output)
178
+
179
+ return predictions
180
+
181
+
182
+ def save_model(
183
+ model,
184
+ tokenizer,
185
+ include_gguf=True,
186
+ include_merged=True,
187
+ publish=True,
188
+ ):
189
+ try:
190
+ token = os.getenv("HF_TOKEN") or None
191
+ model_name = os.getenv("MODEL_NAME")
192
+
193
+ save_method = "lora"
194
+ quantization_method = "q5_k_m"
195
+
196
+ model_names = get_model_names(
197
+ model_name, save_method=save_method, quantization_method=quantization_method
198
+ )
199
+
200
+ model.save_pretrained(model_names["local"])
201
+ tokenizer.save_pretrained(model_names["local"])
202
+
203
+ if publish:
204
+ model.push_to_hub(
205
+ model_names["hub"],
206
+ token=token,
207
+ )
208
+ tokenizer.push_to_hub(
209
+ model_names["hub"],
210
+ token=token,
211
+ )
212
+
213
+ if include_merged:
214
+ model.save_pretrained_merged(
215
+ model_names["local"] + "-merged", tokenizer, save_method=save_method
216
+ )
217
+ if publish:
218
+ model.push_to_hub_merged(
219
+ model_names["hub"] + "-merged",
220
+ tokenizer,
221
+ save_method="lora",
222
+ token="",
223
+ )
224
+
225
+ if include_gguf:
226
+ model.save_pretrained_gguf(
227
+ model_names["local-gguf"],
228
+ tokenizer,
229
+ quantization_method=quantization_method,
230
+ )
231
+
232
+ if publish:
233
+ model.push_to_hub_gguf(
234
+ model_names["hub-gguf"],
235
+ tokenizer,
236
+ quantization_method=quantization_method,
237
+ token=token,
238
+ )
239
+ except Exception as e:
240
+ print(e)
241
+
242
+
243
+ def get_metrics(df):
244
+ metrics_df = pd.DataFrame(df.columns.T)[2:]
245
+ metrics_df.rename(columns={0: "model"}, inplace=True)
246
+ metrics_df["model"] = metrics_df["model"].apply(lambda x: x.split("/")[-1])
247
+ metrics_df.reset_index(inplace=True)
248
+ metrics_df = metrics_df.drop(columns=["index"])
249
+
250
+ accuracy = []
251
+ meteor = []
252
+ bleu_1 = []
253
+ rouge_l = []
254
+ all_metrics = []
255
+ for col in df.columns[2:]:
256
+ metrics = calc_metrics(df["english"], df[col], debug=True)
257
+ print(f"{col}: {metrics}")
258
+
259
+ accuracy.append(metrics["accuracy"])
260
+ meteor.append(metrics["meteor"])
261
+ bleu_1.append(metrics["bleu_scores"]["bleu"])
262
+ rouge_l.append(metrics["rouge_scores"]["rougeL"])
263
+ all_metrics.append(metrics)
264
+
265
+ metrics_df["accuracy"] = accuracy
266
+ metrics_df["meteor"] = meteor
267
+ metrics_df["bleu_1"] = bleu_1
268
+ metrics_df["rouge_l"] = rouge_l
269
+ metrics_df["all_metrics"] = all_metrics
270
+
271
+ return metrics_df
272
+
273
+
274
+ def plot_metrics(metrics_df, figsize=(14, 5), ylim=(0, 0.44)):
275
+ plt.figure(figsize=figsize)
276
+ df_melted = pd.melt(
277
+ metrics_df, id_vars="model", value_vars=["meteor", "bleu_1", "rouge_l"]
278
+ )
279
+
280
+ barplot = sns.barplot(x="variable", y="value", hue="model", data=df_melted)
281
+
282
+ # Set different hatches for each model
283
+ hatches = ["/", "\\", "|", "-", "+", "x", "o", "O", ".", "*", "//", "\\\\"]
284
+
285
+ # Create a dictionary to map models to hatches
286
+ model_hatches = {
287
+ model: hatches[i % len(hatches)]
288
+ for i, model in enumerate(metrics_df["model"].unique())
289
+ }
290
+
291
+ # Apply hatches based on the model
292
+ num_vars = len(df_melted["variable"].unique())
293
+ for i, bar in enumerate(barplot.patches):
294
+ model = df_melted["model"].iloc[i // num_vars]
295
+ bar.set_hatch(model_hatches[model])
296
+
297
+ # Manually update legend to match the bar hatches
298
+ handles, labels = barplot.get_legend_handles_labels()
299
+ for handle, model in zip(handles, metrics_df["model"].unique()):
300
+ handle.set_hatch(model_hatches[model])
301
+
302
+ barplot.set_xticklabels(["METEOR", "BLEU-1", "ROUGE-L"])
303
+ for p in barplot.patches:
304
+ if p.get_height() == 0:
305
+ continue
306
+ barplot.annotate(
307
+ f"{p.get_height():.2f}",
308
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
309
+ ha="center",
310
+ va="center",
311
+ xytext=(0, 10),
312
+ textcoords="offset points",
313
+ )
314
+
315
+ barplot.set(ylim=ylim, ylabel="Scores", xlabel="Metrics")
316
+ plt.legend(bbox_to_anchor=(0.5, -0.1), loc="upper center")
317
+ plt.show()
318
+
319
+
320
+ def plot_times(perf_df, ylim=0.421):
321
+ # Adjusted code to put "train-time" bars in red at the bottom
322
+
323
+ fig, ax1 = plt.subplots(figsize=(12, 10))
324
+
325
+ color_train = "tab:red"
326
+ color_eval = "orange"
327
+ ax1.set_xlabel("Models")
328
+ ax1.set_ylabel("Time (mins)")
329
+ ax1.set_xticks(range(len(perf_df["model"]))) # Set x-ticks positions
330
+ ax1.set_xticklabels(perf_df["model"], rotation=90)
331
+
332
+ # Plot "train-time" first so it's at the bottom
333
+ ax1.bar(
334
+ perf_df["model"],
335
+ perf_df["train-time(mins)"],
336
+ color=color_train,
337
+ label="train-time",
338
+ )
339
+
340
+ # Then, plot "eval-time" on top of "train-time"
341
+ ax1.bar(
342
+ perf_df["model"],
343
+ perf_df["eval-time(mins)"],
344
+ bottom=perf_df["train-time(mins)"],
345
+ color=color_eval,
346
+ label="eval-time",
347
+ )
348
+
349
+ ax1.tick_params(axis="y")
350
+ ax1.legend(loc="upper left")
351
+
352
+ if "meteor" in perf_df.columns:
353
+ ax2 = ax1.twinx()
354
+ color_meteor = "tab:blue"
355
+ ax2.set_ylabel("METEOR", color=color_meteor)
356
+ ax2.plot(
357
+ perf_df["model"],
358
+ perf_df["meteor"],
359
+ color=color_meteor,
360
+ marker="o",
361
+ label="meteor",
362
+ )
363
+ ax2.tick_params(axis="y", labelcolor=color_meteor)
364
+ ax2.legend(loc="upper right")
365
+ ax2.set_ylim(ax2.get_ylim()[0], ylim)
366
+
367
+ # Show numbers in bars
368
+ for p in ax1.patches:
369
+ height = p.get_height()
370
+ if height == 0: # Skip bars with height 0
371
+ continue
372
+ ax1.annotate(
373
+ f"{height:.2f}",
374
+ (p.get_x() + p.get_width() / 2.0, p.get_y() + height),
375
+ ha="center",
376
+ va="center",
377
+ xytext=(0, -10),
378
+ textcoords="offset points",
379
+ )
380
+
381
+ fig.tight_layout()
382
+ plt.show()
383
+
384
+
385
+ def translate_via_llm(text):
386
+ base_url = os.getenv("OPENAI_BASE_URL") or "http://localhost:8000/v1"
387
+ llm = ChatOpenAI(
388
+ model="gpt-4o",
389
+ temperature=0,
390
+ max_tokens=None,
391
+ timeout=None,
392
+ max_retries=2,
393
+ base_url=base_url,
394
+ )
395
+
396
+ prompt = ChatPromptTemplate.from_messages(
397
+ [
398
+ (
399
+ "human",
400
+ "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{input}",
401
+ ),
402
+ ]
403
+ )
404
+
405
+ chain = prompt | llm
406
+ response = chain.invoke(
407
+ {
408
+ "input": text,
409
+ }
410
+ )
411
+ return response.content
412
+
413
+
414
+ def translate(text, cache_dict):
415
+ if text in cache_dict:
416
+ return cache_dict[text]
417
+ else:
418
+ translated_text = translate_via_llm(text)
419
+ cache_dict[text] = translated_text
420
+ return translated_text
llm_toolkit/tune.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ from dotenv import find_dotenv, load_dotenv
5
+
6
+ found_dotenv = find_dotenv(".env")
7
+
8
+ if len(found_dotenv) == 0:
9
+ found_dotenv = find_dotenv(".env.example")
10
+ print(f"loading env vars from: {found_dotenv}")
11
+ load_dotenv(found_dotenv, override=False)
12
+
13
+ path = os.path.dirname(found_dotenv)
14
+ print(f"Adding {path} to sys.path")
15
+ sys.path.append(path)
16
+
17
+ from llm_toolkit.translation_engine import *
18
+ from llm_toolkit.translation_utils import *
19
+
20
+
21
+ model_name = os.getenv("MODEL_NAME")
22
+ load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
23
+ eval_base_model = os.getenv("EVAL_BASE_MODEL") == "true"
24
+ eval_fine_tuned = os.getenv("EVAL_FINE_TUNED") == "true"
25
+ save_fine_tuned_model = os.getenv("SAVE_FINE_TUNED") == "true"
26
+ num_train_epochs = int(os.getenv("NUM_TRAIN_EPOCHS") or 0)
27
+ data_path = os.getenv("DATA_PATH")
28
+ results_path = os.getenv("RESULTS_PATH")
29
+
30
+ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
31
+ dtype = (
32
+ None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
33
+ )
34
+
35
+ print(
36
+ model_name,
37
+ load_in_4bit,
38
+ max_seq_length,
39
+ num_train_epochs,
40
+ dtype,
41
+ data_path,
42
+ results_path,
43
+ eval_base_model,
44
+ eval_fine_tuned,
45
+ save_fine_tuned_model,
46
+ )
47
+
48
+ gpu_stats = torch.cuda.get_device_properties(0)
49
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
50
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
51
+ print(f"(1) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
52
+ print(f"{start_gpu_memory} GB of memory reserved.")
53
+
54
+ model, tokenizer = load_model(model_name, load_in_4bit=load_in_4bit)
55
+
56
+ gpu_stats = torch.cuda.get_device_properties(0)
57
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
58
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
59
+ print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
60
+ print(f"{start_gpu_memory} GB of memory reserved.")
61
+
62
+ datasets = load_translation_dataset(data_path, tokenizer)
63
+
64
+ if eval_base_model:
65
+ print("Evaluating base model: " + model_name)
66
+ predictions = eval_model(model, tokenizer, datasets["test"])
67
+
68
+ # calc_metrics(datasets["test"]["english"], predictions, debug=True)
69
+
70
+ save_results(
71
+ model_name,
72
+ results_path,
73
+ datasets["test"],
74
+ predictions,
75
+ debug=True,
76
+ )
77
+
78
+ gpu_stats = torch.cuda.get_device_properties(0)
79
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
80
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
81
+ print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
82
+ print(f"{start_gpu_memory} GB of memory reserved.")
83
+
84
+
85
+ def is_bfloat16_supported():
86
+ return True
87
+
88
+
89
+ trainer = load_trainer(
90
+ model,
91
+ tokenizer,
92
+ datasets["train"],
93
+ num_train_epochs,
94
+ fp16=not is_bfloat16_supported(),
95
+ bf16=is_bfloat16_supported(),
96
+ )
97
+
98
+ gpu_stats = torch.cuda.get_device_properties(0)
99
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
100
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
101
+ print(f"(4) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
102
+ print(f"{start_gpu_memory} GB of memory reserved.")
103
+
104
+ trainer_stats = trainer.train()
105
+
106
+ # @title Show final memory and time stats
107
+ used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
108
+ used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
109
+ used_percentage = round(used_memory / max_memory * 100, 3)
110
+ lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
111
+ print(f"(5) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
112
+ print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
113
+ print(
114
+ f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
115
+ )
116
+ print(f"Peak reserved memory = {used_memory} GB.")
117
+ print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
118
+ print(f"Peak reserved memory % of max memory = {used_percentage} %.")
119
+ print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
120
+
121
+ if eval_fine_tuned:
122
+ print("Evaluating fine-tuned model: " + model_name)
123
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
124
+ predictions = eval_model(model, tokenizer, datasets["test"])
125
+
126
+ # calc_metrics(datasets["test"]["english"], predictions, debug=True)
127
+
128
+ save_results(
129
+ model_name + "(finetuned)",
130
+ results_path,
131
+ datasets["test"],
132
+ predictions,
133
+ debug=True,
134
+ )
135
+
136
+ gpu_stats = torch.cuda.get_device_properties(0)
137
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
138
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
139
+ print(f"(6) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
140
+ print(f"{start_gpu_memory} GB of memory reserved.")
141
+
142
+ if save_fine_tuned_model:
143
+ save_model(model, tokenizer)
notebooks/00_Data_Analysis.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/01_Qwen2-0.5B_Unsloth.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/02_Qwen2-1.5B_Unsloth.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/03_Qwen2-0.5B_1.5B-4bit.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/04_tune-small-no-flash-attn.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/05_tune-small-with-flash-attn.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/06_tune-small-py3.11.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/07_tune-lf-py3.11.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/07r2_tune-lf-py3.11.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/08_eval-lf-py3.11.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1 +1,15 @@
1
- huggingface_hub==0.22.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ nltk==3.8.1
2
+ python-dotenv==1.0.1
3
+ black==24.4.0
4
+ evaluate==0.4.2
5
+ rouge_score==0.1.2
6
+ pytest==8.2.1
7
+ seaborn==0.13.2
8
+ scikit-learn==1.5.0
9
+ jupyter
10
+ ipywidgets
11
+ packaging
12
+ # triton
13
+ # xformers
14
+ langchain_openai==0.1.13
15
+ wandb==0.17.4
results/experiment-1-results.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfb0c7a3813e9c98c9245c9303b2fb95c1fd7d6a92dd4e0d9d3fe4e4d29a8849
3
+ size 2072299
results/experiment-2-results.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1c99b9bb0c6539a9ff3c9198d730f110c5b6371cba803e1992802beb13e3600
3
+ size 2038783
results/experiment-3-results.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0b8dcb783ed847422ca4f2000b5106742b992537f4b84da6b5ca0b4c22bf0dd
3
+ size 1427300
results/mac-results-no-flash-attn.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89144b0a3e727b326be559637312e353208a7e506b7c0c701ce8e4392e4cbb5e
3
+ size 2129451
results/mac-results-with-flash-attn.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c73be2c390511d0a59090b57c53f0a66c0d4c4648c209ef7155aa97ff73c0b9
3
+ size 1461478
results/mac-results.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7eb1c66dd7162f27a969599ddb3695c3ac82a88bff15cd57d7ed00ca86ab19cd
3
+ size 2072299
results/mac-results_final.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aacf61087ae3b1fd622407c75d0a969b232517c7489841da722e0228bb69a310
3
+ size 2334006
results/mac-results_lf-r2.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25c14d76c8d71ecbce6bc83d641ec4f54f6c0e188fccfcfd8536758a12ed456a
3
+ size 2442353
results/mac-results_lf-r3.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ea9402ad5c87e3b7dcb570cf0a3c0bf33bef093c522d4d2ba6dbf633e21f035
3
+ size 531603
results/mac-results_lf.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5acc087808de5df6839cbf7b170094c6e63445aab4bea15e4be9564b905eb51
3
+ size 3236072
results/mac-results_py3.11.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4adb0922c02cc435858b4ba44b4cdaaee4afe6fcc8721a795d740c36d8d94c2c
3
+ size 1463058
results/mac-results_v3.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bfe9ce9720d0cf67ba118d8b2d82f8f6c0bd0f763a8aa00fc1f43f58e544157
3
+ size 1683953
results/model_training_evaluation_times.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5691ccd7fafb765772c2e5da0ada82bd2f3532459dcfed8517565e7cc0d9f1a8
3
+ size 441
scripts/lf-api.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+
3
+ BASEDIR=$(dirname "$0")
4
+ cd $BASEDIR/../llama-factory
5
+ echo Current Directory:
6
+ pwd
7
+
8
+ API_PORT=8000 llamafactory-cli api $1
scripts/tune-large.sh ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+
3
+ BASEDIR=$(dirname "$0")
4
+ cd $BASEDIR
5
+ echo Current Directory:
6
+ pwd
7
+
8
+ nvidia-smi
9
+ uname -a
10
+ cat /etc/os-release
11
+ lscpu
12
+ grep MemTotal /proc/meminfo
13
+
14
+ # pip install -r requirements.txt
15
+ # FLASH_ATTENTION_FORCE_BUILD=TRUE pip install --upgrade flash-attn
16
+
17
+ # export MODEL_NAME=unsloth/Qwen2-72B-Instruct-bnb-4bit
18
+ # echo Tuning $MODEL_NAME
19
+ # python tune.py
20
+
21
+ export MODEL_NAME=unsloth/llama-3-70b-Instruct-bnb-4bit
22
+ echo Tuning $MODEL_NAME
23
+ python tune.py
24
+
scripts/tune-lf.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+
3
+ BASEDIR=$(dirname "$0")
4
+ cd $BASEDIR/../llama-factory
5
+ echo Current Directory:
6
+ pwd
7
+
8
+ YAML=$1 python -c 'import os, json, sys, yaml; filename=os.getenv("YAML"); y=yaml.safe_load(open(filename)) ; print(f"{filename}:\n", json.dumps(y, indent=2))'
9
+ llamafactory-cli train $1