machine-translation

Build error

App Files Files Community

dh-mc commited on Jul 15, 2024

Commit

3860729

1 Parent(s): fd78d87

initial code for Chinese/English translation

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +18 -0
.gitignore +152 -0
datasets/mac/mac-test.tsv +3 -0
datasets/mac/mac-train.tsv +3 -0
datasets/mac/mac.tsv +3 -0
eval_modules/calc_repetitions.py +79 -0
llama-factory/config/llama3_8b_lora_sft.yaml +46 -0
llama-factory/config/qwen2_0.5b_lora_sft.yaml +42 -0
llama-factory/config/qwen2_0.5b_lora_sft_unsloth.yaml +45 -0
llama-factory/config/qwen2_1.5b_lora_sft.yaml +42 -0
llama-factory/config/qwen2_1.5b_lora_sft_unsloth.yaml +45 -0
llama-factory/config/qwen2_7b_lora_sft.yaml +45 -0
llama-factory/config/qwen2_7b_lora_sft_unsloth.yaml +45 -0
llama-factory/data/alpaca_mac.json +3 -0
llama-factory/data/dataset_info.json +3 -0
llama-factory/inference/qwen2_1.5b_lora_sft.yaml +4 -0
llm_toolkit/chat.py +88 -0
llm_toolkit/eval.py +67 -0
llm_toolkit/eval_lf.py +110 -0
llm_toolkit/llm_utils.py +165 -0
llm_toolkit/translation_engine.py +130 -0
llm_toolkit/translation_utils.py +420 -0
llm_toolkit/tune.py +143 -0
notebooks/00_Data_Analysis.ipynb +0 -0
notebooks/01_Qwen2-0.5B_Unsloth.ipynb +0 -0
notebooks/02_Qwen2-1.5B_Unsloth.ipynb +0 -0
notebooks/03_Qwen2-0.5B_1.5B-4bit.ipynb +0 -0
notebooks/04_tune-small-no-flash-attn.ipynb +0 -0
notebooks/05_tune-small-with-flash-attn.ipynb +0 -0
notebooks/06_tune-small-py3.11.ipynb +0 -0
notebooks/07_tune-lf-py3.11.ipynb +0 -0
notebooks/07r2_tune-lf-py3.11.ipynb +0 -0
notebooks/08_eval-lf-py3.11.ipynb +0 -0
requirements.txt +15 -1
results/experiment-1-results.csv +3 -0
results/experiment-2-results.csv +3 -0
results/experiment-3-results.csv +3 -0
results/mac-results-no-flash-attn.csv +3 -0
results/mac-results-with-flash-attn.csv +3 -0
results/mac-results.csv +3 -0
results/mac-results_final.csv +3 -0
results/mac-results_lf-r2.csv +3 -0
results/mac-results_lf-r3.csv +3 -0
results/mac-results_lf.csv +3 -0
results/mac-results_py3.11.csv +3 -0
results/mac-results_v3.csv +3 -0
results/model_training_evaluation_times.csv +3 -0
scripts/lf-api.sh +8 -0
scripts/tune-large.sh +24 -0
scripts/tune-lf.sh +9 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,21 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+results/mac-results.csv filter=lfs diff=lfs merge=lfs -text
+results/mac-results_lf.csv filter=lfs diff=lfs merge=lfs -text
+results/mac-results_lf-r3.csv filter=lfs diff=lfs merge=lfs -text
+results/experiment-1-results.csv filter=lfs diff=lfs merge=lfs -text
+results/mac-results-no-flash-attn.csv filter=lfs diff=lfs merge=lfs -text
+results/mac-results_lf-r2.csv filter=lfs diff=lfs merge=lfs -text
+results/model_training_evaluation_times.csv filter=lfs diff=lfs merge=lfs -text
+results/experiment-3-results.csv filter=lfs diff=lfs merge=lfs -text
+results/mac-results_final.csv filter=lfs diff=lfs merge=lfs -text
+results/mac-results_py3.11.csv filter=lfs diff=lfs merge=lfs -text
+results/experiment-2-results.csv filter=lfs diff=lfs merge=lfs -text
+results/mac-results-with-flash-attn.csv filter=lfs diff=lfs merge=lfs -text
+results/mac-results_v3.csv filter=lfs diff=lfs merge=lfs -text
+llama-factory/data/alpaca_mac.json filter=lfs diff=lfs merge=lfs -text
+llama-factory/data/dataset_info.json filter=lfs diff=lfs merge=lfs -text
+datasets/mac/mac-test.tsv filter=lfs diff=lfs merge=lfs -text
+datasets/mac/mac-train.tsv filter=lfs diff=lfs merge=lfs -text
+datasets/mac/mac.tsv filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,152 @@

+*.out
+*.log
+*/outputs/
+*/models/
+*/wandb/
+*/cs605-nlp-assignment-2*/
+*/augmented_data/
+*/inflaton/
+*/llama.cpp/
+wandb/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+# *.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# JetBrains
+.idea
+*.db
+.DS_Store
+/outputs
+/models
+/llama.cpp
+/llama-factory/saves
+/llama-factory/saves-1

datasets/mac/mac-test.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5663c7521eaf9942a9fea40b2950a46e37b761b22cc698eb6fe6b57bf70d0c4
+size 253194

datasets/mac/mac-train.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:424f0adcb2727eec322acef12294f4efb10412fc0b0529887d28dddc5171af05
+size 1031685

datasets/mac/mac.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93f3ab2ba07b67b0a3f9ff05291c1b6748851999cda050bc165f8dd259daa2aa
+size 1289106

eval_modules/calc_repetitions.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import re
+import math
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+import seaborn as sns
+import nltk
+import evaluate
+meteor = evaluate.load("meteor")
+print(f"loading: {__file__}")
+# final version
+pattern_excessive_whitespaces = re.compile(r"\s{5,}")
+pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL)
+def del_excessive_whitespaces(text, debug=False):
+    count = 0
+    if isinstance(text, str):
+        if debug:
+            print("----detect excessive whitespaces----")
+        count = len(text)
+        text = pattern_excessive_whitespaces.sub("", text)
+        count -= len(text)
+        if debug and count:
+            print(f"removed excessive whitespaces: {count}")
+    return text, count
+# final version for repetition detection
+def detect_text_repetitions(text, debug=False):
+    count = 0
+    if isinstance(text, str):
+        if debug:
+            print("----detect text repetitions----")
+        matches = pattern_text_repetitions.finditer(text)
+        for match in matches:
+            if debug:
+                print(match)
+                for groupNum in range(0, len(match.groups())):
+                    groupNum = groupNum + 1
+                    print(
+                        "Group {groupNum} found at {start}-{end}: `{group}`".format(
+                            groupNum=groupNum,
+                            start=match.start(groupNum),
+                            end=match.end(groupNum),
+                            group=match.group(groupNum),
+                        )
+                    )
+            start, end = match.span()
+            count += end - start
+    return count
+def detect_repetitions(text, debug=False):
+    text, count_excessive_whitespaces = del_excessive_whitespaces(text, debug=debug)
+    count_text_repetitions = detect_text_repetitions(text, debug=debug)
+    total_repetitions = count_excessive_whitespaces + count_text_repetitions
+    result = (count_excessive_whitespaces, count_text_repetitions, total_repetitions)
+    if debug:
+        print(result)
+    return result
+def detect_scores(text, debug=False):
+    newline_score, repetition_score, total_repetitions = detect_repetitions(
+        text, debug=debug
+    )
+    return pd.Series([newline_score, repetition_score, total_repetitions])

llama-factory/config/llama3_8b_lora_sft.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+### model
+model_name_or_path: gradientai/Llama-3-8B-Instruct-Gradient-1048k
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: all
+quantization_bit: 4                     # use 4-bit QLoRA
+loraplus_lr_ratio: 16.0                 # use LoRA+ with lambda=16.0
+# use_unsloth: true                       # use UnslothAI's LoRA optimization for 2x faster training
+### dataset
+dataset: alpaca_mac
+template: llama3
+cutoff_len: 1024
+max_samples: 4528
+overwrite_cache: true
+preprocessing_num_workers: 16
+### output
+# output_dir: saves/llama3-8b/lora/sft
+output_dir: /Workspace/Users/donghao.huang@mastercard.com/lf-saves/llama3-8b/lora/sft/
+logging_steps: 10
+save_steps: 560
+plot_loss: true
+overwrite_output_dir: true
+# resume_from_checkpoint: true
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 6.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+val_size: 0.01
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 560
+report_to: none

llama-factory/config/qwen2_0.5b_lora_sft.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+### model
+model_name_or_path: Qwen/Qwen2-0.5B-Instruct
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: all
+### dataset
+dataset: alpaca_mac
+template: chatml
+cutoff_len: 1024
+max_samples: 4528
+overwrite_cache: true
+preprocessing_num_workers: 16
+### output
+output_dir: saves/qwen2-0.5b/lora/sft
+logging_steps: 10
+save_steps: 560
+plot_loss: true
+overwrite_output_dir: true
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 6.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+val_size: 0.01
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 560
+report_to: wandb
+run_name: qwen2_0.5b_lora_sft # optional

llama-factory/config/qwen2_0.5b_lora_sft_unsloth.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+### model
+model_name_or_path: Qwen/Qwen2-0.5B-Instruct
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: all
+quantization_bit: 4                     # use 4-bit QLoRA
+loraplus_lr_ratio: 16.0                 # use LoRA+ with lambda=16.0
+use_unsloth: true                       # use UnslothAI's LoRA optimization for 2x faster training
+### dataset
+dataset: alpaca_mac
+template: chatml
+cutoff_len: 1024
+max_samples: 4528
+overwrite_cache: true
+preprocessing_num_workers: 16
+### output
+output_dir: saves/qwen2-0.5b/lora/sft
+logging_steps: 10
+save_steps: 560
+plot_loss: true
+overwrite_output_dir: true
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 6.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+val_size: 0.01
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 560
+report_to: wandb
+run_name: qwen2_0.5b_lora_sft # optional

llama-factory/config/qwen2_1.5b_lora_sft.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+### model
+model_name_or_path: Qwen/Qwen2-1.5B-Instruct
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: all
+### dataset
+dataset: alpaca_mac
+template: chatml
+cutoff_len: 1024
+max_samples: 4528
+overwrite_cache: true
+preprocessing_num_workers: 16
+### output
+output_dir: saves/qwen2-1.5b/lora/sft
+logging_steps: 10
+save_steps: 560
+plot_loss: true
+overwrite_output_dir: true
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 6.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+val_size: 0.01
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 560
+report_to: wandb
+run_name: qwen2_1.5b_lora_sft # optional

llama-factory/config/qwen2_1.5b_lora_sft_unsloth.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+### model
+model_name_or_path: Qwen/Qwen2-1.5B-Instruct
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: all
+quantization_bit: 4                     # use 4-bit QLoRA
+loraplus_lr_ratio: 16.0                 # use LoRA+ with lambda=16.0
+use_unsloth: true                       # use UnslothAI's LoRA optimization for 2x faster training
+### dataset
+dataset: alpaca_mac
+template: chatml
+cutoff_len: 1024
+max_samples: 4528
+overwrite_cache: true
+preprocessing_num_workers: 16
+### output
+output_dir: saves/qwen2-1.5b/lora/sft
+logging_steps: 10
+save_steps: 560
+plot_loss: true
+overwrite_output_dir: true
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 6.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+val_size: 0.01
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 560
+report_to: wandb
+run_name: qwen2_1.5b_lora_sft # optional

llama-factory/config/qwen2_7b_lora_sft.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+### model
+model_name_or_path: Qwen/Qwen2-7B-Instruct
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: all
+quantization_bit: 4                     # use 4-bit QLoRA
+loraplus_lr_ratio: 16.0                 # use LoRA+ with lambda=16.0
+# use_unsloth: true                       # use UnslothAI's LoRA optimization for 2x faster training
+### dataset
+dataset: alpaca_mac
+template: chatml
+cutoff_len: 1024
+max_samples: 4528
+overwrite_cache: true
+preprocessing_num_workers: 16
+### output
+output_dir: saves/qwen2-7b/lora/sft
+logging_steps: 10
+save_steps: 560
+plot_loss: true
+overwrite_output_dir: true
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 6.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+val_size: 0.01
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 560
+report_to: wandb
+run_name: qwen2_7b_lora_sft # optional

llama-factory/config/qwen2_7b_lora_sft_unsloth.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+### model
+model_name_or_path: Qwen/Qwen2-7B-Instruct
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: all
+quantization_bit: 4                     # use 4-bit QLoRA
+loraplus_lr_ratio: 16.0                 # use LoRA+ with lambda=16.0
+use_unsloth: true                       # use UnslothAI's LoRA optimization for 2x faster training
+### dataset
+dataset: alpaca_mac
+template: chatml
+cutoff_len: 1024
+max_samples: 4528
+overwrite_cache: true
+preprocessing_num_workers: 16
+### output
+output_dir: saves/qwen2-7b/lora/sft
+logging_steps: 10
+save_steps: 560
+plot_loss: true
+overwrite_output_dir: true
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 6.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+val_size: 0.01
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 560
+report_to: wandb
+run_name: qwen2_7b_lora_sft # optional

llama-factory/data/alpaca_mac.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f03e62eb461c2204bbaef55f2de28ec115b1a5834b81f03b10f157551d5fe9f
+size 2240344

llama-factory/data/dataset_info.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84bce610296ed7e729647e85d25576b6226d20ddf0bca4982fb1deb02de35911
+size 13560

llama-factory/inference/qwen2_1.5b_lora_sft.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+model_name_or_path: Qwen/Qwen2-1.5B-Instruct
+adapter_name_or_path: saves/qwen2-1.5b/lora/sft/checkpoint-1680
+template: chatml
+finetuning_type: lora

llm_toolkit/chat.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+import sys
+from llamafactory.chat import ChatModel
+from llamafactory.extras.misc import torch_gc
+from dotenv import find_dotenv, load_dotenv
+found_dotenv = find_dotenv(".env")
+if len(found_dotenv) == 0:
+    found_dotenv = find_dotenv(".env.example")
+print(f"loading env vars from: {found_dotenv}")
+load_dotenv(found_dotenv, override=False)
+path = os.path.dirname(found_dotenv)
+print(f"Adding {path} to sys.path")
+sys.path.append(path)
+from llm_toolkit.translation_engine import *
+from llm_toolkit.translation_utils import *
+model_name = os.getenv("MODEL_NAME")
+load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
+eval_base_model = os.getenv("EVAL_BASE_MODEL") == "true"
+eval_fine_tuned = os.getenv("EVAL_FINE_TUNED") == "true"
+save_fine_tuned_model = os.getenv("SAVE_FINE_TUNED") == "true"
+num_train_epochs = int(os.getenv("NUM_TRAIN_EPOCHS") or 0)
+data_path = os.getenv("DATA_PATH")
+results_path = os.getenv("RESULTS_PATH")
+max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
+dtype = (
+    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+)
+print(
+    model_name,
+    load_in_4bit,
+    max_seq_length,
+    num_train_epochs,
+    dtype,
+    data_path,
+    results_path,
+    eval_base_model,
+    eval_fine_tuned,
+    save_fine_tuned_model,
+)
+adapter_name_or_path = (
+    sys.argv[1]
+    if len(sys.argv) > 1
+    else "llama-factory/saves/qwen2-0.5b/lora/sft/checkpoint-560"
+)
+args = dict(
+    model_name_or_path=model_name,  # use bnb-4bit-quantized Llama-3-8B-Instruct model
+    adapter_name_or_path=adapter_name_or_path,  # load the saved LoRA adapters
+    template="chatml",  # same to the one in training
+    finetuning_type="lora",  # same to the one in training
+    quantization_bit=4,  # load 4-bit quantized model
+)
+chat_model = ChatModel(args)
+messages = []
+print(
+    "Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application."
+)
+while True:
+    query = input("\nUser: ")
+    if query.strip() == "exit":
+        break
+    if query.strip() == "clear":
+        messages = []
+        torch_gc()
+        print("History has been removed.")
+        continue
+    messages.append({"role": "user", "content": query})
+    print("Assistant: ", end="", flush=True)
+    response = ""
+    for new_text in chat_model.stream_chat(messages):
+        print(new_text, end="", flush=True)
+        response += new_text
+    print()
+    messages.append({"role": "assistant", "content": response})
+torch_gc()

llm_toolkit/eval.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import sys
+import torch
+from dotenv import find_dotenv, load_dotenv
+found_dotenv = find_dotenv(".env")
+if len(found_dotenv) == 0:
+    found_dotenv = find_dotenv(".env.example")
+print(f"loading env vars from: {found_dotenv}")
+load_dotenv(found_dotenv, override=False)
+path = os.path.dirname(found_dotenv)
+print(f"Adding {path} to sys.path")
+sys.path.append(path)
+from llm_toolkit.translation_engine import *
+from llm_toolkit.translation_utils import *
+model_name = os.getenv("MODEL_NAME")
+adapter_name_or_path = os.getenv("ADAPTER_NAME_OR_PATH")
+load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
+data_path = os.getenv("DATA_PATH")
+results_path = os.getenv("RESULTS_PATH")
+print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
+gpu_stats = torch.cuda.get_device_properties(0)
+start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+print(f"(1) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{start_gpu_memory} GB of memory reserved.")
+model, tokenizer = load_model(
+    model_name, load_in_4bit=load_in_4bit, adapter_name_or_path=adapter_name_or_path
+)
+gpu_stats = torch.cuda.get_device_properties(0)
+start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{start_gpu_memory} GB of memory reserved.")
+datasets = load_translation_dataset(data_path, tokenizer)
+print("Evaluating model: " + model_name)
+predictions = eval_model(model, tokenizer, datasets["test"])
+gpu_stats = torch.cuda.get_device_properties(0)
+start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{start_gpu_memory} GB of memory reserved.")
+if adapter_name_or_path is not None:
+    model_name += "_" + adapter_name_or_path.split("/")[-1]
+save_results(
+    model_name,
+    results_path,
+    datasets["test"],
+    predictions,
+    debug=True,
+)
+metrics = calc_metrics(datasets["test"]["english"], predictions, debug=True)
+print(metrics)

llm_toolkit/eval_lf.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import os
+import sys
+import torch
+from dotenv import find_dotenv, load_dotenv
+from llamafactory.chat import ChatModel
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+found_dotenv = find_dotenv(".env")
+if len(found_dotenv) == 0:
+    found_dotenv = find_dotenv(".env.example")
+print(f"loading env vars from: {found_dotenv}")
+load_dotenv(found_dotenv, override=False)
+path = os.path.dirname(found_dotenv)
+print(f"Adding {path} to sys.path")
+sys.path.append(path)
+from llm_toolkit.translation_utils import *
+model_name = os.getenv("MODEL_NAME")
+adapter_name_or_path = os.getenv("ADAPTER_NAME_OR_PATH")
+load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
+data_path = os.getenv("DATA_PATH")
+results_path = os.getenv("RESULTS_PATH")
+print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
+def load_model(
+    model_name,
+    max_seq_length=2048,
+    dtype=torch.bfloat16,
+    load_in_4bit=False,
+    adapter_name_or_path=None,
+):
+    print(f"loading model: {model_name}")
+    if adapter_name_or_path:
+        template = "llama3" if "llama-3" in model_name.lower() else "chatml"
+        args = dict(
+            model_name_or_path=model_name,
+            adapter_name_or_path=adapter_name_or_path,  # load the saved LoRA adapters
+            template=template,  # same to the one in training
+            finetuning_type="lora",  # same to the one in training
+            quantization_bit=4 if load_in_4bit else None,  # load 4-bit quantized model
+        )
+        chat_model = ChatModel(args)
+        return chat_model.engine.model, chat_model.engine.tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=load_in_4bit,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=False,
+        bnb_4bit_compute_dtype=dtype,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        quantization_config=bnb_config,
+        torch_dtype=dtype,
+        trust_remote_code=True,
+        device_map="auto",
+    )
+    return model, tokenizer
+gpu_stats = torch.cuda.get_device_properties(0)
+start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+print(f"(1) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{start_gpu_memory} GB of memory reserved.")
+model, tokenizer = load_model(
+    model_name, load_in_4bit=load_in_4bit, adapter_name_or_path=adapter_name_or_path
+)
+gpu_stats = torch.cuda.get_device_properties(0)
+start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{start_gpu_memory} GB of memory reserved.")
+datasets = load_translation_dataset(data_path, tokenizer)
+print("Evaluating model: " + model_name)
+predictions = eval_model(model, tokenizer, datasets["test"])
+gpu_stats = torch.cuda.get_device_properties(0)
+start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{start_gpu_memory} GB of memory reserved.")
+if adapter_name_or_path is not None:
+    model_name += "_" + adapter_name_or_path.split("/")[-1]
+save_results(
+    model_name,
+    results_path,
+    datasets["test"],
+    predictions,
+    debug=True,
+)
+metrics = calc_metrics(datasets["test"]["english"], predictions, debug=True)
+print(metrics)

llm_toolkit/llm_utils.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import os
+import re
+import sys
+import torch
+from llamafactory.chat import ChatModel
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer
+def load_model(
+    model_name,
+    max_seq_length=2048,
+    dtype=torch.bfloat16,
+    load_in_4bit=False,
+    adapter_name_or_path=None,
+):
+    print(f"loading model: {model_name}")
+    if adapter_name_or_path:
+        template = "llama3" if "llama-3" in model_name.lower() else "chatml"
+        args = dict(
+            model_name_or_path=model_name,
+            adapter_name_or_path=adapter_name_or_path,  # load the saved LoRA adapters
+            template=template,  # same to the one in training
+            finetuning_type="lora",  # same to the one in training
+            quantization_bit=4 if load_in_4bit else None,  # load 4-bit quantized model
+        )
+        chat_model = ChatModel(args)
+        return chat_model.engine.model, chat_model.engine.tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=load_in_4bit,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=False,
+        bnb_4bit_compute_dtype=dtype,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        quantization_config=bnb_config,
+        torch_dtype=dtype,
+        trust_remote_code=True,
+        device_map="auto",
+    ) if load_in_4bit else AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=dtype,
+        trust_remote_code=True,
+        device_map="auto",
+    )
+    return model, tokenizer
+def test_model(model, tokenizer, prompt):
+    inputs = tokenizer(
+        [prompt],
+        return_tensors="pt",
+    ).to("cuda")
+    text_streamer = TextStreamer(tokenizer)
+    _ = model.generate(
+        **inputs, max_new_tokens=2048, streamer=text_streamer, use_cache=True
+    )
+def extract_answer(text, debug=False):
+    if text:
+        # Remove the begin and end tokens
+        text = re.sub(
+            r".*?(assistant|\[/INST\]).+?\b", "", text, flags=re.DOTALL | re.MULTILINE
+        )
+        if debug:
+            print("--------\nstep 1:", text)
+        text = re.sub(r"<.+?>.*", "", text, flags=re.DOTALL | re.MULTILINE)
+        if debug:
+            print("--------\nstep 2:", text)
+        text = re.sub(
+            r".*?end_header_id\|>\n\n", "", text, flags=re.DOTALL | re.MULTILINE
+        )
+        if debug:
+            print("--------\nstep 3:", text)
+    return text
+def eval_model(model, tokenizer, eval_dataset):
+    total = len(eval_dataset)
+    predictions = []
+    for i in tqdm(range(total)):
+        inputs = tokenizer(
+            eval_dataset["prompt"][i : i + 1],
+            return_tensors="pt",
+        ).to("cuda")
+        outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
+        decoded_output = tokenizer.batch_decode(outputs)
+        debug = i == 0
+        decoded_output = [
+            extract_answer(output, debug=debug) for output in decoded_output
+        ]
+        predictions.extend(decoded_output)
+    return predictions
+def save_model(
+    model,
+    tokenizer,
+    include_gguf=True,
+    include_merged=True,
+    publish=True,
+):
+    try:
+        token = os.getenv("HF_TOKEN") or None
+        model_name = os.getenv("MODEL_NAME")
+        save_method = "lora"
+        quantization_method = "q5_k_m"
+        model_names = get_model_names(
+            model_name, save_method=save_method, quantization_method=quantization_method
+        )
+        model.save_pretrained(model_names["local"])
+        tokenizer.save_pretrained(model_names["local"])
+        if publish:
+            model.push_to_hub(
+                model_names["hub"],
+                token=token,
+            )
+            tokenizer.push_to_hub(
+                model_names["hub"],
+                token=token,
+            )
+        if include_merged:
+            model.save_pretrained_merged(
+                model_names["local"] + "-merged", tokenizer, save_method=save_method
+            )
+            if publish:
+                model.push_to_hub_merged(
+                    model_names["hub"] + "-merged",
+                    tokenizer,
+                    save_method="lora",
+                    token="",
+                )
+        if include_gguf:
+            model.save_pretrained_gguf(
+                model_names["local-gguf"],
+                tokenizer,
+                quantization_method=quantization_method,
+            )
+            if publish:
+                model.push_to_hub_gguf(
+                    model_names["hub-gguf"],
+                    tokenizer,
+                    quantization_method=quantization_method,
+                    token=token,
+                )
+    except Exception as e:
+        print(e)

llm_toolkit/translation_engine.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+import pandas as pd
+import torch
+from unsloth import FastLanguageModel, is_bfloat16_supported
+from trl import SFTTrainer
+from transformers import TrainingArguments, TextStreamer
+from llm_toolkit.translation_utils import *
+from llamafactory.chat import ChatModel
+print(f"loading {__file__}")
+def get_model_names(
+    model_name, save_method="merged_4bit_forced", quantization_method="q5_k_m"
+):
+    hub_model = model_name.split("/")[-1] + "-MAC-"
+    local_model = "models/" + hub_model
+    return {
+        "local": local_model + save_method,
+        "local-gguf": local_model + quantization_method,
+        "hub": hub_model + save_method,
+        "hub-gguf": hub_model + "gguf-" + quantization_method,
+    }
+def load_model(
+    model_name,
+    max_seq_length=2048,
+    dtype=None,
+    load_in_4bit=False,
+    template="chatml",
+    adapter_name_or_path=None,
+):
+    print(f"loading model: {model_name}")
+    if adapter_name_or_path:
+        args = dict(
+            model_name_or_path=model_name,
+            adapter_name_or_path=adapter_name_or_path,  # load the saved LoRA adapters
+            template=template,  # same to the one in training
+            finetuning_type="lora",  # same to the one in training
+            quantization_bit=4,  # load 4-bit quantized model
+        )
+        chat_model = ChatModel(args)
+        return chat_model.engine.model, chat_model.engine.tokenizer
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=model_name,  # YOUR MODEL YOU USED FOR TRAINING
+        max_seq_length=max_seq_length,
+        dtype=dtype,
+        load_in_4bit=load_in_4bit,
+        trust_remote_code=True,
+    )
+    FastLanguageModel.for_inference(model)
+    return model, tokenizer
+def test_model(model, tokenizer, prompt):
+    inputs = tokenizer(
+        [prompt],
+        return_tensors="pt",
+    ).to("cuda")
+    text_streamer = TextStreamer(tokenizer)
+    _ = model.generate(
+        **inputs, max_new_tokens=128, streamer=text_streamer, use_cache=True
+    )
+def load_trainer(
+    model,
+    tokenizer,
+    dataset,
+    num_train_epochs,
+    max_seq_length=2048,
+    fp16=False,
+    bf16=False,
+    output_dir="./outputs",
+):
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+        target_modules=[
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+        ],
+        lora_alpha=16,
+        lora_dropout=0,  # Supports any, but = 0 is optimized
+        bias="none",  # Supports any, but = "none" is optimized
+        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
+        use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
+        random_state=3407,
+        use_rslora=False,  # We support rank stabilized LoRA
+        loftq_config=None,  # And LoftQ
+    )
+    trainer = SFTTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=dataset,
+        dataset_text_field="text",
+        max_seq_length=max_seq_length,
+        dataset_num_proc=2,
+        packing=False,  # Can make training 5x faster for short sequences.
+        args=TrainingArguments(
+            per_device_train_batch_size=2,
+            gradient_accumulation_steps=4,
+            warmup_steps=5,
+            num_train_epochs=num_train_epochs,
+            learning_rate=2e-4,
+            fp16=not is_bfloat16_supported(),
+            bf16=is_bfloat16_supported(),
+            logging_steps=100,
+            optim="adamw_8bit",
+            weight_decay=0.01,
+            lr_scheduler_type="linear",
+            seed=3407,
+            output_dir=output_dir,
+        ),
+    )
+    return trainer

llm_toolkit/translation_utils.py ADDED Viewed

	@@ -0,0 +1,420 @@

+import os
+import re
+import pandas as pd
+import evaluate
+import seaborn as sns
+import matplotlib.pyplot as plt
+from datasets import load_dataset
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
+from tqdm import tqdm
+print(f"loading {__file__}")
+bleu = evaluate.load("bleu")
+rouge = evaluate.load("rouge")
+meteor = evaluate.load("meteor")
+accuracy = evaluate.load("accuracy")
+def extract_answer(text, debug=False):
+    if text:
+        # Remove the begin and end tokens
+        text = re.sub(
+            r".*?(assistant|\[/INST\]).+?\b", "", text, flags=re.DOTALL | re.MULTILINE
+        )
+        if debug:
+            print("--------\nstep 1:", text)
+        text = re.sub(r"<.+?>.*", "", text, flags=re.DOTALL | re.MULTILINE)
+        if debug:
+            print("--------\nstep 2:", text)
+        text = re.sub(
+            r".*?end_header_id\|>\n\n", "", text, flags=re.DOTALL | re.MULTILINE
+        )
+        if debug:
+            print("--------\nstep 3:", text)
+    return text
+def calc_metrics(references, predictions, debug=False):
+    assert len(references) == len(
+        predictions
+    ), f"lengths are difference: {len(references)} != {len(predictions)}"
+    predictions = [extract_answer(text) for text in predictions]
+    correct = [1 if ref == pred else 0 for ref, pred in zip(references, predictions)]
+    accuracy = sum(correct) / len(references)
+    results = {"accuracy": accuracy}
+    if debug:
+        correct_ids = [i for i, c in enumerate(correct) if c == 1]
+        results["correct_ids"] = correct_ids
+    results["meteor"] = meteor.compute(predictions=predictions, references=references)[
+        "meteor"
+    ]
+    results["bleu_scores"] = bleu.compute(
+        predictions=predictions, references=references, max_order=4
+    )
+    results["rouge_scores"] = rouge.compute(
+        predictions=predictions, references=references
+    )
+    return results
+def save_results(model_name, results_path, dataset, predictions, debug=False):
+    if not os.path.exists(results_path):
+        # Get the directory part of the file path
+        dir_path = os.path.dirname(results_path)
+        # Create all directories in the path (if they don't exist)
+        os.makedirs(dir_path, exist_ok=True)
+        df = dataset.to_pandas()
+        df.drop(columns=["text", "prompt"], inplace=True)
+    else:
+        df = pd.read_csv(results_path, on_bad_lines="warn")
+    df[model_name] = predictions
+    if debug:
+        print(df.head(1))
+    df.to_csv(results_path, index=False)
+def load_translation_dataset(data_path, tokenizer=None):
+    train_data_file = data_path.replace(".tsv", "-train.tsv")
+    test_data_file = data_path.replace(".tsv", "-test.tsv")
+    if not os.path.exists(train_data_file):
+        print("generating train/test data files")
+        dataset = load_dataset(
+            "csv", data_files=data_path, delimiter="\t", split="train"
+        )
+        print(len(dataset))
+        dataset = dataset.filter(lambda x: x["chinese"] and x["english"])
+        datasets = dataset.train_test_split(test_size=0.2)
+        print(len(dataset))
+        # Convert to pandas DataFrame
+        train_df = pd.DataFrame(datasets["train"])
+        test_df = pd.DataFrame(datasets["test"])
+        # Save to TSV
+        train_df.to_csv(train_data_file, sep="\t", index=False)
+        test_df.to_csv(test_data_file, sep="\t", index=False)
+    print("loading train/test data files")
+    datasets = load_dataset(
+        "csv",
+        data_files={"train": train_data_file, "test": test_data_file},
+        delimiter="\t",
+    )
+    if tokenizer:
+        translation_prompt = "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{}"
+        def formatting_prompts_func(examples):
+            inputs = examples["chinese"]
+            outputs = examples["english"]
+            messages = [
+                {
+                    "role": "system",
+                    "content": "You are an expert in translating Chinese to English.",
+                },
+                None,
+            ]
+            model_name = os.getenv("MODEL_NAME")
+            if "mistral" in model_name.lower():
+                messages = messages[1:]
+            texts = []
+            prompts = []
+            for input, output in zip(inputs, outputs):
+                prompt = translation_prompt.format(input)
+                messages[-1] = {"role": "user", "content": prompt}
+                prompt = tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts.append(prompt)
+                texts.append(prompt + output + tokenizer.eos_token)
+            return {"text": texts, "prompt": prompts}
+        datasets = datasets.map(
+            formatting_prompts_func,
+            batched=True,
+        )
+    print(datasets)
+    return datasets
+def eval_model(model, tokenizer, eval_dataset):
+    total = len(eval_dataset)
+    predictions = []
+    for i in tqdm(range(total)):
+        inputs = tokenizer(
+            eval_dataset["prompt"][i : i + 1],
+            return_tensors="pt",
+        ).to("cuda")
+        outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
+        decoded_output = tokenizer.batch_decode(outputs)
+        debug = i == 0
+        decoded_output = [
+            extract_answer(output, debug=debug) for output in decoded_output
+        ]
+        predictions.extend(decoded_output)
+    return predictions
+def save_model(
+    model,
+    tokenizer,
+    include_gguf=True,
+    include_merged=True,
+    publish=True,
+):
+    try:
+        token = os.getenv("HF_TOKEN") or None
+        model_name = os.getenv("MODEL_NAME")
+        save_method = "lora"
+        quantization_method = "q5_k_m"
+        model_names = get_model_names(
+            model_name, save_method=save_method, quantization_method=quantization_method
+        )
+        model.save_pretrained(model_names["local"])
+        tokenizer.save_pretrained(model_names["local"])
+        if publish:
+            model.push_to_hub(
+                model_names["hub"],
+                token=token,
+            )
+            tokenizer.push_to_hub(
+                model_names["hub"],
+                token=token,
+            )
+        if include_merged:
+            model.save_pretrained_merged(
+                model_names["local"] + "-merged", tokenizer, save_method=save_method
+            )
+            if publish:
+                model.push_to_hub_merged(
+                    model_names["hub"] + "-merged",
+                    tokenizer,
+                    save_method="lora",
+                    token="",
+                )
+        if include_gguf:
+            model.save_pretrained_gguf(
+                model_names["local-gguf"],
+                tokenizer,
+                quantization_method=quantization_method,
+            )
+            if publish:
+                model.push_to_hub_gguf(
+                    model_names["hub-gguf"],
+                    tokenizer,
+                    quantization_method=quantization_method,
+                    token=token,
+                )
+    except Exception as e:
+        print(e)
+def get_metrics(df):
+    metrics_df = pd.DataFrame(df.columns.T)[2:]
+    metrics_df.rename(columns={0: "model"}, inplace=True)
+    metrics_df["model"] = metrics_df["model"].apply(lambda x: x.split("/")[-1])
+    metrics_df.reset_index(inplace=True)
+    metrics_df = metrics_df.drop(columns=["index"])
+    accuracy = []
+    meteor = []
+    bleu_1 = []
+    rouge_l = []
+    all_metrics = []
+    for col in df.columns[2:]:
+        metrics = calc_metrics(df["english"], df[col], debug=True)
+        print(f"{col}: {metrics}")
+        accuracy.append(metrics["accuracy"])
+        meteor.append(metrics["meteor"])
+        bleu_1.append(metrics["bleu_scores"]["bleu"])
+        rouge_l.append(metrics["rouge_scores"]["rougeL"])
+        all_metrics.append(metrics)
+    metrics_df["accuracy"] = accuracy
+    metrics_df["meteor"] = meteor
+    metrics_df["bleu_1"] = bleu_1
+    metrics_df["rouge_l"] = rouge_l
+    metrics_df["all_metrics"] = all_metrics
+    return metrics_df
+def plot_metrics(metrics_df, figsize=(14, 5), ylim=(0, 0.44)):
+    plt.figure(figsize=figsize)
+    df_melted = pd.melt(
+        metrics_df, id_vars="model", value_vars=["meteor", "bleu_1", "rouge_l"]
+    )
+    barplot = sns.barplot(x="variable", y="value", hue="model", data=df_melted)
+    # Set different hatches for each model
+    hatches = ["/", "\\", "|", "-", "+", "x", "o", "O", ".", "*", "//", "\\\\"]
+    # Create a dictionary to map models to hatches
+    model_hatches = {
+        model: hatches[i % len(hatches)]
+        for i, model in enumerate(metrics_df["model"].unique())
+    }
+    # Apply hatches based on the model
+    num_vars = len(df_melted["variable"].unique())
+    for i, bar in enumerate(barplot.patches):
+        model = df_melted["model"].iloc[i // num_vars]
+        bar.set_hatch(model_hatches[model])
+    # Manually update legend to match the bar hatches
+    handles, labels = barplot.get_legend_handles_labels()
+    for handle, model in zip(handles, metrics_df["model"].unique()):
+        handle.set_hatch(model_hatches[model])
+    barplot.set_xticklabels(["METEOR", "BLEU-1", "ROUGE-L"])
+    for p in barplot.patches:
+        if p.get_height() == 0:
+            continue
+        barplot.annotate(
+            f"{p.get_height():.2f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="center",
+            xytext=(0, 10),
+            textcoords="offset points",
+        )
+    barplot.set(ylim=ylim, ylabel="Scores", xlabel="Metrics")
+    plt.legend(bbox_to_anchor=(0.5, -0.1), loc="upper center")
+    plt.show()
+def plot_times(perf_df, ylim=0.421):
+    # Adjusted code to put "train-time" bars in red at the bottom
+    fig, ax1 = plt.subplots(figsize=(12, 10))
+    color_train = "tab:red"
+    color_eval = "orange"
+    ax1.set_xlabel("Models")
+    ax1.set_ylabel("Time (mins)")
+    ax1.set_xticks(range(len(perf_df["model"])))  # Set x-ticks positions
+    ax1.set_xticklabels(perf_df["model"], rotation=90)
+    # Plot "train-time" first so it's at the bottom
+    ax1.bar(
+        perf_df["model"],
+        perf_df["train-time(mins)"],
+        color=color_train,
+        label="train-time",
+    )
+    # Then, plot "eval-time" on top of "train-time"
+    ax1.bar(
+        perf_df["model"],
+        perf_df["eval-time(mins)"],
+        bottom=perf_df["train-time(mins)"],
+        color=color_eval,
+        label="eval-time",
+    )
+    ax1.tick_params(axis="y")
+    ax1.legend(loc="upper left")
+    if "meteor" in perf_df.columns:
+        ax2 = ax1.twinx()
+        color_meteor = "tab:blue"
+        ax2.set_ylabel("METEOR", color=color_meteor)
+        ax2.plot(
+            perf_df["model"],
+            perf_df["meteor"],
+            color=color_meteor,
+            marker="o",
+            label="meteor",
+        )
+        ax2.tick_params(axis="y", labelcolor=color_meteor)
+        ax2.legend(loc="upper right")
+        ax2.set_ylim(ax2.get_ylim()[0], ylim)
+    # Show numbers in bars
+    for p in ax1.patches:
+        height = p.get_height()
+        if height == 0:  # Skip bars with height 0
+            continue
+        ax1.annotate(
+            f"{height:.2f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_y() + height),
+            ha="center",
+            va="center",
+            xytext=(0, -10),
+            textcoords="offset points",
+        )
+    fig.tight_layout()
+    plt.show()
+def translate_via_llm(text):
+    base_url = os.getenv("OPENAI_BASE_URL") or "http://localhost:8000/v1"
+    llm = ChatOpenAI(
+        model="gpt-4o",
+        temperature=0,
+        max_tokens=None,
+        timeout=None,
+        max_retries=2,
+        base_url=base_url,
+    )
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            (
+                "human",
+                "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{input}",
+            ),
+        ]
+    )
+    chain = prompt | llm
+    response = chain.invoke(
+        {
+            "input": text,
+        }
+    )
+    return response.content
+def translate(text, cache_dict):
+    if text in cache_dict:
+        return cache_dict[text]
+    else:
+        translated_text = translate_via_llm(text)
+        cache_dict[text] = translated_text
+        return translated_text

llm_toolkit/tune.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+import sys
+import torch
+from dotenv import find_dotenv, load_dotenv
+found_dotenv = find_dotenv(".env")
+if len(found_dotenv) == 0:
+    found_dotenv = find_dotenv(".env.example")
+print(f"loading env vars from: {found_dotenv}")
+load_dotenv(found_dotenv, override=False)
+path = os.path.dirname(found_dotenv)
+print(f"Adding {path} to sys.path")
+sys.path.append(path)
+from llm_toolkit.translation_engine import *
+from llm_toolkit.translation_utils import *
+model_name = os.getenv("MODEL_NAME")
+load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
+eval_base_model = os.getenv("EVAL_BASE_MODEL") == "true"
+eval_fine_tuned = os.getenv("EVAL_FINE_TUNED") == "true"
+save_fine_tuned_model = os.getenv("SAVE_FINE_TUNED") == "true"
+num_train_epochs = int(os.getenv("NUM_TRAIN_EPOCHS") or 0)
+data_path = os.getenv("DATA_PATH")
+results_path = os.getenv("RESULTS_PATH")
+max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
+dtype = (
+    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+)
+print(
+    model_name,
+    load_in_4bit,
+    max_seq_length,
+    num_train_epochs,
+    dtype,
+    data_path,
+    results_path,
+    eval_base_model,
+    eval_fine_tuned,
+    save_fine_tuned_model,
+)
+gpu_stats = torch.cuda.get_device_properties(0)
+start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+print(f"(1) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{start_gpu_memory} GB of memory reserved.")
+model, tokenizer = load_model(model_name, load_in_4bit=load_in_4bit)
+gpu_stats = torch.cuda.get_device_properties(0)
+start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{start_gpu_memory} GB of memory reserved.")
+datasets = load_translation_dataset(data_path, tokenizer)
+if eval_base_model:
+    print("Evaluating base model: " + model_name)
+    predictions = eval_model(model, tokenizer, datasets["test"])
+    # calc_metrics(datasets["test"]["english"], predictions, debug=True)
+    save_results(
+        model_name,
+        results_path,
+        datasets["test"],
+        predictions,
+        debug=True,
+    )
+gpu_stats = torch.cuda.get_device_properties(0)
+start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{start_gpu_memory} GB of memory reserved.")
+def is_bfloat16_supported():
+    return True
+trainer = load_trainer(
+    model,
+    tokenizer,
+    datasets["train"],
+    num_train_epochs,
+    fp16=not is_bfloat16_supported(),
+    bf16=is_bfloat16_supported(),
+)
+gpu_stats = torch.cuda.get_device_properties(0)
+start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+print(f"(4) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{start_gpu_memory} GB of memory reserved.")
+trainer_stats = trainer.train()
+# @title Show final memory and time stats
+used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
+used_percentage = round(used_memory / max_memory * 100, 3)
+lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
+print(f"(5) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
+print(
+    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
+)
+print(f"Peak reserved memory = {used_memory} GB.")
+print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
+print(f"Peak reserved memory % of max memory = {used_percentage} %.")
+print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
+if eval_fine_tuned:
+    print("Evaluating fine-tuned model: " + model_name)
+    FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
+    predictions = eval_model(model, tokenizer, datasets["test"])
+    # calc_metrics(datasets["test"]["english"], predictions, debug=True)
+    save_results(
+        model_name + "(finetuned)",
+        results_path,
+        datasets["test"],
+        predictions,
+        debug=True,
+    )
+gpu_stats = torch.cuda.get_device_properties(0)
+start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+print(f"(6) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{start_gpu_memory} GB of memory reserved.")
+if save_fine_tuned_model:
+    save_model(model, tokenizer)

notebooks/00_Data_Analysis.ipynb ADDED Viewed