machine-translation

Build error

App Files Files Community

dh-mc commited on Jul 28

Commit

c73d190

•

1 Parent(s): 07320d0

ready for gpu cluster

Browse files

Files changed (32) hide show

llm_toolkit/chat.py +0 -88
llm_toolkit/eval.py +44 -24
llm_toolkit/eval_lf.py +0 -110
llm_toolkit/llm_utils.py +41 -0
llm_toolkit/translation_engine.py +0 -130
llm_toolkit/translation_utils.py +2 -2
llm_toolkit/tune.py +0 -143
notebooks/00_Data_Analysis.ipynb +0 -0
notebooks/01_Qwen2-0.5B_Unsloth.ipynb +0 -0
notebooks/02_Qwen2-1.5B_Unsloth.ipynb +0 -0
notebooks/03_Qwen2-0.5B_1.5B-4bit.ipynb +0 -0
notebooks/04_tune-small-no-flash-attn.ipynb +0 -0
notebooks/05_tune-small-with-flash-attn.ipynb +0 -0
notebooks/06_tune-small-py3.11.ipynb +0 -0
notebooks/07_tune-lf-py3.11.ipynb +0 -0
notebooks/07r2_tune-lf-py3.11.ipynb +0 -0
notebooks/08_eval-lf-py3.11.ipynb +0 -0
results/experiment-1-results.csv +0 -3
results/experiment-2-results.csv +0 -3
results/experiment-3-results.csv +0 -3
results/mac-results-no-flash-attn.csv +0 -3
results/mac-results-with-flash-attn.csv +0 -3
results/mac-results.csv +0 -3
results/mac-results_final.csv +0 -3
results/mac-results_lf-r2.csv +0 -3
results/mac-results_lf-r3.csv +0 -3
results/mac-results_lf.csv +0 -3
results/mac-results_py3.11.csv +0 -3
results/mac-results_v3.csv +0 -3
results/model_training_evaluation_times.csv +0 -3
scripts/eval-mac.sh +10 -5
scripts/eval-model.sh +10 -0

llm_toolkit/chat.py DELETED Viewed

@@ -1,88 +0,0 @@
-import os
-import sys
-from llamafactory.chat import ChatModel
-from llamafactory.extras.misc import torch_gc
-from dotenv import find_dotenv, load_dotenv
-found_dotenv = find_dotenv(".env")
-if len(found_dotenv) == 0:
-    found_dotenv = find_dotenv(".env.example")
-print(f"loading env vars from: {found_dotenv}")
-load_dotenv(found_dotenv, override=False)
-path = os.path.dirname(found_dotenv)
-print(f"Adding {path} to sys.path")
-sys.path.append(path)
-from llm_toolkit.translation_engine import *
-from llm_toolkit.translation_utils import *
-model_name = os.getenv("MODEL_NAME")
-load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
-eval_base_model = os.getenv("EVAL_BASE_MODEL") == "true"
-eval_fine_tuned = os.getenv("EVAL_FINE_TUNED") == "true"
-save_fine_tuned_model = os.getenv("SAVE_FINE_TUNED") == "true"
-num_train_epochs = int(os.getenv("NUM_TRAIN_EPOCHS") or 0)
-data_path = os.getenv("DATA_PATH")
-results_path = os.getenv("RESULTS_PATH")
-max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
-dtype = (
-    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
-)
-print(
-    model_name,
-    load_in_4bit,
-    max_seq_length,
-    num_train_epochs,
-    dtype,
-    data_path,
-    results_path,
-    eval_base_model,
-    eval_fine_tuned,
-    save_fine_tuned_model,
-)
-adapter_name_or_path = (
-    sys.argv[1]
-    if len(sys.argv) > 1
-    else "llama-factory/saves/qwen2-0.5b/lora/sft/checkpoint-560"
-)
-args = dict(
-    model_name_or_path=model_name,  # use bnb-4bit-quantized Llama-3-8B-Instruct model
-    adapter_name_or_path=adapter_name_or_path,  # load the saved LoRA adapters
-    template="chatml",  # same to the one in training
-    finetuning_type="lora",  # same to the one in training
-    quantization_bit=4,  # load 4-bit quantized model
-)
-chat_model = ChatModel(args)
-messages = []
-print(
-    "Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application."
-)
-while True:
-    query = input("\nUser: ")
-    if query.strip() == "exit":
-        break
-    if query.strip() == "clear":
-        messages = []
-        torch_gc()
-        print("History has been removed.")
-        continue
-    messages.append({"role": "user", "content": query})
-    print("Assistant: ", end="", flush=True)
-    response = ""
-    for new_text in chat_model.stream_chat(messages):
-        print(new_text, end="", flush=True)
-        response += new_text
-    print()
-    messages.append({"role": "assistant", "content": response})
-torch_gc()

llm_toolkit/eval.py CHANGED Viewed

@@ -17,6 +17,9 @@ sys.path.append(path)
 from llm_toolkit.llm_utils import *
 from llm_toolkit.translation_utils import *
 model_name = os.getenv("MODEL_NAME")
 adapter_name_or_path = os.getenv("ADAPTER_NAME_OR_PATH")
 load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
@@ -25,21 +28,26 @@ results_path = os.getenv("RESULTS_PATH")
 print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
-gpu_stats = torch.cuda.get_device_properties(0)
-start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
-max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
-print(f"(1) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
-print(f"{start_gpu_memory} GB of memory reserved.")
 model, tokenizer = load_model(
     model_name, load_in_4bit=load_in_4bit, adapter_name_or_path=adapter_name_or_path
 )
-gpu_stats = torch.cuda.get_device_properties(0)
-start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
-max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
-print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
-print(f"{start_gpu_memory} GB of memory reserved.")
 datasets = load_translation_dataset(data_path, tokenizer)
@@ -51,25 +59,37 @@ if len(sys.argv) > 1:
 print_row_details(datasets["test"].to_pandas(), indices=[0, -1])
-print("Evaluating model: " + model_name)
-predictions = eval_model(model, tokenizer, datasets["test"])
-gpu_stats = torch.cuda.get_device_properties(0)
-start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
-max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
-print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
-print(f"{start_gpu_memory} GB of memory reserved.")
 if adapter_name_or_path is not None:
-    model_name += "_" + adapter_name_or_path.split("/")[-1]
-save_results(
     model_name,
-    results_path,
     datasets["test"],
-    predictions,
-    debug=True,
 )
-metrics = calc_metrics(datasets["test"]["english"], predictions, debug=True)
-print(metrics)

 from llm_toolkit.llm_utils import *
 from llm_toolkit.translation_utils import *
+device = check_gpu()
+is_cuda = torch.cuda.is_available()
 model_name = os.getenv("MODEL_NAME")
 adapter_name_or_path = os.getenv("ADAPTER_NAME_OR_PATH")
 load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
 print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
+if is_cuda:
+    torch.cuda.empty_cache()
+    gpu_stats = torch.cuda.get_device_properties(0)
+    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+    print(f"(0) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+    print(f"{start_gpu_memory} GB of memory reserved.")
+    torch.cuda.empty_cache()
 model, tokenizer = load_model(
     model_name, load_in_4bit=load_in_4bit, adapter_name_or_path=adapter_name_or_path
 )
+if is_cuda:
+    gpu_stats = torch.cuda.get_device_properties(0)
+    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+    print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+    print(f"{start_gpu_memory} GB of memory reserved.")
 datasets = load_translation_dataset(data_path, tokenizer)
 print_row_details(datasets["test"].to_pandas(), indices=[0, -1])
+def on_repetition_penalty_step_completed(model_name, predictions):
+    save_results(
+        model_name,
+        results_path,
+        datasets["test"],
+        predictions,
+    )
+    metrics = calc_metrics(datasets["test"]["english"], predictions, debug=True)
+    print(f"{model_name} metrics: {metrics}")
 if adapter_name_or_path is not None:
+    model_name += "/" + adapter_name_or_path.split("/")[-1]
+evaluate_model_with_repetition_penalty(
+    model,
+    tokenizer,
     model_name,
     datasets["test"],
+    on_repetition_penalty_step_completed,
+    start_repetition_penalty=1.0,
+    end_repetition_penalty=1.3,
+    step_repetition_penalty=0.02,
+    device=device,
 )
+if is_cuda:
+    gpu_stats = torch.cuda.get_device_properties(0)
+    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+    print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+    print(f"{start_gpu_memory} GB of memory reserved.")

llm_toolkit/eval_lf.py DELETED Viewed

@@ -1,110 +0,0 @@
-import os
-import sys
-import torch
-from dotenv import find_dotenv, load_dotenv
-from llamafactory.chat import ChatModel
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-found_dotenv = find_dotenv(".env")
-if len(found_dotenv) == 0:
-    found_dotenv = find_dotenv(".env.example")
-print(f"loading env vars from: {found_dotenv}")
-load_dotenv(found_dotenv, override=False)
-path = os.path.dirname(found_dotenv)
-print(f"Adding {path} to sys.path")
-sys.path.append(path)
-from llm_toolkit.translation_utils import *
-model_name = os.getenv("MODEL_NAME")
-adapter_name_or_path = os.getenv("ADAPTER_NAME_OR_PATH")
-load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
-data_path = os.getenv("DATA_PATH")
-results_path = os.getenv("RESULTS_PATH")
-print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
-def load_model(
-    model_name,
-    max_seq_length=2048,
-    dtype=torch.bfloat16,
-    load_in_4bit=False,
-    adapter_name_or_path=None,
-):
-    print(f"loading model: {model_name}")
-    if adapter_name_or_path:
-        template = "llama3" if "llama-3" in model_name.lower() else "chatml"
-        args = dict(
-            model_name_or_path=model_name,
-            adapter_name_or_path=adapter_name_or_path,  # load the saved LoRA adapters
-            template=template,  # same to the one in training
-            finetuning_type="lora",  # same to the one in training
-            quantization_bit=4 if load_in_4bit else None,  # load 4-bit quantized model
-        )
-        chat_model = ChatModel(args)
-        return chat_model.engine.model, chat_model.engine.tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=load_in_4bit,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_use_double_quant=False,
-        bnb_4bit_compute_dtype=dtype,
-    )
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        quantization_config=bnb_config,
-        torch_dtype=dtype,
-        trust_remote_code=True,
-        device_map="auto",
-    )
-    return model, tokenizer
-gpu_stats = torch.cuda.get_device_properties(0)
-start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
-max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
-print(f"(1) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
-print(f"{start_gpu_memory} GB of memory reserved.")
-model, tokenizer = load_model(
-    model_name, load_in_4bit=load_in_4bit, adapter_name_or_path=adapter_name_or_path
-)
-gpu_stats = torch.cuda.get_device_properties(0)
-start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
-max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
-print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
-print(f"{start_gpu_memory} GB of memory reserved.")
-datasets = load_translation_dataset(data_path, tokenizer)
-print("Evaluating model: " + model_name)
-predictions = eval_model(model, tokenizer, datasets["test"])
-gpu_stats = torch.cuda.get_device_properties(0)
-start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
-max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
-print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
-print(f"{start_gpu_memory} GB of memory reserved.")
-if adapter_name_or_path is not None:
-    model_name += "_" + adapter_name_or_path.split("/")[-1]
-save_results(
-    model_name,
-    results_path,
-    datasets["test"],
-    predictions,
-    debug=True,
-)
-metrics = calc_metrics(datasets["test"]["english"], predictions, debug=True)
-print(metrics)

llm_toolkit/llm_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import re
 import torch
 from transformers import (
     AutoModelForCausalLM,
@@ -197,6 +198,46 @@ def eval_model(
     return predictions
 def save_model(
     model,
     tokenizer,

 import os
 import re
+import numpy as np
 import torch
 from transformers import (
     AutoModelForCausalLM,
     return predictions
+def evaluate_model_with_repetition_penalty(
+    model,
+    tokenizer,
+    model_name,
+    dataset,
+    on_repetition_penalty_step_completed,
+    start_repetition_penalty=1.0,
+    end_repetition_penalty=1.3,
+    step_repetition_penalty=0.02,
+    device="cuda",
+):
+    print(f"Evaluating model: {model_name} on {device}")
+    for repetition_penalty in np.arange(
+        start_repetition_penalty,
+        end_repetition_penalty + step_repetition_penalty / 2,
+        step_repetition_penalty,
+    ):
+        # round to 2 decimal places
+        repetition_penalty = round(repetition_penalty, 2)
+        print(f"*** Evaluating with repetition_penalty: {repetition_penalty}")
+        predictions = eval_model(
+            model,
+            tokenizer,
+            dataset,
+            device=device,
+            repetition_penalty=repetition_penalty,
+        )
+        model_name_with_rp = f"{model_name}/rpp-{repetition_penalty:.2f}"
+        try:
+            on_repetition_penalty_step_completed(
+                model_name_with_rp,
+                predictions,
+            )
+        except Exception as e:
+            print(e)
 def save_model(
     model,
     tokenizer,

llm_toolkit/translation_engine.py DELETED Viewed

@@ -1,130 +0,0 @@
-import os
-import pandas as pd
-import torch
-from unsloth import FastLanguageModel, is_bfloat16_supported
-from trl import SFTTrainer
-from transformers import TrainingArguments, TextStreamer
-from llm_toolkit.translation_utils import *
-from llamafactory.chat import ChatModel
-print(f"loading {__file__}")
-def get_model_names(
-    model_name, save_method="merged_4bit_forced", quantization_method="q5_k_m"
-):
-    hub_model = model_name.split("/")[-1] + "-MAC-"
-    local_model = "models/" + hub_model
-    return {
-        "local": local_model + save_method,
-        "local-gguf": local_model + quantization_method,
-        "hub": hub_model + save_method,
-        "hub-gguf": hub_model + "gguf-" + quantization_method,
-    }
-def load_model(
-    model_name,
-    max_seq_length=2048,
-    dtype=None,
-    load_in_4bit=False,
-    template="chatml",
-    adapter_name_or_path=None,
-):
-    print(f"loading model: {model_name}")
-    if adapter_name_or_path:
-        args = dict(
-            model_name_or_path=model_name,
-            adapter_name_or_path=adapter_name_or_path,  # load the saved LoRA adapters
-            template=template,  # same to the one in training
-            finetuning_type="lora",  # same to the one in training
-            quantization_bit=4,  # load 4-bit quantized model
-        )
-        chat_model = ChatModel(args)
-        return chat_model.engine.model, chat_model.engine.tokenizer
-    model, tokenizer = FastLanguageModel.from_pretrained(
-        model_name=model_name,  # YOUR MODEL YOU USED FOR TRAINING
-        max_seq_length=max_seq_length,
-        dtype=dtype,
-        load_in_4bit=load_in_4bit,
-        trust_remote_code=True,
-    )
-    FastLanguageModel.for_inference(model)
-    return model, tokenizer
-def test_model(model, tokenizer, prompt):
-    inputs = tokenizer(
-        [prompt],
-        return_tensors="pt",
-    ).to("cuda")
-    text_streamer = TextStreamer(tokenizer)
-    _ = model.generate(
-        **inputs, max_new_tokens=128, streamer=text_streamer, use_cache=True
-    )
-def load_trainer(
-    model,
-    tokenizer,
-    dataset,
-    num_train_epochs,
-    max_seq_length=2048,
-    fp16=False,
-    bf16=False,
-    output_dir="./outputs",
-):
-    model = FastLanguageModel.get_peft_model(
-        model,
-        r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
-        target_modules=[
-            "q_proj",
-            "k_proj",
-            "v_proj",
-            "o_proj",
-            "gate_proj",
-            "up_proj",
-            "down_proj",
-        ],
-        lora_alpha=16,
-        lora_dropout=0,  # Supports any, but = 0 is optimized
-        bias="none",  # Supports any, but = "none" is optimized
-        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
-        use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
-        random_state=3407,
-        use_rslora=False,  # We support rank stabilized LoRA
-        loftq_config=None,  # And LoftQ
-    )
-    trainer = SFTTrainer(
-        model=model,
-        tokenizer=tokenizer,
-        train_dataset=dataset,
-        dataset_text_field="text",
-        max_seq_length=max_seq_length,
-        dataset_num_proc=2,
-        packing=False,  # Can make training 5x faster for short sequences.
-        args=TrainingArguments(
-            per_device_train_batch_size=2,
-            gradient_accumulation_steps=4,
-            warmup_steps=5,
-            num_train_epochs=num_train_epochs,
-            learning_rate=2e-4,
-            fp16=not is_bfloat16_supported(),
-            bf16=is_bfloat16_supported(),
-            logging_steps=100,
-            optim="adamw_8bit",
-            weight_decay=0.01,
-            lr_scheduler_type="linear",
-            seed=3407,
-            output_dir=output_dir,
-        ),
-    )
-    return trainer

llm_toolkit/translation_utils.py CHANGED Viewed

@@ -159,14 +159,14 @@ def load_translation_dataset(data_path, tokenizer=None):
     return datasets
-def eval_model(model, tokenizer, eval_dataset):
     total = len(eval_dataset)
     predictions = []
     for i in tqdm(range(total)):
         inputs = tokenizer(
             eval_dataset["prompt"][i : i + 1],
             return_tensors="pt",
-        ).to("cuda")
         outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
         decoded_output = tokenizer.batch_decode(outputs)

     return datasets
+def eval_model(model, tokenizer, eval_dataset, device="cuda"):
     total = len(eval_dataset)
     predictions = []
     for i in tqdm(range(total)):
         inputs = tokenizer(
             eval_dataset["prompt"][i : i + 1],
             return_tensors="pt",
+        ).to(device)
         outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
         decoded_output = tokenizer.batch_decode(outputs)

llm_toolkit/tune.py DELETED Viewed

@@ -1,143 +0,0 @@
-import os
-import sys
-import torch
-from dotenv import find_dotenv, load_dotenv
-found_dotenv = find_dotenv(".env")
-if len(found_dotenv) == 0:
-    found_dotenv = find_dotenv(".env.example")
-print(f"loading env vars from: {found_dotenv}")
-load_dotenv(found_dotenv, override=False)
-path = os.path.dirname(found_dotenv)
-print(f"Adding {path} to sys.path")
-sys.path.append(path)
-from llm_toolkit.translation_engine import *
-from llm_toolkit.translation_utils import *
-model_name = os.getenv("MODEL_NAME")
-load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
-eval_base_model = os.getenv("EVAL_BASE_MODEL") == "true"
-eval_fine_tuned = os.getenv("EVAL_FINE_TUNED") == "true"
-save_fine_tuned_model = os.getenv("SAVE_FINE_TUNED") == "true"
-num_train_epochs = int(os.getenv("NUM_TRAIN_EPOCHS") or 0)
-data_path = os.getenv("DATA_PATH")
-results_path = os.getenv("RESULTS_PATH")
-max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
-dtype = (
-    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
-)
-print(
-    model_name,
-    load_in_4bit,
-    max_seq_length,
-    num_train_epochs,
-    dtype,
-    data_path,
-    results_path,
-    eval_base_model,
-    eval_fine_tuned,
-    save_fine_tuned_model,
-)
-gpu_stats = torch.cuda.get_device_properties(0)
-start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
-max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
-print(f"(1) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
-print(f"{start_gpu_memory} GB of memory reserved.")
-model, tokenizer = load_model(model_name, load_in_4bit=load_in_4bit)
-gpu_stats = torch.cuda.get_device_properties(0)
-start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
-max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
-print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
-print(f"{start_gpu_memory} GB of memory reserved.")
-datasets = load_translation_dataset(data_path, tokenizer)
-if eval_base_model:
-    print("Evaluating base model: " + model_name)
-    predictions = eval_model(model, tokenizer, datasets["test"])
-    # calc_metrics(datasets["test"]["english"], predictions, debug=True)
-    save_results(
-        model_name,
-        results_path,
-        datasets["test"],
-        predictions,
-        debug=True,
-    )
-gpu_stats = torch.cuda.get_device_properties(0)
-start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
-max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
-print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
-print(f"{start_gpu_memory} GB of memory reserved.")
-def is_bfloat16_supported():
-    return True
-trainer = load_trainer(
-    model,
-    tokenizer,
-    datasets["train"],
-    num_train_epochs,
-    fp16=not is_bfloat16_supported(),
-    bf16=is_bfloat16_supported(),
-)
-gpu_stats = torch.cuda.get_device_properties(0)
-start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
-max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
-print(f"(4) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
-print(f"{start_gpu_memory} GB of memory reserved.")
-trainer_stats = trainer.train()
-# @title Show final memory and time stats
-used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
-used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
-used_percentage = round(used_memory / max_memory * 100, 3)
-lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
-print(f"(5) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
-print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
-print(
-    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
-)
-print(f"Peak reserved memory = {used_memory} GB.")
-print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
-print(f"Peak reserved memory % of max memory = {used_percentage} %.")
-print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
-if eval_fine_tuned:
-    print("Evaluating fine-tuned model: " + model_name)
-    FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
-    predictions = eval_model(model, tokenizer, datasets["test"])
-    # calc_metrics(datasets["test"]["english"], predictions, debug=True)
-    save_results(
-        model_name + "(finetuned)",
-        results_path,
-        datasets["test"],
-        predictions,
-        debug=True,
-    )
-gpu_stats = torch.cuda.get_device_properties(0)
-start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
-max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
-print(f"(6) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
-print(f"{start_gpu_memory} GB of memory reserved.")
-if save_fine_tuned_model:
-    save_model(model, tokenizer)

notebooks/00_Data_Analysis.ipynb DELETED Viewed