Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

dh-mc commited on Jul 5, 2024

Commit

36cb2cb

1 Parent(s): 1323386

llama-factory finetuning on Google Colab

Browse files

Files changed (17) hide show

.gitignore +1 -0
competition/03_EDA_en.ipynb +0 -0
config/qwen2_0.5b_lora_sft.yaml +0 -39
config/qwen2_1.5b_lora_sft.yaml +0 -39
config/qwen2_7b_lora_sft.yaml +0 -39
data/alpaca_mac.json +0 -0
data/dataset_info.json +0 -568
datasets/mac/mac-test.tsv +0 -0
llama-factory/config/llama3_8b_lora_sft.yaml +2 -2
llama-factory/config/qwen2_7b_lora_sft.yaml +13 -6
llm_toolkit/eval_mac.py +44 -1
llm_toolkit/translation_engine.py +1 -156
llm_toolkit/translation_utils.py +156 -1
notebooks/00_fine-tune-with-colab.ipynb +0 -0
requirements.txt +3 -1
results/mac-results_lf.csv +2 -2
scripts/tune-lf.sh +1 -0

.gitignore CHANGED Viewed

@@ -148,3 +148,4 @@ dmypy.json
 /models
 /llama.cpp
 /huggingface_tokenizers_cache

 /models
 /llama.cpp
 /huggingface_tokenizers_cache
+/llama-factory/huggingface_tokenizers_cache

competition/03_EDA_en.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

config/qwen2_0.5b_lora_sft.yaml DELETED Viewed

@@ -1,39 +0,0 @@
-### model
-model_name_or_path: Qwen/Qwen2-0.5B-Instruct
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-### dataset
-dataset: alpaca_mac
-template: chatml
-cutoff_len: 1024
-max_samples: 4528
-overwrite_cache: true
-preprocessing_num_workers: 16
-### output
-output_dir: saves/qwen2-0.5b/lora/sft
-logging_steps: 10
-save_steps: 560
-plot_loss: true
-overwrite_output_dir: true
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 10.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-### eval
-val_size: 0.01
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 560

config/qwen2_1.5b_lora_sft.yaml DELETED Viewed

@@ -1,39 +0,0 @@
-### model
-model_name_or_path: Qwen/Qwen2-1.5B-Instruct
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-### dataset
-dataset: alpaca_mac
-template: chatml
-cutoff_len: 1024
-max_samples: 4528
-overwrite_cache: true
-preprocessing_num_workers: 16
-### output
-output_dir: saves/qwen2-1.5b/lora/sft
-logging_steps: 10
-save_steps: 560
-plot_loss: true
-overwrite_output_dir: true
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 10.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-### eval
-val_size: 0.01
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 560

config/qwen2_7b_lora_sft.yaml DELETED Viewed

@@ -1,39 +0,0 @@
-### model
-model_name_or_path: Qwen/Qwen2-7B-Instruct
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-### dataset
-dataset: alpaca_mac
-template: chatml
-cutoff_len: 1024
-max_samples: 4528
-overwrite_cache: true
-preprocessing_num_workers: 16
-### output
-output_dir: saves/qwen2-7b/lora/sft
-logging_steps: 10
-save_steps: 560
-plot_loss: true
-overwrite_output_dir: true
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 10.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-### eval
-val_size: 0.01
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 560

data/alpaca_mac.json DELETED Viewed

The diff for this file is too large to render. See raw diff

data/dataset_info.json DELETED Viewed

@@ -1,568 +0,0 @@
-{
-  "alpaca_mac": {
-    "file_name": "alpaca_mac.json"
-  },
-  "identity": {
-    "file_name": "identity.json"
-  },
-  "alpaca_en_demo": {
-    "file_name": "alpaca_en_demo.json"
-  },
-  "alpaca_zh_demo": {
-    "file_name": "alpaca_zh_demo.json"
-  },
-  "glaive_toolcall_en_demo": {
-    "file_name": "glaive_toolcall_en_demo.json",
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "conversations",
-      "tools": "tools"
-    }
-  },
-  "glaive_toolcall_zh_demo": {
-    "file_name": "glaive_toolcall_zh_demo.json",
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "conversations",
-      "tools": "tools"
-    }
-  },
-  "mllm_demo": {
-    "file_name": "mllm_demo.json",
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "messages",
-      "images": "images"
-    },
-    "tags": {
-      "role_tag": "role",
-      "content_tag": "content",
-      "user_tag": "user",
-      "assistant_tag": "assistant"
-    }
-  },
-  "alpaca_en": {
-    "hf_hub_url": "llamafactory/alpaca_en",
-    "ms_hub_url": "llamafactory/alpaca_en"
-  },
-  "alpaca_zh": {
-    "hf_hub_url": "llamafactory/alpaca_zh",
-    "ms_hub_url": "llamafactory/alpaca_zh"
-  },
-  "alpaca_gpt4_en": {
-    "hf_hub_url": "llamafactory/alpaca_gpt4_en",
-    "ms_hub_url": "llamafactory/alpaca_gpt4_en"
-  },
-  "alpaca_gpt4_zh": {
-    "hf_hub_url": "llamafactory/alpaca_gpt4_zh",
-    "ms_hub_url": "llamafactory/alpaca_gpt4_zh"
-  },
-  "glaive_toolcall_en": {
-    "hf_hub_url": "llamafactory/glaive_toolcall_en",
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "conversations",
-      "tools": "tools"
-    }
-  },
-  "glaive_toolcall_zh": {
-    "hf_hub_url": "llamafactory/glaive_toolcall_zh",
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "conversations",
-      "tools": "tools"
-    }
-  },
-  "lima": {
-    "hf_hub_url": "llamafactory/lima",
-    "formatting": "sharegpt"
-  },
-  "guanaco": {
-    "hf_hub_url": "JosephusCheung/GuanacoDataset",
-    "ms_hub_url": "AI-ModelScope/GuanacoDataset"
-  },
-  "belle_2m": {
-    "hf_hub_url": "BelleGroup/train_2M_CN",
-    "ms_hub_url": "AI-ModelScope/train_2M_CN"
-  },
-  "belle_1m": {
-    "hf_hub_url": "BelleGroup/train_1M_CN",
-    "ms_hub_url": "AI-ModelScope/train_1M_CN"
-  },
-  "belle_0.5m": {
-    "hf_hub_url": "BelleGroup/train_0.5M_CN",
-    "ms_hub_url": "AI-ModelScope/train_0.5M_CN"
-  },
-  "belle_dialog": {
-    "hf_hub_url": "BelleGroup/generated_chat_0.4M",
-    "ms_hub_url": "AI-ModelScope/generated_chat_0.4M"
-  },
-  "belle_math": {
-    "hf_hub_url": "BelleGroup/school_math_0.25M",
-    "ms_hub_url": "AI-ModelScope/school_math_0.25M"
-  },
-  "belle_multiturn": {
-    "script_url": "belle_multiturn",
-    "formatting": "sharegpt"
-  },
-  "ultra_chat": {
-    "script_url": "ultra_chat",
-    "formatting": "sharegpt"
-  },
-  "open_platypus": {
-    "hf_hub_url": "garage-bAInd/Open-Platypus",
-    "ms_hub_url": "AI-ModelScope/Open-Platypus"
-  },
-  "codealpaca": {
-    "hf_hub_url": "sahil2801/CodeAlpaca-20k",
-    "ms_hub_url": "AI-ModelScope/CodeAlpaca-20k"
-  },
-  "alpaca_cot": {
-    "hf_hub_url": "QingyiSi/Alpaca-CoT",
-    "ms_hub_url": "AI-ModelScope/Alpaca-CoT"
-  },
-  "openorca": {
-    "hf_hub_url": "Open-Orca/OpenOrca",
-    "ms_hub_url": "AI-ModelScope/OpenOrca",
-    "columns": {
-      "prompt": "question",
-      "response": "response",
-      "system": "system_prompt"
-    }
-  },
-  "slimorca": {
-    "hf_hub_url": "Open-Orca/SlimOrca",
-    "formatting": "sharegpt"
-  },
-  "mathinstruct": {
-    "hf_hub_url": "TIGER-Lab/MathInstruct",
-    "ms_hub_url": "AI-ModelScope/MathInstruct",
-    "columns": {
-      "prompt": "instruction",
-      "response": "output"
-    }
-  },
-  "firefly": {
-    "hf_hub_url": "YeungNLP/firefly-train-1.1M",
-    "columns": {
-      "prompt": "input",
-      "response": "target"
-    }
-  },
-  "wikiqa": {
-    "hf_hub_url": "wiki_qa",
-    "columns": {
-      "prompt": "question",
-      "response": "answer"
-    }
-  },
-  "webqa": {
-    "hf_hub_url": "suolyer/webqa",
-    "ms_hub_url": "AI-ModelScope/webqa",
-    "columns": {
-      "prompt": "input",
-      "response": "output"
-    }
-  },
-  "webnovel": {
-    "hf_hub_url": "zxbsmk/webnovel_cn",
-    "ms_hub_url": "AI-ModelScope/webnovel_cn"
-  },
-  "nectar_sft": {
-    "hf_hub_url": "AstraMindAI/SFT-Nectar",
-    "ms_hub_url": "AI-ModelScope/SFT-Nectar"
-  },
-  "deepctrl": {
-    "ms_hub_url": "deepctrl/deepctrl-sft-data"
-  },
-  "adgen": {
-    "hf_hub_url": "HasturOfficial/adgen",
-    "ms_hub_url": "AI-ModelScope/adgen",
-    "columns": {
-      "prompt": "content",
-      "response": "summary"
-    }
-  },
-  "sharegpt_hyper": {
-    "hf_hub_url": "totally-not-an-llm/sharegpt-hyperfiltered-3k",
-    "formatting": "sharegpt"
-  },
-  "sharegpt4": {
-    "hf_hub_url": "shibing624/sharegpt_gpt4",
-    "ms_hub_url": "AI-ModelScope/sharegpt_gpt4",
-    "formatting": "sharegpt"
-  },
-  "ultrachat_200k": {
-    "hf_hub_url": "HuggingFaceH4/ultrachat_200k",
-    "ms_hub_url": "AI-ModelScope/ultrachat_200k",
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "messages"
-    },
-    "tags": {
-      "role_tag": "role",
-      "content_tag": "content",
-      "user_tag": "user",
-      "assistant_tag": "assistant"
-    }
-  },
-  "agent_instruct": {
-    "hf_hub_url": "THUDM/AgentInstruct",
-    "ms_hub_url": "ZhipuAI/AgentInstruct",
-    "formatting": "sharegpt"
-  },
-  "lmsys_chat": {
-    "hf_hub_url": "lmsys/lmsys-chat-1m",
-    "ms_hub_url": "AI-ModelScope/lmsys-chat-1m",
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "conversation"
-    },
-    "tags": {
-      "role_tag": "role",
-      "content_tag": "content",
-      "user_tag": "human",
-      "assistant_tag": "assistant"
-    }
-  },
-  "evol_instruct": {
-    "hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k",
-    "ms_hub_url": "AI-ModelScope/WizardLM_evol_instruct_V2_196k",
-    "formatting": "sharegpt"
-  },
-  "glaive_toolcall_100k": {
-    "hf_hub_url": "hiyouga/glaive-function-calling-v2-sharegpt",
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "conversations",
-      "tools": "tools"
-    }
-  },
-  "cosmopedia": {
-    "hf_hub_url": "HuggingFaceTB/cosmopedia",
-    "columns": {
-      "prompt": "prompt",
-      "response": "text"
-    }
-  },
-  "stem_zh": {
-    "hf_hub_url": "hfl/stem_zh_instruction"
-  },
-  "ruozhiba_gpt4": {
-    "hf_hub_url": "hfl/ruozhiba_gpt4_turbo"
-  },
-  "neo_sft": {
-    "hf_hub_url": "m-a-p/neo_sft_phase2",
-    "formatting": "sharegpt"
-  },
-  "magpie_pro_300k": {
-    "hf_hub_url": "Magpie-Align/Magpie-Pro-300K-Filtered",
-    "formatting": "sharegpt"
-  },
-  "web_instruct": {
-    "hf_hub_url": "TIGER-Lab/WebInstructSub",
-    "columns": {
-      "prompt": "question",
-      "response": "answer"
-    }
-  },
-  "llava_1k_en": {
-    "hf_hub_url": "BUAADreamer/llava-en-zh-2k",
-    "subset": "en",
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "messages",
-      "images": "images"
-    },
-    "tags": {
-      "role_tag": "role",
-      "content_tag": "content",
-      "user_tag": "user",
-      "assistant_tag": "assistant"
-    }
-  },
-  "llava_1k_zh": {
-    "hf_hub_url": "BUAADreamer/llava-en-zh-2k",
-    "subset": "zh",
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "messages",
-      "images": "images"
-    },
-    "tags": {
-      "role_tag": "role",
-      "content_tag": "content",
-      "user_tag": "user",
-      "assistant_tag": "assistant"
-    }
-  },
-  "llava_150k_en": {
-    "hf_hub_url": "BUAADreamer/llava-en-zh-300k",
-    "subset": "en",
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "messages",
-      "images": "images"
-    },
-    "tags": {
-      "role_tag": "role",
-      "content_tag": "content",
-      "user_tag": "user",
-      "assistant_tag": "assistant"
-    }
-  },
-  "llava_150k_zh": {
-    "hf_hub_url": "BUAADreamer/llava-en-zh-300k",
-    "subset": "zh",
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "messages",
-      "images": "images"
-    },
-    "tags": {
-      "role_tag": "role",
-      "content_tag": "content",
-      "user_tag": "user",
-      "assistant_tag": "assistant"
-    }
-  },
-  "mllm_pt_demo": {
-    "hf_hub_url": "BUAADreamer/mllm_pt_demo",
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "messages",
-      "images": "images"
-    },
-    "tags": {
-      "role_tag": "role",
-      "content_tag": "content",
-      "user_tag": "user",
-      "assistant_tag": "assistant"
-    }
-  },
-  "oasst_de": {
-    "hf_hub_url": "mayflowergmbh/oasst_de"
-  },
-  "dolly_15k_de": {
-    "hf_hub_url": "mayflowergmbh/dolly-15k_de"
-  },
-  "alpaca-gpt4_de": {
-    "hf_hub_url": "mayflowergmbh/alpaca-gpt4_de"
-  },
-  "openschnabeltier_de": {
-    "hf_hub_url": "mayflowergmbh/openschnabeltier_de"
-  },
-  "evol_instruct_de": {
-    "hf_hub_url": "mayflowergmbh/evol-instruct_de"
-  },
-  "dolphin_de": {
-    "hf_hub_url": "mayflowergmbh/dolphin_de"
-  },
-  "booksum_de": {
-    "hf_hub_url": "mayflowergmbh/booksum_de"
-  },
-  "airoboros_de": {
-    "hf_hub_url": "mayflowergmbh/airoboros-3.0_de"
-  },
-  "ultrachat_de": {
-    "hf_hub_url": "mayflowergmbh/ultra-chat_de"
-  },
-  "dpo_en_demo": {
-    "file_name": "dpo_en_demo.json",
-    "ranking": true,
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "conversations",
-      "chosen": "chosen",
-      "rejected": "rejected"
-    }
-  },
-  "dpo_zh_demo": {
-    "file_name": "dpo_zh_demo.json",
-    "ranking": true,
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "conversations",
-      "chosen": "chosen",
-      "rejected": "rejected"
-    }
-  },
-  "dpo_mix_en": {
-    "hf_hub_url": "hiyouga/DPO-En-Zh-20k",
-    "subset": "en",
-    "ranking": true,
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "conversations",
-      "chosen": "chosen",
-      "rejected": "rejected"
-    }
-  },
-  "dpo_mix_zh": {
-    "hf_hub_url": "hiyouga/DPO-En-Zh-20k",
-    "subset": "zh",
-    "ranking": true,
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "conversations",
-      "chosen": "chosen",
-      "rejected": "rejected"
-    }
-  },
-  "ultrafeedback": {
-    "hf_hub_url": "llamafactory/ultrafeedback_binarized",
-    "ms_hub_url": "llamafactory/ultrafeedback_binarized",
-    "ranking": true,
-    "columns": {
-      "prompt": "instruction",
-      "chosen": "chosen",
-      "rejected": "rejected"
-    }
-  },
-  "orca_pairs": {
-    "hf_hub_url": "Intel/orca_dpo_pairs",
-    "ranking": true,
-    "columns": {
-      "prompt": "question",
-      "chosen": "chosen",
-      "rejected": "rejected",
-      "system": "system"
-    }
-  },
-  "hh_rlhf_en": {
-    "script_url": "hh_rlhf_en",
-    "ranking": true,
-    "columns": {
-      "prompt": "instruction",
-      "chosen": "chosen",
-      "rejected": "rejected",
-      "history": "history"
-    }
-  },
-  "nectar_rm": {
-    "hf_hub_url": "AstraMindAI/RLAIF-Nectar",
-    "ms_hub_url": "AI-ModelScope/RLAIF-Nectar",
-    "ranking": true
-  },
-  "orca_dpo_de": {
-    "hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de",
-    "ranking": true
-  },
-  "kto_en_demo": {
-    "file_name": "kto_en_demo.json",
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "messages",
-      "kto_tag": "label"
-    },
-    "tags": {
-      "role_tag": "role",
-      "content_tag": "content",
-      "user_tag": "user",
-      "assistant_tag": "assistant"
-    }
-  },
-  "kto_mix_en": {
-    "hf_hub_url": "argilla/kto-mix-15k",
-    "formatting": "sharegpt",
-    "columns": {
-      "messages": "completion",
-      "kto_tag": "label"
-    },
-    "tags": {
-      "role_tag": "role",
-      "content_tag": "content",
-      "user_tag": "user",
-      "assistant_tag": "assistant"
-    }
-  },
-  "ultrafeedback_kto": {
-    "hf_hub_url": "argilla/ultrafeedback-binarized-preferences-cleaned-kto",
-    "ms_hub_url": "AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto",
-    "columns": {
-      "prompt": "prompt",
-      "response": "completion",
-      "kto_tag": "label"
-    }
-  },
-  "wiki_demo": {
-    "file_name": "wiki_demo.txt",
-    "columns": {
-      "prompt": "text"
-    }
-  },
-  "c4_demo": {
-    "file_name": "c4_demo.json",
-    "columns": {
-      "prompt": "text"
-    }
-  },
-  "refinedweb": {
-    "hf_hub_url": "tiiuae/falcon-refinedweb",
-    "columns": {
-      "prompt": "content"
-    }
-  },
-  "redpajama_v2": {
-    "hf_hub_url": "togethercomputer/RedPajama-Data-V2",
-    "columns": {
-      "prompt": "raw_content"
-    },
-    "subset": "default"
-  },
-  "wikipedia_en": {
-    "hf_hub_url": "olm/olm-wikipedia-20221220",
-    "ms_hub_url": "AI-ModelScope/olm-wikipedia-20221220",
-    "columns": {
-      "prompt": "text"
-    }
-  },
-  "wikipedia_zh": {
-    "hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered",
-    "ms_hub_url": "AI-ModelScope/wikipedia-cn-20230720-filtered",
-    "columns": {
-      "prompt": "completion"
-    }
-  },
-  "pile": {
-    "hf_hub_url": "monology/pile-uncopyrighted",
-    "ms_hub_url": "AI-ModelScope/pile",
-    "columns": {
-      "prompt": "text"
-    }
-  },
-  "skypile": {
-    "hf_hub_url": "Skywork/SkyPile-150B",
-    "ms_hub_url": "AI-ModelScope/SkyPile-150B",
-    "columns": {
-      "prompt": "text"
-    }
-  },
-  "fineweb": {
-    "hf_hub_url": "HuggingFaceFW/fineweb",
-    "columns": {
-      "prompt": "text"
-    }
-  },
-  "fineweb_edu": {
-    "hf_hub_url": "HuggingFaceFW/fineweb-edu",
-    "columns": {
-      "prompt": "text"
-    }
-  },
-  "the_stack": {
-    "hf_hub_url": "bigcode/the-stack",
-    "ms_hub_url": "AI-ModelScope/the-stack",
-    "columns": {
-      "prompt": "content"
-    }
-  },
-  "starcoder_python": {
-    "hf_hub_url": "bigcode/starcoderdata",
-    "ms_hub_url": "AI-ModelScope/starcoderdata",
-    "columns": {
-      "prompt": "content"
-    },
-    "folder": "python"
-  }
-}

datasets/mac/mac-test.tsv CHANGED Viewed

The diff for this file is too large to render. See raw diff

llama-factory/config/llama3_8b_lora_sft.yaml CHANGED Viewed

@@ -8,7 +8,7 @@ finetuning_type: lora
 lora_target: all
 quantization_bit: 4                     # use 4-bit QLoRA
 loraplus_lr_ratio: 16.0                 # use LoRA+ with lambda=16.0
-# use_unsloth: true                       # use UnslothAI's LoRA optimization for 2x faster training
 ### dataset
 dataset: alpaca_mac
@@ -20,7 +20,7 @@ preprocessing_num_workers: 16
 ### output
 # output_dir: saves/llama3-8b/lora/sft
-output_dir: /Workspace/Users/donghao.huang@mastercard.com/lf-saves/llama3-8b/lora/sft/
 logging_steps: 10
 save_steps: 100
 plot_loss: true

 lora_target: all
 quantization_bit: 4                     # use 4-bit QLoRA
 loraplus_lr_ratio: 16.0                 # use LoRA+ with lambda=16.0
+use_unsloth: true                       # use UnslothAI's LoRA optimization for 2x faster training
 ### dataset
 dataset: alpaca_mac
 ### output
 # output_dir: saves/llama3-8b/lora/sft
+output_dir: /content/llama3-8b/
 logging_steps: 10
 save_steps: 100
 plot_loss: true

llama-factory/config/qwen2_7b_lora_sft.yaml CHANGED Viewed

@@ -6,34 +6,41 @@ stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: all
 ### dataset
 dataset: alpaca_mac
 template: chatml
 cutoff_len: 1024
-max_samples: 4528
 overwrite_cache: true
 preprocessing_num_workers: 16
 ### output
-output_dir: saves/qwen2-7b/lora/sft
 logging_steps: 10
-save_steps: 560
 plot_loss: true
 overwrite_output_dir: true
 ### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 1.0e-4
-num_train_epochs: 10.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.01
 per_device_eval_batch_size: 1
 eval_strategy: steps
-eval_steps: 560

 do_train: true
 finetuning_type: lora
 lora_target: all
+quantization_bit: 4                     # use 4-bit QLoRA
+loraplus_lr_ratio: 16.0                 # use LoRA+ with lambda=16.0
+# use_unsloth: true                       # use UnslothAI's LoRA optimization for 2x faster training
 ### dataset
 dataset: alpaca_mac
 template: chatml
 cutoff_len: 1024
+max_samples: 50
 overwrite_cache: true
 preprocessing_num_workers: 16
 ### output
+output_dir: /content/qwen2-7b/
 logging_steps: 10
+save_steps: 10
 plot_loss: true
 overwrite_output_dir: true
+# resume_from_checkpoint: true
 ### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 1.0e-4
+num_train_epochs: 6.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
 ### eval
+val_size: 0.02
 per_device_eval_batch_size: 1
 eval_strategy: steps
+eval_steps: 10
+report_to: wandb
+run_name: qwen2_7b_mac_colab # optional

llm_toolkit/eval_mac.py CHANGED Viewed

@@ -2,6 +2,8 @@ import os
 import sys
 import torch
 from dotenv import find_dotenv, load_dotenv
 found_dotenv = find_dotenv(".env")
@@ -14,7 +16,6 @@ path = os.path.dirname(found_dotenv)
 print(f"Adding {path} to sys.path")
 sys.path.append(path)
-from llm_toolkit.translation_engine import *
 from llm_toolkit.translation_utils import *
 model_name = os.getenv("MODEL_NAME")
@@ -25,6 +26,48 @@ results_path = os.getenv("RESULTS_PATH")
 print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
 gpu_stats = torch.cuda.get_device_properties(0)
 start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
 max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

 import sys
 import torch
 from dotenv import find_dotenv, load_dotenv
+from llamafactory.chat import ChatModel
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 found_dotenv = find_dotenv(".env")
 print(f"Adding {path} to sys.path")
 sys.path.append(path)
 from llm_toolkit.translation_utils import *
 model_name = os.getenv("MODEL_NAME")
 print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
+def load_model(
+    model_name,
+    max_seq_length=2048,
+    dtype=None,
+    load_in_4bit=False,
+    adapter_name_or_path=None,
+):
+    print(f"loading model: {model_name}")
+    if adapter_name_or_path:
+        template = "llama3" if "llama-3" in model_name.lower() else "chatml"
+        args = dict(
+            model_name_or_path=model_name,
+            adapter_name_or_path=adapter_name_or_path,  # load the saved LoRA adapters
+            template=template,  # same to the one in training
+            finetuning_type="lora",  # same to the one in training
+            quantization_bit=4,  # load 4-bit quantized model
+        )
+        chat_model = ChatModel(args)
+        return chat_model.engine.model, chat_model.engine.tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=False,
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        quantization_config=bnb_config,
+        # attn_implementation="flash_attention_2",
+        trust_remote_code=True,
+        device_map="auto",
+    )
+    return model, tokenizer
 gpu_stats = torch.cuda.get_device_properties(0)
 start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
 max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

llm_toolkit/translation_engine.py CHANGED Viewed

@@ -1,11 +1,9 @@
 import os
 import pandas as pd
-from datasets import load_dataset
 import torch
 from unsloth import FastLanguageModel, is_bfloat16_supported
 from trl import SFTTrainer
 from transformers import TrainingArguments, TextStreamer
-from tqdm import tqdm
 from llm_toolkit.translation_utils import *
 from llamafactory.chat import ChatModel
@@ -36,7 +34,7 @@ def load_model(
 ):
     print(f"loading model: {model_name}")
-    if adapter_name_or_path is not None:
         args = dict(
             model_name_or_path=model_name,
             adapter_name_or_path=adapter_name_or_path,  # load the saved LoRA adapters
@@ -130,156 +128,3 @@ def load_trainer(
     )
     return trainer
-def load_translation_dataset(data_path, tokenizer=None):
-    train_data_file = data_path.replace(".tsv", "-train.tsv")
-    test_data_file = data_path.replace(".tsv", "-test.tsv")
-    if not os.path.exists(train_data_file):
-        print("generating train/test data files")
-        dataset = load_dataset(
-            "csv", data_files=data_path, delimiter="\t", split="train"
-        )
-        print(len(dataset))
-        dataset = dataset.filter(lambda x: x["chinese"] and x["english"])
-        datasets = dataset.train_test_split(test_size=0.2)
-        print(len(dataset))
-        # Convert to pandas DataFrame
-        train_df = pd.DataFrame(datasets["train"])
-        test_df = pd.DataFrame(datasets["test"])
-        # Save to TSV
-        train_df.to_csv(train_data_file, sep="\t", index=False)
-        test_df.to_csv(test_data_file, sep="\t", index=False)
-    print("loading train/test data files")
-    datasets = load_dataset(
-        "csv",
-        data_files={"train": train_data_file, "test": test_data_file},
-        delimiter="\t",
-    )
-    if tokenizer:
-        translation_prompt = "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{}"
-        def formatting_prompts_func(examples):
-            inputs = examples["chinese"]
-            outputs = examples["english"]
-            messages = [
-                {
-                    "role": "system",
-                    "content": "You are an expert in translating Chinese to English.",
-                },
-                None,
-            ]
-            model_name = os.getenv("MODEL_NAME")
-            if "mistral" in model_name.lower():
-                messages = messages[1:]
-            texts = []
-            prompts = []
-            for input, output in zip(inputs, outputs):
-                prompt = translation_prompt.format(input)
-                messages[-1] = {"role": "user", "content": prompt}
-                prompt = tokenizer.apply_chat_template(
-                    messages, tokenize=False, add_generation_prompt=True
-                )
-                prompts.append(prompt)
-                texts.append(prompt + output + tokenizer.eos_token)
-            return {"text": texts, "prompt": prompts}
-        datasets = datasets.map(
-            formatting_prompts_func,
-            batched=True,
-        )
-    print(datasets)
-    return datasets
-def eval_model(model, tokenizer, eval_dataset):
-    total = len(eval_dataset)
-    predictions = []
-    for i in tqdm(range(total)):
-        inputs = tokenizer(
-            eval_dataset["prompt"][i : i + 1],
-            return_tensors="pt",
-        ).to("cuda")
-        outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
-        decoded_output = tokenizer.batch_decode(outputs)
-        debug = i == 0
-        decoded_output = [
-            extract_answer(output, debug=debug) for output in decoded_output
-        ]
-        predictions.extend(decoded_output)
-    return predictions
-def save_model(
-    model,
-    tokenizer,
-    include_gguf=True,
-    include_merged=True,
-    publish=True,
-):
-    try:
-        token = os.getenv("HF_TOKEN") or None
-        model_name = os.getenv("MODEL_NAME")
-        save_method = "lora"
-        quantization_method = "q5_k_m"
-        model_names = get_model_names(
-            model_name, save_method=save_method, quantization_method=quantization_method
-        )
-        model.save_pretrained(model_names["local"])
-        tokenizer.save_pretrained(model_names["local"])
-        if publish:
-            model.push_to_hub(
-                model_names["hub"],
-                token=token,
-            )
-            tokenizer.push_to_hub(
-                model_names["hub"],
-                token=token,
-            )
-        if include_merged:
-            model.save_pretrained_merged(
-                model_names["local"] + "-merged", tokenizer, save_method=save_method
-            )
-            if publish:
-                model.push_to_hub_merged(
-                    model_names["hub"] + "-merged",
-                    tokenizer,
-                    save_method="lora",
-                    token="",
-                )
-        if include_gguf:
-            model.save_pretrained_gguf(
-                model_names["local-gguf"],
-                tokenizer,
-                quantization_method=quantization_method,
-            )
-            if publish:
-                model.push_to_hub_gguf(
-                    model_names["hub-gguf"],
-                    tokenizer,
-                    quantization_method=quantization_method,
-                    token=token,
-                )
-    except Exception as e:
-        print(e)

 import os
 import pandas as pd
 import torch
 from unsloth import FastLanguageModel, is_bfloat16_supported
 from trl import SFTTrainer
 from transformers import TrainingArguments, TextStreamer
 from llm_toolkit.translation_utils import *
 from llamafactory.chat import ChatModel
 ):
     print(f"loading model: {model_name}")
+    if adapter_name_or_path:
         args = dict(
             model_name_or_path=model_name,
             adapter_name_or_path=adapter_name_or_path,  # load the saved LoRA adapters
     )
     return trainer

llm_toolkit/translation_utils.py CHANGED Viewed

@@ -4,10 +4,12 @@ import pandas as pd
 import evaluate
 import seaborn as sns
 import matplotlib.pyplot as plt
 from langchain_openai import ChatOpenAI
 from langchain_core.prompts import ChatPromptTemplate
 bleu = evaluate.load("bleu")
 rouge = evaluate.load("rouge")
@@ -85,6 +87,159 @@ def save_results(model_name, results_path, dataset, predictions, debug=False):
     df.to_csv(results_path, index=False)
 def get_metrics(df):
     metrics_df = pd.DataFrame(df.columns.T)[2:]
     metrics_df.rename(columns={0: "model"}, inplace=True)

 import evaluate
 import seaborn as sns
 import matplotlib.pyplot as plt
+from datasets import load_dataset
 from langchain_openai import ChatOpenAI
 from langchain_core.prompts import ChatPromptTemplate
+from tqdm import tqdm
+print(f"loading {__file__}")
 bleu = evaluate.load("bleu")
 rouge = evaluate.load("rouge")
     df.to_csv(results_path, index=False)
+def load_translation_dataset(data_path, tokenizer=None):
+    train_data_file = data_path.replace(".tsv", "-train.tsv")
+    test_data_file = data_path.replace(".tsv", "-test.tsv")
+    if not os.path.exists(train_data_file):
+        print("generating train/test data files")
+        dataset = load_dataset(
+            "csv", data_files=data_path, delimiter="\t", split="train"
+        )
+        print(len(dataset))
+        dataset = dataset.filter(lambda x: x["chinese"] and x["english"])
+        datasets = dataset.train_test_split(test_size=0.2)
+        print(len(dataset))
+        # Convert to pandas DataFrame
+        train_df = pd.DataFrame(datasets["train"])
+        test_df = pd.DataFrame(datasets["test"])
+        # Save to TSV
+        train_df.to_csv(train_data_file, sep="\t", index=False)
+        test_df.to_csv(test_data_file, sep="\t", index=False)
+    print("loading train/test data files")
+    datasets = load_dataset(
+        "csv",
+        data_files={"train": train_data_file, "test": test_data_file},
+        delimiter="\t",
+    )
+    if tokenizer:
+        translation_prompt = "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{}"
+        def formatting_prompts_func(examples):
+            inputs = examples["chinese"]
+            outputs = examples["english"]
+            messages = [
+                {
+                    "role": "system",
+                    "content": "You are an expert in translating Chinese to English.",
+                },
+                None,
+            ]
+            model_name = os.getenv("MODEL_NAME")
+            if "mistral" in model_name.lower():
+                messages = messages[1:]
+            texts = []
+            prompts = []
+            for input, output in zip(inputs, outputs):
+                prompt = translation_prompt.format(input)
+                messages[-1] = {"role": "user", "content": prompt}
+                prompt = tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts.append(prompt)
+                texts.append(prompt + output + tokenizer.eos_token)
+            return {"text": texts, "prompt": prompts}
+        datasets = datasets.map(
+            formatting_prompts_func,
+            batched=True,
+        )
+    print(datasets)
+    return datasets
+def eval_model(model, tokenizer, eval_dataset):
+    total = len(eval_dataset)
+    predictions = []
+    for i in tqdm(range(total)):
+        inputs = tokenizer(
+            eval_dataset["prompt"][i : i + 1],
+            return_tensors="pt",
+        ).to("cuda")
+        outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
+        decoded_output = tokenizer.batch_decode(outputs)
+        debug = i == 0
+        decoded_output = [
+            extract_answer(output, debug=debug) for output in decoded_output
+        ]
+        predictions.extend(decoded_output)
+    return predictions
+def save_model(
+    model,
+    tokenizer,
+    include_gguf=True,
+    include_merged=True,
+    publish=True,
+):
+    try:
+        token = os.getenv("HF_TOKEN") or None
+        model_name = os.getenv("MODEL_NAME")
+        save_method = "lora"
+        quantization_method = "q5_k_m"
+        model_names = get_model_names(
+            model_name, save_method=save_method, quantization_method=quantization_method
+        )
+        model.save_pretrained(model_names["local"])
+        tokenizer.save_pretrained(model_names["local"])
+        if publish:
+            model.push_to_hub(
+                model_names["hub"],
+                token=token,
+            )
+            tokenizer.push_to_hub(
+                model_names["hub"],
+                token=token,
+            )
+        if include_merged:
+            model.save_pretrained_merged(
+                model_names["local"] + "-merged", tokenizer, save_method=save_method
+            )
+            if publish:
+                model.push_to_hub_merged(
+                    model_names["hub"] + "-merged",
+                    tokenizer,
+                    save_method="lora",
+                    token="",
+                )
+        if include_gguf:
+            model.save_pretrained_gguf(
+                model_names["local-gguf"],
+                tokenizer,
+                quantization_method=quantization_method,
+            )
+            if publish:
+                model.push_to_hub_gguf(
+                    model_names["hub-gguf"],
+                    tokenizer,
+                    quantization_method=quantization_method,
+                    token=token,
+                )
+    except Exception as e:
+        print(e)
 def get_metrics(df):
     metrics_df = pd.DataFrame(df.columns.T)[2:]
     metrics_df.rename(columns={0: "model"}, inplace=True)

notebooks/00_fine-tune-with-colab.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-huggingface_hub==0.23.0
 nltk==3.8.1
 python-dotenv==1.0.1
 black==24.4.0
@@ -10,5 +10,7 @@ scikit-learn==1.5.0
 jupyter
 ipywidgets
 packaging
 # triton
 # xformers

+huggingface_hub==0.23.2
 nltk==3.8.1
 python-dotenv==1.0.1
 black==24.4.0
 jupyter
 ipywidgets
 packaging
+langchain_openai==0.1.13
+wandb==0.17.4
 # triton
 # xformers

results/mac-results_lf.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c5acc087808de5df6839cbf7b170094c6e63445aab4bea15e4be9564b905eb51
-size 3236072

 version https://git-lfs.github.com/spec/v1
+oid sha256:20e21280e557b2e3292a686267318a757c1ed8f370da290df4f1825c98c51152
+size 11580

scripts/tune-lf.sh CHANGED Viewed

@@ -5,4 +5,5 @@ cd $BASEDIR/../llama-factory
 echo Current Directory:
 pwd
 llamafactory-cli train $1

 echo Current Directory:
 pwd
+YAML=$1 python -c 'import os, json, sys, yaml; filename=os.getenv("YAML"); y=yaml.safe_load(open(filename)) ; print(f"{filename}:\n", json.dumps(y, indent=2))'
 llamafactory-cli train $1