dh-mc commited on
Commit
54b1b8a
1 Parent(s): 3860729
.gitignore CHANGED
@@ -1,3 +1,4 @@
 
1
  *.out
2
  *.log
3
  */outputs/
 
1
+ *.run
2
  *.out
3
  *.log
4
  */outputs/
README.md CHANGED
@@ -10,4 +10,8 @@ pinned: false
10
  license: mit
11
  ---
12
 
13
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
 
 
 
 
10
  license: mit
11
  ---
12
 
13
+ An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
14
+
15
+ ```
16
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
17
+ ```
llama-factory/config/llama3_8b_lora_sft.yaml DELETED
@@ -1,46 +0,0 @@
1
- ### model
2
- model_name_or_path: gradientai/Llama-3-8B-Instruct-Gradient-1048k
3
-
4
- ### method
5
- stage: sft
6
- do_train: true
7
- finetuning_type: lora
8
- lora_target: all
9
- quantization_bit: 4 # use 4-bit QLoRA
10
- loraplus_lr_ratio: 16.0 # use LoRA+ with lambda=16.0
11
- # use_unsloth: true # use UnslothAI's LoRA optimization for 2x faster training
12
-
13
- ### dataset
14
- dataset: alpaca_mac
15
- template: llama3
16
- cutoff_len: 1024
17
- max_samples: 4528
18
- overwrite_cache: true
19
- preprocessing_num_workers: 16
20
-
21
- ### output
22
- # output_dir: saves/llama3-8b/lora/sft
23
- output_dir: /Workspace/Users/donghao.huang@mastercard.com/lf-saves/llama3-8b/lora/sft/
24
- logging_steps: 10
25
- save_steps: 560
26
- plot_loss: true
27
- overwrite_output_dir: true
28
- # resume_from_checkpoint: true
29
-
30
- ### train
31
- per_device_train_batch_size: 1
32
- gradient_accumulation_steps: 8
33
- learning_rate: 1.0e-4
34
- num_train_epochs: 6.0
35
- lr_scheduler_type: cosine
36
- warmup_ratio: 0.1
37
- bf16: true
38
- ddp_timeout: 180000000
39
-
40
- ### eval
41
- val_size: 0.01
42
- per_device_eval_batch_size: 1
43
- eval_strategy: steps
44
- eval_steps: 560
45
-
46
- report_to: none
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/config/qwen2_0.5b_lora_sft.yaml DELETED
@@ -1,42 +0,0 @@
1
- ### model
2
- model_name_or_path: Qwen/Qwen2-0.5B-Instruct
3
-
4
- ### method
5
- stage: sft
6
- do_train: true
7
- finetuning_type: lora
8
- lora_target: all
9
-
10
- ### dataset
11
- dataset: alpaca_mac
12
- template: chatml
13
- cutoff_len: 1024
14
- max_samples: 4528
15
- overwrite_cache: true
16
- preprocessing_num_workers: 16
17
-
18
- ### output
19
- output_dir: saves/qwen2-0.5b/lora/sft
20
- logging_steps: 10
21
- save_steps: 560
22
- plot_loss: true
23
- overwrite_output_dir: true
24
-
25
- ### train
26
- per_device_train_batch_size: 1
27
- gradient_accumulation_steps: 8
28
- learning_rate: 1.0e-4
29
- num_train_epochs: 6.0
30
- lr_scheduler_type: cosine
31
- warmup_ratio: 0.1
32
- bf16: true
33
- ddp_timeout: 180000000
34
-
35
- ### eval
36
- val_size: 0.01
37
- per_device_eval_batch_size: 1
38
- eval_strategy: steps
39
- eval_steps: 560
40
-
41
- report_to: wandb
42
- run_name: qwen2_0.5b_lora_sft # optional
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/config/qwen2_0.5b_lora_sft_unsloth.yaml DELETED
@@ -1,45 +0,0 @@
1
- ### model
2
- model_name_or_path: Qwen/Qwen2-0.5B-Instruct
3
-
4
- ### method
5
- stage: sft
6
- do_train: true
7
- finetuning_type: lora
8
- lora_target: all
9
- quantization_bit: 4 # use 4-bit QLoRA
10
- loraplus_lr_ratio: 16.0 # use LoRA+ with lambda=16.0
11
- use_unsloth: true # use UnslothAI's LoRA optimization for 2x faster training
12
-
13
- ### dataset
14
- dataset: alpaca_mac
15
- template: chatml
16
- cutoff_len: 1024
17
- max_samples: 4528
18
- overwrite_cache: true
19
- preprocessing_num_workers: 16
20
-
21
- ### output
22
- output_dir: saves/qwen2-0.5b/lora/sft
23
- logging_steps: 10
24
- save_steps: 560
25
- plot_loss: true
26
- overwrite_output_dir: true
27
-
28
- ### train
29
- per_device_train_batch_size: 1
30
- gradient_accumulation_steps: 8
31
- learning_rate: 1.0e-4
32
- num_train_epochs: 6.0
33
- lr_scheduler_type: cosine
34
- warmup_ratio: 0.1
35
- bf16: true
36
- ddp_timeout: 180000000
37
-
38
- ### eval
39
- val_size: 0.01
40
- per_device_eval_batch_size: 1
41
- eval_strategy: steps
42
- eval_steps: 560
43
-
44
- report_to: wandb
45
- run_name: qwen2_0.5b_lora_sft # optional
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/config/qwen2_1.5b_lora_sft.yaml DELETED
@@ -1,42 +0,0 @@
1
- ### model
2
- model_name_or_path: Qwen/Qwen2-1.5B-Instruct
3
-
4
- ### method
5
- stage: sft
6
- do_train: true
7
- finetuning_type: lora
8
- lora_target: all
9
-
10
- ### dataset
11
- dataset: alpaca_mac
12
- template: chatml
13
- cutoff_len: 1024
14
- max_samples: 4528
15
- overwrite_cache: true
16
- preprocessing_num_workers: 16
17
-
18
- ### output
19
- output_dir: saves/qwen2-1.5b/lora/sft
20
- logging_steps: 10
21
- save_steps: 560
22
- plot_loss: true
23
- overwrite_output_dir: true
24
-
25
- ### train
26
- per_device_train_batch_size: 1
27
- gradient_accumulation_steps: 8
28
- learning_rate: 1.0e-4
29
- num_train_epochs: 6.0
30
- lr_scheduler_type: cosine
31
- warmup_ratio: 0.1
32
- bf16: true
33
- ddp_timeout: 180000000
34
-
35
- ### eval
36
- val_size: 0.01
37
- per_device_eval_batch_size: 1
38
- eval_strategy: steps
39
- eval_steps: 560
40
-
41
- report_to: wandb
42
- run_name: qwen2_1.5b_lora_sft # optional
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/config/qwen2_1.5b_lora_sft_unsloth.yaml DELETED
@@ -1,45 +0,0 @@
1
- ### model
2
- model_name_or_path: Qwen/Qwen2-1.5B-Instruct
3
-
4
- ### method
5
- stage: sft
6
- do_train: true
7
- finetuning_type: lora
8
- lora_target: all
9
- quantization_bit: 4 # use 4-bit QLoRA
10
- loraplus_lr_ratio: 16.0 # use LoRA+ with lambda=16.0
11
- use_unsloth: true # use UnslothAI's LoRA optimization for 2x faster training
12
-
13
- ### dataset
14
- dataset: alpaca_mac
15
- template: chatml
16
- cutoff_len: 1024
17
- max_samples: 4528
18
- overwrite_cache: true
19
- preprocessing_num_workers: 16
20
-
21
- ### output
22
- output_dir: saves/qwen2-1.5b/lora/sft
23
- logging_steps: 10
24
- save_steps: 560
25
- plot_loss: true
26
- overwrite_output_dir: true
27
-
28
- ### train
29
- per_device_train_batch_size: 1
30
- gradient_accumulation_steps: 8
31
- learning_rate: 1.0e-4
32
- num_train_epochs: 6.0
33
- lr_scheduler_type: cosine
34
- warmup_ratio: 0.1
35
- bf16: true
36
- ddp_timeout: 180000000
37
-
38
- ### eval
39
- val_size: 0.01
40
- per_device_eval_batch_size: 1
41
- eval_strategy: steps
42
- eval_steps: 560
43
-
44
- report_to: wandb
45
- run_name: qwen2_1.5b_lora_sft # optional
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/config/qwen2_7b_lora_sft.yaml DELETED
@@ -1,45 +0,0 @@
1
- ### model
2
- model_name_or_path: Qwen/Qwen2-7B-Instruct
3
-
4
- ### method
5
- stage: sft
6
- do_train: true
7
- finetuning_type: lora
8
- lora_target: all
9
- quantization_bit: 4 # use 4-bit QLoRA
10
- loraplus_lr_ratio: 16.0 # use LoRA+ with lambda=16.0
11
- # use_unsloth: true # use UnslothAI's LoRA optimization for 2x faster training
12
-
13
- ### dataset
14
- dataset: alpaca_mac
15
- template: chatml
16
- cutoff_len: 1024
17
- max_samples: 4528
18
- overwrite_cache: true
19
- preprocessing_num_workers: 16
20
-
21
- ### output
22
- output_dir: saves/qwen2-7b/lora/sft
23
- logging_steps: 10
24
- save_steps: 560
25
- plot_loss: true
26
- overwrite_output_dir: true
27
-
28
- ### train
29
- per_device_train_batch_size: 1
30
- gradient_accumulation_steps: 8
31
- learning_rate: 1.0e-4
32
- num_train_epochs: 6.0
33
- lr_scheduler_type: cosine
34
- warmup_ratio: 0.1
35
- bf16: true
36
- ddp_timeout: 180000000
37
-
38
- ### eval
39
- val_size: 0.01
40
- per_device_eval_batch_size: 1
41
- eval_strategy: steps
42
- eval_steps: 560
43
-
44
- report_to: wandb
45
- run_name: qwen2_7b_lora_sft # optional
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/config/qwen2_7b_lora_sft_unsloth.yaml DELETED
@@ -1,45 +0,0 @@
1
- ### model
2
- model_name_or_path: Qwen/Qwen2-7B-Instruct
3
-
4
- ### method
5
- stage: sft
6
- do_train: true
7
- finetuning_type: lora
8
- lora_target: all
9
- quantization_bit: 4 # use 4-bit QLoRA
10
- loraplus_lr_ratio: 16.0 # use LoRA+ with lambda=16.0
11
- use_unsloth: true # use UnslothAI's LoRA optimization for 2x faster training
12
-
13
- ### dataset
14
- dataset: alpaca_mac
15
- template: chatml
16
- cutoff_len: 1024
17
- max_samples: 4528
18
- overwrite_cache: true
19
- preprocessing_num_workers: 16
20
-
21
- ### output
22
- output_dir: saves/qwen2-7b/lora/sft
23
- logging_steps: 10
24
- save_steps: 560
25
- plot_loss: true
26
- overwrite_output_dir: true
27
-
28
- ### train
29
- per_device_train_batch_size: 1
30
- gradient_accumulation_steps: 8
31
- learning_rate: 1.0e-4
32
- num_train_epochs: 6.0
33
- lr_scheduler_type: cosine
34
- warmup_ratio: 0.1
35
- bf16: true
36
- ddp_timeout: 180000000
37
-
38
- ### eval
39
- val_size: 0.01
40
- per_device_eval_batch_size: 1
41
- eval_strategy: steps
42
- eval_steps: 560
43
-
44
- report_to: wandb
45
- run_name: qwen2_7b_lora_sft # optional
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/data/alpaca_mac.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f03e62eb461c2204bbaef55f2de28ec115b1a5834b81f03b10f157551d5fe9f
3
- size 2240344
 
 
 
 
llama-factory/data/dataset_info.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:84bce610296ed7e729647e85d25576b6226d20ddf0bca4982fb1deb02de35911
3
- size 13560
 
 
 
 
llama-factory/inference/qwen2_1.5b_lora_sft.yaml DELETED
@@ -1,4 +0,0 @@
1
- model_name_or_path: Qwen/Qwen2-1.5B-Instruct
2
- adapter_name_or_path: saves/qwen2-1.5b/lora/sft/checkpoint-1680
3
- template: chatml
4
- finetuning_type: lora
 
 
 
 
 
llm_toolkit/llm_utils.py CHANGED
@@ -1,22 +1,39 @@
1
  import os
2
  import re
3
- import sys
4
  import torch
5
- from llamafactory.chat import ChatModel
6
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  def load_model(
10
  model_name,
11
- max_seq_length=2048,
12
  dtype=torch.bfloat16,
13
  load_in_4bit=False,
14
  adapter_name_or_path=None,
 
15
  ):
16
- print(f"loading model: {model_name}")
17
 
18
- if adapter_name_or_path:
19
- template = "llama3" if "llama-3" in model_name.lower() else "chatml"
 
 
20
 
21
  args = dict(
22
  model_name_or_path=model_name,
@@ -26,6 +43,10 @@ def load_model(
26
  quantization_bit=4 if load_in_4bit else None, # load 4-bit quantized model
27
  )
28
  chat_model = ChatModel(args)
 
 
 
 
29
  return chat_model.engine.model, chat_model.engine.tokenizer
30
 
31
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -36,26 +57,59 @@ def load_model(
36
  bnb_4bit_compute_dtype=dtype,
37
  )
38
 
39
- model = AutoModelForCausalLM.from_pretrained(
40
- model_name,
41
- quantization_config=bnb_config,
42
- torch_dtype=dtype,
43
- trust_remote_code=True,
44
- device_map="auto",
45
- ) if load_in_4bit else AutoModelForCausalLM.from_pretrained(
46
- model_name,
47
- torch_dtype=dtype,
48
- trust_remote_code=True,
49
- device_map="auto",
 
 
 
 
50
  )
51
 
 
 
 
 
 
 
 
 
 
52
  return model, tokenizer
53
 
54
- def test_model(model, tokenizer, prompt):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  inputs = tokenizer(
56
  [prompt],
57
  return_tensors="pt",
58
- ).to("cuda")
59
 
60
  text_streamer = TextStreamer(tokenizer)
61
 
@@ -68,7 +122,10 @@ def extract_answer(text, debug=False):
68
  if text:
69
  # Remove the begin and end tokens
70
  text = re.sub(
71
- r".*?(assistant|\[/INST\]).+?\b", "", text, flags=re.DOTALL | re.MULTILINE
 
 
 
72
  )
73
  if debug:
74
  print("--------\nstep 1:", text)
@@ -83,27 +140,63 @@ def extract_answer(text, debug=False):
83
  if debug:
84
  print("--------\nstep 3:", text)
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  return text
87
 
88
- def eval_model(model, tokenizer, eval_dataset):
 
 
 
 
 
 
 
 
 
89
  total = len(eval_dataset)
90
  predictions = []
91
- for i in tqdm(range(total)):
92
- inputs = tokenizer(
93
- eval_dataset["prompt"][i : i + 1],
94
- return_tensors="pt",
95
- ).to("cuda")
96
-
97
- outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
98
- decoded_output = tokenizer.batch_decode(outputs)
99
- debug = i == 0
100
- decoded_output = [
101
- extract_answer(output, debug=debug) for output in decoded_output
102
- ]
103
- predictions.extend(decoded_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  return predictions
106
 
 
107
  def save_model(
108
  model,
109
  tokenizer,
@@ -163,3 +256,10 @@ def save_model(
163
  )
164
  except Exception as e:
165
  print(e)
 
 
 
 
 
 
 
 
1
  import os
2
  import re
 
3
  import torch
4
+ from transformers import (
5
+ AutoModelForCausalLM,
6
+ AutoTokenizer,
7
+ BitsAndBytesConfig,
8
+ TextStreamer,
9
+ )
10
+ from tqdm import tqdm
11
+
12
+
13
+ def get_template(model_name):
14
+ model_name = model_name.lower()
15
+ if "llama" in model_name:
16
+ return "llama3"
17
+ if "internlm" in model_name:
18
+ return "intern2"
19
+ if "glm" in model_name:
20
+ return "glm4"
21
+ return "chatml"
22
 
23
 
24
  def load_model(
25
  model_name,
 
26
  dtype=torch.bfloat16,
27
  load_in_4bit=False,
28
  adapter_name_or_path=None,
29
+ using_llama_factory=False,
30
  ):
31
+ print(f"loading model: {model_name} with adapter: {adapter_name_or_path}")
32
 
33
+ if using_llama_factory:
34
+ from llamafactory.chat import ChatModel
35
+
36
+ template = get_template(model_name)
37
 
38
  args = dict(
39
  model_name_or_path=model_name,
 
43
  quantization_bit=4 if load_in_4bit else None, # load 4-bit quantized model
44
  )
45
  chat_model = ChatModel(args)
46
+ if os.getenv("RESIZE_TOKEN_EMBEDDINGS") == "true":
47
+ chat_model.engine.model.resize_token_embeddings(
48
+ len(chat_model.engine.tokenizer), pad_to_multiple_of=32
49
+ )
50
  return chat_model.engine.model, chat_model.engine.tokenizer
51
 
52
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
57
  bnb_4bit_compute_dtype=dtype,
58
  )
59
 
60
+ model = (
61
+ AutoModelForCausalLM.from_pretrained(
62
+ model_name,
63
+ quantization_config=bnb_config,
64
+ torch_dtype=dtype,
65
+ trust_remote_code=True,
66
+ device_map="auto",
67
+ )
68
+ if load_in_4bit
69
+ else AutoModelForCausalLM.from_pretrained(
70
+ model_name,
71
+ torch_dtype=dtype,
72
+ trust_remote_code=True,
73
+ device_map="auto",
74
+ )
75
  )
76
 
77
+ if adapter_name_or_path:
78
+ adapter_name = model.load_adapter(adapter_name_or_path)
79
+ model.active_adapters = adapter_name
80
+
81
+ if not tokenizer.pad_token:
82
+ print("Adding pad token to tokenizer for model: ", model_name)
83
+ tokenizer.add_special_tokens({"pad_token": "<pad>"})
84
+ model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=32)
85
+
86
  return model, tokenizer
87
 
88
+
89
+ def check_gpu():
90
+ # torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
91
+ is_cuda = torch.cuda.is_available()
92
+
93
+ # If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
94
+ if is_cuda:
95
+ device = torch.device("cuda")
96
+ print("CUDA is available, we have found ", torch.cuda.device_count(), " GPU(s)")
97
+ print(torch.cuda.get_device_name(0))
98
+ print("CUDA version: " + torch.version.cuda)
99
+ elif torch.backends.mps.is_available():
100
+ device = torch.device("mps")
101
+ print("MPS is available")
102
+ else:
103
+ device = torch.device("cpu")
104
+ print("GPU/MPS not available, CPU used")
105
+ return device
106
+
107
+
108
+ def test_model(model, tokenizer, prompt, device="cuda"):
109
  inputs = tokenizer(
110
  [prompt],
111
  return_tensors="pt",
112
+ ).to(device)
113
 
114
  text_streamer = TextStreamer(tokenizer)
115
 
 
122
  if text:
123
  # Remove the begin and end tokens
124
  text = re.sub(
125
+ r".*?(assistant|\[/INST\]).+?\b",
126
+ "",
127
+ text,
128
+ flags=re.DOTALL | re.MULTILINE,
129
  )
130
  if debug:
131
  print("--------\nstep 1:", text)
 
140
  if debug:
141
  print("--------\nstep 3:", text)
142
 
143
+ text = text.split("。")[0].strip()
144
+ if debug:
145
+ print("--------\nstep 4:", text)
146
+
147
+ text = re.sub(
148
+ r"^Response:.+?\b",
149
+ "",
150
+ text,
151
+ flags=re.DOTALL | re.MULTILINE,
152
+ )
153
+ if debug:
154
+ print("--------\nstep 5:", text)
155
+
156
  return text
157
 
158
+
159
+ def eval_model(
160
+ model,
161
+ tokenizer,
162
+ eval_dataset,
163
+ device="cuda",
164
+ max_new_tokens=4096,
165
+ repetition_penalty=1.0,
166
+ batch_size=1,
167
+ ):
168
  total = len(eval_dataset)
169
  predictions = []
170
+
171
+ model.eval()
172
+
173
+ with torch.no_grad():
174
+ for i in tqdm(range(0, total, batch_size)): # Iterate in batches
175
+ batch_end = min(i + batch_size, total) # Ensure not to exceed dataset
176
+ batch_prompts = eval_dataset["prompt"][i:batch_end]
177
+ inputs = tokenizer(
178
+ batch_prompts,
179
+ return_tensors="pt",
180
+ padding=True, # Ensure all inputs in the batch have the same length
181
+ ).to(device)
182
+
183
+ outputs = model.generate(
184
+ **inputs,
185
+ max_new_tokens=max_new_tokens,
186
+ repetition_penalty=repetition_penalty,
187
+ use_cache=False,
188
+ )
189
+ outputs = outputs[:, inputs["input_ids"].shape[1] :]
190
+ decoded_output = tokenizer.batch_decode(
191
+ outputs, skip_special_tokens=True
192
+ ) # Skip special tokens for clean output
193
+ if i == 0:
194
+ print("Batch output:", decoded_output)
195
+ predictions.extend(decoded_output)
196
 
197
  return predictions
198
 
199
+
200
  def save_model(
201
  model,
202
  tokenizer,
 
256
  )
257
  except Exception as e:
258
  print(e)
259
+
260
+
261
+ def print_row_details(df, indices=[0]):
262
+ for index in indices:
263
+ for col in df.columns:
264
+ print("-" * 50)
265
+ print(f"{col}: {df[col].iloc[index]}")
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  nltk==3.8.1
2
  python-dotenv==1.0.1
3
  black==24.4.0
@@ -9,7 +10,10 @@ scikit-learn==1.5.0
9
  jupyter
10
  ipywidgets
11
  packaging
12
- # triton
13
- # xformers
14
  langchain_openai==0.1.13
15
- wandb==0.17.4
 
 
 
 
 
 
1
+ huggingface_hub==0.24.2
2
  nltk==3.8.1
3
  python-dotenv==1.0.1
4
  black==24.4.0
 
10
  jupyter
11
  ipywidgets
12
  packaging
 
 
13
  langchain_openai==0.1.13
14
+ wandb==0.17.4
15
+ transformers==4.43.3
16
+ sentencepiece==0.2.0
17
+ einops==0.8.0
18
+ accelerate==0.32.1
19
+ peft==0.11.1
scripts/lf-api.sh DELETED
@@ -1,8 +0,0 @@
1
- #!/bin/sh
2
-
3
- BASEDIR=$(dirname "$0")
4
- cd $BASEDIR/../llama-factory
5
- echo Current Directory:
6
- pwd
7
-
8
- API_PORT=8000 llamafactory-cli api $1
 
 
 
 
 
 
 
 
 
scripts/tune-large.sh DELETED
@@ -1,24 +0,0 @@
1
- #!/bin/sh
2
-
3
- BASEDIR=$(dirname "$0")
4
- cd $BASEDIR
5
- echo Current Directory:
6
- pwd
7
-
8
- nvidia-smi
9
- uname -a
10
- cat /etc/os-release
11
- lscpu
12
- grep MemTotal /proc/meminfo
13
-
14
- # pip install -r requirements.txt
15
- # FLASH_ATTENTION_FORCE_BUILD=TRUE pip install --upgrade flash-attn
16
-
17
- # export MODEL_NAME=unsloth/Qwen2-72B-Instruct-bnb-4bit
18
- # echo Tuning $MODEL_NAME
19
- # python tune.py
20
-
21
- export MODEL_NAME=unsloth/llama-3-70b-Instruct-bnb-4bit
22
- echo Tuning $MODEL_NAME
23
- python tune.py
24
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/tune-lf.sh DELETED
@@ -1,9 +0,0 @@
1
- #!/bin/sh
2
-
3
- BASEDIR=$(dirname "$0")
4
- cd $BASEDIR/../llama-factory
5
- echo Current Directory:
6
- pwd
7
-
8
- YAML=$1 python -c 'import os, json, sys, yaml; filename=os.getenv("YAML"); y=yaml.safe_load(open(filename)) ; print(f"{filename}:\n", json.dumps(y, indent=2))'
9
- llamafactory-cli train $1
 
 
 
 
 
 
 
 
 
 
scripts/tune-medium.sh DELETED
@@ -1,27 +0,0 @@
1
- #!/bin/sh
2
-
3
- BASEDIR=$(dirname "$0")
4
- cd $BASEDIR
5
- echo Current Directory:
6
- pwd
7
-
8
- nvidia-smi
9
- uname -a
10
- cat /etc/os-release
11
- lscpu
12
- grep MemTotal /proc/meminfo
13
-
14
- # pip install -r requirements.txt
15
- # FLASH_ATTENTION_FORCE_BUILD=TRUE pip install --upgrade flash-attn
16
-
17
- export MODEL_NAME=unsloth/Qwen2-7B-Instruct
18
- echo Tuning $MODEL_NAME
19
- python llm_toolkit/tune.py
20
-
21
- export MODEL_NAME=unsloth/mistral-7b-instruct-v0.3
22
- echo Tuning $MODEL_NAME
23
- python llm_toolkit/tune.py
24
-
25
- export MODEL_NAME=gradientai/Llama-3-8B-Instruct-Gradient-1048k
26
- echo Tuning $MODEL_NAME
27
- python llm_toolkit/tune.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/tune-small-2.sh DELETED
@@ -1,14 +0,0 @@
1
- #!/bin/sh
2
-
3
- BASEDIR=$(dirname "$0")
4
- cd $BASEDIR/..
5
- echo Current Directory:
6
- pwd
7
-
8
- export MODEL_NAME=unsloth/Qwen2-0.5B-Instruct
9
- echo Tuning $MODEL_NAME
10
- python llm_toolkit/tune.py
11
-
12
- export MODEL_NAME=unsloth/Qwen2-1.5B-Instruct
13
- echo Tuning $MODEL_NAME
14
- python llm_toolkit/tune.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/tune-small.sh DELETED
@@ -1,14 +0,0 @@
1
- #!/bin/sh
2
-
3
- BASEDIR=$(dirname "$0")
4
- cd $BASEDIR/..
5
- echo Current Directory:
6
- pwd
7
-
8
- export MODEL_NAME=unsloth/Qwen2-0.5B-Instruct-bnb-4bit
9
- echo Tuning $MODEL_NAME
10
- python llm_toolkit/tune.py
11
-
12
- export MODEL_NAME=unsloth/Qwen2-1.5B-Instruct-bnb-4bit
13
- echo Tuning $MODEL_NAME
14
- python llm_toolkit/tune.py