Spaces:
Runtime error
Runtime error
menouar
commited on
Commit
·
b8758c8
1
Parent(s):
e75ffde
Update the generated Notebook to push properly to HF
Browse files- app.py +8 -8
- utils/__init__.py +2 -3
- utils/components_creator.py +8 -9
- utils/notebook_generator.py +47 -30
app.py
CHANGED
@@ -68,8 +68,8 @@ def change_model_selection(model_id):
|
|
68 |
return None
|
69 |
|
70 |
|
71 |
-
def
|
72 |
-
return gr.
|
73 |
|
74 |
|
75 |
def check_valid_input(value):
|
@@ -190,12 +190,12 @@ def generate_code(components: dict[Component, Any]):
|
|
190 |
|
191 |
create_merge_lora_cells(notebook['cells'], output_dir)
|
192 |
|
193 |
-
|
194 |
|
195 |
if push_to_hub:
|
196 |
if not should_login:
|
197 |
create_login_hf_cells(notebook['cells'])
|
198 |
-
|
199 |
|
200 |
file_name = f"{finetuning_notebook}.ipynb"
|
201 |
|
@@ -287,8 +287,8 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(text_size='lg', font=["monospace"],
|
|
287 |
with centered_column():
|
288 |
output_dir_cmp, push_to_hub_cmp = add_outputs()
|
289 |
all_components.update({output_dir_cmp, push_to_hub_cmp})
|
290 |
-
|
291 |
-
all_components.update({
|
292 |
with centered_column():
|
293 |
all_components.update(add_outputs1())
|
294 |
|
@@ -318,9 +318,9 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(text_size='lg', font=["monospace"],
|
|
318 |
)
|
319 |
|
320 |
push_to_hub_cmp.change(
|
321 |
-
fn=
|
322 |
inputs=push_to_hub_cmp,
|
323 |
-
outputs=
|
324 |
)
|
325 |
|
326 |
demo.launch(allowed_paths=["/"])
|
|
|
68 |
return None
|
69 |
|
70 |
|
71 |
+
def handle_push_to_hub(value):
|
72 |
+
return gr.Textbox(visible=value)
|
73 |
|
74 |
|
75 |
def check_valid_input(value):
|
|
|
190 |
|
191 |
create_merge_lora_cells(notebook['cells'], output_dir)
|
192 |
|
193 |
+
merge_model_cells(notebook['cells'], output_dir)
|
194 |
|
195 |
if push_to_hub:
|
196 |
if not should_login:
|
197 |
create_login_hf_cells(notebook['cells'])
|
198 |
+
push_to_hub_cells(notebook['cells'], output_dir)
|
199 |
|
200 |
file_name = f"{finetuning_notebook}.ipynb"
|
201 |
|
|
|
287 |
with centered_column():
|
288 |
output_dir_cmp, push_to_hub_cmp = add_outputs()
|
289 |
all_components.update({output_dir_cmp, push_to_hub_cmp})
|
290 |
+
repo_name_cmp = add_hf_repo_cmp()
|
291 |
+
all_components.update({repo_name_cmp})
|
292 |
with centered_column():
|
293 |
all_components.update(add_outputs1())
|
294 |
|
|
|
318 |
)
|
319 |
|
320 |
push_to_hub_cmp.change(
|
321 |
+
fn=handle_push_to_hub,
|
322 |
inputs=push_to_hub_cmp,
|
323 |
+
outputs=repo_name_cmp
|
324 |
)
|
325 |
|
326 |
demo.launch(allowed_paths=["/"])
|
utils/__init__.py
CHANGED
@@ -36,9 +36,8 @@ LR_SCHEDULER_TYPE_ID = "lr_scheduler_type"
|
|
36 |
OUTPUT_DIR_ID = "output_dir"
|
37 |
|
38 |
PUSH_TO_HUB_ID = "push_to_hub"
|
39 |
-
|
40 |
-
|
41 |
-
PUSH_TYPES_ONLY_MODEL = "Push only the Model and Tokenizer"
|
42 |
|
43 |
REPORT_TO_ID = "report_to"
|
44 |
|
|
|
36 |
OUTPUT_DIR_ID = "output_dir"
|
37 |
|
38 |
PUSH_TO_HUB_ID = "push_to_hub"
|
39 |
+
|
40 |
+
REPOSITORY_NAME_ID = "repo_id"
|
|
|
41 |
|
42 |
REPORT_TO_ID = "report_to"
|
43 |
|
utils/components_creator.py
CHANGED
@@ -181,15 +181,14 @@ def add_outputs() -> (Component, Component):
|
|
181 |
return output_dir, push_to_hub
|
182 |
|
183 |
|
184 |
-
def
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
return push_type
|
193 |
|
194 |
|
195 |
def add_outputs1() -> Set[Component]:
|
|
|
181 |
return output_dir, push_to_hub
|
182 |
|
183 |
|
184 |
+
def add_hf_repo_cmp() -> Component:
|
185 |
+
repo_name = gr.Textbox(label="HF Repo name",
|
186 |
+
placeholder="username/your_repository",
|
187 |
+
info="Hugging Face repository to be created.",
|
188 |
+
interactive=True,
|
189 |
+
visible=False,
|
190 |
+
elem_id=REPOSITORY_NAME_ID)
|
191 |
+
return repo_name
|
|
|
192 |
|
193 |
|
194 |
def add_outputs1() -> Set[Component]:
|
utils/notebook_generator.py
CHANGED
@@ -2,7 +2,7 @@ from typing import Optional
|
|
2 |
|
3 |
import nbformat as nbf
|
4 |
|
5 |
-
from utils import FTDataSet
|
6 |
|
7 |
|
8 |
def create_install_libraries_cells(cells: list):
|
@@ -74,7 +74,7 @@ def create_login_hf_cells(cells: list, should_login: bool = False, model_name: O
|
|
74 |
from huggingface_hub import login
|
75 |
|
76 |
login(
|
77 |
-
token='
|
78 |
add_to_git_credential=True
|
79 |
)
|
80 |
"""
|
@@ -148,6 +148,7 @@ bnb_config = BitsAndBytesConfig(
|
|
148 |
model = AutoModelForCausalLM.from_pretrained(
|
149 |
model_id,
|
150 |
device_map="auto",
|
|
|
151 |
{flash_attention_str}
|
152 |
torch_dtype=torch.bfloat16,
|
153 |
quantization_config=bnb_config
|
@@ -246,7 +247,7 @@ def create_training_args_cells(cells: list, epochs, max_steps, logging_steps, pe
|
|
246 |
from transformers import TrainingArguments
|
247 |
|
248 |
args = TrainingArguments(
|
249 |
-
output_dir="{output_dir}",
|
250 |
num_train_epochs={epochs},
|
251 |
per_device_train_batch_size={per_device_train_batch_size},
|
252 |
gradient_accumulation_steps={gradient_accumulation_steps},
|
@@ -319,7 +320,7 @@ def create_start_training_cells(cells: list, epochs, max_steps, push_to_hub, out
|
|
319 |
f"""### Starting Training and Saving Model/Tokenizer
|
320 |
|
321 |
We start training the model by calling the `train()` method on the trainer instance. This will start the training
|
322 |
-
loop and train the model for `{epoch_str}`. The model will be automatically saved the output directory({output_dir})
|
323 |
{save_txt}
|
324 |
|
325 |
""")
|
@@ -331,11 +332,9 @@ model.config.use_cache = False
|
|
331 |
# start training
|
332 |
trainer.train()
|
333 |
|
334 |
-
# save the model
|
335 |
trainer.save_model()
|
336 |
|
337 |
-
# save tokenizer
|
338 |
-
tokenizer.save_pretrained("{output_dir}")
|
339 |
"""
|
340 |
code_cell = nbf.v4.new_code_cell(code)
|
341 |
cells.append(text_cell)
|
@@ -375,7 +374,7 @@ from peft import AutoPeftModelForCausalLM
|
|
375 |
|
376 |
# Load Peft model on CPU
|
377 |
model = AutoPeftModelForCausalLM.from_pretrained(
|
378 |
-
"{output_dir}",
|
379 |
torch_dtype=torch.float16,
|
380 |
low_cpu_mem_usage=True
|
381 |
)
|
@@ -383,48 +382,66 @@ model = AutoPeftModelForCausalLM.from_pretrained(
|
|
383 |
# Merge LoRA and base model and save
|
384 |
merged_model = model.merge_and_unload()
|
385 |
merged_model.save_pretrained("{output_dir}", safe_serialization=True, max_shard_size="2GB")
|
|
|
386 |
"""
|
387 |
code_cell = nbf.v4.new_code_cell(code)
|
388 |
cells.append(text_cell)
|
389 |
cells.append(code_cell)
|
390 |
|
391 |
|
392 |
-
def
|
393 |
text_cell = nbf.v4.new_markdown_cell(
|
394 |
-
"
|
395 |
|
396 |
code = f"""
|
397 |
-
|
|
|
|
|
|
|
|
|
|
|
398 |
|
399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
400 |
"""
|
401 |
|
402 |
-
|
|
|
|
|
403 |
|
404 |
-
|
|
|
|
|
|
|
|
|
405 |
|
406 |
# Instantiate the HfApi class
|
407 |
api = HfApi()
|
408 |
|
409 |
# Your Hugging Face repository
|
410 |
-
repo_name = "
|
411 |
|
412 |
# Create a repository on the Hugging Face Hub
|
413 |
-
api.create_repo(token=HfFolder.get_token(),
|
414 |
-
|
415 |
-
# Path to your local folder
|
416 |
-
folder_path = "{output_dir}"
|
417 |
-
|
418 |
-
# Create a repository object
|
419 |
-
repo = Repository(local_dir=folder_path, clone_from=repo_name)
|
420 |
|
421 |
-
|
422 |
-
|
|
|
|
|
|
|
423 |
"""
|
424 |
-
|
425 |
-
|
426 |
-
code_cell = nbf.v4.new_code_cell(code)
|
427 |
-
else:
|
428 |
-
code_cell = nbf.v4.new_code_cell(code_all)
|
429 |
-
cells.append(text_cell)
|
430 |
cells.append(code_cell)
|
|
|
2 |
|
3 |
import nbformat as nbf
|
4 |
|
5 |
+
from utils import FTDataSet
|
6 |
|
7 |
|
8 |
def create_install_libraries_cells(cells: list):
|
|
|
74 |
from huggingface_hub import login
|
75 |
|
76 |
login(
|
77 |
+
token='HF_TOKEN',
|
78 |
add_to_git_credential=True
|
79 |
)
|
80 |
"""
|
|
|
148 |
model = AutoModelForCausalLM.from_pretrained(
|
149 |
model_id,
|
150 |
device_map="auto",
|
151 |
+
trust_remote_code=True,
|
152 |
{flash_attention_str}
|
153 |
torch_dtype=torch.bfloat16,
|
154 |
quantization_config=bnb_config
|
|
|
247 |
from transformers import TrainingArguments
|
248 |
|
249 |
args = TrainingArguments(
|
250 |
+
output_dir="temp_{output_dir}",
|
251 |
num_train_epochs={epochs},
|
252 |
per_device_train_batch_size={per_device_train_batch_size},
|
253 |
gradient_accumulation_steps={gradient_accumulation_steps},
|
|
|
320 |
f"""### Starting Training and Saving Model/Tokenizer
|
321 |
|
322 |
We start training the model by calling the `train()` method on the trainer instance. This will start the training
|
323 |
+
loop and train the model for `{epoch_str}`. The model will be automatically saved the output directory(temp_{output_dir})
|
324 |
{save_txt}
|
325 |
|
326 |
""")
|
|
|
332 |
# start training
|
333 |
trainer.train()
|
334 |
|
335 |
+
# save the PEFT model
|
336 |
trainer.save_model()
|
337 |
|
|
|
|
|
338 |
"""
|
339 |
code_cell = nbf.v4.new_code_cell(code)
|
340 |
cells.append(text_cell)
|
|
|
374 |
|
375 |
# Load Peft model on CPU
|
376 |
model = AutoPeftModelForCausalLM.from_pretrained(
|
377 |
+
"temp_{output_dir}",
|
378 |
torch_dtype=torch.float16,
|
379 |
low_cpu_mem_usage=True
|
380 |
)
|
|
|
382 |
# Merge LoRA and base model and save
|
383 |
merged_model = model.merge_and_unload()
|
384 |
merged_model.save_pretrained("{output_dir}", safe_serialization=True, max_shard_size="2GB")
|
385 |
+
tokenizer.save_pretrained("{output_dir}")
|
386 |
"""
|
387 |
code_cell = nbf.v4.new_code_cell(code)
|
388 |
cells.append(text_cell)
|
389 |
cells.append(code_cell)
|
390 |
|
391 |
|
392 |
+
def merge_model_cells(cells: list, output_dir):
|
393 |
text_cell = nbf.v4.new_markdown_cell(
|
394 |
+
f"### Copy all result folders from 'temp_{output_dir}' to '{output_dir}'.")
|
395 |
|
396 |
code = f"""
|
397 |
+
import os
|
398 |
+
import shutil
|
399 |
+
|
400 |
+
# Specify the source folder and the destination folder
|
401 |
+
source_folder = "temp_{output_dir}"
|
402 |
+
destination_folder = "{output_dir}"
|
403 |
|
404 |
+
# Create the destination folder if it doesn't exist
|
405 |
+
os.makedirs(destination_folder, exist_ok=True)
|
406 |
+
|
407 |
+
# Iterate over the files and subfolders in the source folder
|
408 |
+
for item in os.listdir(source_folder):
|
409 |
+
item_path = os.path.join(source_folder, item)
|
410 |
+
|
411 |
+
# Check if it's a subfolder (and not a file)
|
412 |
+
if os.path.isdir(item_path):
|
413 |
+
# Specify the destination path
|
414 |
+
destination_path = os.path.join(destination_folder, item)
|
415 |
+
|
416 |
+
# Copy the subfolder to the destination folder
|
417 |
+
shutil.copytree(item_path, destination_path)
|
418 |
"""
|
419 |
|
420 |
+
code_cell = nbf.v4.new_code_cell(code)
|
421 |
+
cells.append(text_cell)
|
422 |
+
cells.append(code_cell)
|
423 |
|
424 |
+
|
425 |
+
def push_to_hub_cells(cells: list, output_dir):
|
426 |
+
text = f"Push '{output_dir}' to your Hugging Face account."
|
427 |
+
code = f"""
|
428 |
+
from huggingface_hub import HfApi, HfFolder, Repository
|
429 |
|
430 |
# Instantiate the HfApi class
|
431 |
api = HfApi()
|
432 |
|
433 |
# Your Hugging Face repository
|
434 |
+
repo_name = "{output_dir}"
|
435 |
|
436 |
# Create a repository on the Hugging Face Hub
|
437 |
+
repo = api.create_repo(token=HfFolder.get_token(), repo_type="model", repo_id=repo_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
|
439 |
+
api.upload_folder(
|
440 |
+
folder_path="{output_dir}",
|
441 |
+
repo_id=repo.repo_id,
|
442 |
+
repo_type="model",
|
443 |
+
)
|
444 |
"""
|
445 |
+
code_cell = nbf.v4.new_code_cell(code)
|
446 |
+
cells.append(nbf.v4.new_markdown_cell(text))
|
|
|
|
|
|
|
|
|
447 |
cells.append(code_cell)
|