Spaces:

Menouar
/

LLM-FineTuning-Notebook-Generator

Runtime error

App Files Files Community

menouar commited on Mar 10, 2024

Commit

b8758c8

1 Parent(s): e75ffde

Update the generated Notebook to push properly to HF

Browse files

Files changed (4) hide show

app.py +8 -8
utils/__init__.py +2 -3
utils/components_creator.py +8 -9
utils/notebook_generator.py +47 -30

app.py CHANGED Viewed

@@ -68,8 +68,8 @@ def change_model_selection(model_id):
     return None
-def display_push_type(value):
-    return gr.Radio(visible=value)
 def check_valid_input(value):
@@ -190,12 +190,12 @@ def generate_code(components: dict[Component, Any]):
     create_merge_lora_cells(notebook['cells'], output_dir)
-    push_type_value = get_value(components, PUSH_TYPE_ID)
     if push_to_hub:
         if not should_login:
             create_login_hf_cells(notebook['cells'])
-        push_merged_model_cells(notebook['cells'], output_dir, push_type_value)
     file_name = f"{finetuning_notebook}.ipynb"
@@ -287,8 +287,8 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(text_size='lg', font=["monospace"],
         with centered_column():
             output_dir_cmp, push_to_hub_cmp = add_outputs()
             all_components.update({output_dir_cmp, push_to_hub_cmp})
-            push_type_cmp = add_push_type_cmp()
-            all_components.update({push_type_cmp})
         with centered_column():
             all_components.update(add_outputs1())
@@ -318,9 +318,9 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(text_size='lg', font=["monospace"],
     )
     push_to_hub_cmp.change(
-        fn=display_push_type,
         inputs=push_to_hub_cmp,
-        outputs=push_type_cmp
     )
 demo.launch(allowed_paths=["/"])

     return None
+def handle_push_to_hub(value):
+    return gr.Textbox(visible=value)
 def check_valid_input(value):
     create_merge_lora_cells(notebook['cells'], output_dir)
+    merge_model_cells(notebook['cells'], output_dir)
     if push_to_hub:
         if not should_login:
             create_login_hf_cells(notebook['cells'])
+        push_to_hub_cells(notebook['cells'], output_dir)
     file_name = f"{finetuning_notebook}.ipynb"
         with centered_column():
             output_dir_cmp, push_to_hub_cmp = add_outputs()
             all_components.update({output_dir_cmp, push_to_hub_cmp})
+            repo_name_cmp = add_hf_repo_cmp()
+            all_components.update({repo_name_cmp})
         with centered_column():
             all_components.update(add_outputs1())
     )
     push_to_hub_cmp.change(
+        fn=handle_push_to_hub,
         inputs=push_to_hub_cmp,
+        outputs=repo_name_cmp
     )
 demo.launch(allowed_paths=["/"])

utils/__init__.py CHANGED Viewed

@@ -36,9 +36,8 @@ LR_SCHEDULER_TYPE_ID = "lr_scheduler_type"
 OUTPUT_DIR_ID = "output_dir"
 PUSH_TO_HUB_ID = "push_to_hub"
-PUSH_TYPE_ID = "push_type"
-PUSH_TYPES_ALL = "Push all the outputs"
-PUSH_TYPES_ONLY_MODEL = "Push only the Model and Tokenizer"
 REPORT_TO_ID = "report_to"

 OUTPUT_DIR_ID = "output_dir"
 PUSH_TO_HUB_ID = "push_to_hub"
+REPOSITORY_NAME_ID = "repo_id"
 REPORT_TO_ID = "report_to"

utils/components_creator.py CHANGED Viewed

@@ -181,15 +181,14 @@ def add_outputs() -> (Component, Component):
     return output_dir, push_to_hub
-def add_push_type_cmp() -> Component:
-    push_type = gr.Radio([PUSH_TYPES_ONLY_MODEL, PUSH_TYPES_ALL],
-                         label="Output Push Option",
-                         info="Select whether to push only the Model and Tokenizer or all the outputs.",
-                         interactive=True,
-                         visible=False,
-                         value=PUSH_TYPES_ALL,
-                         elem_id=PUSH_TYPE_ID)
-    return push_type
 def add_outputs1() -> Set[Component]:

     return output_dir, push_to_hub
+def add_hf_repo_cmp() -> Component:
+    repo_name = gr.Textbox(label="HF Repo name",
+                           placeholder="username/your_repository",
+                           info="Hugging Face repository to be created.",
+                           interactive=True,
+                           visible=False,
+                           elem_id=REPOSITORY_NAME_ID)
+    return repo_name
 def add_outputs1() -> Set[Component]:

utils/notebook_generator.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Optional
 import nbformat as nbf
-from utils import FTDataSet, PUSH_TYPES_ONLY_MODEL
 def create_install_libraries_cells(cells: list):
@@ -74,7 +74,7 @@ def create_login_hf_cells(cells: list, should_login: bool = False, model_name: O
 from huggingface_hub import login
 login(
-        token='Your_HF_TOKEN'
         add_to_git_credential=True
 )
     """
@@ -148,6 +148,7 @@ bnb_config = BitsAndBytesConfig(
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
     {flash_attention_str}
     torch_dtype=torch.bfloat16,
     quantization_config=bnb_config
@@ -246,7 +247,7 @@ def create_training_args_cells(cells: list, epochs, max_steps, logging_steps, pe
 from transformers import TrainingArguments
 args = TrainingArguments(
-    output_dir="{output_dir}",
     num_train_epochs={epochs},
     per_device_train_batch_size={per_device_train_batch_size},
     gradient_accumulation_steps={gradient_accumulation_steps},
@@ -319,7 +320,7 @@ def create_start_training_cells(cells: list, epochs, max_steps, push_to_hub, out
         f"""### Starting Training and Saving Model/Tokenizer
 We start training the model by calling the `train()` method on the trainer instance. This will start the training
-loop and train the model for `{epoch_str}`. The model will be automatically saved the output directory({output_dir})
 {save_txt}
     """)
@@ -331,11 +332,9 @@ model.config.use_cache = False
 # start training
 trainer.train()
-# save the model
 trainer.save_model()
-# save tokenizer
-tokenizer.save_pretrained("{output_dir}")
 """
     code_cell = nbf.v4.new_code_cell(code)
     cells.append(text_cell)
@@ -375,7 +374,7 @@ from peft import AutoPeftModelForCausalLM
 # Load Peft model on CPU
 model = AutoPeftModelForCausalLM.from_pretrained(
-    "{output_dir}",
     torch_dtype=torch.float16,
     low_cpu_mem_usage=True
 )
@@ -383,48 +382,66 @@ model = AutoPeftModelForCausalLM.from_pretrained(
 # Merge LoRA and base model and save
 merged_model = model.merge_and_unload()
 merged_model.save_pretrained("{output_dir}", safe_serialization=True, max_shard_size="2GB")
 """
     code_cell = nbf.v4.new_code_cell(code)
     cells.append(text_cell)
     cells.append(code_cell)
-def push_merged_model_cells(cells: list, output_dir, push_type_value):
     text_cell = nbf.v4.new_markdown_cell(
-        """### Push the Merged model as well as the Tokenizer to HF hub""")
     code = f"""
-merged_model.push_to_hub("{output_dir}", use_temp_dir=False)
-tokenizer.push_to_hub("{output_dir}", use_temp_dir=False)
 """
-    code_all = f"""
-from huggingface_hub import HfApi, HfFolder
 # Instantiate the HfApi class
 api = HfApi()
 # Your Hugging Face repository
-repo_name = "Menouar/test"
 # Create a repository on the Hugging Face Hub
-api.create_repo(token=HfFolder.get_token(), name=repo_name, repo_type="model")
-# Path to your local folder
-folder_path = "{output_dir}"
-# Create a repository object
-repo = Repository(local_dir=folder_path, clone_from=repo_name)
-# Commit and push your changes
-repo.git_add(commit_message="Initial commit", git_push=True)
 """
-    if push_type_value == PUSH_TYPES_ONLY_MODEL:
-        code_cell = nbf.v4.new_code_cell(code)
-    else:
-        code_cell = nbf.v4.new_code_cell(code_all)
-    cells.append(text_cell)
     cells.append(code_cell)

 import nbformat as nbf
+from utils import FTDataSet
 def create_install_libraries_cells(cells: list):
 from huggingface_hub import login
 login(
+        token='HF_TOKEN',
         add_to_git_credential=True
 )
     """
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
+    trust_remote_code=True,
     {flash_attention_str}
     torch_dtype=torch.bfloat16,
     quantization_config=bnb_config
 from transformers import TrainingArguments
 args = TrainingArguments(
+    output_dir="temp_{output_dir}",
     num_train_epochs={epochs},
     per_device_train_batch_size={per_device_train_batch_size},
     gradient_accumulation_steps={gradient_accumulation_steps},
         f"""### Starting Training and Saving Model/Tokenizer
 We start training the model by calling the `train()` method on the trainer instance. This will start the training
+loop and train the model for `{epoch_str}`. The model will be automatically saved the output directory(temp_{output_dir})
 {save_txt}
     """)
 # start training
 trainer.train()
+# save the PEFT model
 trainer.save_model()
 """
     code_cell = nbf.v4.new_code_cell(code)
     cells.append(text_cell)
 # Load Peft model on CPU
 model = AutoPeftModelForCausalLM.from_pretrained(
+    "temp_{output_dir}",
     torch_dtype=torch.float16,
     low_cpu_mem_usage=True
 )
 # Merge LoRA and base model and save
 merged_model = model.merge_and_unload()
 merged_model.save_pretrained("{output_dir}", safe_serialization=True, max_shard_size="2GB")
+tokenizer.save_pretrained("{output_dir}")
 """
     code_cell = nbf.v4.new_code_cell(code)
     cells.append(text_cell)
     cells.append(code_cell)
+def merge_model_cells(cells: list, output_dir):
     text_cell = nbf.v4.new_markdown_cell(
+        f"### Copy all result folders from 'temp_{output_dir}' to '{output_dir}'.")
     code = f"""
+import os
+import shutil
+# Specify the source folder and the destination folder
+source_folder = "temp_{output_dir}"
+destination_folder = "{output_dir}"
+# Create the destination folder if it doesn't exist
+os.makedirs(destination_folder, exist_ok=True)
+# Iterate over the files and subfolders in the source folder
+for item in os.listdir(source_folder):
+    item_path = os.path.join(source_folder, item)
+    # Check if it's a subfolder (and not a file)
+    if os.path.isdir(item_path):
+        # Specify the destination path
+        destination_path = os.path.join(destination_folder, item)
+        # Copy the subfolder to the destination folder
+        shutil.copytree(item_path, destination_path)
 """
+    code_cell = nbf.v4.new_code_cell(code)
+    cells.append(text_cell)
+    cells.append(code_cell)
+def push_to_hub_cells(cells: list, output_dir):
+    text = f"Push '{output_dir}' to your Hugging Face account."
+    code = f"""
+from huggingface_hub import HfApi, HfFolder, Repository
 # Instantiate the HfApi class
 api = HfApi()
 # Your Hugging Face repository
+repo_name = "{output_dir}"
 # Create a repository on the Hugging Face Hub
+repo = api.create_repo(token=HfFolder.get_token(), repo_type="model", repo_id=repo_name)
+api.upload_folder(
+    folder_path="{output_dir}",
+    repo_id=repo.repo_id,
+    repo_type="model",
+)
 """
+    code_cell = nbf.v4.new_code_cell(code)
+    cells.append(nbf.v4.new_markdown_cell(text))
     cells.append(code_cell)