menouar commited on
Commit
b8758c8
·
1 Parent(s): e75ffde

Update the generated Notebook to push properly to HF

Browse files
app.py CHANGED
@@ -68,8 +68,8 @@ def change_model_selection(model_id):
68
  return None
69
 
70
 
71
- def display_push_type(value):
72
- return gr.Radio(visible=value)
73
 
74
 
75
  def check_valid_input(value):
@@ -190,12 +190,12 @@ def generate_code(components: dict[Component, Any]):
190
 
191
  create_merge_lora_cells(notebook['cells'], output_dir)
192
 
193
- push_type_value = get_value(components, PUSH_TYPE_ID)
194
 
195
  if push_to_hub:
196
  if not should_login:
197
  create_login_hf_cells(notebook['cells'])
198
- push_merged_model_cells(notebook['cells'], output_dir, push_type_value)
199
 
200
  file_name = f"{finetuning_notebook}.ipynb"
201
 
@@ -287,8 +287,8 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(text_size='lg', font=["monospace"],
287
  with centered_column():
288
  output_dir_cmp, push_to_hub_cmp = add_outputs()
289
  all_components.update({output_dir_cmp, push_to_hub_cmp})
290
- push_type_cmp = add_push_type_cmp()
291
- all_components.update({push_type_cmp})
292
  with centered_column():
293
  all_components.update(add_outputs1())
294
 
@@ -318,9 +318,9 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(text_size='lg', font=["monospace"],
318
  )
319
 
320
  push_to_hub_cmp.change(
321
- fn=display_push_type,
322
  inputs=push_to_hub_cmp,
323
- outputs=push_type_cmp
324
  )
325
 
326
  demo.launch(allowed_paths=["/"])
 
68
  return None
69
 
70
 
71
+ def handle_push_to_hub(value):
72
+ return gr.Textbox(visible=value)
73
 
74
 
75
  def check_valid_input(value):
 
190
 
191
  create_merge_lora_cells(notebook['cells'], output_dir)
192
 
193
+ merge_model_cells(notebook['cells'], output_dir)
194
 
195
  if push_to_hub:
196
  if not should_login:
197
  create_login_hf_cells(notebook['cells'])
198
+ push_to_hub_cells(notebook['cells'], output_dir)
199
 
200
  file_name = f"{finetuning_notebook}.ipynb"
201
 
 
287
  with centered_column():
288
  output_dir_cmp, push_to_hub_cmp = add_outputs()
289
  all_components.update({output_dir_cmp, push_to_hub_cmp})
290
+ repo_name_cmp = add_hf_repo_cmp()
291
+ all_components.update({repo_name_cmp})
292
  with centered_column():
293
  all_components.update(add_outputs1())
294
 
 
318
  )
319
 
320
  push_to_hub_cmp.change(
321
+ fn=handle_push_to_hub,
322
  inputs=push_to_hub_cmp,
323
+ outputs=repo_name_cmp
324
  )
325
 
326
  demo.launch(allowed_paths=["/"])
utils/__init__.py CHANGED
@@ -36,9 +36,8 @@ LR_SCHEDULER_TYPE_ID = "lr_scheduler_type"
36
  OUTPUT_DIR_ID = "output_dir"
37
 
38
  PUSH_TO_HUB_ID = "push_to_hub"
39
- PUSH_TYPE_ID = "push_type"
40
- PUSH_TYPES_ALL = "Push all the outputs"
41
- PUSH_TYPES_ONLY_MODEL = "Push only the Model and Tokenizer"
42
 
43
  REPORT_TO_ID = "report_to"
44
 
 
36
  OUTPUT_DIR_ID = "output_dir"
37
 
38
  PUSH_TO_HUB_ID = "push_to_hub"
39
+
40
+ REPOSITORY_NAME_ID = "repo_id"
 
41
 
42
  REPORT_TO_ID = "report_to"
43
 
utils/components_creator.py CHANGED
@@ -181,15 +181,14 @@ def add_outputs() -> (Component, Component):
181
  return output_dir, push_to_hub
182
 
183
 
184
- def add_push_type_cmp() -> Component:
185
- push_type = gr.Radio([PUSH_TYPES_ONLY_MODEL, PUSH_TYPES_ALL],
186
- label="Output Push Option",
187
- info="Select whether to push only the Model and Tokenizer or all the outputs.",
188
- interactive=True,
189
- visible=False,
190
- value=PUSH_TYPES_ALL,
191
- elem_id=PUSH_TYPE_ID)
192
- return push_type
193
 
194
 
195
  def add_outputs1() -> Set[Component]:
 
181
  return output_dir, push_to_hub
182
 
183
 
184
+ def add_hf_repo_cmp() -> Component:
185
+ repo_name = gr.Textbox(label="HF Repo name",
186
+ placeholder="username/your_repository",
187
+ info="Hugging Face repository to be created.",
188
+ interactive=True,
189
+ visible=False,
190
+ elem_id=REPOSITORY_NAME_ID)
191
+ return repo_name
 
192
 
193
 
194
  def add_outputs1() -> Set[Component]:
utils/notebook_generator.py CHANGED
@@ -2,7 +2,7 @@ from typing import Optional
2
 
3
  import nbformat as nbf
4
 
5
- from utils import FTDataSet, PUSH_TYPES_ONLY_MODEL
6
 
7
 
8
  def create_install_libraries_cells(cells: list):
@@ -74,7 +74,7 @@ def create_login_hf_cells(cells: list, should_login: bool = False, model_name: O
74
  from huggingface_hub import login
75
 
76
  login(
77
- token='Your_HF_TOKEN'
78
  add_to_git_credential=True
79
  )
80
  """
@@ -148,6 +148,7 @@ bnb_config = BitsAndBytesConfig(
148
  model = AutoModelForCausalLM.from_pretrained(
149
  model_id,
150
  device_map="auto",
 
151
  {flash_attention_str}
152
  torch_dtype=torch.bfloat16,
153
  quantization_config=bnb_config
@@ -246,7 +247,7 @@ def create_training_args_cells(cells: list, epochs, max_steps, logging_steps, pe
246
  from transformers import TrainingArguments
247
 
248
  args = TrainingArguments(
249
- output_dir="{output_dir}",
250
  num_train_epochs={epochs},
251
  per_device_train_batch_size={per_device_train_batch_size},
252
  gradient_accumulation_steps={gradient_accumulation_steps},
@@ -319,7 +320,7 @@ def create_start_training_cells(cells: list, epochs, max_steps, push_to_hub, out
319
  f"""### Starting Training and Saving Model/Tokenizer
320
 
321
  We start training the model by calling the `train()` method on the trainer instance. This will start the training
322
- loop and train the model for `{epoch_str}`. The model will be automatically saved the output directory({output_dir})
323
  {save_txt}
324
 
325
  """)
@@ -331,11 +332,9 @@ model.config.use_cache = False
331
  # start training
332
  trainer.train()
333
 
334
- # save the model
335
  trainer.save_model()
336
 
337
- # save tokenizer
338
- tokenizer.save_pretrained("{output_dir}")
339
  """
340
  code_cell = nbf.v4.new_code_cell(code)
341
  cells.append(text_cell)
@@ -375,7 +374,7 @@ from peft import AutoPeftModelForCausalLM
375
 
376
  # Load Peft model on CPU
377
  model = AutoPeftModelForCausalLM.from_pretrained(
378
- "{output_dir}",
379
  torch_dtype=torch.float16,
380
  low_cpu_mem_usage=True
381
  )
@@ -383,48 +382,66 @@ model = AutoPeftModelForCausalLM.from_pretrained(
383
  # Merge LoRA and base model and save
384
  merged_model = model.merge_and_unload()
385
  merged_model.save_pretrained("{output_dir}", safe_serialization=True, max_shard_size="2GB")
 
386
  """
387
  code_cell = nbf.v4.new_code_cell(code)
388
  cells.append(text_cell)
389
  cells.append(code_cell)
390
 
391
 
392
- def push_merged_model_cells(cells: list, output_dir, push_type_value):
393
  text_cell = nbf.v4.new_markdown_cell(
394
- """### Push the Merged model as well as the Tokenizer to HF hub""")
395
 
396
  code = f"""
397
- merged_model.push_to_hub("{output_dir}", use_temp_dir=False)
 
 
 
 
 
398
 
399
- tokenizer.push_to_hub("{output_dir}", use_temp_dir=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  """
401
 
402
- code_all = f"""
 
 
403
 
404
- from huggingface_hub import HfApi, HfFolder
 
 
 
 
405
 
406
  # Instantiate the HfApi class
407
  api = HfApi()
408
 
409
  # Your Hugging Face repository
410
- repo_name = "Menouar/test"
411
 
412
  # Create a repository on the Hugging Face Hub
413
- api.create_repo(token=HfFolder.get_token(), name=repo_name, repo_type="model")
414
-
415
- # Path to your local folder
416
- folder_path = "{output_dir}"
417
-
418
- # Create a repository object
419
- repo = Repository(local_dir=folder_path, clone_from=repo_name)
420
 
421
- # Commit and push your changes
422
- repo.git_add(commit_message="Initial commit", git_push=True)
 
 
 
423
  """
424
-
425
- if push_type_value == PUSH_TYPES_ONLY_MODEL:
426
- code_cell = nbf.v4.new_code_cell(code)
427
- else:
428
- code_cell = nbf.v4.new_code_cell(code_all)
429
- cells.append(text_cell)
430
  cells.append(code_cell)
 
2
 
3
  import nbformat as nbf
4
 
5
+ from utils import FTDataSet
6
 
7
 
8
  def create_install_libraries_cells(cells: list):
 
74
  from huggingface_hub import login
75
 
76
  login(
77
+ token='HF_TOKEN',
78
  add_to_git_credential=True
79
  )
80
  """
 
148
  model = AutoModelForCausalLM.from_pretrained(
149
  model_id,
150
  device_map="auto",
151
+ trust_remote_code=True,
152
  {flash_attention_str}
153
  torch_dtype=torch.bfloat16,
154
  quantization_config=bnb_config
 
247
  from transformers import TrainingArguments
248
 
249
  args = TrainingArguments(
250
+ output_dir="temp_{output_dir}",
251
  num_train_epochs={epochs},
252
  per_device_train_batch_size={per_device_train_batch_size},
253
  gradient_accumulation_steps={gradient_accumulation_steps},
 
320
  f"""### Starting Training and Saving Model/Tokenizer
321
 
322
  We start training the model by calling the `train()` method on the trainer instance. This will start the training
323
+ loop and train the model for `{epoch_str}`. The model will be automatically saved the output directory(temp_{output_dir})
324
  {save_txt}
325
 
326
  """)
 
332
  # start training
333
  trainer.train()
334
 
335
+ # save the PEFT model
336
  trainer.save_model()
337
 
 
 
338
  """
339
  code_cell = nbf.v4.new_code_cell(code)
340
  cells.append(text_cell)
 
374
 
375
  # Load Peft model on CPU
376
  model = AutoPeftModelForCausalLM.from_pretrained(
377
+ "temp_{output_dir}",
378
  torch_dtype=torch.float16,
379
  low_cpu_mem_usage=True
380
  )
 
382
  # Merge LoRA and base model and save
383
  merged_model = model.merge_and_unload()
384
  merged_model.save_pretrained("{output_dir}", safe_serialization=True, max_shard_size="2GB")
385
+ tokenizer.save_pretrained("{output_dir}")
386
  """
387
  code_cell = nbf.v4.new_code_cell(code)
388
  cells.append(text_cell)
389
  cells.append(code_cell)
390
 
391
 
392
+ def merge_model_cells(cells: list, output_dir):
393
  text_cell = nbf.v4.new_markdown_cell(
394
+ f"### Copy all result folders from 'temp_{output_dir}' to '{output_dir}'.")
395
 
396
  code = f"""
397
+ import os
398
+ import shutil
399
+
400
+ # Specify the source folder and the destination folder
401
+ source_folder = "temp_{output_dir}"
402
+ destination_folder = "{output_dir}"
403
 
404
+ # Create the destination folder if it doesn't exist
405
+ os.makedirs(destination_folder, exist_ok=True)
406
+
407
+ # Iterate over the files and subfolders in the source folder
408
+ for item in os.listdir(source_folder):
409
+ item_path = os.path.join(source_folder, item)
410
+
411
+ # Check if it's a subfolder (and not a file)
412
+ if os.path.isdir(item_path):
413
+ # Specify the destination path
414
+ destination_path = os.path.join(destination_folder, item)
415
+
416
+ # Copy the subfolder to the destination folder
417
+ shutil.copytree(item_path, destination_path)
418
  """
419
 
420
+ code_cell = nbf.v4.new_code_cell(code)
421
+ cells.append(text_cell)
422
+ cells.append(code_cell)
423
 
424
+
425
+ def push_to_hub_cells(cells: list, output_dir):
426
+ text = f"Push '{output_dir}' to your Hugging Face account."
427
+ code = f"""
428
+ from huggingface_hub import HfApi, HfFolder, Repository
429
 
430
  # Instantiate the HfApi class
431
  api = HfApi()
432
 
433
  # Your Hugging Face repository
434
+ repo_name = "{output_dir}"
435
 
436
  # Create a repository on the Hugging Face Hub
437
+ repo = api.create_repo(token=HfFolder.get_token(), repo_type="model", repo_id=repo_name)
 
 
 
 
 
 
438
 
439
+ api.upload_folder(
440
+ folder_path="{output_dir}",
441
+ repo_id=repo.repo_id,
442
+ repo_type="model",
443
+ )
444
  """
445
+ code_cell = nbf.v4.new_code_cell(code)
446
+ cells.append(nbf.v4.new_markdown_cell(text))
 
 
 
 
447
  cells.append(code_cell)