menouar commited on
Commit
784cc97
1 Parent(s): 77e4f87

Improve generated Notebook

Browse files
Files changed (1) hide show
  1. utils/notebook_generator.py +22 -7
utils/notebook_generator.py CHANGED
@@ -2,7 +2,7 @@ from typing import Optional
2
 
3
  import nbformat as nbf
4
 
5
- from utils import FTDataSet, falcon
6
 
7
 
8
  def create_install_libraries_cells(cells: list):
@@ -136,6 +136,21 @@ def create_model_cells(cells: list, model_id: str, version: str, flash_attention
136
  auto_model_import = "FalconForCausalLM"
137
  trust_code = ""
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  code = f"""
140
  import torch
141
  from transformers import AutoTokenizer, {auto_model_import}, BitsAndBytesConfig
@@ -160,15 +175,13 @@ model = {auto_model_import}.from_pretrained(
160
  quantization_config=bnb_config
161
  )
162
 
163
- tokenizer = AutoTokenizer.from_pretrained(model_id)
164
  tokenizer.padding_side = "{pad_side}"
165
  {pad_value_str}
166
-
167
- # Set chat template to OAI chatML
168
- model, tokenizer = setup_chat_format(model, tokenizer)
169
  """
170
 
171
- text_1 = """
172
  This process involves two key steps:
173
 
174
  1. **LLM Quantization:**
@@ -194,7 +207,9 @@ a 24GB GPU for fine-tuning.
194
 
195
  * Sets the tokenizer's chat template, which defines the format used to convert input data into a chat-like
196
  structure. The default template is `chatml` from OpenAI.
197
- """
 
 
198
 
199
  code_cell = nbf.v4.new_code_cell(code)
200
  text_cell1 = nbf.v4.new_markdown_cell(text_1)
 
2
 
3
  import nbformat as nbf
4
 
5
+ from utils import FTDataSet, falcon, gemma
6
 
7
 
8
  def create_install_libraries_cells(cells: list):
 
136
  auto_model_import = "FalconForCausalLM"
137
  trust_code = ""
138
 
139
+ chat_ml = """
140
+ # Set chat template to OAI chatML
141
+ model, tokenizer = setup_chat_format(model, tokenizer)
142
+ """
143
+ note = f"""
144
+ > **Note:** For `{model_id}`, we will not use `setup_chat_format`. Instead, we will directly use this tokenizer, [philschmid/gemma-tokenizer-chatml](https://huggingface.co/philschmid/gemma-tokenizer-chatml), to fine-tune `{model_id}` with ChatML.
145
+ """
146
+ tokenizer_id = f"{model_id}-{version}"
147
+ if model_id == gemma.name:
148
+ tokenizer_id = "philschmid/gemma-tokenizer-chatml"
149
+ chat_ml =""
150
+ else:
151
+ note = ""
152
+
153
+
154
  code = f"""
155
  import torch
156
  from transformers import AutoTokenizer, {auto_model_import}, BitsAndBytesConfig
 
175
  quantization_config=bnb_config
176
  )
177
 
178
+ tokenizer = AutoTokenizer.from_pretrained("{tokenizer_id}")
179
  tokenizer.padding_side = "{pad_side}"
180
  {pad_value_str}
181
+ {chat_ml}
 
 
182
  """
183
 
184
+ text_1 = f"""
185
  This process involves two key steps:
186
 
187
  1. **LLM Quantization:**
 
207
 
208
  * Sets the tokenizer's chat template, which defines the format used to convert input data into a chat-like
209
  structure. The default template is `chatml` from OpenAI.
210
+
211
+ {note}
212
+ """
213
 
214
  code_cell = nbf.v4.new_code_cell(code)
215
  text_cell1 = nbf.v4.new_markdown_cell(text_1)