nxphi47 commited on
Commit
28561ce
1 Parent(s): c54a8eb

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +14 -3
README.md CHANGED
@@ -250,8 +250,11 @@ def seallm_7b_v25_tokenize_multi_turns(tokenizer, conversations, add_assistant_p
250
  """
251
  TURN_TEMPLATE = "<|im_start|>{role}\n{content}<eos>\n"
252
  TURN_PREFIX = "<|im_start|>{role}\n"
 
 
253
  sample = None
254
  assistant_prefix_len = None
 
255
  for turn_id, turn in enumerate(conversations):
256
  prompt = TURN_TEMPLATE.format(role=turn['role'], content=turn['content'])
257
  turn_sample = tokenizer(
@@ -261,7 +264,12 @@ def seallm_7b_v25_tokenize_multi_turns(tokenizer, conversations, add_assistant_p
261
  if turn['role'] == 'assistant':
262
  if assistant_prefix_len is None:
263
  assistant_prefix_len = len(tokenizer.encode(TURN_PREFIX.format(role=turn['role']), add_special_tokens=False))
264
- turn_sample['token_type_ids'][assistant_prefix_len:] = [1] * (len(turn_sample['input_ids']) - assistant_prefix_len)
 
 
 
 
 
265
  if sample is None:
266
  sample = turn_sample
267
  else:
@@ -282,9 +290,12 @@ def seallm_7b_v25_tokenize_multi_turns(tokenizer, conversations, add_assistant_p
282
 
283
  # ! testing
284
  sample = seallm_7b_v25_tokenize_multi_turns(tokenizer, conversations)
285
- print(tokenizer.convert_ids_to_tokens(sample['input_ids']))
286
- print(sample['token_type_ids'])
 
287
 
 
 
288
 
289
  ```
290
 
 
250
  """
251
  TURN_TEMPLATE = "<|im_start|>{role}\n{content}<eos>\n"
252
  TURN_PREFIX = "<|im_start|>{role}\n"
253
+ TURN_SUFFIX = "<eos>\n"
254
+ TURN_SUFFIX_TAKE = "<eos>"
255
  sample = None
256
  assistant_prefix_len = None
257
+ assistant_suffix_len = None
258
  for turn_id, turn in enumerate(conversations):
259
  prompt = TURN_TEMPLATE.format(role=turn['role'], content=turn['content'])
260
  turn_sample = tokenizer(
 
264
  if turn['role'] == 'assistant':
265
  if assistant_prefix_len is None:
266
  assistant_prefix_len = len(tokenizer.encode(TURN_PREFIX.format(role=turn['role']), add_special_tokens=False))
267
+ if assistant_suffix_len is None:
268
+ assistant_suffix_len = (
269
+ len(tokenizer.encode(TURN_SUFFIX.format(role=turn['role']), add_special_tokens=False)) -
270
+ len(tokenizer.encode(TURN_SUFFIX_TAKE, add_special_tokens=False))
271
+ )
272
+ turn_sample['token_type_ids'][assistant_prefix_len:-assistant_suffix_len] = [1] * (len(turn_sample['input_ids']) - assistant_prefix_len - assistant_suffix_len)
273
  if sample is None:
274
  sample = turn_sample
275
  else:
 
290
 
291
  # ! testing
292
  sample = seallm_7b_v25_tokenize_multi_turns(tokenizer, conversations)
293
+ tokens = tokenizer.convert_ids_to_tokens(sample['input_ids'])
294
+ pairs = [(x, y) for x, y in zip(tokens, sample['token_type_ids'])]
295
+ print(pairs)
296
 
297
+ # source and special tokens is masked out (token_type 0), only assistant with <eos> is trained (token_type 1)
298
+ # [('<bos>', 0), ('<', 0), ('|', 0), ..., ('assistant', 0), ('\n', 0), ('Hi', 1), ('▁there', 1), (',', 1), ('▁how', 1), ('▁can', 1), ('▁I', 1), ('▁help', 1), ('?', 1), ('<eos>', 1), ('\n', 0), ('<', 0), ...
299
 
300
  ```
301