eson commited on
Commit
8e0e4e9
1 Parent(s): 6f9d07b
app.py CHANGED
@@ -18,6 +18,7 @@
18
  - baichuan的单字数量怎么两万多个?
19
  - OOV
20
  - feedback位置
 
21
 
22
 
23
  plots
 
18
  - baichuan的单字数量怎么两万多个?
19
  - OOV
20
  - feedback位置
21
+ - gpt4, gpt3.5 的overlap tokens 有问题。
22
 
23
 
24
  plots
utils/log_util.py CHANGED
@@ -3,7 +3,9 @@ import logging
3
 
4
  logging.basicConfig(
5
  format='%(asctime)s - %(filename)s - %(levelname)s - %(process)d - %(thread)d - %(message)s',
 
6
  datefmt="%Y-%m-%d %H:%M:%S",
 
7
  )
8
 
9
  logger = logging.getLogger(__name__)
 
3
 
4
  logging.basicConfig(
5
  format='%(asctime)s - %(filename)s - %(levelname)s - %(process)d - %(thread)d - %(message)s',
6
+ level=logging.INFO,
7
  datefmt="%Y-%m-%d %H:%M:%S",
8
+
9
  )
10
 
11
  logger = logging.getLogger(__name__)
vocab/gpt_35_turbo/__init__.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  import tiktoken
4
  from tiktoken import Encoding
 
5
 
6
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
7
  tokenizer.vocab_size = tokenizer.n_vocab
@@ -22,16 +23,21 @@ def convert_ids_to_tokens(self, tokens):
22
  def get_vocab(self):
23
  """Returns vocab as a dict"""
24
  vocab = {}
 
 
25
  for i in range(self.vocab_size):
26
  try:
27
  token_byte = self.convert_ids_to_tokens([i])[0]
28
  token_str = token_byte.decode("utf-8")
29
  vocab[token_str] = i
30
- except KeyError:
31
- print("gpt_35_turbo decode KeyError", i)
32
- except UnicodeDecodeError:
33
- print("gpt_35_turbo decode UnicodeDecodeError", i, str(token_byte))
 
34
  # vocab.update(self.added_tokens_encoder)
 
 
35
  return vocab
36
 
37
 
 
2
 
3
  import tiktoken
4
  from tiktoken import Encoding
5
+ from utils.log_util import logger
6
 
7
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
8
  tokenizer.vocab_size = tokenizer.n_vocab
 
23
  def get_vocab(self):
24
  """Returns vocab as a dict"""
25
  vocab = {}
26
+ key_error_list = []
27
+ unicode_decode_error_list = []
28
  for i in range(self.vocab_size):
29
  try:
30
  token_byte = self.convert_ids_to_tokens([i])[0]
31
  token_str = token_byte.decode("utf-8")
32
  vocab[token_str] = i
33
+ except KeyError: # 100256 100261-100275
34
+ key_error_list.append(i)
35
+ except UnicodeDecodeError: # 特别多
36
+ unicode_decode_error_list.append((i, str(token_byte)))
37
+
38
  # vocab.update(self.added_tokens_encoder)
39
+ logger.info(f"gpt_35_turbo {len(key_error_list)} KeyError: {key_error_list}")
40
+ logger.info(f"gpt_35_turbo {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
41
  return vocab
42
 
43
 
vocab/gpt_4/__init__.py CHANGED
@@ -22,16 +22,21 @@ def convert_ids_to_tokens(self, tokens):
22
  def get_vocab(self):
23
  """Returns vocab as a dict"""
24
  vocab = {}
 
 
25
  for i in range(self.vocab_size):
26
  try:
27
  token_byte = self.convert_ids_to_tokens([i])[0]
28
  token_str = token_byte.decode("utf-8")
29
  vocab[token_str] = i
30
- except KeyError:
31
- print("gpt_35_turbo decode KeyError", i)
32
- except UnicodeDecodeError:
33
- print("gpt_35_turbo decode UnicodeDecodeError", i, str(token_byte))
 
34
  # vocab.update(self.added_tokens_encoder)
 
 
35
  return vocab
36
 
37
 
 
22
  def get_vocab(self):
23
  """Returns vocab as a dict"""
24
  vocab = {}
25
+ key_error_list = []
26
+ unicode_decode_error_list = []
27
  for i in range(self.vocab_size):
28
  try:
29
  token_byte = self.convert_ids_to_tokens([i])[0]
30
  token_str = token_byte.decode("utf-8")
31
  vocab[token_str] = i
32
+ except KeyError: # 100256 100261-100275
33
+ key_error_list.append(i)
34
+ except UnicodeDecodeError: # 特别多
35
+ unicode_decode_error_list.append((i, str(token_byte)))
36
+
37
  # vocab.update(self.added_tokens_encoder)
38
+ logger.info(f"gpt-4 {len(key_error_list)} KeyError: {key_error_list}")
39
+ logger.info(f"gpt-4 {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
40
  return vocab
41
 
42