xu-song commited on
Commit
79b95c3
·
1 Parent(s): 819cf7f
Files changed (4) hide show
  1. app.py +6 -3
  2. util.py +8 -11
  3. utils/byte_util.py +0 -0
  4. utils/log_util.py +10 -0
app.py CHANGED
@@ -16,7 +16,7 @@
16
  - 词典支持下载
17
  - 中文字词统计,是否要包括 _ G 等字符
18
  - baichuan的单字数量怎么两万多个?
19
- - gpt4
20
 
21
 
22
  plots
@@ -35,7 +35,6 @@ table
35
  """
36
 
37
  import gradio as gr
38
-
39
  from vocab import all_tokenizers
40
  from util import *
41
 
@@ -48,6 +47,10 @@ examples = [
48
  ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
49
  ]
50
 
 
 
 
 
51
 
52
  def example_fn(example_idx):
53
  return examples[example_idx]
@@ -138,7 +141,7 @@ with gr.Blocks(css="style.css") as demo:
138
  # )
139
  # https://www.onlinewebfonts.com/icon/418591
140
  gr.Image("images/VS.svg", scale=1, show_label=False,
141
- show_download_button=True, container=False,
142
  show_share_button=False)
143
  with gr.Column(scale=6):
144
  with gr.Group():
 
16
  - 词典支持下载
17
  - 中文字词统计,是否要包括 _ G 等字符
18
  - baichuan的单字数量怎么两万多个?
19
+ - OOV
20
 
21
 
22
  plots
 
35
  """
36
 
37
  import gradio as gr
 
38
  from vocab import all_tokenizers
39
  from util import *
40
 
 
47
  ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
48
  ]
49
 
50
+ # jieba.enable_parallel() # flask中没办法parallel
51
+
52
+
53
+
54
 
55
  def example_fn(example_idx):
56
  return examples[example_idx]
 
141
  # )
142
  # https://www.onlinewebfonts.com/icon/418591
143
  gr.Image("images/VS.svg", scale=1, show_label=False,
144
+ show_download_button=False, container=False,
145
  show_share_button=False)
146
  with gr.Column(scale=6):
147
  with gr.Group():
util.py CHANGED
@@ -1,19 +1,16 @@
1
-
2
-
3
  import gradio as gr
4
  import json
5
  import pandas as pd
6
  from vocab import load_tokener
7
  from utils.zh_util import iter_vocab
8
-
9
-
10
 
11
 
12
  def tokenize(text, tokenizer_type, color_num=5, update=True):
13
  """
14
  TODO: cache tokenizer
15
  """
16
- print(f"入参:tokenize, {text}, {tokenizer_type}")
17
  pos_tokens = []
18
  tokenizer = load_tokener(tokenizer_type)
19
  encoding = tokenizer.encode(text)
@@ -31,7 +28,9 @@ def tokenize(text, tokenizer_type, color_num=5, update=True):
31
  token_str = token.decode("utf-8")
32
  except:
33
  token_str = token.decode("utf-8", errors="ignore")
34
- print("decode_error", tokenizer_type, token, token_str)
 
 
35
 
36
  token_bytes = token
37
  json_dumps = json.dumps(token_str)
@@ -54,7 +53,7 @@ def tokenize(text, tokenizer_type, color_num=5, update=True):
54
  )
55
 
56
  table_df = pd.DataFrame(table)
57
- print(f"Tokenization[{tokenizer_type}]: {table}")
58
  # print(table_df)
59
 
60
  if update:
@@ -82,16 +81,14 @@ def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
82
  vocab2 = tokenizer2.get_vocab()
83
  overlap_tokens = vocab1.keys() & vocab2.keys()
84
  overlap_token_size = len(overlap_tokens)
85
- print(f"OverlapTokens: {tokenizer_type_1}, {tokenizer_type_2} {list(overlap_tokens)[:10]}")
86
  return overlap_token_size, overlap_token_size
87
 
88
 
89
-
90
-
91
  def test_coding():
92
  bytes1 = b'\xe4\xb8\xad'
93
  print(bytes1) # b'\xe4\xb8\xad'
94
 
95
 
96
  if __name__ == "__main__":
97
- print(basic_count("internlm_chat_7b"))
 
 
 
1
  import gradio as gr
2
  import json
3
  import pandas as pd
4
  from vocab import load_tokener
5
  from utils.zh_util import iter_vocab
6
+ from utils.log_util import logger
 
7
 
8
 
9
  def tokenize(text, tokenizer_type, color_num=5, update=True):
10
  """
11
  TODO: cache tokenizer
12
  """
13
+ logger.info("[param]:" + json.dumps({"text": text, "tokenizer_type": tokenizer_type}, ensure_ascii=False))
14
  pos_tokens = []
15
  tokenizer = load_tokener(tokenizer_type)
16
  encoding = tokenizer.encode(text)
 
28
  token_str = token.decode("utf-8")
29
  except:
30
  token_str = token.decode("utf-8", errors="ignore")
31
+ logger.info("[decode_error]: " + json.dumps(
32
+ {"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
33
+ ensure_ascii=False))
34
 
35
  token_bytes = token
36
  json_dumps = json.dumps(token_str)
 
53
  )
54
 
55
  table_df = pd.DataFrame(table)
56
+ logger.info(f"[Tokens {tokenizer_type}]: {table[:2]}")
57
  # print(table_df)
58
 
59
  if update:
 
81
  vocab2 = tokenizer2.get_vocab()
82
  overlap_tokens = vocab1.keys() & vocab2.keys()
83
  overlap_token_size = len(overlap_tokens)
84
+ logger.info(f"[OverlapTokens of {tokenizer_type_1} {tokenizer_type_2}]: {list(overlap_tokens)[:10]}")
85
  return overlap_token_size, overlap_token_size
86
 
87
 
 
 
88
  def test_coding():
89
  bytes1 = b'\xe4\xb8\xad'
90
  print(bytes1) # b'\xe4\xb8\xad'
91
 
92
 
93
  if __name__ == "__main__":
94
+ print(basic_count("internlm_chat_7b"))
utils/byte_util.py ADDED
File without changes
utils/log_util.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+
4
+ logging.basicConfig(
5
+ format='%(asctime)s - %(filename)s - %(levelname)s - %(process)d - %(thread)d - %(message)s',
6
+ datefmt="%Y-%m-%d %H:%M:%S",
7
+ )
8
+
9
+ logger = logging.getLogger(__name__)
10
+ logger.setLevel(logging.INFO)