update
Browse files- app.py +6 -3
- util.py +8 -11
- utils/byte_util.py +0 -0
- utils/log_util.py +10 -0
app.py
CHANGED
@@ -16,7 +16,7 @@
|
|
16 |
- 词典支持下载
|
17 |
- 中文字词统计,是否要包括 _ G 等字符
|
18 |
- baichuan的单字数量怎么两万多个?
|
19 |
-
-
|
20 |
|
21 |
|
22 |
plots
|
@@ -35,7 +35,6 @@ table
|
|
35 |
"""
|
36 |
|
37 |
import gradio as gr
|
38 |
-
|
39 |
from vocab import all_tokenizers
|
40 |
from util import *
|
41 |
|
@@ -48,6 +47,10 @@ examples = [
|
|
48 |
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
49 |
]
|
50 |
|
|
|
|
|
|
|
|
|
51 |
|
52 |
def example_fn(example_idx):
|
53 |
return examples[example_idx]
|
@@ -138,7 +141,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
138 |
# )
|
139 |
# https://www.onlinewebfonts.com/icon/418591
|
140 |
gr.Image("images/VS.svg", scale=1, show_label=False,
|
141 |
-
show_download_button=
|
142 |
show_share_button=False)
|
143 |
with gr.Column(scale=6):
|
144 |
with gr.Group():
|
|
|
16 |
- 词典支持下载
|
17 |
- 中文字词统计,是否要包括 _ G 等字符
|
18 |
- baichuan的单字数量怎么两万多个?
|
19 |
+
- OOV
|
20 |
|
21 |
|
22 |
plots
|
|
|
35 |
"""
|
36 |
|
37 |
import gradio as gr
|
|
|
38 |
from vocab import all_tokenizers
|
39 |
from util import *
|
40 |
|
|
|
47 |
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
48 |
]
|
49 |
|
50 |
+
# jieba.enable_parallel() # flask中没办法parallel
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
|
55 |
def example_fn(example_idx):
|
56 |
return examples[example_idx]
|
|
|
141 |
# )
|
142 |
# https://www.onlinewebfonts.com/icon/418591
|
143 |
gr.Image("images/VS.svg", scale=1, show_label=False,
|
144 |
+
show_download_button=False, container=False,
|
145 |
show_share_button=False)
|
146 |
with gr.Column(scale=6):
|
147 |
with gr.Group():
|
util.py
CHANGED
@@ -1,19 +1,16 @@
|
|
1 |
-
|
2 |
-
|
3 |
import gradio as gr
|
4 |
import json
|
5 |
import pandas as pd
|
6 |
from vocab import load_tokener
|
7 |
from utils.zh_util import iter_vocab
|
8 |
-
|
9 |
-
|
10 |
|
11 |
|
12 |
def tokenize(text, tokenizer_type, color_num=5, update=True):
|
13 |
"""
|
14 |
TODO: cache tokenizer
|
15 |
"""
|
16 |
-
|
17 |
pos_tokens = []
|
18 |
tokenizer = load_tokener(tokenizer_type)
|
19 |
encoding = tokenizer.encode(text)
|
@@ -31,7 +28,9 @@ def tokenize(text, tokenizer_type, color_num=5, update=True):
|
|
31 |
token_str = token.decode("utf-8")
|
32 |
except:
|
33 |
token_str = token.decode("utf-8", errors="ignore")
|
34 |
-
|
|
|
|
|
35 |
|
36 |
token_bytes = token
|
37 |
json_dumps = json.dumps(token_str)
|
@@ -54,7 +53,7 @@ def tokenize(text, tokenizer_type, color_num=5, update=True):
|
|
54 |
)
|
55 |
|
56 |
table_df = pd.DataFrame(table)
|
57 |
-
|
58 |
# print(table_df)
|
59 |
|
60 |
if update:
|
@@ -82,16 +81,14 @@ def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
|
|
82 |
vocab2 = tokenizer2.get_vocab()
|
83 |
overlap_tokens = vocab1.keys() & vocab2.keys()
|
84 |
overlap_token_size = len(overlap_tokens)
|
85 |
-
|
86 |
return overlap_token_size, overlap_token_size
|
87 |
|
88 |
|
89 |
-
|
90 |
-
|
91 |
def test_coding():
|
92 |
bytes1 = b'\xe4\xb8\xad'
|
93 |
print(bytes1) # b'\xe4\xb8\xad'
|
94 |
|
95 |
|
96 |
if __name__ == "__main__":
|
97 |
-
print(basic_count("internlm_chat_7b"))
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import json
|
3 |
import pandas as pd
|
4 |
from vocab import load_tokener
|
5 |
from utils.zh_util import iter_vocab
|
6 |
+
from utils.log_util import logger
|
|
|
7 |
|
8 |
|
9 |
def tokenize(text, tokenizer_type, color_num=5, update=True):
|
10 |
"""
|
11 |
TODO: cache tokenizer
|
12 |
"""
|
13 |
+
logger.info("[param]:" + json.dumps({"text": text, "tokenizer_type": tokenizer_type}, ensure_ascii=False))
|
14 |
pos_tokens = []
|
15 |
tokenizer = load_tokener(tokenizer_type)
|
16 |
encoding = tokenizer.encode(text)
|
|
|
28 |
token_str = token.decode("utf-8")
|
29 |
except:
|
30 |
token_str = token.decode("utf-8", errors="ignore")
|
31 |
+
logger.info("[decode_error]: " + json.dumps(
|
32 |
+
{"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
|
33 |
+
ensure_ascii=False))
|
34 |
|
35 |
token_bytes = token
|
36 |
json_dumps = json.dumps(token_str)
|
|
|
53 |
)
|
54 |
|
55 |
table_df = pd.DataFrame(table)
|
56 |
+
logger.info(f"[Tokens {tokenizer_type}]: {table[:2]}")
|
57 |
# print(table_df)
|
58 |
|
59 |
if update:
|
|
|
81 |
vocab2 = tokenizer2.get_vocab()
|
82 |
overlap_tokens = vocab1.keys() & vocab2.keys()
|
83 |
overlap_token_size = len(overlap_tokens)
|
84 |
+
logger.info(f"[OverlapTokens of {tokenizer_type_1} {tokenizer_type_2}]: {list(overlap_tokens)[:10]}")
|
85 |
return overlap_token_size, overlap_token_size
|
86 |
|
87 |
|
|
|
|
|
88 |
def test_coding():
|
89 |
bytes1 = b'\xe4\xb8\xad'
|
90 |
print(bytes1) # b'\xe4\xb8\xad'
|
91 |
|
92 |
|
93 |
if __name__ == "__main__":
|
94 |
+
print(basic_count("internlm_chat_7b"))
|
utils/byte_util.py
ADDED
File without changes
|
utils/log_util.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import logging
|
3 |
+
|
4 |
+
logging.basicConfig(
|
5 |
+
format='%(asctime)s - %(filename)s - %(levelname)s - %(process)d - %(thread)d - %(message)s',
|
6 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
7 |
+
)
|
8 |
+
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
logger.setLevel(logging.INFO)
|