|
import json |
|
from collections import defaultdict |
|
|
|
all_zh_words = defaultdict(int) |
|
for model_name in [ |
|
"gpt2", |
|
"gpt2_chinese", |
|
"chinese_llama_lora_7b", |
|
"bert_chinese", |
|
"moss", |
|
"bloom", |
|
"bloomz_6b4_zh", |
|
"gpt_nexo_20b", |
|
"gpt_neox_chinese_v1", |
|
"glm_chinese", |
|
"chatglm" |
|
]: |
|
zh_word_set = set() |
|
for line in open(model_name + "_vocab.zh.jsonl", "r", encoding="utf-8"): |
|
item = json.loads(line) |
|
token = item["token"] |
|
if item["type"] in ["中文单字", "中文多字"]: |
|
zh_word_set.add(token.strip()) |
|
|
|
for word in zh_word_set: |
|
all_zh_words[word] += 1 |
|
|
|
sorted_keywords = sorted(all_zh_words.items(), key=lambda kv: kv[1], reverse=True) |
|
|
|
with open("vocab.freq.zh.txt", "w", encoding="utf-8") as f_out: |
|
for word, count in sorted_keywords: |
|
f_out.write("%s\t%d\n" % (word, count)) |
|
|