File size: 2,642 Bytes
3daa9d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import sys
import traceback
import pickle
import os
import concurrent.futures
from tqdm import tqdm
import time
from font_dataset.font import load_fonts
import cv2
cjk_ratio = 3
train_cnt = 100
val_cnt = 5
test_cnt = 30
train_cnt_cjk = int(train_cnt * cjk_ratio)
val_cnt_cjk = int(val_cnt * cjk_ratio)
test_cnt_cjk = int(test_cnt * cjk_ratio)
dataset_path = "./dataset/font_img"
os.makedirs(dataset_path, exist_ok=True)
unqualified_log_file_name = f"unqualified_font_{time.time()}.txt"
runtime_exclusion_list = []
fonts, exclusion_rule = load_fonts()
def generate_dataset(dataset_type: str, cnt: int):
dataset_bath_dir = os.path.join(dataset_path, dataset_type)
os.makedirs(dataset_bath_dir, exist_ok=True)
def _generate_single(args):
i, j, font = args
print(
f"Checking {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}",
end="\r",
)
if exclusion_rule(font):
print(f"Excluded font: {font.path}")
return
if font.path in runtime_exclusion_list:
print(f"Excluded font: {font.path}")
return
image_file_name = f"font_{i}_img_{j}.jpg"
label_file_name = f"font_{i}_img_{j}.bin"
image_file_path = os.path.join(dataset_bath_dir, image_file_name)
label_file_path = os.path.join(dataset_bath_dir, label_file_name)
# detect cache
if (not os.path.exists(image_file_path)) or (
not os.path.exists(label_file_path)
):
print(
f"Missing {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}"
)
# detect broken
try:
# check image
cv2.imread(image_file_path)
# check label
with open(label_file_path, "rb") as f:
pickle.load(f)
except Exception as e:
print(
f"Broken {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}"
)
os.remove(image_file_path)
os.remove(label_file_path)
return
work_list = []
# divide len(fonts) into 64 parts and choose the third part for this script
for i in range(len(fonts)):
font = fonts[i]
if font.language == "CJK":
true_cnt = cnt * cjk_ratio
else:
true_cnt = cnt
for j in range(true_cnt):
work_list.append((i, j, font))
for i in tqdm(range(len(work_list))):
_generate_single(work_list[i])
generate_dataset("train", train_cnt)
generate_dataset("val", val_cnt)
generate_dataset("test", test_cnt)
|