YuzuMarker.FontDetection / font_ds_detect_broken.py
gyrojeff's picture
feat: add broken detection script
3daa9d7
raw
history blame contribute delete
No virus
2.64 kB
import sys
import traceback
import pickle
import os
import concurrent.futures
from tqdm import tqdm
import time
from font_dataset.font import load_fonts
import cv2
cjk_ratio = 3
train_cnt = 100
val_cnt = 5
test_cnt = 30
train_cnt_cjk = int(train_cnt * cjk_ratio)
val_cnt_cjk = int(val_cnt * cjk_ratio)
test_cnt_cjk = int(test_cnt * cjk_ratio)
dataset_path = "./dataset/font_img"
os.makedirs(dataset_path, exist_ok=True)
unqualified_log_file_name = f"unqualified_font_{time.time()}.txt"
runtime_exclusion_list = []
fonts, exclusion_rule = load_fonts()
def generate_dataset(dataset_type: str, cnt: int):
dataset_bath_dir = os.path.join(dataset_path, dataset_type)
os.makedirs(dataset_bath_dir, exist_ok=True)
def _generate_single(args):
i, j, font = args
print(
f"Checking {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}",
end="\r",
)
if exclusion_rule(font):
print(f"Excluded font: {font.path}")
return
if font.path in runtime_exclusion_list:
print(f"Excluded font: {font.path}")
return
image_file_name = f"font_{i}_img_{j}.jpg"
label_file_name = f"font_{i}_img_{j}.bin"
image_file_path = os.path.join(dataset_bath_dir, image_file_name)
label_file_path = os.path.join(dataset_bath_dir, label_file_name)
# detect cache
if (not os.path.exists(image_file_path)) or (
not os.path.exists(label_file_path)
):
print(
f"Missing {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}"
)
# detect broken
try:
# check image
cv2.imread(image_file_path)
# check label
with open(label_file_path, "rb") as f:
pickle.load(f)
except Exception as e:
print(
f"Broken {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}"
)
os.remove(image_file_path)
os.remove(label_file_path)
return
work_list = []
# divide len(fonts) into 64 parts and choose the third part for this script
for i in range(len(fonts)):
font = fonts[i]
if font.language == "CJK":
true_cnt = cnt * cjk_ratio
else:
true_cnt = cnt
for j in range(true_cnt):
work_list.append((i, j, font))
for i in tqdm(range(len(work_list))):
_generate_single(work_list[i])
generate_dataset("train", train_cnt)
generate_dataset("val", val_cnt)
generate_dataset("test", test_cnt)