|
import os |
|
from multiprocessing import Pool |
|
|
|
from mmengine.logging import MMLogger |
|
from scenedetect import ContentDetector, detect |
|
from tqdm import tqdm |
|
|
|
from opensora.utils.misc import get_timestamp |
|
|
|
from .utils import check_mp4_integrity, clone_folder_structure, iterate_files, split_video |
|
|
|
|
|
target_fps = 30 |
|
shorter_size = 512 |
|
min_seconds = 1 |
|
max_seconds = 5 |
|
assert max_seconds > min_seconds |
|
cfg = dict( |
|
target_fps=target_fps, |
|
min_seconds=min_seconds, |
|
max_seconds=max_seconds, |
|
shorter_size=shorter_size, |
|
) |
|
|
|
|
|
def process_folder(root_src, root_dst): |
|
|
|
folder_path_log = os.path.dirname(root_dst) |
|
log_name = os.path.basename(root_dst) |
|
timestamp = get_timestamp() |
|
log_path = os.path.join(folder_path_log, f"{log_name}_{timestamp}.log") |
|
logger = MMLogger.get_instance(log_name, log_file=log_path) |
|
|
|
|
|
clone_folder_structure(root_src, root_dst) |
|
|
|
|
|
mp4_list = [x for x in iterate_files(root_src) if x.endswith(".mp4")] |
|
mp4_list = sorted(mp4_list) |
|
|
|
for idx, sample_path in tqdm(enumerate(mp4_list)): |
|
folder_src = os.path.dirname(sample_path) |
|
folder_dst = os.path.join(root_dst, os.path.relpath(folder_src, root_src)) |
|
|
|
|
|
if not check_mp4_integrity(sample_path, logger=logger): |
|
continue |
|
|
|
|
|
scene_list = detect(sample_path, ContentDetector(), start_in_scene=True) |
|
|
|
|
|
save_path_list = split_video(sample_path, scene_list, save_dir=folder_dst, **cfg, logger=logger) |
|
|
|
|
|
for x in save_path_list: |
|
check_mp4_integrity(x, logger=logger) |
|
|
|
|
|
def scene_detect(): |
|
"""detect & cut scenes using a single process |
|
Expected dataset structure: |
|
data/ |
|
your_dataset/ |
|
raw_videos/ |
|
xxx.mp4 |
|
yyy.mp4 |
|
|
|
This function results in: |
|
data/ |
|
your_dataset/ |
|
raw_videos/ |
|
xxx.mp4 |
|
yyy.mp4 |
|
zzz.mp4 |
|
clips/ |
|
xxx_scene-0.mp4 |
|
yyy_scene-0.mp4 |
|
yyy_scene-1.mp4 |
|
""" |
|
|
|
root_src = f"./data/your_dataset/raw_videos" |
|
root_dst = f"./data/your_dataset/clips" |
|
|
|
process_folder(root_src, root_dst) |
|
|
|
|
|
def scene_detect_mp(): |
|
"""detect & cut scenes using multiple processes |
|
Expected dataset structure: |
|
data/ |
|
your_dataset/ |
|
raw_videos/ |
|
split_0/ |
|
xxx.mp4 |
|
yyy.mp4 |
|
split_1/ |
|
xxx.mp4 |
|
yyy.mp4 |
|
|
|
This function results in: |
|
data/ |
|
your_dataset/ |
|
raw_videos/ |
|
split_0/ |
|
xxx.mp4 |
|
yyy.mp4 |
|
split_1/ |
|
xxx.mp4 |
|
yyy.mp4 |
|
clips/ |
|
split_0/ |
|
xxx_scene-0.mp4 |
|
yyy_scene-0.mp4 |
|
split_1/ |
|
xxx_scene-0.mp4 |
|
yyy_scene-0.mp4 |
|
yyy_scene-1.mp4 |
|
""" |
|
|
|
root_src = f"./data/your_dataset/raw_videos" |
|
root_dst = f"./data/your_dataset/clips" |
|
|
|
|
|
splits = ["split_0", "split_1"] |
|
|
|
|
|
root_src_list = [os.path.join(root_src, x) for x in splits] |
|
root_dst_list = [os.path.join(root_dst, x) for x in splits] |
|
|
|
with Pool(processes=len(splits)) as pool: |
|
pool.starmap(process_folder, list(zip(root_src_list, root_dst_list))) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
scene_detect() |
|
|
|
|