''' This file is to prepare the dataset in csv file following the format required by Opne-SORA ''' import os, sys, shutil import json import csv # Import files from the local folder root_path = os.path.abspath('.') sys.path.append(root_path) # from curation_pipeline.prepare_bridge_v1 import read_bridge_v1 # from curation_pipeline.prepare_bridge_v2 import read_bridge_v2 def iter_dataset(dataset_path): lists = [] for sub_folder_name in os.listdir(dataset_path): sub_folder_path = os.path.join(dataset_path, sub_folder_name) # Check number of frames max_length = len(os.listdir(sub_folder_path)) for check_idx in range(max_length): if not os.path.exists(os.path.join(sub_folder_path, 'im_' + str(check_idx) + '.jpg')): # Should be sequentially exists break num_frames = check_idx # Read the text txt_path = os.path.join(sub_folder_path, "lang.txt") f = open(txt_path, "r") lang_prompt = f.readline() lists.append([sub_folder_path, lang_prompt, num_frames, 480, 640]) # break return lists if __name__ == "__main__": v1_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/sanity_check/bridge_v1_raw" v2_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/sanity_check/bridge_v2_raw" store_name = "Bridge_raw.csv" if os.path.exists(store_name): os.remove(store_name) # Execute full_lists = [["path", "text", "num_frames", "height", "width"]] v1_lists = iter_dataset(v1_dataset_path) full_lists.extend(v1_lists) v2_lists = iter_dataset(v2_dataset_path) full_lists.extend(v2_lists) print("Full length is ", len(full_lists)) # Store as csv file with open(store_name, 'w') as outfile: write = csv.writer(outfile) write.writerows(full_lists) # with open('output.jsonl', 'w') as outfile: # for entry in JSON_file: # json.dump(entry, outfile) # outfile.write('\n')