''' This repository is used to prepare Bridge dataset ''' import os, sys, shutil def read_bridge_v2(dataset_path, train_store_path, test_store_path, test_dataset_lists, copyfile=True): # copyfile is True most of the time start_idx = 0 target_lists = [] prefix_len = len(dataset_path) + 1 # Iterate all the folders inside for scene_name in sorted(os.listdir(dataset_path)): print("We are reading scene ", scene_name) scene_dir = os.path.join(dataset_path, scene_name) for task_name in sorted(os.listdir(scene_dir)): task_dir = os.path.join(scene_dir, task_name) for order_name in sorted(os.listdir(task_dir)): order_dir = os.path.join(task_dir, order_name) for time_clock in sorted(os.listdir(order_dir)): if time_clock == "lmdb": continue # Skip lmdb folder time_dir = os.path.join(order_dir, time_clock, "raw", "traj_group0") if not os.path.exists(time_dir): print("time_dir does not exist for ", time_dir) continue for traj_name in sorted(os.listdir(time_dir)): traj_path = os.path.join(time_dir, traj_name) if not os.path.isdir(traj_path): print("traj_path does not exist for ", traj_path) continue # Directly move policy_out_file_path; just in case there is also valuable information there policy_out_file_path = os.path.join(traj_path, "policy_out.pkl") if not os.path.exists(policy_out_file_path): continue # Check the lang txt file lang_txt_file_path = os.path.join(traj_path, "lang.txt") if not os.path.exists(lang_txt_file_path): continue for img_name in sorted(os.listdir(traj_path)): if img_name != "images0": # Only consider one camera angle continue img_folder_path = os.path.join(traj_path, img_name) if not os.path.isdir(img_folder_path): print("img_folder_path does not exist for ", img_folder_path) continue ############################################ Main Process #################################################### # # First Sanity check (Make sure the input source is jpg good) # length = len(os.listdir(img_folder_path)) # status = True # for check_idx in range(length): # if not os.path.exists(os.path.join(img_folder_path, 'im_' + str(check_idx) + '.jpg')): # Should be sequentially exists # status = False # break # Now we can copy the folder to our destination target_lists.append(img_folder_path) if copyfile: print("img_folder_path[prefix_len:] is ", img_folder_path[prefix_len:]) if img_folder_path[prefix_len:] in test_dataset_lists: # Store to test set target_dir = os.path.join(test_store_path, str(start_idx)) else: # This is training set target_dir = os.path.join(train_store_path, str(start_idx)) # Now we can copy the folder to our destination print("Copy " + str(img_folder_path) + " to " + str(os.path.join(train_store_path, str(start_idx)))) shutil.copytree(img_folder_path, target_dir) # Sanity check length = len(os.listdir(target_dir)) status = True for check_idx in range(length): if not os.path.exists(os.path.join(target_dir, 'im_' + str(check_idx) + '.jpg' )): # Should be sequentially exists status = False break if not status: # If they didn't have sequential files we need, we will remove and begin again without updating start_idx print("This file cannot pass the sanity check. We will remove it!") shutil.rmtree(target_dir) continue # Move other auxilary files shutil.copy(policy_out_file_path, os.path.join(target_dir, "policy_out.pkl")) shutil.copy(lang_txt_file_path, os.path.join(target_dir, "lang.txt")) # Update the idx start_idx += 1 print("We have ", start_idx) # Return a list of file path return target_lists if __name__ == "__main__": dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/raw/bridge_data_v2" train_store_path = "../sanity_check/bridge_v2_raw" test_store_path = "../sanity_check/bridge_v2_test_raw" test_dataset_predefined_path = "test_path_v2.txt" # Make dir if needed if os.path.exists(train_store_path): shutil.rmtree(train_store_path) os.makedirs(train_store_path) if os.path.exists(test_store_path): shutil.rmtree(test_store_path) os.makedirs(test_store_path) # Read Test dataset path test_dataset_lists = [] read_file = open(test_dataset_predefined_path, "r") for line in read_file.readlines(): test_dataset_lists.append(line[:-1]) print("test_dataset_lists is ", test_dataset_lists) read_bridge_v2(dataset_path, train_store_path, test_store_path, test_dataset_lists)