Spaces:

HikariDawn
/

This-and-That

Running on Zero

App Files Files Community

This-and-That / curation_pipeline /prepare_bridge_csv.py

HikariDawn777

feat: initial push

59b2a81 about 1 month ago

raw

history blame contribute delete

2.03 kB

	'''
	This file is to prepare the dataset in csv file following the format required by Opne-SORA
	'''

	import os, sys, shutil
	import json
	import csv

	# Import files from the local folder
	root_path = os.path.abspath('.')
	sys.path.append(root_path)
	# from curation_pipeline.prepare_bridge_v1 import read_bridge_v1
	# from curation_pipeline.prepare_bridge_v2 import read_bridge_v2



	def iter_dataset(dataset_path):
	lists = []
	for sub_folder_name in os.listdir(dataset_path):
	sub_folder_path = os.path.join(dataset_path, sub_folder_name)

	# Check number of frames
	max_length = len(os.listdir(sub_folder_path))
	for check_idx in range(max_length):
	if not os.path.exists(os.path.join(sub_folder_path, 'im_' + str(check_idx) + '.jpg')): # Should be sequentially exists
	break
	num_frames = check_idx

	# Read the text
	txt_path = os.path.join(sub_folder_path, "lang.txt")
	f = open(txt_path, "r")
	lang_prompt = f.readline()

	lists.append([sub_folder_path, lang_prompt, num_frames, 480, 640])
	# break
	return lists



	if __name__ == "__main__":
	v1_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/sanity_check/bridge_v1_raw"
	v2_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/sanity_check/bridge_v2_raw"
	store_name = "Bridge_raw.csv"

	if os.path.exists(store_name):
	os.remove(store_name)


	# Execute
	full_lists = [["path", "text", "num_frames", "height", "width"]]

	v1_lists = iter_dataset(v1_dataset_path)
	full_lists.extend(v1_lists)
	v2_lists = iter_dataset(v2_dataset_path)
	full_lists.extend(v2_lists)
	print("Full length is ", len(full_lists))


	# Store as csv file
	with open(store_name, 'w') as outfile:
	write = csv.writer(outfile)
	write.writerows(full_lists)



	# with open('output.jsonl', 'w') as outfile:
	# for entry in JSON_file:
	# json.dump(entry, outfile)
	# outfile.write('\n')