Spaces:
Running
on
Zero
Running
on
Zero
| ''' | |
| This file is to prepare the dataset in csv file following the format required by Opne-SORA | |
| ''' | |
| import os, sys, shutil | |
| import json | |
| import csv | |
| # Import files from the local folder | |
| root_path = os.path.abspath('.') | |
| sys.path.append(root_path) | |
| # from curation_pipeline.prepare_bridge_v1 import read_bridge_v1 | |
| # from curation_pipeline.prepare_bridge_v2 import read_bridge_v2 | |
| def iter_dataset(dataset_path): | |
| lists = [] | |
| for sub_folder_name in os.listdir(dataset_path): | |
| sub_folder_path = os.path.join(dataset_path, sub_folder_name) | |
| # Check number of frames | |
| max_length = len(os.listdir(sub_folder_path)) | |
| for check_idx in range(max_length): | |
| if not os.path.exists(os.path.join(sub_folder_path, 'im_' + str(check_idx) + '.jpg')): # Should be sequentially exists | |
| break | |
| num_frames = check_idx | |
| # Read the text | |
| txt_path = os.path.join(sub_folder_path, "lang.txt") | |
| f = open(txt_path, "r") | |
| lang_prompt = f.readline() | |
| lists.append([sub_folder_path, lang_prompt, num_frames, 480, 640]) | |
| # break | |
| return lists | |
| if __name__ == "__main__": | |
| v1_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/sanity_check/bridge_v1_raw" | |
| v2_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/sanity_check/bridge_v2_raw" | |
| store_name = "Bridge_raw.csv" | |
| if os.path.exists(store_name): | |
| os.remove(store_name) | |
| # Execute | |
| full_lists = [["path", "text", "num_frames", "height", "width"]] | |
| v1_lists = iter_dataset(v1_dataset_path) | |
| full_lists.extend(v1_lists) | |
| v2_lists = iter_dataset(v2_dataset_path) | |
| full_lists.extend(v2_lists) | |
| print("Full length is ", len(full_lists)) | |
| # Store as csv file | |
| with open(store_name, 'w') as outfile: | |
| write = csv.writer(outfile) | |
| write.writerows(full_lists) | |
| # with open('output.jsonl', 'w') as outfile: | |
| # for entry in JSON_file: | |
| # json.dump(entry, outfile) | |
| # outfile.write('\n') |