ayousanz's picture
Add files using upload-large-folder tool
1e00de9 verified
from curses import meta
import os
import pandas as pd
import json
from tqdm import tqdm
AUDIOSET_PATH_DICT = {}
segment_label_path = "/mnt/fast/nobackup/scratch4weeks/hl01486/datasets/audiocaps_segment_labels/averaged"
audiocaps_csv = [
"test.csv",
"train.csv",
"val.csv"
]
audioset_path = {
"bal_unbal_train": "/mnt/fast/nobackup/users/hl01486/metadata/audioset/datafiles/audioset_bal_unbal_train_data.json",
"eval": "/mnt/fast/nobackup/users/hl01486/metadata/audioset/datafiles/audioset_eval_data.json",
"bal_train": "/mnt/fast/nobackup/users/hl01486/metadata/audioset/datafiles/audioset_bal_train_data.json",
}
def read_json(dataset_json_file):
with open(dataset_json_file, 'r') as fp:
data_json = json.load(fp)
return data_json['data']
def build_audioset_metadata():
metadata_list = []
for k in audioset_path.keys():
metadata_list += read_json(audioset_path[k])
for each in metadata_list:
wav_path = each["wav"]
audio_basename = os.path.basename(wav_path)[:-4]
labels = each["labels"]
AUDIOSET_PATH_DICT[audio_basename] = [wav_path, labels]
def read_audiocaps_csv(csvfile):
df = pd.read_csv(csvfile)
data_list = []
for row in tqdm(df.iterrows()):
instance = {}
file_object = row[1]
audiocap_id = file_object["audiocap_id"]
youtube_id = file_object["youtube_id"]
start_time = file_object["start_time"]
caption = file_object["caption"]
try:
wav, labels = lookup_path(youtube_id)
except Exception as e:
print(e)
continue
instance["wav"]=wav
instance["seg_label"]=os.path.join(segment_label_path, "Y%s.npy" % youtube_id)
instance["labels"]=labels
instance["caption"]=caption
data_list.append(instance)
return data_list
def lookup_path(youtube_id):
metadata = AUDIOSET_PATH_DICT["Y"+youtube_id]
return metadata[0], metadata[1]
def save_json(wav_list, fname):
with open('datafiles/%s.json' % fname, 'w') as f:
json.dump({'data': wav_list}, f, indent=1)
build_audioset_metadata()
for file in audiocaps_csv:
meta = read_audiocaps_csv(file)
save_json(meta, fname=file)
import ipdb; ipdb.set_trace()