{ "feature_extractor": { "init_args": { "sample_rate": 24000, "n_fft": 1024, "hop_length": 256, "n_mels": 100 } }, "backbone": { "init_args": { "input_channels": 100, "dim": 512, "intermediate_dim": 1536, "num_layers": 8 } }, "head": { "init_args": { "dim": 512, "n_fft": 1024, "hop_length": 256 } } }