--- |
tags: |
- espnet |
- audio |
- spoken-language-understanding |
language: en |
datasets: |
- slue-ted |
license: cc-by-4.0 |
--- |
## ESPnet2 SLU model |
### `espnet/slueted_whisper_summ` |
This model was trained by “siddhu001” using slue-ted recipe in [espnet](https://github.com/espnet/espnet/). |
### Demo: How to use in ESPnet2 |
Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) |
if you haven't done that already. |
```bash |
cd espnet |
git checkout e23ef85f0b3116ad5c60d0833f186da0deec0734 |
pip install -e . |
cd egs2/slue-ted/slu1 |
./run.sh --skip_data_prep false --skip_train true --download_model espnet/slueted_whisper_summ |
``` |
{'rouge1': 0.2255418629519756, 'rouge2': 0.0485061537185737, 'rougeL': 0.1596465851004139, 'rougeLsum': 0.15968116069467322, 'meteor': 0.2129616261465529} |
RESULT 22.55418629519756 3.799127541421444e-132 15.96465851004139 21.29616261465529 83.78519008627457 |
## SLU config |
<details><summary>expand</summary> |
``` |
config: conf//train_asr_whisper_weighted_conv2d2.yaml |
print_config: false |
log_level: INFO |
drop_last_iter: false |
dry_run: false |
iterator_type: sequence |
valid_iterator_type: null |
output_dir: exp/slu_train_asr_whisper_weighted_conv2d2_raw_en_bpe500_sp |
ngpu: 1 |
seed: 2022 |
num_workers: 2 |
num_att_plot: 3 |
dist_backend: nccl |
dist_init_method: env:// |
dist_world_size: 4 |
dist_rank: 0 |
local_rank: 0 |
dist_master_addr: localhost |
dist_master_port: 42635 |
dist_launcher: null |
multiprocessing_distributed: true |
unused_parameters: false |
sharded_ddp: false |
cudnn_enabled: true |
cudnn_benchmark: false |
cudnn_deterministic: true |
collect_stats: false |
write_collected_feats: false |
max_epoch: 25 |
patience: null |
val_scheduler_criterion: |
- valid |
- loss |
early_stopping_criterion: |
- valid |
- loss |
- min |
best_model_criterion: |
- - valid |
- acc |
- max |
keep_nbest_models: 10 |
nbest_averaging_interval: 0 |
grad_clip: 5.0 |
grad_clip_type: 2.0 |
grad_noise: false |
accum_grad: 1 |
no_forward_run: false |
resume: true |
train_dtype: float32 |
use_amp: false |
log_interval: 100 |
use_matplotlib: true |
use_tensorboard: true |
create_graph_in_tensorboard: false |
use_wandb: false |
wandb_project: null |
wandb_id: null |
wandb_entity: null |
wandb_name: null |
wandb_model_log_interval: -1 |
detect_anomaly: false |
use_lora: false |
save_lora_only: true |
lora_conf: {} |
pretrain_path: null |
init_param: |
- /scratch/bbjs/arora1/espnet_slue_PR/espnet/egs2/tedlium3/asr1/exp/asr_train_asr_whisper_weighted_conv2d2_raw_en_bpe500/valid.acc.ave_10best.pth:::ctc |
ignore_init_mismatch: false |
freeze_param: |
- encoder |
num_iters_per_epoch: null |
batch_size: 20 |
valid_batch_size: null |
batch_bins: 12000000 |
valid_batch_bins: null |
train_shape_file: |
- exp/slu_stats_raw_en_bpe500_sp/train/speech_shape |
- exp/slu_stats_raw_en_bpe500_sp/train/text_shape.bpe |
valid_shape_file: |
- exp/slu_stats_raw_en_bpe500_sp/valid/speech_shape |
- exp/slu_stats_raw_en_bpe500_sp/valid/text_shape.bpe |
batch_type: numel |
valid_batch_type: null |
fold_length: |
- 80000 |
- 150 |
sort_in_batch: descending |
shuffle_within_batch: false |
sort_batch: descending |
multiple_iterator: false |
chunk_length: 500 |
chunk_shift_ratio: 0.5 |
num_cache_chunks: 1024 |
chunk_excluded_key_prefixes: [] |
chunk_default_fs: null |
train_data_path_and_name_and_type: |
- - dump/raw/train_sp/wav.scp |
- speech |
- kaldi_ark |
- - dump/raw/train_sp/text |
- text |
- text |
valid_data_path_and_name_and_type: |
- - dump/raw/devel/wav.scp |
- speech |
- kaldi_ark |
- - dump/raw/devel/text |
- text |
- text |
allow_variable_data_keys: false |
max_cache_size: 0.0 |
max_cache_fd: 32 |
allow_multi_rates: false |
valid_max_cache_size: null |
exclude_weight_decay: false |
exclude_weight_decay_conf: {} |
optim: adam |
optim_conf: |
lr: 0.002 |
weight_decay: 1.0e-06 |
scheduler: warmuplr |
scheduler_conf: |
warmup_steps: 5000 |
token_list: |
- <blank> |
- <unk> |
- '[sep]' |
- '"' |
- s |
- ▁ |
- ▁the |
- ',' |
- t |
- d |
- ▁a |
- . |
- ing |
- o |
- e |
- ▁to |
- a |
- ▁and |
- y |
- n |
- ▁of |
- r |
- ▁in |
- u |
- i |
- m |
- p |
- c |
- er |
- g |
- l |
- al |
- re |
- ed |
- b |
- '''' |
- ar |
- k |
- in |
- f |
- ▁" |
- le |
- 'on' |
- v |
- or |
- th |
- '-' |
- ▁c |
- en |
- ▁f |
- ▁-- |
- ▁we |
- ▁for |
- ▁how |
- ly |
- ▁re |
- se |
- ▁that |
- es |
- w |
- ic |
- st |
- ▁w |
- ▁be |
- ri |
- an |
- ra |
- ve |
- ce |
- ur |
- ▁by |
- ▁it |
- li |
- ▁de |
- '?' |
- it |
- ch |
- ent |
- ▁is |
- ter |
- el |
- ▁on |
- ▁e |
- ▁he |
- ▁co |
- ▁an |
- ▁ma |
- ▁st |
- ll |
- ▁with |
- ▁can |
- il |
- ▁you |
- ▁us |
- ation |
- te |
- ▁this |
- ▁b |
- ▁do |
- ▁g |
- me |
- ▁what |
- ck |
- ▁from |
- ate |
- ▁p |
- z |
- la |
- ▁mo |
- ▁di |
- ive |
- mp |
- ▁talk |
- ity |
- vi |
- ta |
- at |
- ge |
- ▁tr |
- ▁she |
- ▁our |
- ▁pa |
- ci |
- et |
- h |
- ▁su |
- ver |
- ▁world |
- pe |
- ▁about |
- ▁me |
- ▁so |
- and |
- ▁con |
- tion |
- de |
- ir |
- ▁her |
- im |
- ':' |
- ▁his |
- ies |
- ▁po |
- ▁are |
- ect |
- lo |
- ▁your |
- un |
- ist |
- hi |
- ▁mi |
- x |
- id |
- ment |
- ol |
- ul |
- ti |
- ne |
- qu |
- ▁but |
- ▁ca |
- ▁fa |
- ▁as |
- ▁un |
- ers |
- ight |
- ▁says |
- '0' |
- ng |
- op |
- '1' |
- ▁k |
- ad |
- j |
- ma |
- ▁pro |
- ▁work |
- ▁ba |
- ▁share |
- ▁new |
- ▁more |
- ▁vi |
- ▁sa |
- ▁at |
- ▁la |
- ut |
- bi |
- sion |
- ▁ho |
- na |
- act |
- age |
- ke |
- if |
- ▁bo |
- ▁br |
- ▁ha |
- ▁no |
- co |
- ▁lo |
- mi |
- ▁make |
- ▁people |
- ▁why |
- ant |
- ▁their |
- ▁i |
- ▁life |
- ▁all |
- ting |
- ▁human |
- ▁have |
- om |
- ) |
- ▁( |
- ▁help |
- ▁ted |
- wa |
- sh |
- ▁da |
- ▁le |
- ▁out |
- ph |
- ical |
- ▁way |
- ff |
- ▁ro |
- able |
- ▁some |
- est |
- ure |
- em |
- ho |
- ▁ex |
- gen |
- ha |
- ia |
- ine |
- ▁into |
- ca |
- ▁was |
- ▁who |
- ther |
- ▁they |
- ow |
- he |
- ▁one |
- ▁when |
- form |
- ▁pre |
- ni |
- ▁could |
- ▁like |
- ▁per |
- ▁up |
- ance |
- com |
- ▁go |
- ion |
- tor |
- ▁fe |
- ▁ra |
- ▁or |
- ▁en |
- ▁change |
- tic |
- ▁every |
- ▁jo |
- ence |
- ▁not |
- ▁art |
- one |
- use |
- ous |
- ▁plan |
- ▁music |
- ▁exp |
- und |
- ▁ne |
- um |
- ative |
- pp |
- ▁need |
- tro |
- directed |
- ▁learn |
- ▁narrate |
- ▁has |
- lar |
- '].' |
- man |
- ▁car |
- ▁future |
- ▁real |
- ▁time |
- ize |
- ▁live |
- ber |
- ▁mar |
- ▁ga |
- ▁take |
- ▁dr |
- ful |
- ▁get |
- ▁shows |
- day |
- ▁cha |
- ▁than |
- ▁know |
- ian |
- ▁see |
- ▁just |
- '2' |
- ▁other |
- old |
- ▁design |
- ▁chi |
- ▁build |
- ious |
- ▁most |
- ▁si |
- ▁will |
- ▁power |
- ▁think |
- port |
- ▁over |
- ▁ja |
- ish |
- ▁climate |
- ▁sha |
- ▁through |
- less |
- '3' |
- ▁my |
- ▁where |
- ▁global |
- ▁health |
- ▁pri |
- ▁20 |
- ▁story |
- gu |
- ugh |
- ▁create |
- ▁look |
- ▁trans |
- ▁har |
- ▁even |
- ▁part |
- ▁years |
- ▁lead |
- side |
- low |
- long |
- ▁technolog |
- ness |
- '5' |
- ▁call |
- ▁sc |
- ▁system |
- '9' |
- line |
- ▁brain |
- ▁data |
- ▁own |
- ition |
- ▁explains |
- ▁tell |
- ▁explore |
- ▁start |
- ▁ru |
- ▁which |
- ▁anderson |
- ▁find |
- ▁hu |
- ▁women |
- ▁better |
- ▁idea |
- ▁history |
- ▁research |
- ▁science |
- ism |
- ▁first |
- ▁grow |
- ▁right |
- clu |
- ▁space |
- ▁develop |
- ▁problem |
- ▁two |
- ▁earth |
- ologist |
- ▁many |
- ▁should |
- ▁three |
- ▁fellow |
- ▁social |
- ▁africa |
- ▁... |
- '4' |
- ▁addis |
- ▁powerful |
- ▁found |
- ▁under |
- ▁understand |
- ▁after |
- ▁stories |
- ▁around |
- ▁personal |
- ▁project |
- ▁between |
- ▁question |
- ▁play |
- ▁scientist |
- ▁happen |
- ▁good |
- ▁produc |
- ▁experience |
- ▁step |
- ▁america |
- '8' |
- ▁great |
- ▁down |
- ▁high |
- ▁would |
- ▁turn |
- ▁surprising |
- ▁imagin |
- ▁teach |
- cross |
- ▁place |
- ▁medic |
- ▁million |
- ▁things |
- '7' |
- ▁reveal |
- ▁without |
- ▁challenge |
- ▁next |
- ▁each |
- ▁studio |
- organ |
- '6' |
- ▁business |
- ▁much |
- ▁show |
- ▁conversation |
- ▁energy |
- ▁school |
- ▁ocean |
- ▁while |
- source |
- ization |
- ▁break |
- ▁robot |
- ▁disease |
- ▁behind |
- ability |
- ▁team |
- ▁chris |
- ▁become |
- ▁solution |
- ▁protect |
- ▁collect |
- ▁different |
- ▁those |
- ▁connect |
- ▁architect |
- ▁language |
- ▁simple |
- ▁solve |
- ▁before |
- ▁community |
- ▁country |
- ▁secret |
- ▁keep |
- ▁food |
- ▁thought |
- ▁discover |
- ▁environment |
- ▁government |
- ▁public |
- ; |
- '!' |
- / |
- q |
- '%' |
- '@' |
- ']' |
- + |
- '&' |
- '|' |
- _ |
- ( |
- '"' |
- $ |
- '*' |
- '=' |
- '[' |
- '`' |
- <sos/eos> |
transcript_token_list: null |
two_pass: false |
pre_postencoder_norm: false |
init: null |
input_size: 1 |
ctc_conf: |
dropout_rate: 0.0 |
ctc_type: builtin |
reduce: true |
ignore_nan_grad: null |
zero_infinity: true |
brctc_risk_strategy: exp |
brctc_group_strategy: end |
brctc_risk_factor: 0.0 |
joint_net_conf: null |
use_preprocessor: true |
token_type: bpe |
bpemodel: data/en_token_list/bpe_unigram500/bpe.model |
non_linguistic_symbols: null |
cleaner: null |
g2p: null |
speech_volume_normalize: null |
rir_scp: null |
rir_apply_prob: 1.0 |
noise_scp: null |
noise_apply_prob: 1.0 |
noise_db_range: '13_15' |
short_noise_thres: 0.5 |
frontend: null |
frontend_conf: {} |
specaug: null |
specaug_conf: {} |
normalize: null |
normalize_conf: {} |
model: espnet |
model_conf: |
ctc_weight: 0.0 |
lsm_weight: 0.1 |
length_normalized_loss: false |
weighted_sum: true |
extract_feats_in_collect_stats: false |
preencoder: null |
preencoder_conf: {} |
encoder: whisper |
encoder_conf: |
whisper_model: medium |
dropout_rate: 0.0 |
use_specaug: true |
specaug_conf: |
apply_time_warp: true |
time_warp_window: 5 |
time_warp_mode: bicubic |
apply_freq_mask: true |
freq_mask_width_range: |
- 0 |
- 40 |
num_freq_mask: 2 |
apply_time_mask: true |
time_mask_width_ratio_range: |
- 0.0 |
- 0.12 |
num_time_mask: 5 |
prepostencoder: linear |
prepostencoder_conf: |
input_size: 1024 |
output_size: 80 |
postencoder: conformer_full |
postencoder_conf: |
output_size: 256 |
attention_heads: 4 |
linear_units: 1024 |
num_blocks: 12 |
dropout_rate: 0.1 |
positional_dropout_rate: 0.1 |
attention_dropout_rate: 0.1 |
input_layer: conv2d2 |
normalize_before: true |
macaron_style: true |
rel_pos_type: latest |
pos_enc_layer_type: rel_pos |
selfattention_layer_type: rel_selfattn |
activation_type: swish |
use_cnn_module: true |
cnn_module_kernel: 31 |
deliberationencoder: null |
deliberationencoder_conf: {} |
decoder: transformer |
decoder_conf: |
attention_heads: 4 |
linear_units: 2048 |
num_blocks: 6 |
dropout_rate: 0.1 |
positional_dropout_rate: 0.1 |
self_attention_dropout_rate: 0.1 |
src_attention_dropout_rate: 0.1 |
postdecoder: null |
postdecoder_conf: {} |
required: |
- output_dir |
- token_list |
version: '202310' |
distributed: true |
``` |
</details> |
### Citing ESPnet |
```BibTex |
@inproceedings{ESPnet-SLU, |
title={{ESPnet-SLU}: Advancing spoken language understanding through espnet}, |
author={Arora, Siddhant and Dalmia, Siddharth and Denisov, Pavel and Chang, Xuankai and Ueda, Yushi and Peng, Yifan and Zhang, Yuekai and Kumar, Sujay and Ganesan, Karthik and Yan, Brian and others}, |
booktitle={ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, |
pages={7167--7171}, |
year={2022}, |
organization={IEEE} |
} |
@inproceedings{watanabe2018espnet, |
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
title={{ESPnet}: End-to-End Speech Processing Toolkit}, |
year={2018}, |
booktitle={Proceedings of Interspeech}, |
pages={2207--2211}, |
doi={10.21437/Interspeech.2018-1456}, |
url={http://dx.doi.org/10.21437/Interspeech.2018-1456} |
} |
``` |
or arXiv: |
```bibtex |
@misc{watanabe2018espnet, |
title={ESPnet: End-to-End Speech Processing Toolkit}, |
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
year={2018}, |
eprint={1804.00015}, |
archivePrefix={arXiv}, |
primaryClass={cs.CL} |
} |
``` |