|
--- |
|
tags: |
|
- espnet |
|
- audio |
|
- spoken-language-understanding |
|
language: en |
|
datasets: |
|
- slue-ted |
|
license: cc-by-4.0 |
|
--- |
|
|
|
## ESPnet2 SLU model |
|
|
|
### `espnet/slueted_whisper_summ` |
|
|
|
This model was trained by “siddhu001” using slue-ted recipe in [espnet](https://github.com/espnet/espnet/). |
|
|
|
### Demo: How to use in ESPnet2 |
|
|
|
Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) |
|
if you haven't done that already. |
|
|
|
```bash |
|
cd espnet |
|
git checkout e23ef85f0b3116ad5c60d0833f186da0deec0734 |
|
pip install -e . |
|
cd egs2/slue-ted/slu1 |
|
./run.sh --skip_data_prep false --skip_train true --download_model espnet/slueted_whisper_summ |
|
``` |
|
|
|
{'rouge1': 0.2255418629519756, 'rouge2': 0.0485061537185737, 'rougeL': 0.1596465851004139, 'rougeLsum': 0.15968116069467322, 'meteor': 0.2129616261465529} |
|
RESULT 22.55418629519756 3.799127541421444e-132 15.96465851004139 21.29616261465529 83.78519008627457 |
|
|
|
## SLU config |
|
|
|
<details><summary>expand</summary> |
|
|
|
``` |
|
config: conf//train_asr_whisper_weighted_conv2d2.yaml |
|
print_config: false |
|
log_level: INFO |
|
drop_last_iter: false |
|
dry_run: false |
|
iterator_type: sequence |
|
valid_iterator_type: null |
|
output_dir: exp/slu_train_asr_whisper_weighted_conv2d2_raw_en_bpe500_sp |
|
ngpu: 1 |
|
seed: 2022 |
|
num_workers: 2 |
|
num_att_plot: 3 |
|
dist_backend: nccl |
|
dist_init_method: env:// |
|
dist_world_size: 4 |
|
dist_rank: 0 |
|
local_rank: 0 |
|
dist_master_addr: localhost |
|
dist_master_port: 42635 |
|
dist_launcher: null |
|
multiprocessing_distributed: true |
|
unused_parameters: false |
|
sharded_ddp: false |
|
cudnn_enabled: true |
|
cudnn_benchmark: false |
|
cudnn_deterministic: true |
|
collect_stats: false |
|
write_collected_feats: false |
|
max_epoch: 25 |
|
patience: null |
|
val_scheduler_criterion: |
|
- valid |
|
- loss |
|
early_stopping_criterion: |
|
- valid |
|
- loss |
|
- min |
|
best_model_criterion: |
|
- - valid |
|
- acc |
|
- max |
|
keep_nbest_models: 10 |
|
nbest_averaging_interval: 0 |
|
grad_clip: 5.0 |
|
grad_clip_type: 2.0 |
|
grad_noise: false |
|
accum_grad: 1 |
|
no_forward_run: false |
|
resume: true |
|
train_dtype: float32 |
|
use_amp: false |
|
log_interval: 100 |
|
use_matplotlib: true |
|
use_tensorboard: true |
|
create_graph_in_tensorboard: false |
|
use_wandb: false |
|
wandb_project: null |
|
wandb_id: null |
|
wandb_entity: null |
|
wandb_name: null |
|
wandb_model_log_interval: -1 |
|
detect_anomaly: false |
|
use_lora: false |
|
save_lora_only: true |
|
lora_conf: {} |
|
pretrain_path: null |
|
init_param: |
|
- /scratch/bbjs/arora1/espnet_slue_PR/espnet/egs2/tedlium3/asr1/exp/asr_train_asr_whisper_weighted_conv2d2_raw_en_bpe500/valid.acc.ave_10best.pth:::ctc |
|
ignore_init_mismatch: false |
|
freeze_param: |
|
- encoder |
|
num_iters_per_epoch: null |
|
batch_size: 20 |
|
valid_batch_size: null |
|
batch_bins: 12000000 |
|
valid_batch_bins: null |
|
train_shape_file: |
|
- exp/slu_stats_raw_en_bpe500_sp/train/speech_shape |
|
- exp/slu_stats_raw_en_bpe500_sp/train/text_shape.bpe |
|
valid_shape_file: |
|
- exp/slu_stats_raw_en_bpe500_sp/valid/speech_shape |
|
- exp/slu_stats_raw_en_bpe500_sp/valid/text_shape.bpe |
|
batch_type: numel |
|
valid_batch_type: null |
|
fold_length: |
|
- 80000 |
|
- 150 |
|
sort_in_batch: descending |
|
shuffle_within_batch: false |
|
sort_batch: descending |
|
multiple_iterator: false |
|
chunk_length: 500 |
|
chunk_shift_ratio: 0.5 |
|
num_cache_chunks: 1024 |
|
chunk_excluded_key_prefixes: [] |
|
chunk_default_fs: null |
|
train_data_path_and_name_and_type: |
|
- - dump/raw/train_sp/wav.scp |
|
- speech |
|
- kaldi_ark |
|
- - dump/raw/train_sp/text |
|
- text |
|
- text |
|
valid_data_path_and_name_and_type: |
|
- - dump/raw/devel/wav.scp |
|
- speech |
|
- kaldi_ark |
|
- - dump/raw/devel/text |
|
- text |
|
- text |
|
allow_variable_data_keys: false |
|
max_cache_size: 0.0 |
|
max_cache_fd: 32 |
|
allow_multi_rates: false |
|
valid_max_cache_size: null |
|
exclude_weight_decay: false |
|
exclude_weight_decay_conf: {} |
|
optim: adam |
|
optim_conf: |
|
lr: 0.002 |
|
weight_decay: 1.0e-06 |
|
scheduler: warmuplr |
|
scheduler_conf: |
|
warmup_steps: 5000 |
|
token_list: |
|
- <blank> |
|
- <unk> |
|
- '[sep]' |
|
- '"' |
|
- s |
|
- ▁ |
|
- ▁the |
|
- ',' |
|
- t |
|
- d |
|
- ▁a |
|
- . |
|
- ing |
|
- o |
|
- e |
|
- ▁to |
|
- a |
|
- ▁and |
|
- y |
|
- n |
|
- ▁of |
|
- r |
|
- ▁in |
|
- u |
|
- i |
|
- m |
|
- p |
|
- c |
|
- er |
|
- g |
|
- l |
|
- al |
|
- re |
|
- ed |
|
- b |
|
- '''' |
|
- ar |
|
- k |
|
- in |
|
- f |
|
- ▁" |
|
- le |
|
- 'on' |
|
- v |
|
- or |
|
- th |
|
- '-' |
|
- ▁c |
|
- en |
|
- ▁f |
|
- ▁-- |
|
- ▁we |
|
- ▁for |
|
- ▁how |
|
- ly |
|
- ▁re |
|
- se |
|
- ▁that |
|
- es |
|
- w |
|
- ic |
|
- st |
|
- ▁w |
|
- ▁be |
|
- ri |
|
- an |
|
- ra |
|
- ve |
|
- ce |
|
- ur |
|
- ▁by |
|
- ▁it |
|
- li |
|
- ▁de |
|
- '?' |
|
- it |
|
- ch |
|
- ent |
|
- ▁is |
|
- ter |
|
- el |
|
- ▁on |
|
- ▁e |
|
- ▁he |
|
- ▁co |
|
- ▁an |
|
- ▁ma |
|
- ▁st |
|
- ll |
|
- ▁with |
|
- ▁can |
|
- il |
|
- ▁you |
|
- ▁us |
|
- ation |
|
- te |
|
- ▁this |
|
- ▁b |
|
- ▁do |
|
- ▁g |
|
- me |
|
- ▁what |
|
- ck |
|
- ▁from |
|
- ate |
|
- ▁p |
|
- z |
|
- la |
|
- ▁mo |
|
- ▁di |
|
- ive |
|
- mp |
|
- ▁talk |
|
- ity |
|
- vi |
|
- ta |
|
- at |
|
- ge |
|
- ▁tr |
|
- ▁she |
|
- ▁our |
|
- ▁pa |
|
- ci |
|
- et |
|
- h |
|
- ▁su |
|
- ver |
|
- ▁world |
|
- pe |
|
- ▁about |
|
- ▁me |
|
- ▁so |
|
- and |
|
- ▁con |
|
- tion |
|
- de |
|
- ir |
|
- ▁her |
|
- im |
|
- ':' |
|
- ▁his |
|
- ies |
|
- ▁po |
|
- ▁are |
|
- ect |
|
- lo |
|
- ▁your |
|
- un |
|
- ist |
|
- hi |
|
- ▁mi |
|
- x |
|
- id |
|
- ment |
|
- ol |
|
- ul |
|
- ti |
|
- ne |
|
- qu |
|
- ▁but |
|
- ▁ca |
|
- ▁fa |
|
- ▁as |
|
- ▁un |
|
- ers |
|
- ight |
|
- ▁says |
|
- '0' |
|
- ng |
|
- op |
|
- '1' |
|
- ▁k |
|
- ad |
|
- j |
|
- ma |
|
- ▁pro |
|
- ▁work |
|
- ▁ba |
|
- ▁share |
|
- ▁new |
|
- ▁more |
|
- ▁vi |
|
- ▁sa |
|
- ▁at |
|
- ▁la |
|
- ut |
|
- bi |
|
- sion |
|
- ▁ho |
|
- na |
|
- act |
|
- age |
|
- ke |
|
- if |
|
- ▁bo |
|
- ▁br |
|
- ▁ha |
|
- ▁no |
|
- co |
|
- ▁lo |
|
- mi |
|
- ▁make |
|
- ▁people |
|
- ▁why |
|
- ant |
|
- ▁their |
|
- ▁i |
|
- ▁life |
|
- ▁all |
|
- ting |
|
- ▁human |
|
- ▁have |
|
- om |
|
- ) |
|
- ▁( |
|
- ▁help |
|
- ▁ted |
|
- wa |
|
- sh |
|
- ▁da |
|
- ▁le |
|
- ▁out |
|
- ph |
|
- ical |
|
- ▁way |
|
- ff |
|
- ▁ro |
|
- able |
|
- ▁some |
|
- est |
|
- ure |
|
- em |
|
- ho |
|
- ▁ex |
|
- gen |
|
- ha |
|
- ia |
|
- ine |
|
- ▁into |
|
- ca |
|
- ▁was |
|
- ▁who |
|
- ther |
|
- ▁they |
|
- ow |
|
- he |
|
- ▁one |
|
- ▁when |
|
- form |
|
- ▁pre |
|
- ni |
|
- ▁could |
|
- ▁like |
|
- ▁per |
|
- ▁up |
|
- ance |
|
- com |
|
- ▁go |
|
- ion |
|
- tor |
|
- ▁fe |
|
- ▁ra |
|
- ▁or |
|
- ▁en |
|
- ▁change |
|
- tic |
|
- ▁every |
|
- ▁jo |
|
- ence |
|
- ▁not |
|
- ▁art |
|
- one |
|
- use |
|
- ous |
|
- ▁plan |
|
- ▁music |
|
- ▁exp |
|
- und |
|
- ▁ne |
|
- um |
|
- ative |
|
- pp |
|
- ▁need |
|
- tro |
|
- directed |
|
- ▁learn |
|
- ▁narrate |
|
- ▁has |
|
- lar |
|
- '].' |
|
- man |
|
- ▁car |
|
- ▁future |
|
- ▁real |
|
- ▁time |
|
- ize |
|
- ▁live |
|
- ber |
|
- ▁mar |
|
- ▁ga |
|
- ▁take |
|
- ▁dr |
|
- ful |
|
- ▁get |
|
- ▁shows |
|
- day |
|
- ▁cha |
|
- ▁than |
|
- ▁know |
|
- ian |
|
- ▁see |
|
- ▁just |
|
- '2' |
|
- ▁other |
|
- old |
|
- ▁design |
|
- ▁chi |
|
- ▁build |
|
- ious |
|
- ▁most |
|
- ▁si |
|
- ▁will |
|
- ▁power |
|
- ▁think |
|
- port |
|
- ▁over |
|
- ▁ja |
|
- ish |
|
- ▁climate |
|
- ▁sha |
|
- ▁through |
|
- less |
|
- '3' |
|
- ▁my |
|
- ▁where |
|
- ▁global |
|
- ▁health |
|
- ▁pri |
|
- ▁20 |
|
- ▁story |
|
- gu |
|
- ugh |
|
- ▁create |
|
- ▁look |
|
- ▁trans |
|
- ▁har |
|
- ▁even |
|
- ▁part |
|
- ▁years |
|
- ▁lead |
|
- side |
|
- low |
|
- long |
|
- ▁technolog |
|
- ness |
|
- '5' |
|
- ▁call |
|
- ▁sc |
|
- ▁system |
|
- '9' |
|
- line |
|
- ▁brain |
|
- ▁data |
|
- ▁own |
|
- ition |
|
- ▁explains |
|
- ▁tell |
|
- ▁explore |
|
- ▁start |
|
- ▁ru |
|
- ▁which |
|
- ▁anderson |
|
- ▁find |
|
- ▁hu |
|
- ▁women |
|
- ▁better |
|
- ▁idea |
|
- ▁history |
|
- ▁research |
|
- ▁science |
|
- ism |
|
- ▁first |
|
- ▁grow |
|
- ▁right |
|
- clu |
|
- ▁space |
|
- ▁develop |
|
- ▁problem |
|
- ▁two |
|
- ▁earth |
|
- ologist |
|
- ▁many |
|
- ▁should |
|
- ▁three |
|
- ▁fellow |
|
- ▁social |
|
- ▁africa |
|
- ▁... |
|
- '4' |
|
- ▁addis |
|
- ▁powerful |
|
- ▁found |
|
- ▁under |
|
- ▁understand |
|
- ▁after |
|
- ▁stories |
|
- ▁around |
|
- ▁personal |
|
- ▁project |
|
- ▁between |
|
- ▁question |
|
- ▁play |
|
- ▁scientist |
|
- ▁happen |
|
- ▁good |
|
- ▁produc |
|
- ▁experience |
|
- ▁step |
|
- ▁america |
|
- '8' |
|
- ▁great |
|
- ▁down |
|
- ▁high |
|
- ▁would |
|
- ▁turn |
|
- ▁surprising |
|
- ▁imagin |
|
- ▁teach |
|
- cross |
|
- ▁place |
|
- ▁medic |
|
- ▁million |
|
- ▁things |
|
- '7' |
|
- ▁reveal |
|
- ▁without |
|
- ▁challenge |
|
- ▁next |
|
- ▁each |
|
- ▁studio |
|
- organ |
|
- '6' |
|
- ▁business |
|
- ▁much |
|
- ▁show |
|
- ▁conversation |
|
- ▁energy |
|
- ▁school |
|
- ▁ocean |
|
- ▁while |
|
- source |
|
- ization |
|
- ▁break |
|
- ▁robot |
|
- ▁disease |
|
- ▁behind |
|
- ability |
|
- ▁team |
|
- ▁chris |
|
- ▁become |
|
- ▁solution |
|
- ▁protect |
|
- ▁collect |
|
- ▁different |
|
- ▁those |
|
- ▁connect |
|
- ▁architect |
|
- ▁language |
|
- ▁simple |
|
- ▁solve |
|
- ▁before |
|
- ▁community |
|
- ▁country |
|
- ▁secret |
|
- ▁keep |
|
- ▁food |
|
- ▁thought |
|
- ▁discover |
|
- ▁environment |
|
- ▁government |
|
- ▁public |
|
- ; |
|
- '!' |
|
- / |
|
- q |
|
- '%' |
|
- '@' |
|
- ']' |
|
- + |
|
- '&' |
|
- '|' |
|
- _ |
|
- ( |
|
- '"' |
|
- $ |
|
- '*' |
|
- '=' |
|
- '[' |
|
- '`' |
|
- <sos/eos> |
|
transcript_token_list: null |
|
two_pass: false |
|
pre_postencoder_norm: false |
|
init: null |
|
input_size: 1 |
|
ctc_conf: |
|
dropout_rate: 0.0 |
|
ctc_type: builtin |
|
reduce: true |
|
ignore_nan_grad: null |
|
zero_infinity: true |
|
brctc_risk_strategy: exp |
|
brctc_group_strategy: end |
|
brctc_risk_factor: 0.0 |
|
joint_net_conf: null |
|
use_preprocessor: true |
|
token_type: bpe |
|
bpemodel: data/en_token_list/bpe_unigram500/bpe.model |
|
non_linguistic_symbols: null |
|
cleaner: null |
|
g2p: null |
|
speech_volume_normalize: null |
|
rir_scp: null |
|
rir_apply_prob: 1.0 |
|
noise_scp: null |
|
noise_apply_prob: 1.0 |
|
noise_db_range: '13_15' |
|
short_noise_thres: 0.5 |
|
frontend: null |
|
frontend_conf: {} |
|
specaug: null |
|
specaug_conf: {} |
|
normalize: null |
|
normalize_conf: {} |
|
model: espnet |
|
model_conf: |
|
ctc_weight: 0.0 |
|
lsm_weight: 0.1 |
|
length_normalized_loss: false |
|
weighted_sum: true |
|
extract_feats_in_collect_stats: false |
|
preencoder: null |
|
preencoder_conf: {} |
|
encoder: whisper |
|
encoder_conf: |
|
whisper_model: medium |
|
dropout_rate: 0.0 |
|
use_specaug: true |
|
specaug_conf: |
|
apply_time_warp: true |
|
time_warp_window: 5 |
|
time_warp_mode: bicubic |
|
apply_freq_mask: true |
|
freq_mask_width_range: |
|
- 0 |
|
- 40 |
|
num_freq_mask: 2 |
|
apply_time_mask: true |
|
time_mask_width_ratio_range: |
|
- 0.0 |
|
- 0.12 |
|
num_time_mask: 5 |
|
prepostencoder: linear |
|
prepostencoder_conf: |
|
input_size: 1024 |
|
output_size: 80 |
|
postencoder: conformer_full |
|
postencoder_conf: |
|
output_size: 256 |
|
attention_heads: 4 |
|
linear_units: 1024 |
|
num_blocks: 12 |
|
dropout_rate: 0.1 |
|
positional_dropout_rate: 0.1 |
|
attention_dropout_rate: 0.1 |
|
input_layer: conv2d2 |
|
normalize_before: true |
|
macaron_style: true |
|
rel_pos_type: latest |
|
pos_enc_layer_type: rel_pos |
|
selfattention_layer_type: rel_selfattn |
|
activation_type: swish |
|
use_cnn_module: true |
|
cnn_module_kernel: 31 |
|
deliberationencoder: null |
|
deliberationencoder_conf: {} |
|
decoder: transformer |
|
decoder_conf: |
|
attention_heads: 4 |
|
linear_units: 2048 |
|
num_blocks: 6 |
|
dropout_rate: 0.1 |
|
positional_dropout_rate: 0.1 |
|
self_attention_dropout_rate: 0.1 |
|
src_attention_dropout_rate: 0.1 |
|
postdecoder: null |
|
postdecoder_conf: {} |
|
required: |
|
- output_dir |
|
- token_list |
|
version: '202310' |
|
distributed: true |
|
``` |
|
|
|
</details> |
|
|
|
|
|
|
|
### Citing ESPnet |
|
|
|
```BibTex |
|
@inproceedings{ESPnet-SLU, |
|
title={{ESPnet-SLU}: Advancing spoken language understanding through espnet}, |
|
author={Arora, Siddhant and Dalmia, Siddharth and Denisov, Pavel and Chang, Xuankai and Ueda, Yushi and Peng, Yifan and Zhang, Yuekai and Kumar, Sujay and Ganesan, Karthik and Yan, Brian and others}, |
|
booktitle={ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, |
|
pages={7167--7171}, |
|
year={2022}, |
|
organization={IEEE} |
|
} |
|
|
|
@inproceedings{watanabe2018espnet, |
|
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
|
title={{ESPnet}: End-to-End Speech Processing Toolkit}, |
|
year={2018}, |
|
booktitle={Proceedings of Interspeech}, |
|
pages={2207--2211}, |
|
doi={10.21437/Interspeech.2018-1456}, |
|
url={http://dx.doi.org/10.21437/Interspeech.2018-1456} |
|
} |
|
|
|
``` |
|
|
|
or arXiv: |
|
|
|
```bibtex |
|
@misc{watanabe2018espnet, |
|
title={ESPnet: End-to-End Speech Processing Toolkit}, |
|
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
|
year={2018}, |
|
eprint={1804.00015}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CL} |
|
} |
|
``` |
|
|