novateur commited on
Commit
df714f0
·
verified ·
1 Parent(s): 894a02c

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +22 -0
  2. Wav-Omni/.github/ISSUE_TEMPLATE/bug.yml +79 -0
  3. Wav-Omni/.github/ISSUE_TEMPLATE/feature-request.yml +31 -0
  4. Wav-Omni/.github/PULL_REQUEST_TEMPLATE.md +38 -0
  5. Wav-Omni/.github/workflows/spellcheck.yml +66 -0
  6. Wav-Omni/.gitignore +25 -0
  7. Wav-Omni/Dockerfile +41 -0
  8. Wav-Omni/README.md +10 -0
  9. Wav-Omni/ckpt/test.py +0 -0
  10. Wav-Omni/ckpt/wavomni_chinese_3_2/.hydra/config.yaml +68 -0
  11. Wav-Omni/ckpt/wavomni_chinese_3_2/.hydra/hydra.yaml +216 -0
  12. Wav-Omni/ckpt/wavomni_chinese_3_2/.hydra/overrides.yaml +61 -0
  13. Wav-Omni/ckpt/wavomni_chinese_3_2/inference_s2s.log +175 -0
  14. Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt +3 -0
  15. Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/gt_text +4 -0
  16. Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/1.wav +3 -0
  17. Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/2.wav +3 -0
  18. Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/3.wav +3 -0
  19. Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/4.wav +3 -0
  20. Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_text +12 -0
  21. Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/question_text +4 -0
  22. Wav-Omni/environment.yml +243 -0
  23. Wav-Omni/examples/s2s/__pycache__/s2s_config.cpython-39.pyc +0 -0
  24. Wav-Omni/examples/s2s/__pycache__/speech_dataset_s2s.cpython-39.pyc +0 -0
  25. Wav-Omni/examples/s2s/audio_prompt/en/prompt_1.wav +3 -0
  26. Wav-Omni/examples/s2s/audio_prompt/en/prompt_2.wav +3 -0
  27. Wav-Omni/examples/s2s/audio_prompt/en/prompt_3.wav +3 -0
  28. Wav-Omni/examples/s2s/audio_prompt/en/prompt_4.wav +3 -0
  29. Wav-Omni/examples/s2s/audio_prompt/en/prompt_5.wav +3 -0
  30. Wav-Omni/examples/s2s/audio_prompt/en/prompt_6.wav +3 -0
  31. Wav-Omni/examples/s2s/audio_prompt/zh/prompt_1.wav +3 -0
  32. Wav-Omni/examples/s2s/audio_prompt/zh/prompt_2.wav +3 -0
  33. Wav-Omni/examples/s2s/audio_prompt/zh/prompt_3.wav +3 -0
  34. Wav-Omni/examples/s2s/audio_prompt/zh/prompt_4.wav +3 -0
  35. Wav-Omni/examples/s2s/audio_prompt/zh/prompt_5.wav +3 -0
  36. Wav-Omni/examples/s2s/audio_prompt/zh/prompt_6.wav +3 -0
  37. Wav-Omni/examples/s2s/conf/ds_config.json +19 -0
  38. Wav-Omni/examples/s2s/conf/prompt.yaml +3 -0
  39. Wav-Omni/examples/s2s/conf/prompt_asr.yaml +2 -0
  40. Wav-Omni/examples/s2s/conf/prompt_tts.yaml +4 -0
  41. Wav-Omni/examples/s2s/deepspeed_finetune_s2s.py +47 -0
  42. Wav-Omni/examples/s2s/demo/142001_0_rlhf.wav +3 -0
  43. Wav-Omni/examples/s2s/demo/142023_0_rlhf.wav +3 -0
  44. Wav-Omni/examples/s2s/demo/142038_0_rlhf.wav +3 -0
  45. Wav-Omni/examples/s2s/demo/64474_1_rlhf.wav +3 -0
  46. Wav-Omni/examples/s2s/demo/demo_data/142038_0_rlhf.wav +3 -0
  47. Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp.jsonl +4 -0
  48. Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl +4 -0
  49. Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_train.jsonl +30 -0
  50. Wav-Omni/examples/s2s/demo/question_jsp.wav +3 -0
.gitattributes CHANGED
@@ -34,3 +34,25 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  mucodec.wav filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  mucodec.wav filter=lfs diff=lfs merge=lfs -text
37
+ Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/1.wav filter=lfs diff=lfs merge=lfs -text
38
+ Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/2.wav filter=lfs diff=lfs merge=lfs -text
39
+ Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/3.wav filter=lfs diff=lfs merge=lfs -text
40
+ Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/4.wav filter=lfs diff=lfs merge=lfs -text
41
+ Wav-Omni/examples/s2s/audio_prompt/en/prompt_1.wav filter=lfs diff=lfs merge=lfs -text
42
+ Wav-Omni/examples/s2s/audio_prompt/en/prompt_2.wav filter=lfs diff=lfs merge=lfs -text
43
+ Wav-Omni/examples/s2s/audio_prompt/en/prompt_3.wav filter=lfs diff=lfs merge=lfs -text
44
+ Wav-Omni/examples/s2s/audio_prompt/en/prompt_4.wav filter=lfs diff=lfs merge=lfs -text
45
+ Wav-Omni/examples/s2s/audio_prompt/en/prompt_5.wav filter=lfs diff=lfs merge=lfs -text
46
+ Wav-Omni/examples/s2s/audio_prompt/en/prompt_6.wav filter=lfs diff=lfs merge=lfs -text
47
+ Wav-Omni/examples/s2s/audio_prompt/zh/prompt_1.wav filter=lfs diff=lfs merge=lfs -text
48
+ Wav-Omni/examples/s2s/audio_prompt/zh/prompt_2.wav filter=lfs diff=lfs merge=lfs -text
49
+ Wav-Omni/examples/s2s/audio_prompt/zh/prompt_3.wav filter=lfs diff=lfs merge=lfs -text
50
+ Wav-Omni/examples/s2s/audio_prompt/zh/prompt_4.wav filter=lfs diff=lfs merge=lfs -text
51
+ Wav-Omni/examples/s2s/audio_prompt/zh/prompt_5.wav filter=lfs diff=lfs merge=lfs -text
52
+ Wav-Omni/examples/s2s/audio_prompt/zh/prompt_6.wav filter=lfs diff=lfs merge=lfs -text
53
+ Wav-Omni/examples/s2s/demo/142001_0_rlhf.wav filter=lfs diff=lfs merge=lfs -text
54
+ Wav-Omni/examples/s2s/demo/142023_0_rlhf.wav filter=lfs diff=lfs merge=lfs -text
55
+ Wav-Omni/examples/s2s/demo/142038_0_rlhf.wav filter=lfs diff=lfs merge=lfs -text
56
+ Wav-Omni/examples/s2s/demo/64474_1_rlhf.wav filter=lfs diff=lfs merge=lfs -text
57
+ Wav-Omni/examples/s2s/demo/demo_data/142038_0_rlhf.wav filter=lfs diff=lfs merge=lfs -text
58
+ Wav-Omni/examples/s2s/demo/question_jsp.wav filter=lfs diff=lfs merge=lfs -text
Wav-Omni/.github/ISSUE_TEMPLATE/bug.yml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: 🐛 Bug Report
2
+ description: Create a report to help us reproduce and fix the bug
3
+
4
+ body:
5
+ - type: markdown
6
+ attributes:
7
+ value: >
8
+ #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the
9
+ existing and past issues](https://github.com/ddlBoJack/Wav-Omni/issues).
10
+
11
+ - type: textarea
12
+ id: system-info
13
+ attributes:
14
+ label: System Info
15
+ description: |
16
+ Please share your system info with us. You can use the following command to capture your environment information
17
+ python -m "torch.utils.collect_env"
18
+
19
+ placeholder: |
20
+ PyTorch version, CUDA version, GPU type, #num of GPUs...
21
+ validations:
22
+ required: true
23
+
24
+ - type: checkboxes
25
+ id: information-scripts-examples
26
+ attributes:
27
+ label: Information
28
+ description: 'The problem arises when using:'
29
+ options:
30
+ - label: "The official example scripts"
31
+ - label: "My own modified scripts"
32
+
33
+ - type: textarea
34
+ id: bug-description
35
+ attributes:
36
+ label: 🐛 Describe the bug
37
+ description: |
38
+ Please provide a clear and concise description of what the bug is.
39
+
40
+ Provide the exact command(s) that you ran with the settings eg using FSDP and PEFT or pure FSDP.
41
+
42
+ Please also paste or describe the results you observe instead of the expected results.
43
+ placeholder: |
44
+ A clear and concise description of what the bug is.
45
+
46
+ ```python
47
+ # Command that you used for running the examples
48
+ ```
49
+ Description of the results
50
+ validations:
51
+ required: true
52
+
53
+ - type: textarea
54
+ attributes:
55
+ label: Error logs
56
+ description: |
57
+ If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
58
+
59
+ placeholder: |
60
+ ```
61
+ The error message you got, with the full traceback.
62
+ ```
63
+
64
+ validations:
65
+ required: true
66
+
67
+
68
+ - type: textarea
69
+ id: expected-behavior
70
+ validations:
71
+ required: true
72
+ attributes:
73
+ label: Expected behavior
74
+ description: "A clear and concise description of what you would expect to happen."
75
+
76
+ - type: markdown
77
+ attributes:
78
+ value: >
79
+ Thanks for contributing 🎉!
Wav-Omni/.github/ISSUE_TEMPLATE/feature-request.yml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: 🚀 Feature request
2
+ description: Submit a proposal/request for a new Wav-Omni feature
3
+
4
+ body:
5
+ - type: textarea
6
+ id: feature-pitch
7
+ attributes:
8
+ label: 🚀 The feature, motivation and pitch
9
+ description: >
10
+ A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
11
+ validations:
12
+ required: true
13
+
14
+ - type: textarea
15
+ id: alternatives
16
+ attributes:
17
+ label: Alternatives
18
+ description: >
19
+ A description of any alternative solutions or features you've considered, if any.
20
+
21
+ - type: textarea
22
+ id: additional-context
23
+ attributes:
24
+ label: Additional context
25
+ description: >
26
+ Add any other context or screenshots about the feature request.
27
+
28
+ - type: markdown
29
+ attributes:
30
+ value: >
31
+ Thanks for contributing 🎉!
Wav-Omni/.github/PULL_REQUEST_TEMPLATE.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # What does this PR do?
2
+
3
+ <!--
4
+ Congratulations! You've made it this far! You're not quite done yet though.
5
+
6
+ Please include a good title that fully reflects the extent of your awesome contribution.
7
+
8
+ Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.
9
+
10
+ -->
11
+
12
+ <!-- Remove if not applicable -->
13
+
14
+ Fixes # (issue)
15
+
16
+
17
+ ## Feature/Issue validation/testing
18
+
19
+ Please describe the tests that you ran to verify your changes and relevant result summary. Provide instructions so it can be reproduced.
20
+ Please also list any relevant details for your test configuration.
21
+
22
+ - [ ] Test A
23
+ Logs for Test A
24
+
25
+ - [ ] Test B
26
+ Logs for Test B
27
+
28
+
29
+ ## Before submitting
30
+ - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
31
+ - [ ] Did you read the [contributor guideline](https://github.com/facebookresearch/llama-recipes/blob/main/CONTRIBUTING.md#pull-requests),
32
+ Pull Request section?
33
+ - [ ] Was this discussed/approved via a Github issue? Please add a link
34
+ to it if that's the case.
35
+ - [ ] Did you make sure to update the documentation with your changes?
36
+ - [ ] Did you write any new necessary tests?
37
+
38
+ Thanks for contributing 🎉!
Wav-Omni/.github/workflows/spellcheck.yml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: SpellCheck
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ pull_request:
8
+ branches:
9
+ - main
10
+ jobs:
11
+ build:
12
+ runs-on: ubuntu-20.04
13
+ name: Lint changed files
14
+ steps:
15
+ - uses: actions/checkout@v3
16
+ with:
17
+ fetch-depth: 0 # OR "2" -> To retrieve the preceding commit.
18
+
19
+ - name: Check links in all markdown files
20
+ uses: gaurav-nelson/github-action-markdown-link-check@1.0.13
21
+ with:
22
+ use-verbose-mode: 'yes'
23
+ config-file: "scripts/markdown_link_check_config.json"
24
+
25
+ - name: Get changed files
26
+ id: changed-files
27
+ uses: tj-actions/changed-files@v29.0.4
28
+ with:
29
+
30
+ files: |
31
+ **/*.py
32
+
33
+ spellcheck:
34
+ runs-on: ubuntu-20.04
35
+ steps:
36
+ - uses: actions/checkout@v3
37
+
38
+ - name: Install dependencies
39
+ run: |
40
+ sudo apt-get install aspell aspell-en
41
+ pip install pyspelling
42
+
43
+ - name: Get changed files
44
+ id: changed-files
45
+ uses: tj-actions/changed-files@v29.0.4
46
+ with:
47
+ files: |
48
+ **/*.md
49
+
50
+ - name: Check spellings
51
+ run: |
52
+ sources=""
53
+ for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
54
+ sources="${sources} -S $file"
55
+ done
56
+ if [ ! "$sources" ]; then
57
+ echo "No files to spellcheck"
58
+ else
59
+ pyspelling -c $GITHUB_WORKSPACE/scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources
60
+ fi
61
+
62
+ - name: In the case of misspellings
63
+ if: ${{ failure() }}
64
+ run: |
65
+ echo "Please fix the misspellings. If you are sure about some of them, "
66
+ echo "so append those to scripts/spellcheck_conf/wordlist.txt"
Wav-Omni/.gitignore ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ __pycache__
3
+ .ipynb_checkpoints
4
+ .vscode
5
+ debug.py
6
+ debug.ipynb
7
+ debug.sh
8
+ .idea/*
9
+ transformers
10
+ wandb/
11
+ log/
12
+ *.log
13
+ outputs/
14
+ data/
15
+ jobs/
16
+ debug/
17
+ audio/
18
+
19
+ examples/s2s/scripts/debug
20
+ examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_noself.sh
21
+ examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_copy.sh
22
+ examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_copy.sh
23
+ scripts_all
24
+ examples/hotwords_librispeech
25
+ examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_debug.sh
Wav-Omni/Dockerfile ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime
2
+
3
+ USER root
4
+
5
+ ARG DEBIAN_FRONTEND=noninteractive
6
+
7
+ LABEL github_repo="https://github.com/ddlBoJack/Wav-Omni"
8
+
9
+ RUN set -x \
10
+ && apt-get update \
11
+ && apt-get -y install wget curl man git less openssl libssl-dev unzip unar build-essential aria2 tmux vim ninja-build\
12
+ && apt-get install -y openssh-server sox libsox-fmt-all libsox-fmt-mp3 libsndfile1-dev ffmpeg \
13
+ && rm -rf /var/lib/apt/lists/* \
14
+ && apt-get clean
15
+
16
+ RUN pip install --no-cache-dir packaging editdistance gpustat wandb einops debugpy tqdm soundfile matplotlib scipy sentencepiece pandas \
17
+ && pip install --no-cache-dir torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118
18
+
19
+ WORKDIR /workspace
20
+
21
+ RUN git clone https://github.com/huggingface/transformers.git \
22
+ && cd transformers \
23
+ && git checkout tags/v4.35.2 \
24
+ && pip install --no-cache-dir -e .
25
+
26
+ RUN git clone https://github.com/huggingface/peft.git \
27
+ && cd peft \
28
+ && git checkout tags/v0.6.0 \
29
+ && pip install --no-cache-dir -e .
30
+
31
+ RUN git clone https://github.com/pytorch/fairseq \
32
+ && cd fairseq \
33
+ && pip install --no-cache-dir --editable ./
34
+
35
+ RUN git clone https://github.com/ddlBoJack/Wav-Omni.git \
36
+ && cd Wav-Omni \
37
+ && pip install --no-cache-dir -e .
38
+
39
+ ENV SHELL=/bin/bash
40
+
41
+ WORKDIR /workspace/Wav-Omni
Wav-Omni/README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ### 安装环境
2
+ ./Wav-Omni/environment.yml
3
+
4
+ ### 训练和推理
5
+ bash ./Wav-Omni/train.sh
6
+ bash ./Wav-Omni/infer.sh
7
+
8
+ 得huggingface上提前下载Qwen和cosyvoice的权重,替换配置文件中的路径。
9
+
10
+ 数据集中的方式可以按照demo给出的样例构建
Wav-Omni/ckpt/test.py ADDED
File without changes
Wav-Omni/ckpt/wavomni_chinese_3_2/.hydra/config.yaml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_config:
2
+ prompt: 'Conduct a spoken conversation with the user. '
3
+ dataset: speech_dataset_s2s
4
+ val_data_path: /home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl
5
+ train_data_path: /home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl
6
+ input_type: mel
7
+ mel_size: 128
8
+ inference_mode: true
9
+ manifest_format: jsonl
10
+ split_size: 0.002
11
+ load_from_cache_file: false
12
+ task_type: s2s
13
+ seed: 777
14
+ vocab_config:
15
+ code_layer: 3
16
+ total_audio_vocabsize: 4160
17
+ total_vocabsize: 156160
18
+ code_type: CosyVoice
19
+ num_latency_tokens: 0
20
+ do_layershift: false
21
+ model_config:
22
+ llm_name: qwen2-0.5b
23
+ llm_path: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/Qwen2-0.5B
24
+ llm_dim: 896
25
+ encoder_name: whisper
26
+ encoder_projector_ds_rate: 5
27
+ encoder_path: large-v3
28
+ encoder_dim: 1280
29
+ encoder_projector: linear
30
+ codec_decoder_path: /home/xj_data/jishengpeng/CosyVoice/pretrained_models/CosyVoice-300M-SFT
31
+ codec_decode: true
32
+ vocab_config:
33
+ code_layer: 3
34
+ total_audio_vocabsize: 4160
35
+ total_vocabsize: 156160
36
+ code_type: CosyVoice
37
+ codec_decoder_type: CosyVoice
38
+ group_decode: true
39
+ group_decode_adapter_type: linear
40
+ train_config:
41
+ model_name: s2s
42
+ freeze_encoder: true
43
+ freeze_llm: true
44
+ freeze_encoder_projector: true
45
+ freeze_group_decode_adapter: true
46
+ batching_strategy: custom
47
+ num_epochs: 1
48
+ val_batch_size: 1
49
+ num_workers_dataloader: 2
50
+ task_type: s2s
51
+ decode_config:
52
+ text_repetition_penalty: 1.2
53
+ audio_repetition_penalty: 1.2
54
+ max_new_tokens: 3000
55
+ task_type: s2s
56
+ do_sample: false
57
+ top_p: 1.0
58
+ top_k: 0
59
+ temperature: 1.0
60
+ decode_text_only: false
61
+ do_layershift: false
62
+ num_latency_tokens: 0
63
+ decode_log: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy
64
+ ckpt_path: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt
65
+ output_text_only: false
66
+ inference_online: false
67
+ speech_sample_rate: 22050
68
+ audio_prompt_path: /home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/audio_prompt/zh/prompt_1.wav
Wav-Omni/ckpt/wavomni_chinese_3_2/.hydra/hydra.yaml ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.run.dir=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2
114
+ - hydra.mode=RUN
115
+ task:
116
+ - ++model_config.llm_name=qwen2-0.5b
117
+ - ++model_config.llm_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/Qwen2-0.5B
118
+ - ++model_config.llm_dim=896
119
+ - ++model_config.encoder_name=whisper
120
+ - ++model_config.encoder_projector_ds_rate=5
121
+ - ++model_config.encoder_path=large-v3
122
+ - ++model_config.encoder_dim=1280
123
+ - ++model_config.encoder_projector=linear
124
+ - ++model_config.codec_decoder_path=/home/xj_data/jishengpeng/CosyVoice/pretrained_models/CosyVoice-300M-SFT
125
+ - ++model_config.codec_decode=true
126
+ - ++model_config.vocab_config.code_layer=3
127
+ - ++model_config.vocab_config.total_audio_vocabsize=4160
128
+ - ++model_config.vocab_config.total_vocabsize=156160
129
+ - ++model_config.code_type=CosyVoice
130
+ - ++model_config.codec_decoder_type=CosyVoice
131
+ - ++model_config.group_decode=true
132
+ - ++model_config.group_decode_adapter_type=linear
133
+ - ++dataset_config.dataset=speech_dataset_s2s
134
+ - ++dataset_config.val_data_path=/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl
135
+ - ++dataset_config.train_data_path=/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl
136
+ - ++dataset_config.input_type=mel
137
+ - ++dataset_config.mel_size=128
138
+ - ++dataset_config.inference_mode=true
139
+ - ++dataset_config.manifest_format=jsonl
140
+ - ++dataset_config.split_size=0.002
141
+ - ++dataset_config.load_from_cache_file=false
142
+ - ++dataset_config.task_type=s2s
143
+ - ++dataset_config.seed=777
144
+ - ++dataset_config.vocab_config.code_layer=3
145
+ - ++dataset_config.vocab_config.total_audio_vocabsize=4160
146
+ - ++dataset_config.vocab_config.total_vocabsize=156160
147
+ - ++dataset_config.code_type=CosyVoice
148
+ - ++dataset_config.num_latency_tokens=0
149
+ - ++dataset_config.do_layershift=false
150
+ - ++train_config.model_name=s2s
151
+ - ++train_config.freeze_encoder=true
152
+ - ++train_config.freeze_llm=true
153
+ - ++train_config.freeze_encoder_projector=true
154
+ - ++train_config.freeze_group_decode_adapter=true
155
+ - ++train_config.batching_strategy=custom
156
+ - ++train_config.num_epochs=1
157
+ - ++train_config.val_batch_size=1
158
+ - ++train_config.num_workers_dataloader=2
159
+ - ++train_config.task_type=s2s
160
+ - ++decode_config.text_repetition_penalty=1.2
161
+ - ++decode_config.audio_repetition_penalty=1.2
162
+ - ++decode_config.max_new_tokens=3000
163
+ - ++decode_config.task_type=s2s
164
+ - ++decode_config.do_sample=false
165
+ - ++decode_config.top_p=1.0
166
+ - ++decode_config.top_k=0
167
+ - ++decode_config.temperature=1.0
168
+ - ++decode_config.decode_text_only=false
169
+ - ++decode_config.do_layershift=false
170
+ - ++decode_log=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy
171
+ - ++decode_config.num_latency_tokens=0
172
+ - ++ckpt_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt
173
+ - ++output_text_only=false
174
+ - ++inference_online=false
175
+ - ++speech_sample_rate=22050
176
+ - ++audio_prompt_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/audio_prompt/zh/prompt_1.wav
177
+ job:
178
+ name: inference_s2s
179
+ chdir: null
180
+ override_dirname: ++audio_prompt_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/audio_prompt/zh/prompt_1.wav,++ckpt_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt,++dataset_config.code_type=CosyVoice,++dataset_config.dataset=speech_dataset_s2s,++dataset_config.do_layershift=false,++dataset_config.inference_mode=true,++dataset_config.input_type=mel,++dataset_config.load_from_cache_file=false,++dataset_config.manifest_format=jsonl,++dataset_config.mel_size=128,++dataset_config.num_latency_tokens=0,++dataset_config.seed=777,++dataset_config.split_size=0.002,++dataset_config.task_type=s2s,++dataset_config.train_data_path=/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl,++dataset_config.val_data_path=/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl,++dataset_config.vocab_config.code_layer=3,++dataset_config.vocab_config.total_audio_vocabsize=4160,++dataset_config.vocab_config.total_vocabsize=156160,++decode_config.audio_repetition_penalty=1.2,++decode_config.decode_text_only=false,++decode_config.do_layershift=false,++decode_config.do_sample=false,++decode_config.max_new_tokens=3000,++decode_config.num_latency_tokens=0,++decode_config.task_type=s2s,++decode_config.temperature=1.0,++decode_config.text_repetition_penalty=1.2,++decode_config.top_k=0,++decode_config.top_p=1.0,++decode_log=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy,++inference_online=false,++model_config.code_type=CosyVoice,++model_config.codec_decode=true,++model_config.codec_decoder_path=/home/xj_data/jishengpeng/CosyVoice/pretrained_models/CosyVoice-300M-SFT,++model_config.codec_decoder_type=CosyVoice,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=large-v3,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.group_decode=true,++model_config.group_decode_adapter_type=linear,++model_config.llm_dim=896,++model_config.llm_name=qwen2-0.5b,++model_config.llm_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/Qwen2-0.5B,++model_config.vocab_config.code_layer=3,++model_config.vocab_config.total_audio_vocabsize=4160,++model_config.vocab_config.total_vocabsize=156160,++output_text_only=false,++speech_sample_rate=22050,++train_config.batching_strategy=custom,++train_config.freeze_encoder=true,++train_config.freeze_encoder_projector=true,++train_config.freeze_group_decode_adapter=true,++train_config.freeze_llm=true,++train_config.model_name=s2s,++train_config.num_epochs=1,++train_config.num_workers_dataloader=2,++train_config.task_type=s2s,++train_config.val_batch_size=1
181
+ id: ???
182
+ num: ???
183
+ config_name: prompt.yaml
184
+ env_set: {}
185
+ env_copy: []
186
+ config:
187
+ override_dirname:
188
+ kv_sep: '='
189
+ item_sep: ','
190
+ exclude_keys: []
191
+ runtime:
192
+ version: 1.3.2
193
+ version_base: '1.3'
194
+ cwd: /home/xj_data/jishengpeng/huawei/Wav-Omni
195
+ config_sources:
196
+ - path: hydra.conf
197
+ schema: pkg
198
+ provider: hydra
199
+ - path: /home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/conf
200
+ schema: file
201
+ provider: main
202
+ - path: ''
203
+ schema: structured
204
+ provider: schema
205
+ output_dir: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2
206
+ choices:
207
+ hydra/env: default
208
+ hydra/callbacks: null
209
+ hydra/job_logging: default
210
+ hydra/hydra_logging: default
211
+ hydra/hydra_help: default
212
+ hydra/help: default
213
+ hydra/sweeper: basic
214
+ hydra/launcher: basic
215
+ hydra/output: default
216
+ verbose: false
Wav-Omni/ckpt/wavomni_chinese_3_2/.hydra/overrides.yaml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - ++model_config.llm_name=qwen2-0.5b
2
+ - ++model_config.llm_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/Qwen2-0.5B
3
+ - ++model_config.llm_dim=896
4
+ - ++model_config.encoder_name=whisper
5
+ - ++model_config.encoder_projector_ds_rate=5
6
+ - ++model_config.encoder_path=large-v3
7
+ - ++model_config.encoder_dim=1280
8
+ - ++model_config.encoder_projector=linear
9
+ - ++model_config.codec_decoder_path=/home/xj_data/jishengpeng/CosyVoice/pretrained_models/CosyVoice-300M-SFT
10
+ - ++model_config.codec_decode=true
11
+ - ++model_config.vocab_config.code_layer=3
12
+ - ++model_config.vocab_config.total_audio_vocabsize=4160
13
+ - ++model_config.vocab_config.total_vocabsize=156160
14
+ - ++model_config.code_type=CosyVoice
15
+ - ++model_config.codec_decoder_type=CosyVoice
16
+ - ++model_config.group_decode=true
17
+ - ++model_config.group_decode_adapter_type=linear
18
+ - ++dataset_config.dataset=speech_dataset_s2s
19
+ - ++dataset_config.val_data_path=/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl
20
+ - ++dataset_config.train_data_path=/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl
21
+ - ++dataset_config.input_type=mel
22
+ - ++dataset_config.mel_size=128
23
+ - ++dataset_config.inference_mode=true
24
+ - ++dataset_config.manifest_format=jsonl
25
+ - ++dataset_config.split_size=0.002
26
+ - ++dataset_config.load_from_cache_file=false
27
+ - ++dataset_config.task_type=s2s
28
+ - ++dataset_config.seed=777
29
+ - ++dataset_config.vocab_config.code_layer=3
30
+ - ++dataset_config.vocab_config.total_audio_vocabsize=4160
31
+ - ++dataset_config.vocab_config.total_vocabsize=156160
32
+ - ++dataset_config.code_type=CosyVoice
33
+ - ++dataset_config.num_latency_tokens=0
34
+ - ++dataset_config.do_layershift=false
35
+ - ++train_config.model_name=s2s
36
+ - ++train_config.freeze_encoder=true
37
+ - ++train_config.freeze_llm=true
38
+ - ++train_config.freeze_encoder_projector=true
39
+ - ++train_config.freeze_group_decode_adapter=true
40
+ - ++train_config.batching_strategy=custom
41
+ - ++train_config.num_epochs=1
42
+ - ++train_config.val_batch_size=1
43
+ - ++train_config.num_workers_dataloader=2
44
+ - ++train_config.task_type=s2s
45
+ - ++decode_config.text_repetition_penalty=1.2
46
+ - ++decode_config.audio_repetition_penalty=1.2
47
+ - ++decode_config.max_new_tokens=3000
48
+ - ++decode_config.task_type=s2s
49
+ - ++decode_config.do_sample=false
50
+ - ++decode_config.top_p=1.0
51
+ - ++decode_config.top_k=0
52
+ - ++decode_config.temperature=1.0
53
+ - ++decode_config.decode_text_only=false
54
+ - ++decode_config.do_layershift=false
55
+ - ++decode_log=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy
56
+ - ++decode_config.num_latency_tokens=0
57
+ - ++ckpt_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt
58
+ - ++output_text_only=false
59
+ - ++inference_online=false
60
+ - ++speech_sample_rate=22050
61
+ - ++audio_prompt_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/audio_prompt/zh/prompt_1.wav
Wav-Omni/ckpt/wavomni_chinese_3_2/inference_s2s.log ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-03-05 13:27:43,011][root][INFO] - train_config: {'model_name': 's2s', 'enable_ddp': False, 'enable_deepspeed': False, 'enable_fsdp': False, 'low_cpu_fsdp': False, 'run_validation': True, 'batch_size_training': 4, 'batching_strategy': 'custom', 'context_length': 4096, 'gradient_accumulation_steps': 1, 'num_epochs': 1, 'num_workers_dataloader': 2, 'warmup_steps': 1000, 'total_steps': 100000, 'validation_interval': 1000, 'lr': 0.0001, 'weight_decay': 0.0, 'gamma': 0.85, 'seed': 42, 'use_fp16': False, 'mixed_precision': True, 'val_batch_size': 1, 'use_peft': False, 'peft_config': {'peft_method': 'lora', 'r': 8, 'lora_alpha': 32, 'target_modules': ['q_proj', 'v_proj'], 'bias': 'none', 'task_type': 'CAUSAL_LM', 'lora_dropout': 0.05, 'inference_mode': False}, 'output_dir': 'PATH/to/save/PEFT/model', 'freeze_layers': False, 'num_freeze_layers': 1, 'quantization': False, 'one_gpu': False, 'save_model': True, 'dist_checkpoint_root_folder': 'PATH/to/save/FSDP/model', 'dist_checkpoint_folder': 'fine-tuned', 'save_optimizer': False, 'use_fast_kernels': False, 'run_test_during_validation': False, 'run_test_during_validation_file': 'test.wav', 'run_test_during_validation_prompt': '<|S2S|>', 'freeze_llm': True, 'freeze_encoder': True, 'train_embed_only': False, 'train_audio_embed_only': False, 'train_embed': False, 'task_type': 's2s', 'freeze_encoder_projector': True, 'freeze_group_decode_adapter': True}
2
+ [2025-03-05 13:27:43,011][root][INFO] - fsdp_config: {'mixed_precision': True, 'use_fp16': False, 'sharding_strategy': 'NO_SHARD', 'checkpoint_type': 'SHARDED_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'pure_bf16': False, 'optimizer': 'AdamW'}
3
+ [2025-03-05 13:27:43,011][root][INFO] - model_config: {'file': 'examples/s2s/model/slam_model_s2s.py:model_factory', 'llm_name': 'qwen2-0.5b', 'llm_path': '/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/Qwen2-0.5B', 'llm_type': 'decoder_only', 'llm_dim': 896, 'encoder_name': 'whisper', 'encoder_ds_rate': 2, 'encoder_path': 'large-v3', 'encoder_dim': 1280, 'encoder_projector': 'linear', 'encoder_projector_ds_rate': 5, 'modal': 'audio', 'normalize': False, 'encoder_type': 'finetune', 'vocab_config': {'text_vocabsize': 151936, 'text_specialtokens': 64, 'audio_vocabsize': 4096, 'audio_specialtokens': 64, 'code_layer': 3, 'padded_text_vocabsize': 152000, 'padded_audio_vocabsize': 4160, 'total_audio_vocabsize': 4160, 'total_vocabsize': 156160, 'eot': 151936, 'pad_t': 151937, 'input_t': 151938, 'answer_t': 151939, 'asr': 151940, 'eoa': 4096, 'pad_a': 4097, 'input_a': 4098, 'answer_a': 4099, 'split': 4100}, 'codec_decode': True, 'codec_decoder_type': 'CosyVoice', 'codec_decoder_path': '/home/xj_data/jishengpeng/CosyVoice/pretrained_models/CosyVoice-300M-SFT', 'tts_adapter': False, 'tts_adapter_config': {'add_qkv_bias': True, 'bias': False, 'gelu_approximate': None, 'head_size': 64, 'intermediate_size': 4864, 'lm_head_bias': False, 'mlp_class_name': 'GptNeoxMLP', 'n_layer': 6, 'n_head': 14, 'n_embd': 896, 'n_query_groups': 2, 'norm_class_name': 'RMSNorm', 'norm_eps': 1e-06, 'parallel_residual': False, 'rotary_percentage': 1.0, 'shared_attention_norm': False}, 'encoder_path_hf': None, 'code_type': 'CosyVoice', 'group_decode': True, 'group_decode_adapter_type': 'linear', 'whisper_decode': False}
4
+ [2025-03-05 13:28:09,748][wavomni_llm.utils.train_utils][INFO] - --> Module whisper
5
+ [2025-03-05 13:28:09,750][wavomni_llm.utils.train_utils][INFO] - --> whisper has 635.04896 Million params
6
+
7
+ [2025-03-05 13:28:09,752][wavomni_llm.utils.train_utils][INFO] - --> Module whisper
8
+ [2025-03-05 13:28:09,753][wavomni_llm.utils.train_utils][INFO] - --> whisper has 0.0 Million params
9
+
10
+ [2025-03-05 13:28:33,421][wavomni_llm.utils.train_utils][INFO] - --> Module qwen2-0.5b
11
+ [2025-03-05 13:28:33,423][wavomni_llm.utils.train_utils][INFO] - --> qwen2-0.5b has 494.032768 Million params
12
+
13
+ [2025-03-05 13:28:33,424][wavomni_llm.utils.train_utils][INFO] - --> Module qwen2-0.5b
14
+ [2025-03-05 13:28:33,425][wavomni_llm.utils.train_utils][INFO] - --> qwen2-0.5b has 0.0 Million params
15
+
16
+ [2025-03-05 13:28:33,502][wavomni_llm.utils.train_utils][INFO] - --> Module linear
17
+ [2025-03-05 13:28:33,502][wavomni_llm.utils.train_utils][INFO] - --> linear has 14.945152 Million params
18
+
19
+ [2025-03-05 13:28:46,757][root][INFO] - input frame rate=50
20
+ [2025-03-05 13:28:56,015][wetext-zh_normalizer][INFO] - found existing fst: /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/zh_tn_tagger.fst
21
+ [2025-03-05 13:28:56,015][wetext-zh_normalizer][INFO] - /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/zh_tn_verbalizer.fst
22
+ [2025-03-05 13:28:56,015][wetext-zh_normalizer][INFO] - skip building fst for zh_normalizer ...
23
+ [2025-03-05 13:28:56,372][wetext-en_normalizer][INFO] - found existing fst: /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/en_tn_tagger.fst
24
+ [2025-03-05 13:28:56,373][wetext-en_normalizer][INFO] - /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/en_tn_verbalizer.fst
25
+ [2025-03-05 13:28:56,373][wetext-en_normalizer][INFO] - skip building fst for en_normalizer ...
26
+ [2025-03-05 13:29:46,255][wavomni_llm.utils.train_utils][INFO] - --> Module CosyVoice Codec
27
+ [2025-03-05 13:29:46,260][wavomni_llm.utils.train_utils][INFO] - --> CosyVoice Codec has 104.577327 Million params
28
+
29
+ [2025-03-05 13:29:46,543][wavomni_llm.utils.train_utils][INFO] - --> Module GroupDecodeAdapter_linear
30
+ [2025-03-05 13:29:46,543][wavomni_llm.utils.train_utils][INFO] - --> GroupDecodeAdapter_linear has 51.92928 Million params
31
+
32
+ [2025-03-05 13:29:48,085][slam_model_s2s.py][INFO] - Resize llm embedding layer's vocab size to 156160
33
+
34
+ [2025-03-05 13:29:48,085][slam_model_s2s.py][INFO] - loading other parts from: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt
35
+
36
+ [2025-03-05 13:30:16,442][wavomni_llm.utils.train_utils][INFO] - --> Model s2s
37
+ [2025-03-05 13:30:16,444][wavomni_llm.utils.train_utils][INFO] - --> s2s has 0.0 Million params
38
+
39
+ [2025-03-05 13:30:17,360][root][INFO] - dataset_config: {'dataset': 'speech_dataset_s2s', 'file': 'examples/s2s/speech_dataset_s2s.py:get_speech_dataset', 'train_data_path': '/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl', 'val_data_path': '/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl', 'train_split': 'train', 'test_split': 'validation', 'prompt': 'Conduct a spoken conversation with the user. ', 'data_path': None, 'max_words': None, 'max_mel': None, 'fix_length_audio': -1, 'inference_mode': True, 'input_type': 'mel', 'mel_size': 128, 'normalize': False, 'seed': 777, 'manifest_format': 'jsonl', 'split_size': 0.002, 'vocab_config': {'text_vocabsize': 151936, 'text_specialtokens': 64, 'audio_vocabsize': 4096, 'audio_specialtokens': 64, 'code_layer': 3, 'padded_text_vocabsize': 152000, 'padded_audio_vocabsize': 4160, 'total_audio_vocabsize': 4160, 'total_vocabsize': 156160, 'eot': 151936, 'pad_t': 151937, 'input_t': 151938, 'answer_t': 151939, 'asr': 151940, 'eoa': 4096, 'pad_a': 4097, 'input_a': 4098, 'answer_a': 4099, 'split': 4100}, 'load_from_cache_file': False, 'task_type': 's2s', 'upsample_text_tokens': False, 'upsampling_factor': 1, 'upsample_method': 'repeat', 'code_type': 'CosyVoice', 'num_latency_tokens': 0, 'do_layershift': False}
40
+ [2025-03-05 13:30:17,420][root][INFO] - --> Training Set Length = 4
41
+ [2025-03-05 13:30:17,429][root][INFO] - decode_config: {'do_sample': False, 'max_new_tokens': 3000, 'min_length': 10, 'temperature': 1.0, 'top_k': 0, 'top_p': 1.0, 'num_beams': 1, 'num_return_sequences': 1, 'num_samples': 1, 'max_time': 0.0, 'text_repetition_penalty': 1.2, 'audio_repetition_penalty': 1.2, 'length_penalty': 1.0, 'early_stopping': False, 'no_repeat_ngram_size': 0, 'bad_words_ids': [], 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'task_type': 's2s', 'decode_text_only': False, 'streaming': False, 'stream_stride': 4, 'upsampling_factor': 1, 'input_text': False, 'do_layershift': False, 'num_latency_tokens': 0}
42
+ [2025-03-05 13:30:17,429][root][INFO] - Decode Strategy: Greedy
43
+ [2025-03-05 13:30:17,430][root][INFO] - Decode Text & Audio
44
+ [2025-03-05 13:30:17,430][root][INFO] - Decode Code Type: CosyVoice
45
+ [2025-03-05 13:30:17,430][root][INFO] - Decode Code Layer: 3
46
+ [2025-03-05 13:30:17,430][root][INFO] - Tone for Audio Generation: prompt_1
47
+ [2025-03-05 13:30:17,430][root][INFO] - ============== Start s2s Inference ==============
48
+ [2025-03-05 13:30:38,187][root][INFO] - LLM Inference Time: 19.46s
49
+ [2025-03-05 13:30:38,188][root][INFO] - Question: 回答该题:柠檬的科学名称是什么?
50
+ [2025-03-05 13:30:38,188][root][INFO] - Generated Text: 柠檬的科学名称是“柑橘科柠檬”,也被称为“柑橘属柠檬”。柑橘科柠檬是一种常见的水果,其果实通常呈圆锥形或椭圆形,并且含有大量的汁液。它富含维生素C、钾和纤维素等营养成分,对身体健康有很好的保健作用。柑橘科柠檬也被广泛用于烹饪和饮料制作中,被人们认为具有美容养颜功效。
51
+ [2025-03-05 13:30:44,356][root][INFO] - Generated Audio: prompt_1/1.wav, audio length: 25.80s, generation time: 25.60s, RTF: 0.99
52
+ [2025-03-05 13:30:44,356][root][INFO] - LLM RTF: 0.75
53
+ [2025-03-05 13:57:49,762][root][INFO] - train_config: {'model_name': 's2s', 'enable_ddp': False, 'enable_deepspeed': False, 'enable_fsdp': False, 'low_cpu_fsdp': False, 'run_validation': True, 'batch_size_training': 4, 'batching_strategy': 'custom', 'context_length': 4096, 'gradient_accumulation_steps': 1, 'num_epochs': 1, 'num_workers_dataloader': 2, 'warmup_steps': 1000, 'total_steps': 100000, 'validation_interval': 1000, 'lr': 0.0001, 'weight_decay': 0.0, 'gamma': 0.85, 'seed': 42, 'use_fp16': False, 'mixed_precision': True, 'val_batch_size': 1, 'use_peft': False, 'peft_config': {'peft_method': 'lora', 'r': 8, 'lora_alpha': 32, 'target_modules': ['q_proj', 'v_proj'], 'bias': 'none', 'task_type': 'CAUSAL_LM', 'lora_dropout': 0.05, 'inference_mode': False}, 'output_dir': 'PATH/to/save/PEFT/model', 'freeze_layers': False, 'num_freeze_layers': 1, 'quantization': False, 'one_gpu': False, 'save_model': True, 'dist_checkpoint_root_folder': 'PATH/to/save/FSDP/model', 'dist_checkpoint_folder': 'fine-tuned', 'save_optimizer': False, 'use_fast_kernels': False, 'run_test_during_validation': False, 'run_test_during_validation_file': 'test.wav', 'run_test_during_validation_prompt': '<|S2S|>', 'freeze_llm': True, 'freeze_encoder': True, 'train_embed_only': False, 'train_audio_embed_only': False, 'train_embed': False, 'task_type': 's2s', 'freeze_encoder_projector': True, 'freeze_group_decode_adapter': True}
54
+ [2025-03-05 13:57:49,762][root][INFO] - fsdp_config: {'mixed_precision': True, 'use_fp16': False, 'sharding_strategy': 'NO_SHARD', 'checkpoint_type': 'SHARDED_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'pure_bf16': False, 'optimizer': 'AdamW'}
55
+ [2025-03-05 13:57:49,762][root][INFO] - model_config: {'file': 'examples/s2s/model/slam_model_s2s.py:model_factory', 'llm_name': 'qwen2-0.5b', 'llm_path': '/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/Qwen2-0.5B', 'llm_type': 'decoder_only', 'llm_dim': 896, 'encoder_name': 'whisper', 'encoder_ds_rate': 2, 'encoder_path': 'large-v3', 'encoder_dim': 1280, 'encoder_projector': 'linear', 'encoder_projector_ds_rate': 5, 'modal': 'audio', 'normalize': False, 'encoder_type': 'finetune', 'vocab_config': {'text_vocabsize': 151936, 'text_specialtokens': 64, 'audio_vocabsize': 4096, 'audio_specialtokens': 64, 'code_layer': 3, 'padded_text_vocabsize': 152000, 'padded_audio_vocabsize': 4160, 'total_audio_vocabsize': 4160, 'total_vocabsize': 156160, 'eot': 151936, 'pad_t': 151937, 'input_t': 151938, 'answer_t': 151939, 'asr': 151940, 'eoa': 4096, 'pad_a': 4097, 'input_a': 4098, 'answer_a': 4099, 'split': 4100}, 'codec_decode': True, 'codec_decoder_type': 'CosyVoice', 'codec_decoder_path': '/home/xj_data/jishengpeng/CosyVoice/pretrained_models/CosyVoice-300M-SFT', 'tts_adapter': False, 'tts_adapter_config': {'add_qkv_bias': True, 'bias': False, 'gelu_approximate': None, 'head_size': 64, 'intermediate_size': 4864, 'lm_head_bias': False, 'mlp_class_name': 'GptNeoxMLP', 'n_layer': 6, 'n_head': 14, 'n_embd': 896, 'n_query_groups': 2, 'norm_class_name': 'RMSNorm', 'norm_eps': 1e-06, 'parallel_residual': False, 'rotary_percentage': 1.0, 'shared_attention_norm': False}, 'encoder_path_hf': None, 'code_type': 'CosyVoice', 'group_decode': True, 'group_decode_adapter_type': 'linear', 'whisper_decode': False}
56
+ [2025-03-05 13:58:15,152][wavomni_llm.utils.train_utils][INFO] - --> Module whisper
57
+ [2025-03-05 13:58:15,154][wavomni_llm.utils.train_utils][INFO] - --> whisper has 635.04896 Million params
58
+
59
+ [2025-03-05 13:58:15,157][wavomni_llm.utils.train_utils][INFO] - --> Module whisper
60
+ [2025-03-05 13:58:15,158][wavomni_llm.utils.train_utils][INFO] - --> whisper has 0.0 Million params
61
+
62
+ [2025-03-05 13:58:17,223][wavomni_llm.utils.train_utils][INFO] - --> Module qwen2-0.5b
63
+ [2025-03-05 13:58:17,224][wavomni_llm.utils.train_utils][INFO] - --> qwen2-0.5b has 494.032768 Million params
64
+
65
+ [2025-03-05 13:58:17,226][wavomni_llm.utils.train_utils][INFO] - --> Module qwen2-0.5b
66
+ [2025-03-05 13:58:17,227][wavomni_llm.utils.train_utils][INFO] - --> qwen2-0.5b has 0.0 Million params
67
+
68
+ [2025-03-05 13:58:17,322][wavomni_llm.utils.train_utils][INFO] - --> Module linear
69
+ [2025-03-05 13:58:17,322][wavomni_llm.utils.train_utils][INFO] - --> linear has 14.945152 Million params
70
+
71
+ [2025-03-05 13:58:27,777][root][INFO] - input frame rate=50
72
+ [2025-03-05 13:58:30,984][wetext-zh_normalizer][INFO] - found existing fst: /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/zh_tn_tagger.fst
73
+ [2025-03-05 13:58:30,984][wetext-zh_normalizer][INFO] - /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/zh_tn_verbalizer.fst
74
+ [2025-03-05 13:58:30,984][wetext-zh_normalizer][INFO] - skip building fst for zh_normalizer ...
75
+ [2025-03-05 13:58:31,355][wetext-en_normalizer][INFO] - found existing fst: /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/en_tn_tagger.fst
76
+ [2025-03-05 13:58:31,355][wetext-en_normalizer][INFO] - /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/en_tn_verbalizer.fst
77
+ [2025-03-05 13:58:31,355][wetext-en_normalizer][INFO] - skip building fst for en_normalizer ...
78
+ [2025-03-05 13:58:35,642][wavomni_llm.utils.train_utils][INFO] - --> Module CosyVoice Codec
79
+ [2025-03-05 13:58:35,647][wavomni_llm.utils.train_utils][INFO] - --> CosyVoice Codec has 104.577327 Million params
80
+
81
+ [2025-03-05 13:58:35,932][wavomni_llm.utils.train_utils][INFO] - --> Module GroupDecodeAdapter_linear
82
+ [2025-03-05 13:58:35,932][wavomni_llm.utils.train_utils][INFO] - --> GroupDecodeAdapter_linear has 51.92928 Million params
83
+
84
+ [2025-03-05 13:58:37,463][slam_model_s2s.py][INFO] - Resize llm embedding layer's vocab size to 156160
85
+
86
+ [2025-03-05 13:58:37,463][slam_model_s2s.py][INFO] - loading other parts from: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt
87
+
88
+ [2025-03-05 13:58:38,622][wavomni_llm.utils.train_utils][INFO] - --> Model s2s
89
+ [2025-03-05 13:58:38,625][wavomni_llm.utils.train_utils][INFO] - --> s2s has 0.0 Million params
90
+
91
+ [2025-03-05 13:58:39,524][root][INFO] - dataset_config: {'dataset': 'speech_dataset_s2s', 'file': 'examples/s2s/speech_dataset_s2s.py:get_speech_dataset', 'train_data_path': '/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl', 'val_data_path': '/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl', 'train_split': 'train', 'test_split': 'validation', 'prompt': 'Conduct a spoken conversation with the user. ', 'data_path': None, 'max_words': None, 'max_mel': None, 'fix_length_audio': -1, 'inference_mode': True, 'input_type': 'mel', 'mel_size': 128, 'normalize': False, 'seed': 777, 'manifest_format': 'jsonl', 'split_size': 0.002, 'vocab_config': {'text_vocabsize': 151936, 'text_specialtokens': 64, 'audio_vocabsize': 4096, 'audio_specialtokens': 64, 'code_layer': 3, 'padded_text_vocabsize': 152000, 'padded_audio_vocabsize': 4160, 'total_audio_vocabsize': 4160, 'total_vocabsize': 156160, 'eot': 151936, 'pad_t': 151937, 'input_t': 151938, 'answer_t': 151939, 'asr': 151940, 'eoa': 4096, 'pad_a': 4097, 'input_a': 4098, 'answer_a': 4099, 'split': 4100}, 'load_from_cache_file': False, 'task_type': 's2s', 'upsample_text_tokens': False, 'upsampling_factor': 1, 'upsample_method': 'repeat', 'code_type': 'CosyVoice', 'num_latency_tokens': 0, 'do_layershift': False}
92
+ [2025-03-05 13:58:39,527][root][INFO] - --> Training Set Length = 4
93
+ [2025-03-05 13:58:39,530][root][INFO] - decode_config: {'do_sample': False, 'max_new_tokens': 3000, 'min_length': 10, 'temperature': 1.0, 'top_k': 0, 'top_p': 1.0, 'num_beams': 1, 'num_return_sequences': 1, 'num_samples': 1, 'max_time': 0.0, 'text_repetition_penalty': 1.2, 'audio_repetition_penalty': 1.2, 'length_penalty': 1.0, 'early_stopping': False, 'no_repeat_ngram_size': 0, 'bad_words_ids': [], 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'task_type': 's2s', 'decode_text_only': False, 'streaming': False, 'stream_stride': 4, 'upsampling_factor': 1, 'input_text': False, 'do_layershift': False, 'num_latency_tokens': 0}
94
+ [2025-03-05 13:58:39,530][root][INFO] - Decode Strategy: Greedy
95
+ [2025-03-05 13:58:39,530][root][INFO] - Decode Text & Audio
96
+ [2025-03-05 13:58:39,530][root][INFO] - Decode Code Type: CosyVoice
97
+ [2025-03-05 13:58:39,530][root][INFO] - Decode Code Layer: 3
98
+ [2025-03-05 13:58:39,530][root][INFO] - Tone for Audio Generation: prompt_1
99
+ [2025-03-05 13:58:39,530][root][INFO] - ============== Start s2s Inference ==============
100
+ [2025-03-05 14:00:42,800][root][INFO] - train_config: {'model_name': 's2s', 'enable_ddp': False, 'enable_deepspeed': False, 'enable_fsdp': False, 'low_cpu_fsdp': False, 'run_validation': True, 'batch_size_training': 4, 'batching_strategy': 'custom', 'context_length': 4096, 'gradient_accumulation_steps': 1, 'num_epochs': 1, 'num_workers_dataloader': 2, 'warmup_steps': 1000, 'total_steps': 100000, 'validation_interval': 1000, 'lr': 0.0001, 'weight_decay': 0.0, 'gamma': 0.85, 'seed': 42, 'use_fp16': False, 'mixed_precision': True, 'val_batch_size': 1, 'use_peft': False, 'peft_config': {'peft_method': 'lora', 'r': 8, 'lora_alpha': 32, 'target_modules': ['q_proj', 'v_proj'], 'bias': 'none', 'task_type': 'CAUSAL_LM', 'lora_dropout': 0.05, 'inference_mode': False}, 'output_dir': 'PATH/to/save/PEFT/model', 'freeze_layers': False, 'num_freeze_layers': 1, 'quantization': False, 'one_gpu': False, 'save_model': True, 'dist_checkpoint_root_folder': 'PATH/to/save/FSDP/model', 'dist_checkpoint_folder': 'fine-tuned', 'save_optimizer': False, 'use_fast_kernels': False, 'run_test_during_validation': False, 'run_test_during_validation_file': 'test.wav', 'run_test_during_validation_prompt': '<|S2S|>', 'freeze_llm': True, 'freeze_encoder': True, 'train_embed_only': False, 'train_audio_embed_only': False, 'train_embed': False, 'task_type': 's2s', 'freeze_encoder_projector': True, 'freeze_group_decode_adapter': True}
101
+ [2025-03-05 14:00:42,800][root][INFO] - fsdp_config: {'mixed_precision': True, 'use_fp16': False, 'sharding_strategy': 'NO_SHARD', 'checkpoint_type': 'SHARDED_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'pure_bf16': False, 'optimizer': 'AdamW'}
102
+ [2025-03-05 14:00:42,800][root][INFO] - model_config: {'file': 'examples/s2s/model/slam_model_s2s.py:model_factory', 'llm_name': 'qwen2-0.5b', 'llm_path': '/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/Qwen2-0.5B', 'llm_type': 'decoder_only', 'llm_dim': 896, 'encoder_name': 'whisper', 'encoder_ds_rate': 2, 'encoder_path': 'large-v3', 'encoder_dim': 1280, 'encoder_projector': 'linear', 'encoder_projector_ds_rate': 5, 'modal': 'audio', 'normalize': False, 'encoder_type': 'finetune', 'vocab_config': {'text_vocabsize': 151936, 'text_specialtokens': 64, 'audio_vocabsize': 4096, 'audio_specialtokens': 64, 'code_layer': 3, 'padded_text_vocabsize': 152000, 'padded_audio_vocabsize': 4160, 'total_audio_vocabsize': 4160, 'total_vocabsize': 156160, 'eot': 151936, 'pad_t': 151937, 'input_t': 151938, 'answer_t': 151939, 'asr': 151940, 'eoa': 4096, 'pad_a': 4097, 'input_a': 4098, 'answer_a': 4099, 'split': 4100}, 'codec_decode': True, 'codec_decoder_type': 'CosyVoice', 'codec_decoder_path': '/home/xj_data/jishengpeng/CosyVoice/pretrained_models/CosyVoice-300M-SFT', 'tts_adapter': False, 'tts_adapter_config': {'add_qkv_bias': True, 'bias': False, 'gelu_approximate': None, 'head_size': 64, 'intermediate_size': 4864, 'lm_head_bias': False, 'mlp_class_name': 'GptNeoxMLP', 'n_layer': 6, 'n_head': 14, 'n_embd': 896, 'n_query_groups': 2, 'norm_class_name': 'RMSNorm', 'norm_eps': 1e-06, 'parallel_residual': False, 'rotary_percentage': 1.0, 'shared_attention_norm': False}, 'encoder_path_hf': None, 'code_type': 'CosyVoice', 'group_decode': True, 'group_decode_adapter_type': 'linear', 'whisper_decode': False}
103
+ [2025-03-05 14:01:06,313][wavomni_llm.utils.train_utils][INFO] - --> Module whisper
104
+ [2025-03-05 14:01:06,315][wavomni_llm.utils.train_utils][INFO] - --> whisper has 635.04896 Million params
105
+
106
+ [2025-03-05 14:01:06,316][wavomni_llm.utils.train_utils][INFO] - --> Module whisper
107
+ [2025-03-05 14:01:06,317][wavomni_llm.utils.train_utils][INFO] - --> whisper has 0.0 Million params
108
+
109
+ [2025-03-05 14:01:08,196][wavomni_llm.utils.train_utils][INFO] - --> Module qwen2-0.5b
110
+ [2025-03-05 14:01:08,197][wavomni_llm.utils.train_utils][INFO] - --> qwen2-0.5b has 494.032768 Million params
111
+
112
+ [2025-03-05 14:01:08,198][wavomni_llm.utils.train_utils][INFO] - --> Module qwen2-0.5b
113
+ [2025-03-05 14:01:08,199][wavomni_llm.utils.train_utils][INFO] - --> qwen2-0.5b has 0.0 Million params
114
+
115
+ [2025-03-05 14:01:08,274][wavomni_llm.utils.train_utils][INFO] - --> Module linear
116
+ [2025-03-05 14:01:08,274][wavomni_llm.utils.train_utils][INFO] - --> linear has 14.945152 Million params
117
+
118
+ [2025-03-05 14:01:17,364][root][INFO] - input frame rate=50
119
+ [2025-03-05 14:01:20,258][wetext-zh_normalizer][INFO] - found existing fst: /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/zh_tn_tagger.fst
120
+ [2025-03-05 14:01:20,258][wetext-zh_normalizer][INFO] - /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/zh_tn_verbalizer.fst
121
+ [2025-03-05 14:01:20,258][wetext-zh_normalizer][INFO] - skip building fst for zh_normalizer ...
122
+ [2025-03-05 14:01:20,643][wetext-en_normalizer][INFO] - found existing fst: /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/en_tn_tagger.fst
123
+ [2025-03-05 14:01:20,643][wetext-en_normalizer][INFO] - /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/en_tn_verbalizer.fst
124
+ [2025-03-05 14:01:20,643][wetext-en_normalizer][INFO] - skip building fst for en_normalizer ...
125
+ [2025-03-05 14:01:24,558][wavomni_llm.utils.train_utils][INFO] - --> Module CosyVoice Codec
126
+ [2025-03-05 14:01:24,563][wavomni_llm.utils.train_utils][INFO] - --> CosyVoice Codec has 104.577327 Million params
127
+
128
+ [2025-03-05 14:01:24,850][wavomni_llm.utils.train_utils][INFO] - --> Module GroupDecodeAdapter_linear
129
+ [2025-03-05 14:01:24,850][wavomni_llm.utils.train_utils][INFO] - --> GroupDecodeAdapter_linear has 51.92928 Million params
130
+
131
+ [2025-03-05 14:01:26,392][slam_model_s2s.py][INFO] - Resize llm embedding layer's vocab size to 156160
132
+
133
+ [2025-03-05 14:01:26,393][slam_model_s2s.py][INFO] - loading other parts from: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt
134
+
135
+ [2025-03-05 14:01:27,554][wavomni_llm.utils.train_utils][INFO] - --> Model s2s
136
+ [2025-03-05 14:01:27,556][wavomni_llm.utils.train_utils][INFO] - --> s2s has 0.0 Million params
137
+
138
+ [2025-03-05 14:01:28,452][root][INFO] - dataset_config: {'dataset': 'speech_dataset_s2s', 'file': 'examples/s2s/speech_dataset_s2s.py:get_speech_dataset', 'train_data_path': '/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl', 'val_data_path': '/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl', 'train_split': 'train', 'test_split': 'validation', 'prompt': 'Conduct a spoken conversation with the user. ', 'data_path': None, 'max_words': None, 'max_mel': None, 'fix_length_audio': -1, 'inference_mode': True, 'input_type': 'mel', 'mel_size': 128, 'normalize': False, 'seed': 777, 'manifest_format': 'jsonl', 'split_size': 0.002, 'vocab_config': {'text_vocabsize': 151936, 'text_specialtokens': 64, 'audio_vocabsize': 4096, 'audio_specialtokens': 64, 'code_layer': 3, 'padded_text_vocabsize': 152000, 'padded_audio_vocabsize': 4160, 'total_audio_vocabsize': 4160, 'total_vocabsize': 156160, 'eot': 151936, 'pad_t': 151937, 'input_t': 151938, 'answer_t': 151939, 'asr': 151940, 'eoa': 4096, 'pad_a': 4097, 'input_a': 4098, 'answer_a': 4099, 'split': 4100}, 'load_from_cache_file': False, 'task_type': 's2s', 'upsample_text_tokens': False, 'upsampling_factor': 1, 'upsample_method': 'repeat', 'code_type': 'CosyVoice', 'num_latency_tokens': 0, 'do_layershift': False}
139
+ [2025-03-05 14:01:28,499][root][INFO] - --> Training Set Length = 4
140
+ [2025-03-05 14:01:28,502][root][INFO] - decode_config: {'do_sample': False, 'max_new_tokens': 3000, 'min_length': 10, 'temperature': 1.0, 'top_k': 0, 'top_p': 1.0, 'num_beams': 1, 'num_return_sequences': 1, 'num_samples': 1, 'max_time': 0.0, 'text_repetition_penalty': 1.2, 'audio_repetition_penalty': 1.2, 'length_penalty': 1.0, 'early_stopping': False, 'no_repeat_ngram_size': 0, 'bad_words_ids': [], 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'task_type': 's2s', 'decode_text_only': False, 'streaming': False, 'stream_stride': 4, 'upsampling_factor': 1, 'input_text': False, 'do_layershift': False, 'num_latency_tokens': 0}
141
+ [2025-03-05 14:01:28,502][root][INFO] - Decode Strategy: Greedy
142
+ [2025-03-05 14:01:28,503][root][INFO] - Decode Text & Audio
143
+ [2025-03-05 14:01:28,503][root][INFO] - Decode Code Type: CosyVoice
144
+ [2025-03-05 14:01:28,503][root][INFO] - Decode Code Layer: 3
145
+ [2025-03-05 14:01:28,503][root][INFO] - Tone for Audio Generation: prompt_1
146
+ [2025-03-05 14:01:28,503][root][INFO] - ============== Start s2s Inference ==============
147
+ [2025-03-05 14:01:49,477][root][INFO] - LLM Inference Time: 19.84s
148
+ [2025-03-05 14:01:49,478][root][INFO] - Question: 回答该题:柠檬的科学名称是什么?
149
+ [2025-03-05 14:01:49,478][root][INFO] - Generated Text: 柠檬的科学名称是“柑橘科柠檬”,也被称为“柑橘属柠檬”。柑橘科柠檬是一种常见的水果,其果实通常呈圆锥形或椭圆形,并且含有大量的汁液。它富含维生素C、钾和纤维素等营养成分,对身体健康有很好的保健作用。柑橘科柠檬也被广泛用于烹饪和饮料制作中,被人们认为具有美容养颜功效。
150
+ [2025-03-05 14:01:55,437][root][INFO] - Generated Audio: prompt_1/1.wav, audio length: 25.80s, generation time: 25.76s, RTF: 1.00
151
+ [2025-03-05 14:01:55,437][root][INFO] - LLM RTF: 0.77
152
+ [2025-03-05 14:02:07,906][root][INFO] - LLM Inference Time: 12.47s
153
+ [2025-03-05 14:02:07,906][root][INFO] - Question: 请提出五种减少塑料垃圾的方法。
154
+ [2025-03-05 14:02:07,906][root][INFO] - Generated Text: 1. 使用可重复使用的购物袋和水瓶
155
+ 2. 减少使用一次性塑料制品,如塑料餐具、瓶子等
156
+ 3. 避免购买包装过多的食品,选择天然食材制作的食物
157
+ 4. 选择购买环保型产品,如有机纸袋、玻璃杯等
158
+ 5. 在家中安装玻璃窗或窗户,减少室内空气流通。
159
+ [2025-03-05 14:02:16,500][root][INFO] - Generated Audio: prompt_1/2.wav, audio length: 23.09s, generation time: 21.04s, RTF: 0.91
160
+ [2025-03-05 14:02:16,500][root][INFO] - LLM RTF: 0.54
161
+ [2025-03-05 14:02:30,030][root][INFO] - LLM Inference Time: 13.53s
162
+ [2025-03-05 14:02:30,030][root][INFO] - Question: 小李的薪水是每月8000元,他每个月需要交400元的养老保险和200元的医疗保险,请问他每个月实际能拿到的薪水是多少?
163
+ [2025-03-05 14:02:30,030][root][INFO] - Generated Text: 小李每月实际能拿到的薪水是8000元+400元+200元=16000元。
164
+ 解题过程:
165
+ 1. 小李每个月需要交的养老保险和医疗保健的金额之和为 16000元
166
+ 2. 计算小李每月实际能拿到的薪水,即 16000元 - 15000元 = 15000元
167
+ 因此,小李每月实际能拿到的薪水是15000元。
168
+ [2025-03-05 14:02:38,316][root][INFO] - Generated Audio: prompt_1/3.wav, audio length: 25.14s, generation time: 21.79s, RTF: 0.87
169
+ [2025-03-05 14:02:38,316][root][INFO] - LLM RTF: 0.54
170
+ [2025-03-05 14:02:48,677][root][INFO] - LLM Inference Time: 10.36s
171
+ [2025-03-05 14:02:48,677][root][INFO] - Question: 你好,我现在是浙江大学计算机二年级的硕士生,攻读语音方向,你能给我推荐一些学习的建议吗?
172
+ [2025-03-05 14:02:48,677][root][INFO] - Generated Text: 当然可以。首先,建议你多读一些经典的数学书籍,例如《数学原理》、《代数与几何》等。其次,要注重练习和思考,可以通过实践项目或者项目来巩固知识。最后,不要忘记在学习过程中保持耐心和积极性,因为只有通过不断的努力才能取得好的成果。
173
+ [2025-03-05 14:02:57,149][root][INFO] - Generated Audio: prompt_1/4.wav, audio length: 19.41s, generation time: 18.80s, RTF: 0.97
174
+ [2025-03-05 14:02:57,150][root][INFO] - LLM RTF: 0.53
175
+ [2025-03-05 14:02:57,379][root][INFO] - ============== Inference Finished ==============
Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcd393406fe8e519802f73520504470f109a67519a3ab5d5442274793cace3bc
3
+ size 2258846794
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/gt_text ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ 1 柠檬的科学名称为Citrus limon,属于芸香科柑橘属的常绿小乔木或灌木植物。它起源于亚洲南部,并在欧洲和地中海地区以及美洲热带地区广泛栽培。柠檬树可以在强烈的阳光下生长,所以它经常被栽种在阳光充足的地方,如地中海气候区。柠檬的果实是圆锥形的,皮表面略带纵行皱纹,果肉酸爽,用于制作饮料和调味料等。柠檬也是一种富含维生素C和抗氧化剂的水果,对保持健康有很大的益处。
2
+ 2 1. 减少使用塑料制品,转而使用可再生或可分解材料制品。2. 训练良好的垃圾分类习惯,将可回收和可分解的塑料垃圾分别分类处理。3. 限制超市、商场,以及快餐店等商家使用一次性塑料袋、餐具和纸杯等,鼓励折叠包与可重复使用的餐具。4. 呼吁政府出台相关的环境保护法律法规,限制塑料垃圾排放,处罚违规行为。5. 开展垃圾分类教育活动,吸引公众关注塑料垃圾减量,培养环保意识,减少不必要的消费行为
3
+ 3 小李每个月的薪水是8000元,需要交400元的养老保险和200元的医疗保险。因此,他实际能拿到的薪水是多少呢?我们可以用一个简单的计算公式来计算:实际薪水 = 总薪水 - 养老保险 - 医疗保险。将小李每月的总薪水8000元代入公式,养老保险400元和医疗保险200元分别代入得:实际薪水 = 8000 - 400 - 200 = 7400元,所以小李每个月实际能拿到的薪水是7400元。
4
+ 4 没有标准回答,请随意发挥。
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6a88562a43f4076affb94dada09d329021ecd7ad041bb2b1b67c22fc8fb3345
3
+ size 1137708
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a94d7a467901758c23a0fc998b44542b7f46f521ce2cb05fe36a1cdc485c9f04
3
+ size 1018412
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e7f3a3d1c9cf1da6280012358379bcb0af32d7d2f4556fcd19de66b114a3676
3
+ size 1108524
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:101fc7477936e0d7ab017d8b2ac78837a0f539a814bdd7df40004e6d3d974eb8
3
+ size 856108
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_text ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1 柠檬的科学名称是“柑橘科柠檬”,也被称为“柑橘属柠檬”。柑橘科柠檬是一种常见的水果,其果实通常呈圆锥形或椭圆形,并且含有大量的汁液。它富含维生素C、钾和纤维素等营养成分,对身体健康有很好的保健作用。柑橘科柠檬也被广泛用于烹饪和饮料制作中,被人们认为具有美容养颜功效。
2
+ 2 1. 使用可重复使用的购物袋和水瓶
3
+ 2. 减少使用一次性塑料制品,如塑料餐具、瓶子等
4
+ 3. 避免购买包装过多的食品,选择天然食材制作的食物
5
+ 4. 选择购买环保型产品,如有机纸袋、玻璃杯等
6
+ 5. 在家中安装玻璃窗或窗户,减少室内空气流通。
7
+ 3 小李每月实际能拿到的薪水是8000元+400元+200元=16000元。
8
+ 解题过程:
9
+ 1. 小李每个月需要交的养老保险和医疗保健的金额之和为 16000元
10
+ 2. 计算小李每月实际能拿到的薪水,即 16000元 - 15000元 = 15000元
11
+ 因此,小李每月实际能拿到的薪水是15000元。
12
+ 4 当然可以。首先,建议你多读一些经典的数学书籍,例如《数学原理》、《代数与几何》等。其次,要注重练习和思考,可以通过实践项目或者项目来巩固知识。最后,不要忘记在学习过程中保持耐心和积极性,因为只有通过不断的努力才能取得好的成果。
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/question_text ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ 1 回答该题:柠檬的科学名称是什么?
2
+ 2 请提出五种减少塑料垃圾的方法。
3
+ 3 小李的薪水是每月8000元,他每个月需要交400元的养老保险和200元的医疗保险,请问他每个月实际能拿到的薪水是多少?
4
+ 4 你好,我现在是浙江大学计算机二年级的硕士生,攻读语音方向,你能给我推荐一些学习的建议吗?
Wav-Omni/environment.yml ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: wavomni
2
+ channels:
3
+ - defaults
4
+ - https://repo.anaconda.com/pkgs/main
5
+ - https://repo.anaconda.com/pkgs/r
6
+ dependencies:
7
+ - _libgcc_mutex=0.1=main
8
+ - _openmp_mutex=5.1=1_gnu
9
+ - ca-certificates=2024.12.31=h06a4308_0
10
+ - ld_impl_linux-64=2.40=h12ee557_0
11
+ - libffi=3.4.4=h6a678d5_1
12
+ - libgcc-ng=11.2.0=h1234567_1
13
+ - libgomp=11.2.0=h1234567_1
14
+ - libstdcxx-ng=11.2.0=h1234567_1
15
+ - ncurses=6.4=h6a678d5_0
16
+ - openssl=3.0.15=h5eee18b_0
17
+ - pip=25.0=py39h06a4308_0
18
+ - python=3.9.21=he870216_1
19
+ - readline=8.2=h5eee18b_0
20
+ - setuptools=75.8.0=py39h06a4308_0
21
+ - sqlite=3.45.3=h5eee18b_0
22
+ - tk=8.6.14=h39e8969_0
23
+ - wheel=0.45.1=py39h06a4308_0
24
+ - xz=5.4.6=h5eee18b_1
25
+ - zlib=1.2.13=h5eee18b_1
26
+ - pip:
27
+ - accelerate==1.3.0
28
+ - addict==2.4.0
29
+ - aiofiles==23.2.1
30
+ - aiohappyeyeballs==2.4.4
31
+ - aiohttp==3.11.12
32
+ - aiosignal==1.3.2
33
+ - aliyun-python-sdk-core==2.16.0
34
+ - aliyun-python-sdk-kms==2.16.5
35
+ - annotated-types==0.7.0
36
+ - antlr4-python3-runtime==4.9.3
37
+ - anyio==4.8.0
38
+ - appdirs==1.4.4
39
+ - asttokens==3.0.0
40
+ - async-timeout==5.0.1
41
+ - attrs==25.1.0
42
+ - audioread==3.0.1
43
+ - beautifulsoup4==4.13.3
44
+ - bitsandbytes==0.45.2
45
+ - black==25.1.0
46
+ - brotli==1.1.0
47
+ - certifi==2025.1.31
48
+ - cffi==1.17.1
49
+ - charset-normalizer==3.4.1
50
+ - click==8.1.8
51
+ - cmake==3.25.0
52
+ - coloredlogs==15.0.1
53
+ - conformer==0.3.2
54
+ - contourpy==1.3.0
55
+ - crcmod==1.7
56
+ - cryptography==44.0.0
57
+ - cycler==0.12.1
58
+ - cython==3.0.11
59
+ - datasets==2.18.0
60
+ - decorator==5.1.1
61
+ - deepspeed==0.14.2
62
+ - diffusers==0.27.2
63
+ - dill==0.3.8
64
+ - docker-pycreds==0.4.0
65
+ - einops==0.8.0
66
+ - eval-type-backport==0.2.2
67
+ - exceptiongroup==1.2.2
68
+ - executing==2.2.0
69
+ - fastapi==0.115.8
70
+ - ffmpy==0.5.0
71
+ - filelock==3.17.0
72
+ - fire==0.7.0
73
+ - flatbuffers==25.1.24
74
+ - fonttools==4.55.8
75
+ - frozenlist==1.5.0
76
+ - fsspec==2024.2.0
77
+ - gast==0.6.0
78
+ - gdown==5.2.0
79
+ - gitdb==4.0.12
80
+ - gitpython==3.1.44
81
+ - gradio==4.44.1
82
+ - gradio-client==1.3.0
83
+ - grpcio==1.57.0
84
+ - grpcio-tools==1.57.0
85
+ - h11==0.14.0
86
+ - hjson==3.1.0
87
+ - httpcore==1.0.7
88
+ - httpx==0.28.1
89
+ - huggingface-hub==0.25.2
90
+ - humanfriendly==10.0
91
+ - hydra-core==1.3.2
92
+ - hyperpyyaml==1.2.2
93
+ - idna==3.10
94
+ - importlib-metadata==8.6.1
95
+ - importlib-resources==6.5.2
96
+ - inflate64==1.0.1
97
+ - inflect==7.3.1
98
+ - ipython==8.18.1
99
+ - jedi==0.19.2
100
+ - jinja2==3.1.5
101
+ - jmespath==0.10.0
102
+ - joblib==1.4.2
103
+ - kiwisolver==1.4.7
104
+ - lazy-loader==0.4
105
+ - librosa==0.10.2.post1
106
+ - lightning==2.2.4
107
+ - lightning-utilities==0.12.0
108
+ - lit==15.0.7
109
+ - llvmlite==0.43.0
110
+ - loralib==0.1.2
111
+ - markdown-it-py==3.0.0
112
+ - markupsafe==2.1.5
113
+ - matplotlib==3.7.5
114
+ - matplotlib-inline==0.1.7
115
+ - mdurl==0.1.2
116
+ - modelscope==1.15.0
117
+ - more-itertools==10.6.0
118
+ - mpmath==1.3.0
119
+ - msgpack==1.1.0
120
+ - multidict==6.1.0
121
+ - multiprocess==0.70.16
122
+ - multivolumefile==0.2.3
123
+ - mypy-extensions==1.0.0
124
+ - networkx==3.2.1
125
+ - ninja==1.11.1.3
126
+ - numba==0.60.0
127
+ - numpy==1.26.4
128
+ - nvidia-cublas-cu12==12.1.3.1
129
+ - nvidia-cuda-cupti-cu12==12.1.105
130
+ - nvidia-cuda-nvrtc-cu12==12.1.105
131
+ - nvidia-cuda-runtime-cu12==12.1.105
132
+ - nvidia-cudnn-cu12==8.9.2.26
133
+ - nvidia-cufft-cu12==11.0.2.54
134
+ - nvidia-curand-cu12==10.3.2.106
135
+ - nvidia-cusolver-cu12==11.4.5.107
136
+ - nvidia-cusparse-cu12==12.1.0.106
137
+ - nvidia-cusparselt-cu12==0.6.2
138
+ - nvidia-ml-py==12.570.86
139
+ - nvidia-nccl-cu12==2.20.5
140
+ - nvidia-nvjitlink-cu12==12.4.127
141
+ - nvidia-nvtx-cu12==12.1.105
142
+ - omegaconf==2.3.0
143
+ - onnxruntime-gpu==1.18.0
144
+ - openai-whisper==20231117
145
+ - optimum==1.24.0
146
+ - orjson==3.10.15
147
+ - oss2==2.19.1
148
+ - packaging==24.2
149
+ - pandas==2.2.3
150
+ - parso==0.8.4
151
+ - pathspec==0.12.1
152
+ - peft==0.6.0
153
+ - pexpect==4.9.0
154
+ - pillow==10.4.0
155
+ - platformdirs==4.3.6
156
+ - pooch==1.8.2
157
+ - prompt-toolkit==3.0.50
158
+ - propcache==0.2.1
159
+ - protobuf==4.25.6
160
+ - psutil==6.1.1
161
+ - ptyprocess==0.7.0
162
+ - pure-eval==0.2.3
163
+ - py-cpuinfo==9.0.0
164
+ - py7zr==0.22.0
165
+ - pyarrow==19.0.0
166
+ - pyarrow-hotfix==0.6
167
+ - pybcj==1.0.3
168
+ - pycparser==2.22
169
+ - pycryptodome==3.21.0
170
+ - pycryptodomex==3.21.0
171
+ - pydantic==2.10.6
172
+ - pydantic-core==2.27.2
173
+ - pydub==0.25.1
174
+ - pygments==2.19.1
175
+ - pynini==2.1.5
176
+ - pynvml==12.0.0
177
+ - pyparsing==3.2.1
178
+ - pyppmd==1.1.1
179
+ - pysocks==1.7.1
180
+ - python-dateutil==2.9.0.post0
181
+ - python-multipart==0.0.20
182
+ - pytorch-lightning==2.5.0.post0
183
+ - pytz==2025.1
184
+ - pyyaml==6.0.2
185
+ - pyzstd==0.16.2
186
+ - regex==2024.11.6
187
+ - requests==2.32.3
188
+ - rich==13.9.4
189
+ - ruamel-yaml==0.18.10
190
+ - ruamel-yaml-clib==0.2.12
191
+ - ruff==0.9.5
192
+ - safetensors==0.5.2
193
+ - scikit-learn==1.6.1
194
+ - scipy==1.13.1
195
+ - semantic-version==2.10.0
196
+ - sentencepiece==0.2.0
197
+ - sentry-sdk==2.20.0
198
+ - setproctitle==1.3.4
199
+ - shellingham==1.5.4
200
+ - simplejson==3.19.3
201
+ - six==1.17.0
202
+ - slam-llm==0.0.1
203
+ - smmap==5.0.2
204
+ - sniffio==1.3.1
205
+ - sortedcontainers==2.4.0
206
+ - soundfile==0.13.1
207
+ - soupsieve==2.6
208
+ - soxr==0.5.0.post1
209
+ - stack-data==0.6.3
210
+ - starlette==0.45.3
211
+ - sympy==1.13.1
212
+ - termcolor==2.5.0
213
+ - texttable==1.7.0
214
+ - threadpoolctl==3.5.0
215
+ - tiktoken==0.8.0
216
+ - tokenize-rt==6.1.0
217
+ - tokenizers==0.19.1
218
+ - tomli==2.2.1
219
+ - tomlkit==0.12.0
220
+ - torch==2.3.0
221
+ - torchaudio==2.3.0
222
+ - torchmetrics==1.6.1
223
+ - torchvision==0.15.2+cu118
224
+ - tqdm==4.67.1
225
+ - traitlets==5.14.3
226
+ - transformers==4.43.4
227
+ - triton==2.3.0
228
+ - typeguard==4.4.1
229
+ - typer==0.15.1
230
+ - typing-extensions==4.12.2
231
+ - tzdata==2025.1
232
+ - urllib3==2.3.0
233
+ - uvicorn==0.34.0
234
+ - wandb==0.19.6
235
+ - wcwidth==0.2.13
236
+ - websockets==12.0
237
+ - wetextprocessing==1.0.3
238
+ - wget==3.2
239
+ - xxhash==3.5.0
240
+ - yapf==0.43.0
241
+ - yarl==1.18.3
242
+ - zipp==3.21.0
243
+ prefix: /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni
Wav-Omni/examples/s2s/__pycache__/s2s_config.cpython-39.pyc ADDED
Binary file (10.2 kB). View file
 
Wav-Omni/examples/s2s/__pycache__/speech_dataset_s2s.cpython-39.pyc ADDED
Binary file (15 kB). View file
 
Wav-Omni/examples/s2s/audio_prompt/en/prompt_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:feeda768f3add18351ee20bed5bea58f5bb0d1936bb6073b4539496e20226888
3
+ size 303016
Wav-Omni/examples/s2s/audio_prompt/en/prompt_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:685356ad8d595caed18b836df579d29e1764327f02c740e34c166e53fe7ff7e0
3
+ size 308662
Wav-Omni/examples/s2s/audio_prompt/en/prompt_3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5993d88854f032e50e3bd850d6aec76222d9300d9999cec24bb4ea7186c46799
3
+ size 303016
Wav-Omni/examples/s2s/audio_prompt/en/prompt_4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e8403303c63f3ecd78a71b423aecbff73f2c5ec50cd105f26391707ec2f7b8d
3
+ size 319950
Wav-Omni/examples/s2s/audio_prompt/en/prompt_5.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8415294e7303851998823ea67e23d5d9033f04fd6c2170a83f3be8d1c65e7583
3
+ size 312424
Wav-Omni/examples/s2s/audio_prompt/en/prompt_6.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c17b0cbe97a1d195493cf903077a90caa50029c5a57644f60e98e4e2e547b61b
3
+ size 335004
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:739ab2183a65576f14eec6f72ea7de6954e02014491b2876b1c528feea4ba5bc
3
+ size 214316
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28fc8a7443a5fbadb04566e1344792fb6fc5847d27b16afeb441b813063fec58
3
+ size 197676
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c70be19e3a974efae41a727872d2385f597f3cb48054dd2b19bd41c8e3910749
3
+ size 198444
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa7c7ee1b196484661326e915ee3cbb5a313efdbbf67ed735ded8a0b7cd9f441
3
+ size 216364
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_5.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8580fc452743c8fb8554b8fbf72f0962fb82346d64471c0d2f976d9abbf3f0e7
3
+ size 219180
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_6.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:751bc465053b4cdcb184cef48c3f58168a3d6c699d3b6b72230c383507d77fb8
3
+ size 197164
Wav-Omni/examples/s2s/conf/ds_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_micro_batch_size_per_gpu": 4,
3
+ "gradient_accumulation_steps": 1,
4
+ "optimizer": {
5
+ "type": "Adam",
6
+ "params": {
7
+ "lr": 1e-4
8
+ }
9
+ },
10
+ "fp16": {
11
+ "enabled": true
12
+ },
13
+ "zero_optimization": {
14
+ "stage": 3,
15
+ "offload_optimizer": {
16
+ "device": "cpu"
17
+ }
18
+ }
19
+ }
Wav-Omni/examples/s2s/conf/prompt.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_config:
2
+ # we put prompt here, because the hydra override in shell script only support a small subset of chars
3
+ prompt: "Conduct a spoken conversation with the user. "
Wav-Omni/examples/s2s/conf/prompt_asr.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ dataset_config:
2
+ prompt: "Transcribe the provided audio into accurate text. "
Wav-Omni/examples/s2s/conf/prompt_tts.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ dataset_config:
2
+ # we put prompt here, because the hydra override in shell script only support a small subset of chars
3
+ # prompt: "Transcribe speech to text. Output the transcription directly without redundant content. Ensure that the output is not duplicated. "
4
+ prompt: "Generate a natural and expressive spoken version of the given text. "
Wav-Omni/examples/s2s/deepspeed_finetune_s2s.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from wavomni_llm.pipeline.finetune_deepspeed import main as train
2
+ from wavomni_llm.utils.deepspeed_utils import deepspeed_main_wrapper
3
+
4
+ import logging
5
+ from dataclasses import dataclass, field
6
+ from omegaconf import DictConfig, ListConfig, OmegaConf
7
+ from s2s_config import ModelConfig, TrainConfig, DataConfig, LogConfig
8
+
9
+
10
+ @dataclass
11
+ class RunConfig:
12
+ dataset_config: DataConfig = field(default_factory=DataConfig)
13
+ model_config: ModelConfig = field(default_factory=ModelConfig)
14
+ train_config: TrainConfig = field(default_factory=TrainConfig)
15
+ log_config: LogConfig = field(default_factory=LogConfig)
16
+ debug: bool = field(default=False, metadata={"help": "Use pdb when true"})
17
+ metric: str = field(default="acc", metadata={"help": "The metric for evaluation"})
18
+ deepspeed_config: str = field(default="examples/asr_librispeech/conf/ds_config.json", metadata={"help": "The metric for evaluation"})
19
+
20
+
21
+ @deepspeed_main_wrapper(config_name=None, version_base=None)
22
+ def main_hydra(cfg: DictConfig):
23
+ run_config = RunConfig()
24
+ cfg = OmegaConf.merge(run_config, cfg)
25
+ def to_plain_list(cfg_item):
26
+ if isinstance(cfg_item, ListConfig):
27
+ return OmegaConf.to_container(cfg_item, resolve=True)
28
+ elif isinstance(cfg_item, DictConfig):
29
+ return {k: to_plain_list(v) for k, v in cfg_item.items()}
30
+ else:
31
+ return cfg_item
32
+
33
+ # kwargs = to_plain_list(cfg)
34
+ kwargs = cfg
35
+ log_level = getattr(logging, kwargs.get("log_level", "INFO").upper())
36
+
37
+ logging.basicConfig(level=log_level)
38
+
39
+ if kwargs.get("debug", False):
40
+ import pdb;
41
+ pdb.set_trace()
42
+
43
+ train(kwargs)
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main_hydra()
Wav-Omni/examples/s2s/demo/142001_0_rlhf.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d027ee1e2cbe76cef17ead8d8d92bfb2a68df09d756ebcce8e9a4339e8738be
3
+ size 330796
Wav-Omni/examples/s2s/demo/142023_0_rlhf.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64278538582f31b7657b9496876a9cc5ea3c1204bfe49aaf2d085c0ca7356400
3
+ size 221228
Wav-Omni/examples/s2s/demo/142038_0_rlhf.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3a09e765fb5e025c206ea65531143d8c60e8b545c82dc5e0d040f4e7c5a4417
3
+ size 362540
Wav-Omni/examples/s2s/demo/64474_1_rlhf.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb2a25bb5a13844494e4e8069e277ea12aa199a6476eae06cfbfc47cf668ff64
3
+ size 183340
Wav-Omni/examples/s2s/demo/demo_data/142038_0_rlhf.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3a09e765fb5e025c206ea65531143d8c60e8b545c82dc5e0d040f4e7c5a4417
3
+ size 362540
Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp.jsonl ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {"key": "1", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
2
+ {"key": "2", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00244-of-00325/142001_0_rlhf.wav", "source_text": "What are some things I can mix into a pizza crust?", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00244-of-00325/142001_0_rlhf.wav", "target_text": "There are lots of tasty ingredients you can mix into a pizza crust to enhance flavor and texture! You can try adding herbs like oregano, basil, or rosemary for a fragrant touch. Garlic powder or minced garlic can add a nice, strong flavor. Cheese like Parmesan or shredded mozzarella can make your crust richer and more savory. Some people also like to add a bit of sugar or honey for a slightly sweet note. You can even mix in finely chopped olives or sundried tomatoes for a unique twist. Always ensure your added ingredients are well-incorporated into the dough before baking for the best results."}
3
+ {"key": "3", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00244-of-00325/142023_0_rlhf.wav", "source_text": "How do you take care of a frog?", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00244-of-00325/142023_0_rlhf.wav", "target_text": "Taking care of a frog involves several important steps. First, you'll need a proper habitat, like a terrarium, with enough space, humidity, and proper temperature. Ensure the terrarium has a secure lid to prevent escape. Use substrates like coconut fiber or moss to keep the environment humid. Provide hiding spots like logs or rocks for your frog to feel safe. Frogs require clean water, so provide a shallow dish with dechlorinated or bottled water. Feed your frog a diet of live insects, like crickets or worms, and dust them with a calcium supplement occasionally. Regularly clean the terrarium to prevent bacterial buildup and monitor your frog for signs of illness, consulting a vet if anything seems amiss."}
4
+ {"key": "4", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00244-of-00325/142038_0_rlhf.wav", "source_text": "Search for a peach crumble bar recipe.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00244-of-00325/142038_0_rlhf.wav", "target_text": "Sure! A quick search online will give you some great peach crumble bar recipes. Typically, these recipes involve making a base crust, a peach filling, and a crumbly topping. You'll need ingredients like fresh peaches, flour, sugar, butter, and oats. You start by creating the crust, layering the peach filling, and finishing with the crumble topping before baking it all together. Enjoy your delicious treat!"}
Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {"key": "1", "source_wav": "/cpfs01/shared/jishengpeng/Data/wavomni/tmp/73856459_1_train_3.5M_CN_ready4cosy_wo_code_switching.wav", "source_text": "回答该题:柠檬的科学名称是什么?", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "柠檬的科学名称为Citrus limon,属于芸香科柑橘属的常绿小乔木或灌木植物。它起源于亚洲南部,并在欧洲和地中海地区以及美洲热带地区广泛栽培。柠檬树可以在强烈的阳光下生长,所以它经常被栽种在阳光充足的地方,如地中海气候区。柠檬的果实是圆锥形的,皮表面略带纵行皱纹,果肉酸爽,用于制作饮料和调味料等。柠檬也是一种富含维生素C和抗氧化剂的水果,对保持健康有很大的益处。"}
2
+ {"key": "2", "source_wav": "/cpfs01/shared/jishengpeng/Data/wavomni/tmp/96297378_1_train_3.5M_CN_ready4cosy_wo_code_switching.wav", "source_text": "请提出五种减少塑料垃圾的方法。", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00244-of-00325/142001_0_rlhf.wav", "target_text": "1. 减少使用塑料制品,转而使用可再生或可分解材料制品。2. 训练良好的垃圾分类习惯,将可回收和可分解的塑料垃圾分别分类处理。3. 限制超市、商场,以及快餐店等商家使用一次性塑料袋、餐具和纸杯等,鼓励折叠包与可重复使用的餐具。4. 呼吁政府出台相关的环境保护法律法规,限制塑料垃圾排放,处罚违规行为。5. 开展垃圾分类教育活动,吸引公众关注塑料垃圾减量,培养环保意识,减少不必要的消费行为"}
3
+ {"key": "3", "source_wav": "/cpfs01/shared/jishengpeng/Data/wavomni/tmp/4412288_1_train_3.5M_CN_ready4cosy_wo_code_switching.wav", "source_text": "小李的薪水是每月8000元,他每个月需要交400元的养老保险和200元的医疗保险,请问他每个月实际能拿到的薪水是多少?", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00244-of-00325/142023_0_rlhf.wav", "target_text": "小李每个月的薪水是8000元,需要交400元的养老保险和200元的医疗保险。因此,他实际能拿到的薪水是多少呢?我们可以用一个简单的计算公式来计算:实际薪水 = 总薪水 - 养老保险 - 医疗保险。将小李每月的总薪水8000元代入公式,养老保险400元和医疗保险200元分别代入得:实际薪水 = 8000 - 400 - 200 = 7400元,所以小李每个月实际能拿到的薪水是7400元。"}
4
+ {"key": "4", "source_wav": "/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/demo/question_jsp.wav", "source_text": "你好,我现在是浙江大学计算机二年级的硕士生,攻读语音方向,你能给我推荐一些学习的建议吗?", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00244-of-00325/142038_0_rlhf.wav", "target_text": "没有标准回答,请随意发挥。"}
Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_train.jsonl ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"key": "1", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
2
+ {"key": "2", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
3
+ {"key": "3", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
4
+ {"key": "4", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
5
+ {"key": "5", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
6
+ {"key": "6", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
7
+ {"key": "7", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
8
+ {"key": "8", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
9
+ {"key": "9", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
10
+ {"key": "10", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
11
+ {"key": "11", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
12
+ {"key": "12", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
13
+ {"key": "13", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
14
+ {"key": "14", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
15
+ {"key": "15", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
16
+ {"key": "16", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
17
+ {"key": "17", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
18
+ {"key": "18", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
19
+ {"key": "19", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
20
+ {"key": "20", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
21
+ {"key": "21", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
22
+ {"key": "22", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
23
+ {"key": "23", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
24
+ {"key": "24", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
25
+ {"key": "25", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
26
+ {"key": "26", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
27
+ {"key": "27", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
28
+ {"key": "28", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
29
+ {"key": "29", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
30
+ {"key": "30", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
Wav-Omni/examples/s2s/demo/question_jsp.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a83d78aca6d0973e96ed1d0c59082a1f13234c9b4a80408cce29d3a9e13fb7b4
3
+ size 248997