Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +22 -0
- Wav-Omni/.github/ISSUE_TEMPLATE/bug.yml +79 -0
- Wav-Omni/.github/ISSUE_TEMPLATE/feature-request.yml +31 -0
- Wav-Omni/.github/PULL_REQUEST_TEMPLATE.md +38 -0
- Wav-Omni/.github/workflows/spellcheck.yml +66 -0
- Wav-Omni/.gitignore +25 -0
- Wav-Omni/Dockerfile +41 -0
- Wav-Omni/README.md +10 -0
- Wav-Omni/ckpt/test.py +0 -0
- Wav-Omni/ckpt/wavomni_chinese_3_2/.hydra/config.yaml +68 -0
- Wav-Omni/ckpt/wavomni_chinese_3_2/.hydra/hydra.yaml +216 -0
- Wav-Omni/ckpt/wavomni_chinese_3_2/.hydra/overrides.yaml +61 -0
- Wav-Omni/ckpt/wavomni_chinese_3_2/inference_s2s.log +175 -0
- Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt +3 -0
- Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/gt_text +4 -0
- Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/1.wav +3 -0
- Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/2.wav +3 -0
- Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/3.wav +3 -0
- Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/4.wav +3 -0
- Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_text +12 -0
- Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/question_text +4 -0
- Wav-Omni/environment.yml +243 -0
- Wav-Omni/examples/s2s/__pycache__/s2s_config.cpython-39.pyc +0 -0
- Wav-Omni/examples/s2s/__pycache__/speech_dataset_s2s.cpython-39.pyc +0 -0
- Wav-Omni/examples/s2s/audio_prompt/en/prompt_1.wav +3 -0
- Wav-Omni/examples/s2s/audio_prompt/en/prompt_2.wav +3 -0
- Wav-Omni/examples/s2s/audio_prompt/en/prompt_3.wav +3 -0
- Wav-Omni/examples/s2s/audio_prompt/en/prompt_4.wav +3 -0
- Wav-Omni/examples/s2s/audio_prompt/en/prompt_5.wav +3 -0
- Wav-Omni/examples/s2s/audio_prompt/en/prompt_6.wav +3 -0
- Wav-Omni/examples/s2s/audio_prompt/zh/prompt_1.wav +3 -0
- Wav-Omni/examples/s2s/audio_prompt/zh/prompt_2.wav +3 -0
- Wav-Omni/examples/s2s/audio_prompt/zh/prompt_3.wav +3 -0
- Wav-Omni/examples/s2s/audio_prompt/zh/prompt_4.wav +3 -0
- Wav-Omni/examples/s2s/audio_prompt/zh/prompt_5.wav +3 -0
- Wav-Omni/examples/s2s/audio_prompt/zh/prompt_6.wav +3 -0
- Wav-Omni/examples/s2s/conf/ds_config.json +19 -0
- Wav-Omni/examples/s2s/conf/prompt.yaml +3 -0
- Wav-Omni/examples/s2s/conf/prompt_asr.yaml +2 -0
- Wav-Omni/examples/s2s/conf/prompt_tts.yaml +4 -0
- Wav-Omni/examples/s2s/deepspeed_finetune_s2s.py +47 -0
- Wav-Omni/examples/s2s/demo/142001_0_rlhf.wav +3 -0
- Wav-Omni/examples/s2s/demo/142023_0_rlhf.wav +3 -0
- Wav-Omni/examples/s2s/demo/142038_0_rlhf.wav +3 -0
- Wav-Omni/examples/s2s/demo/64474_1_rlhf.wav +3 -0
- Wav-Omni/examples/s2s/demo/demo_data/142038_0_rlhf.wav +3 -0
- Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp.jsonl +4 -0
- Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl +4 -0
- Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_train.jsonl +30 -0
- Wav-Omni/examples/s2s/demo/question_jsp.wav +3 -0
.gitattributes
CHANGED
@@ -34,3 +34,25 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
mucodec.wav filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
mucodec.wav filter=lfs diff=lfs merge=lfs -text
|
37 |
+
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/1.wav filter=lfs diff=lfs merge=lfs -text
|
38 |
+
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/2.wav filter=lfs diff=lfs merge=lfs -text
|
39 |
+
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/3.wav filter=lfs diff=lfs merge=lfs -text
|
40 |
+
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/4.wav filter=lfs diff=lfs merge=lfs -text
|
41 |
+
Wav-Omni/examples/s2s/audio_prompt/en/prompt_1.wav filter=lfs diff=lfs merge=lfs -text
|
42 |
+
Wav-Omni/examples/s2s/audio_prompt/en/prompt_2.wav filter=lfs diff=lfs merge=lfs -text
|
43 |
+
Wav-Omni/examples/s2s/audio_prompt/en/prompt_3.wav filter=lfs diff=lfs merge=lfs -text
|
44 |
+
Wav-Omni/examples/s2s/audio_prompt/en/prompt_4.wav filter=lfs diff=lfs merge=lfs -text
|
45 |
+
Wav-Omni/examples/s2s/audio_prompt/en/prompt_5.wav filter=lfs diff=lfs merge=lfs -text
|
46 |
+
Wav-Omni/examples/s2s/audio_prompt/en/prompt_6.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
+
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_1.wav filter=lfs diff=lfs merge=lfs -text
|
48 |
+
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_2.wav filter=lfs diff=lfs merge=lfs -text
|
49 |
+
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_3.wav filter=lfs diff=lfs merge=lfs -text
|
50 |
+
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_4.wav filter=lfs diff=lfs merge=lfs -text
|
51 |
+
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_5.wav filter=lfs diff=lfs merge=lfs -text
|
52 |
+
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_6.wav filter=lfs diff=lfs merge=lfs -text
|
53 |
+
Wav-Omni/examples/s2s/demo/142001_0_rlhf.wav filter=lfs diff=lfs merge=lfs -text
|
54 |
+
Wav-Omni/examples/s2s/demo/142023_0_rlhf.wav filter=lfs diff=lfs merge=lfs -text
|
55 |
+
Wav-Omni/examples/s2s/demo/142038_0_rlhf.wav filter=lfs diff=lfs merge=lfs -text
|
56 |
+
Wav-Omni/examples/s2s/demo/64474_1_rlhf.wav filter=lfs diff=lfs merge=lfs -text
|
57 |
+
Wav-Omni/examples/s2s/demo/demo_data/142038_0_rlhf.wav filter=lfs diff=lfs merge=lfs -text
|
58 |
+
Wav-Omni/examples/s2s/demo/question_jsp.wav filter=lfs diff=lfs merge=lfs -text
|
Wav-Omni/.github/ISSUE_TEMPLATE/bug.yml
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: 🐛 Bug Report
|
2 |
+
description: Create a report to help us reproduce and fix the bug
|
3 |
+
|
4 |
+
body:
|
5 |
+
- type: markdown
|
6 |
+
attributes:
|
7 |
+
value: >
|
8 |
+
#### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the
|
9 |
+
existing and past issues](https://github.com/ddlBoJack/Wav-Omni/issues).
|
10 |
+
|
11 |
+
- type: textarea
|
12 |
+
id: system-info
|
13 |
+
attributes:
|
14 |
+
label: System Info
|
15 |
+
description: |
|
16 |
+
Please share your system info with us. You can use the following command to capture your environment information
|
17 |
+
python -m "torch.utils.collect_env"
|
18 |
+
|
19 |
+
placeholder: |
|
20 |
+
PyTorch version, CUDA version, GPU type, #num of GPUs...
|
21 |
+
validations:
|
22 |
+
required: true
|
23 |
+
|
24 |
+
- type: checkboxes
|
25 |
+
id: information-scripts-examples
|
26 |
+
attributes:
|
27 |
+
label: Information
|
28 |
+
description: 'The problem arises when using:'
|
29 |
+
options:
|
30 |
+
- label: "The official example scripts"
|
31 |
+
- label: "My own modified scripts"
|
32 |
+
|
33 |
+
- type: textarea
|
34 |
+
id: bug-description
|
35 |
+
attributes:
|
36 |
+
label: 🐛 Describe the bug
|
37 |
+
description: |
|
38 |
+
Please provide a clear and concise description of what the bug is.
|
39 |
+
|
40 |
+
Provide the exact command(s) that you ran with the settings eg using FSDP and PEFT or pure FSDP.
|
41 |
+
|
42 |
+
Please also paste or describe the results you observe instead of the expected results.
|
43 |
+
placeholder: |
|
44 |
+
A clear and concise description of what the bug is.
|
45 |
+
|
46 |
+
```python
|
47 |
+
# Command that you used for running the examples
|
48 |
+
```
|
49 |
+
Description of the results
|
50 |
+
validations:
|
51 |
+
required: true
|
52 |
+
|
53 |
+
- type: textarea
|
54 |
+
attributes:
|
55 |
+
label: Error logs
|
56 |
+
description: |
|
57 |
+
If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
|
58 |
+
|
59 |
+
placeholder: |
|
60 |
+
```
|
61 |
+
The error message you got, with the full traceback.
|
62 |
+
```
|
63 |
+
|
64 |
+
validations:
|
65 |
+
required: true
|
66 |
+
|
67 |
+
|
68 |
+
- type: textarea
|
69 |
+
id: expected-behavior
|
70 |
+
validations:
|
71 |
+
required: true
|
72 |
+
attributes:
|
73 |
+
label: Expected behavior
|
74 |
+
description: "A clear and concise description of what you would expect to happen."
|
75 |
+
|
76 |
+
- type: markdown
|
77 |
+
attributes:
|
78 |
+
value: >
|
79 |
+
Thanks for contributing 🎉!
|
Wav-Omni/.github/ISSUE_TEMPLATE/feature-request.yml
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: 🚀 Feature request
|
2 |
+
description: Submit a proposal/request for a new Wav-Omni feature
|
3 |
+
|
4 |
+
body:
|
5 |
+
- type: textarea
|
6 |
+
id: feature-pitch
|
7 |
+
attributes:
|
8 |
+
label: 🚀 The feature, motivation and pitch
|
9 |
+
description: >
|
10 |
+
A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
|
11 |
+
validations:
|
12 |
+
required: true
|
13 |
+
|
14 |
+
- type: textarea
|
15 |
+
id: alternatives
|
16 |
+
attributes:
|
17 |
+
label: Alternatives
|
18 |
+
description: >
|
19 |
+
A description of any alternative solutions or features you've considered, if any.
|
20 |
+
|
21 |
+
- type: textarea
|
22 |
+
id: additional-context
|
23 |
+
attributes:
|
24 |
+
label: Additional context
|
25 |
+
description: >
|
26 |
+
Add any other context or screenshots about the feature request.
|
27 |
+
|
28 |
+
- type: markdown
|
29 |
+
attributes:
|
30 |
+
value: >
|
31 |
+
Thanks for contributing 🎉!
|
Wav-Omni/.github/PULL_REQUEST_TEMPLATE.md
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# What does this PR do?
|
2 |
+
|
3 |
+
<!--
|
4 |
+
Congratulations! You've made it this far! You're not quite done yet though.
|
5 |
+
|
6 |
+
Please include a good title that fully reflects the extent of your awesome contribution.
|
7 |
+
|
8 |
+
Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.
|
9 |
+
|
10 |
+
-->
|
11 |
+
|
12 |
+
<!-- Remove if not applicable -->
|
13 |
+
|
14 |
+
Fixes # (issue)
|
15 |
+
|
16 |
+
|
17 |
+
## Feature/Issue validation/testing
|
18 |
+
|
19 |
+
Please describe the tests that you ran to verify your changes and relevant result summary. Provide instructions so it can be reproduced.
|
20 |
+
Please also list any relevant details for your test configuration.
|
21 |
+
|
22 |
+
- [ ] Test A
|
23 |
+
Logs for Test A
|
24 |
+
|
25 |
+
- [ ] Test B
|
26 |
+
Logs for Test B
|
27 |
+
|
28 |
+
|
29 |
+
## Before submitting
|
30 |
+
- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
|
31 |
+
- [ ] Did you read the [contributor guideline](https://github.com/facebookresearch/llama-recipes/blob/main/CONTRIBUTING.md#pull-requests),
|
32 |
+
Pull Request section?
|
33 |
+
- [ ] Was this discussed/approved via a Github issue? Please add a link
|
34 |
+
to it if that's the case.
|
35 |
+
- [ ] Did you make sure to update the documentation with your changes?
|
36 |
+
- [ ] Did you write any new necessary tests?
|
37 |
+
|
38 |
+
Thanks for contributing 🎉!
|
Wav-Omni/.github/workflows/spellcheck.yml
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: SpellCheck
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- main
|
7 |
+
pull_request:
|
8 |
+
branches:
|
9 |
+
- main
|
10 |
+
jobs:
|
11 |
+
build:
|
12 |
+
runs-on: ubuntu-20.04
|
13 |
+
name: Lint changed files
|
14 |
+
steps:
|
15 |
+
- uses: actions/checkout@v3
|
16 |
+
with:
|
17 |
+
fetch-depth: 0 # OR "2" -> To retrieve the preceding commit.
|
18 |
+
|
19 |
+
- name: Check links in all markdown files
|
20 |
+
uses: gaurav-nelson/github-action-markdown-link-check@1.0.13
|
21 |
+
with:
|
22 |
+
use-verbose-mode: 'yes'
|
23 |
+
config-file: "scripts/markdown_link_check_config.json"
|
24 |
+
|
25 |
+
- name: Get changed files
|
26 |
+
id: changed-files
|
27 |
+
uses: tj-actions/changed-files@v29.0.4
|
28 |
+
with:
|
29 |
+
|
30 |
+
files: |
|
31 |
+
**/*.py
|
32 |
+
|
33 |
+
spellcheck:
|
34 |
+
runs-on: ubuntu-20.04
|
35 |
+
steps:
|
36 |
+
- uses: actions/checkout@v3
|
37 |
+
|
38 |
+
- name: Install dependencies
|
39 |
+
run: |
|
40 |
+
sudo apt-get install aspell aspell-en
|
41 |
+
pip install pyspelling
|
42 |
+
|
43 |
+
- name: Get changed files
|
44 |
+
id: changed-files
|
45 |
+
uses: tj-actions/changed-files@v29.0.4
|
46 |
+
with:
|
47 |
+
files: |
|
48 |
+
**/*.md
|
49 |
+
|
50 |
+
- name: Check spellings
|
51 |
+
run: |
|
52 |
+
sources=""
|
53 |
+
for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
|
54 |
+
sources="${sources} -S $file"
|
55 |
+
done
|
56 |
+
if [ ! "$sources" ]; then
|
57 |
+
echo "No files to spellcheck"
|
58 |
+
else
|
59 |
+
pyspelling -c $GITHUB_WORKSPACE/scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources
|
60 |
+
fi
|
61 |
+
|
62 |
+
- name: In the case of misspellings
|
63 |
+
if: ${{ failure() }}
|
64 |
+
run: |
|
65 |
+
echo "Please fix the misspellings. If you are sure about some of them, "
|
66 |
+
echo "so append those to scripts/spellcheck_conf/wordlist.txt"
|
Wav-Omni/.gitignore
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.DS_Store
|
2 |
+
__pycache__
|
3 |
+
.ipynb_checkpoints
|
4 |
+
.vscode
|
5 |
+
debug.py
|
6 |
+
debug.ipynb
|
7 |
+
debug.sh
|
8 |
+
.idea/*
|
9 |
+
transformers
|
10 |
+
wandb/
|
11 |
+
log/
|
12 |
+
*.log
|
13 |
+
outputs/
|
14 |
+
data/
|
15 |
+
jobs/
|
16 |
+
debug/
|
17 |
+
audio/
|
18 |
+
|
19 |
+
examples/s2s/scripts/debug
|
20 |
+
examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_noself.sh
|
21 |
+
examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_copy.sh
|
22 |
+
examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_copy.sh
|
23 |
+
scripts_all
|
24 |
+
examples/hotwords_librispeech
|
25 |
+
examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_debug.sh
|
Wav-Omni/Dockerfile
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime
|
2 |
+
|
3 |
+
USER root
|
4 |
+
|
5 |
+
ARG DEBIAN_FRONTEND=noninteractive
|
6 |
+
|
7 |
+
LABEL github_repo="https://github.com/ddlBoJack/Wav-Omni"
|
8 |
+
|
9 |
+
RUN set -x \
|
10 |
+
&& apt-get update \
|
11 |
+
&& apt-get -y install wget curl man git less openssl libssl-dev unzip unar build-essential aria2 tmux vim ninja-build\
|
12 |
+
&& apt-get install -y openssh-server sox libsox-fmt-all libsox-fmt-mp3 libsndfile1-dev ffmpeg \
|
13 |
+
&& rm -rf /var/lib/apt/lists/* \
|
14 |
+
&& apt-get clean
|
15 |
+
|
16 |
+
RUN pip install --no-cache-dir packaging editdistance gpustat wandb einops debugpy tqdm soundfile matplotlib scipy sentencepiece pandas \
|
17 |
+
&& pip install --no-cache-dir torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118
|
18 |
+
|
19 |
+
WORKDIR /workspace
|
20 |
+
|
21 |
+
RUN git clone https://github.com/huggingface/transformers.git \
|
22 |
+
&& cd transformers \
|
23 |
+
&& git checkout tags/v4.35.2 \
|
24 |
+
&& pip install --no-cache-dir -e .
|
25 |
+
|
26 |
+
RUN git clone https://github.com/huggingface/peft.git \
|
27 |
+
&& cd peft \
|
28 |
+
&& git checkout tags/v0.6.0 \
|
29 |
+
&& pip install --no-cache-dir -e .
|
30 |
+
|
31 |
+
RUN git clone https://github.com/pytorch/fairseq \
|
32 |
+
&& cd fairseq \
|
33 |
+
&& pip install --no-cache-dir --editable ./
|
34 |
+
|
35 |
+
RUN git clone https://github.com/ddlBoJack/Wav-Omni.git \
|
36 |
+
&& cd Wav-Omni \
|
37 |
+
&& pip install --no-cache-dir -e .
|
38 |
+
|
39 |
+
ENV SHELL=/bin/bash
|
40 |
+
|
41 |
+
WORKDIR /workspace/Wav-Omni
|
Wav-Omni/README.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### 安装环境
|
2 |
+
./Wav-Omni/environment.yml
|
3 |
+
|
4 |
+
### 训练和推理
|
5 |
+
bash ./Wav-Omni/train.sh
|
6 |
+
bash ./Wav-Omni/infer.sh
|
7 |
+
|
8 |
+
得huggingface上提前下载Qwen和cosyvoice的权重,替换配置文件中的路径。
|
9 |
+
|
10 |
+
数据集中的方式可以按照demo给出的样例构建
|
Wav-Omni/ckpt/test.py
ADDED
File without changes
|
Wav-Omni/ckpt/wavomni_chinese_3_2/.hydra/config.yaml
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_config:
|
2 |
+
prompt: 'Conduct a spoken conversation with the user. '
|
3 |
+
dataset: speech_dataset_s2s
|
4 |
+
val_data_path: /home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl
|
5 |
+
train_data_path: /home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl
|
6 |
+
input_type: mel
|
7 |
+
mel_size: 128
|
8 |
+
inference_mode: true
|
9 |
+
manifest_format: jsonl
|
10 |
+
split_size: 0.002
|
11 |
+
load_from_cache_file: false
|
12 |
+
task_type: s2s
|
13 |
+
seed: 777
|
14 |
+
vocab_config:
|
15 |
+
code_layer: 3
|
16 |
+
total_audio_vocabsize: 4160
|
17 |
+
total_vocabsize: 156160
|
18 |
+
code_type: CosyVoice
|
19 |
+
num_latency_tokens: 0
|
20 |
+
do_layershift: false
|
21 |
+
model_config:
|
22 |
+
llm_name: qwen2-0.5b
|
23 |
+
llm_path: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/Qwen2-0.5B
|
24 |
+
llm_dim: 896
|
25 |
+
encoder_name: whisper
|
26 |
+
encoder_projector_ds_rate: 5
|
27 |
+
encoder_path: large-v3
|
28 |
+
encoder_dim: 1280
|
29 |
+
encoder_projector: linear
|
30 |
+
codec_decoder_path: /home/xj_data/jishengpeng/CosyVoice/pretrained_models/CosyVoice-300M-SFT
|
31 |
+
codec_decode: true
|
32 |
+
vocab_config:
|
33 |
+
code_layer: 3
|
34 |
+
total_audio_vocabsize: 4160
|
35 |
+
total_vocabsize: 156160
|
36 |
+
code_type: CosyVoice
|
37 |
+
codec_decoder_type: CosyVoice
|
38 |
+
group_decode: true
|
39 |
+
group_decode_adapter_type: linear
|
40 |
+
train_config:
|
41 |
+
model_name: s2s
|
42 |
+
freeze_encoder: true
|
43 |
+
freeze_llm: true
|
44 |
+
freeze_encoder_projector: true
|
45 |
+
freeze_group_decode_adapter: true
|
46 |
+
batching_strategy: custom
|
47 |
+
num_epochs: 1
|
48 |
+
val_batch_size: 1
|
49 |
+
num_workers_dataloader: 2
|
50 |
+
task_type: s2s
|
51 |
+
decode_config:
|
52 |
+
text_repetition_penalty: 1.2
|
53 |
+
audio_repetition_penalty: 1.2
|
54 |
+
max_new_tokens: 3000
|
55 |
+
task_type: s2s
|
56 |
+
do_sample: false
|
57 |
+
top_p: 1.0
|
58 |
+
top_k: 0
|
59 |
+
temperature: 1.0
|
60 |
+
decode_text_only: false
|
61 |
+
do_layershift: false
|
62 |
+
num_latency_tokens: 0
|
63 |
+
decode_log: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy
|
64 |
+
ckpt_path: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt
|
65 |
+
output_text_only: false
|
66 |
+
inference_online: false
|
67 |
+
speech_sample_rate: 22050
|
68 |
+
audio_prompt_path: /home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/audio_prompt/zh/prompt_1.wav
|
Wav-Omni/ckpt/wavomni_chinese_3_2/.hydra/hydra.yaml
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2
|
4 |
+
sweep:
|
5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
6 |
+
subdir: ${hydra.job.num}
|
7 |
+
launcher:
|
8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
9 |
+
sweeper:
|
10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
11 |
+
max_batch_size: null
|
12 |
+
params: null
|
13 |
+
help:
|
14 |
+
app_name: ${hydra.job.name}
|
15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
16 |
+
|
17 |
+
'
|
18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
19 |
+
|
20 |
+
Use --hydra-help to view Hydra specific help
|
21 |
+
|
22 |
+
'
|
23 |
+
template: '${hydra.help.header}
|
24 |
+
|
25 |
+
== Configuration groups ==
|
26 |
+
|
27 |
+
Compose your configuration from those groups (group=option)
|
28 |
+
|
29 |
+
|
30 |
+
$APP_CONFIG_GROUPS
|
31 |
+
|
32 |
+
|
33 |
+
== Config ==
|
34 |
+
|
35 |
+
Override anything in the config (foo.bar=value)
|
36 |
+
|
37 |
+
|
38 |
+
$CONFIG
|
39 |
+
|
40 |
+
|
41 |
+
${hydra.help.footer}
|
42 |
+
|
43 |
+
'
|
44 |
+
hydra_help:
|
45 |
+
template: 'Hydra (${hydra.runtime.version})
|
46 |
+
|
47 |
+
See https://hydra.cc for more info.
|
48 |
+
|
49 |
+
|
50 |
+
== Flags ==
|
51 |
+
|
52 |
+
$FLAGS_HELP
|
53 |
+
|
54 |
+
|
55 |
+
== Configuration groups ==
|
56 |
+
|
57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
58 |
+
to command line)
|
59 |
+
|
60 |
+
|
61 |
+
$HYDRA_CONFIG_GROUPS
|
62 |
+
|
63 |
+
|
64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
65 |
+
|
66 |
+
'
|
67 |
+
hydra_help: ???
|
68 |
+
hydra_logging:
|
69 |
+
version: 1
|
70 |
+
formatters:
|
71 |
+
simple:
|
72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
73 |
+
handlers:
|
74 |
+
console:
|
75 |
+
class: logging.StreamHandler
|
76 |
+
formatter: simple
|
77 |
+
stream: ext://sys.stdout
|
78 |
+
root:
|
79 |
+
level: INFO
|
80 |
+
handlers:
|
81 |
+
- console
|
82 |
+
loggers:
|
83 |
+
logging_example:
|
84 |
+
level: DEBUG
|
85 |
+
disable_existing_loggers: false
|
86 |
+
job_logging:
|
87 |
+
version: 1
|
88 |
+
formatters:
|
89 |
+
simple:
|
90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
91 |
+
handlers:
|
92 |
+
console:
|
93 |
+
class: logging.StreamHandler
|
94 |
+
formatter: simple
|
95 |
+
stream: ext://sys.stdout
|
96 |
+
file:
|
97 |
+
class: logging.FileHandler
|
98 |
+
formatter: simple
|
99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
100 |
+
root:
|
101 |
+
level: INFO
|
102 |
+
handlers:
|
103 |
+
- console
|
104 |
+
- file
|
105 |
+
disable_existing_loggers: false
|
106 |
+
env: {}
|
107 |
+
mode: RUN
|
108 |
+
searchpath: []
|
109 |
+
callbacks: {}
|
110 |
+
output_subdir: .hydra
|
111 |
+
overrides:
|
112 |
+
hydra:
|
113 |
+
- hydra.run.dir=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2
|
114 |
+
- hydra.mode=RUN
|
115 |
+
task:
|
116 |
+
- ++model_config.llm_name=qwen2-0.5b
|
117 |
+
- ++model_config.llm_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/Qwen2-0.5B
|
118 |
+
- ++model_config.llm_dim=896
|
119 |
+
- ++model_config.encoder_name=whisper
|
120 |
+
- ++model_config.encoder_projector_ds_rate=5
|
121 |
+
- ++model_config.encoder_path=large-v3
|
122 |
+
- ++model_config.encoder_dim=1280
|
123 |
+
- ++model_config.encoder_projector=linear
|
124 |
+
- ++model_config.codec_decoder_path=/home/xj_data/jishengpeng/CosyVoice/pretrained_models/CosyVoice-300M-SFT
|
125 |
+
- ++model_config.codec_decode=true
|
126 |
+
- ++model_config.vocab_config.code_layer=3
|
127 |
+
- ++model_config.vocab_config.total_audio_vocabsize=4160
|
128 |
+
- ++model_config.vocab_config.total_vocabsize=156160
|
129 |
+
- ++model_config.code_type=CosyVoice
|
130 |
+
- ++model_config.codec_decoder_type=CosyVoice
|
131 |
+
- ++model_config.group_decode=true
|
132 |
+
- ++model_config.group_decode_adapter_type=linear
|
133 |
+
- ++dataset_config.dataset=speech_dataset_s2s
|
134 |
+
- ++dataset_config.val_data_path=/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl
|
135 |
+
- ++dataset_config.train_data_path=/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl
|
136 |
+
- ++dataset_config.input_type=mel
|
137 |
+
- ++dataset_config.mel_size=128
|
138 |
+
- ++dataset_config.inference_mode=true
|
139 |
+
- ++dataset_config.manifest_format=jsonl
|
140 |
+
- ++dataset_config.split_size=0.002
|
141 |
+
- ++dataset_config.load_from_cache_file=false
|
142 |
+
- ++dataset_config.task_type=s2s
|
143 |
+
- ++dataset_config.seed=777
|
144 |
+
- ++dataset_config.vocab_config.code_layer=3
|
145 |
+
- ++dataset_config.vocab_config.total_audio_vocabsize=4160
|
146 |
+
- ++dataset_config.vocab_config.total_vocabsize=156160
|
147 |
+
- ++dataset_config.code_type=CosyVoice
|
148 |
+
- ++dataset_config.num_latency_tokens=0
|
149 |
+
- ++dataset_config.do_layershift=false
|
150 |
+
- ++train_config.model_name=s2s
|
151 |
+
- ++train_config.freeze_encoder=true
|
152 |
+
- ++train_config.freeze_llm=true
|
153 |
+
- ++train_config.freeze_encoder_projector=true
|
154 |
+
- ++train_config.freeze_group_decode_adapter=true
|
155 |
+
- ++train_config.batching_strategy=custom
|
156 |
+
- ++train_config.num_epochs=1
|
157 |
+
- ++train_config.val_batch_size=1
|
158 |
+
- ++train_config.num_workers_dataloader=2
|
159 |
+
- ++train_config.task_type=s2s
|
160 |
+
- ++decode_config.text_repetition_penalty=1.2
|
161 |
+
- ++decode_config.audio_repetition_penalty=1.2
|
162 |
+
- ++decode_config.max_new_tokens=3000
|
163 |
+
- ++decode_config.task_type=s2s
|
164 |
+
- ++decode_config.do_sample=false
|
165 |
+
- ++decode_config.top_p=1.0
|
166 |
+
- ++decode_config.top_k=0
|
167 |
+
- ++decode_config.temperature=1.0
|
168 |
+
- ++decode_config.decode_text_only=false
|
169 |
+
- ++decode_config.do_layershift=false
|
170 |
+
- ++decode_log=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy
|
171 |
+
- ++decode_config.num_latency_tokens=0
|
172 |
+
- ++ckpt_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt
|
173 |
+
- ++output_text_only=false
|
174 |
+
- ++inference_online=false
|
175 |
+
- ++speech_sample_rate=22050
|
176 |
+
- ++audio_prompt_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/audio_prompt/zh/prompt_1.wav
|
177 |
+
job:
|
178 |
+
name: inference_s2s
|
179 |
+
chdir: null
|
180 |
+
override_dirname: ++audio_prompt_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/audio_prompt/zh/prompt_1.wav,++ckpt_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt,++dataset_config.code_type=CosyVoice,++dataset_config.dataset=speech_dataset_s2s,++dataset_config.do_layershift=false,++dataset_config.inference_mode=true,++dataset_config.input_type=mel,++dataset_config.load_from_cache_file=false,++dataset_config.manifest_format=jsonl,++dataset_config.mel_size=128,++dataset_config.num_latency_tokens=0,++dataset_config.seed=777,++dataset_config.split_size=0.002,++dataset_config.task_type=s2s,++dataset_config.train_data_path=/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl,++dataset_config.val_data_path=/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl,++dataset_config.vocab_config.code_layer=3,++dataset_config.vocab_config.total_audio_vocabsize=4160,++dataset_config.vocab_config.total_vocabsize=156160,++decode_config.audio_repetition_penalty=1.2,++decode_config.decode_text_only=false,++decode_config.do_layershift=false,++decode_config.do_sample=false,++decode_config.max_new_tokens=3000,++decode_config.num_latency_tokens=0,++decode_config.task_type=s2s,++decode_config.temperature=1.0,++decode_config.text_repetition_penalty=1.2,++decode_config.top_k=0,++decode_config.top_p=1.0,++decode_log=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy,++inference_online=false,++model_config.code_type=CosyVoice,++model_config.codec_decode=true,++model_config.codec_decoder_path=/home/xj_data/jishengpeng/CosyVoice/pretrained_models/CosyVoice-300M-SFT,++model_config.codec_decoder_type=CosyVoice,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=large-v3,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.group_decode=true,++model_config.group_decode_adapter_type=linear,++model_config.llm_dim=896,++model_config.llm_name=qwen2-0.5b,++model_config.llm_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/Qwen2-0.5B,++model_config.vocab_config.code_layer=3,++model_config.vocab_config.total_audio_vocabsize=4160,++model_config.vocab_config.total_vocabsize=156160,++output_text_only=false,++speech_sample_rate=22050,++train_config.batching_strategy=custom,++train_config.freeze_encoder=true,++train_config.freeze_encoder_projector=true,++train_config.freeze_group_decode_adapter=true,++train_config.freeze_llm=true,++train_config.model_name=s2s,++train_config.num_epochs=1,++train_config.num_workers_dataloader=2,++train_config.task_type=s2s,++train_config.val_batch_size=1
|
181 |
+
id: ???
|
182 |
+
num: ???
|
183 |
+
config_name: prompt.yaml
|
184 |
+
env_set: {}
|
185 |
+
env_copy: []
|
186 |
+
config:
|
187 |
+
override_dirname:
|
188 |
+
kv_sep: '='
|
189 |
+
item_sep: ','
|
190 |
+
exclude_keys: []
|
191 |
+
runtime:
|
192 |
+
version: 1.3.2
|
193 |
+
version_base: '1.3'
|
194 |
+
cwd: /home/xj_data/jishengpeng/huawei/Wav-Omni
|
195 |
+
config_sources:
|
196 |
+
- path: hydra.conf
|
197 |
+
schema: pkg
|
198 |
+
provider: hydra
|
199 |
+
- path: /home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/conf
|
200 |
+
schema: file
|
201 |
+
provider: main
|
202 |
+
- path: ''
|
203 |
+
schema: structured
|
204 |
+
provider: schema
|
205 |
+
output_dir: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2
|
206 |
+
choices:
|
207 |
+
hydra/env: default
|
208 |
+
hydra/callbacks: null
|
209 |
+
hydra/job_logging: default
|
210 |
+
hydra/hydra_logging: default
|
211 |
+
hydra/hydra_help: default
|
212 |
+
hydra/help: default
|
213 |
+
hydra/sweeper: basic
|
214 |
+
hydra/launcher: basic
|
215 |
+
hydra/output: default
|
216 |
+
verbose: false
|
Wav-Omni/ckpt/wavomni_chinese_3_2/.hydra/overrides.yaml
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
- ++model_config.llm_name=qwen2-0.5b
|
2 |
+
- ++model_config.llm_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/Qwen2-0.5B
|
3 |
+
- ++model_config.llm_dim=896
|
4 |
+
- ++model_config.encoder_name=whisper
|
5 |
+
- ++model_config.encoder_projector_ds_rate=5
|
6 |
+
- ++model_config.encoder_path=large-v3
|
7 |
+
- ++model_config.encoder_dim=1280
|
8 |
+
- ++model_config.encoder_projector=linear
|
9 |
+
- ++model_config.codec_decoder_path=/home/xj_data/jishengpeng/CosyVoice/pretrained_models/CosyVoice-300M-SFT
|
10 |
+
- ++model_config.codec_decode=true
|
11 |
+
- ++model_config.vocab_config.code_layer=3
|
12 |
+
- ++model_config.vocab_config.total_audio_vocabsize=4160
|
13 |
+
- ++model_config.vocab_config.total_vocabsize=156160
|
14 |
+
- ++model_config.code_type=CosyVoice
|
15 |
+
- ++model_config.codec_decoder_type=CosyVoice
|
16 |
+
- ++model_config.group_decode=true
|
17 |
+
- ++model_config.group_decode_adapter_type=linear
|
18 |
+
- ++dataset_config.dataset=speech_dataset_s2s
|
19 |
+
- ++dataset_config.val_data_path=/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl
|
20 |
+
- ++dataset_config.train_data_path=/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl
|
21 |
+
- ++dataset_config.input_type=mel
|
22 |
+
- ++dataset_config.mel_size=128
|
23 |
+
- ++dataset_config.inference_mode=true
|
24 |
+
- ++dataset_config.manifest_format=jsonl
|
25 |
+
- ++dataset_config.split_size=0.002
|
26 |
+
- ++dataset_config.load_from_cache_file=false
|
27 |
+
- ++dataset_config.task_type=s2s
|
28 |
+
- ++dataset_config.seed=777
|
29 |
+
- ++dataset_config.vocab_config.code_layer=3
|
30 |
+
- ++dataset_config.vocab_config.total_audio_vocabsize=4160
|
31 |
+
- ++dataset_config.vocab_config.total_vocabsize=156160
|
32 |
+
- ++dataset_config.code_type=CosyVoice
|
33 |
+
- ++dataset_config.num_latency_tokens=0
|
34 |
+
- ++dataset_config.do_layershift=false
|
35 |
+
- ++train_config.model_name=s2s
|
36 |
+
- ++train_config.freeze_encoder=true
|
37 |
+
- ++train_config.freeze_llm=true
|
38 |
+
- ++train_config.freeze_encoder_projector=true
|
39 |
+
- ++train_config.freeze_group_decode_adapter=true
|
40 |
+
- ++train_config.batching_strategy=custom
|
41 |
+
- ++train_config.num_epochs=1
|
42 |
+
- ++train_config.val_batch_size=1
|
43 |
+
- ++train_config.num_workers_dataloader=2
|
44 |
+
- ++train_config.task_type=s2s
|
45 |
+
- ++decode_config.text_repetition_penalty=1.2
|
46 |
+
- ++decode_config.audio_repetition_penalty=1.2
|
47 |
+
- ++decode_config.max_new_tokens=3000
|
48 |
+
- ++decode_config.task_type=s2s
|
49 |
+
- ++decode_config.do_sample=false
|
50 |
+
- ++decode_config.top_p=1.0
|
51 |
+
- ++decode_config.top_k=0
|
52 |
+
- ++decode_config.temperature=1.0
|
53 |
+
- ++decode_config.decode_text_only=false
|
54 |
+
- ++decode_config.do_layershift=false
|
55 |
+
- ++decode_log=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy
|
56 |
+
- ++decode_config.num_latency_tokens=0
|
57 |
+
- ++ckpt_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt
|
58 |
+
- ++output_text_only=false
|
59 |
+
- ++inference_online=false
|
60 |
+
- ++speech_sample_rate=22050
|
61 |
+
- ++audio_prompt_path=/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/audio_prompt/zh/prompt_1.wav
|
Wav-Omni/ckpt/wavomni_chinese_3_2/inference_s2s.log
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[2025-03-05 13:27:43,011][root][INFO] - train_config: {'model_name': 's2s', 'enable_ddp': False, 'enable_deepspeed': False, 'enable_fsdp': False, 'low_cpu_fsdp': False, 'run_validation': True, 'batch_size_training': 4, 'batching_strategy': 'custom', 'context_length': 4096, 'gradient_accumulation_steps': 1, 'num_epochs': 1, 'num_workers_dataloader': 2, 'warmup_steps': 1000, 'total_steps': 100000, 'validation_interval': 1000, 'lr': 0.0001, 'weight_decay': 0.0, 'gamma': 0.85, 'seed': 42, 'use_fp16': False, 'mixed_precision': True, 'val_batch_size': 1, 'use_peft': False, 'peft_config': {'peft_method': 'lora', 'r': 8, 'lora_alpha': 32, 'target_modules': ['q_proj', 'v_proj'], 'bias': 'none', 'task_type': 'CAUSAL_LM', 'lora_dropout': 0.05, 'inference_mode': False}, 'output_dir': 'PATH/to/save/PEFT/model', 'freeze_layers': False, 'num_freeze_layers': 1, 'quantization': False, 'one_gpu': False, 'save_model': True, 'dist_checkpoint_root_folder': 'PATH/to/save/FSDP/model', 'dist_checkpoint_folder': 'fine-tuned', 'save_optimizer': False, 'use_fast_kernels': False, 'run_test_during_validation': False, 'run_test_during_validation_file': 'test.wav', 'run_test_during_validation_prompt': '<|S2S|>', 'freeze_llm': True, 'freeze_encoder': True, 'train_embed_only': False, 'train_audio_embed_only': False, 'train_embed': False, 'task_type': 's2s', 'freeze_encoder_projector': True, 'freeze_group_decode_adapter': True}
|
2 |
+
[2025-03-05 13:27:43,011][root][INFO] - fsdp_config: {'mixed_precision': True, 'use_fp16': False, 'sharding_strategy': 'NO_SHARD', 'checkpoint_type': 'SHARDED_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'pure_bf16': False, 'optimizer': 'AdamW'}
|
3 |
+
[2025-03-05 13:27:43,011][root][INFO] - model_config: {'file': 'examples/s2s/model/slam_model_s2s.py:model_factory', 'llm_name': 'qwen2-0.5b', 'llm_path': '/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/Qwen2-0.5B', 'llm_type': 'decoder_only', 'llm_dim': 896, 'encoder_name': 'whisper', 'encoder_ds_rate': 2, 'encoder_path': 'large-v3', 'encoder_dim': 1280, 'encoder_projector': 'linear', 'encoder_projector_ds_rate': 5, 'modal': 'audio', 'normalize': False, 'encoder_type': 'finetune', 'vocab_config': {'text_vocabsize': 151936, 'text_specialtokens': 64, 'audio_vocabsize': 4096, 'audio_specialtokens': 64, 'code_layer': 3, 'padded_text_vocabsize': 152000, 'padded_audio_vocabsize': 4160, 'total_audio_vocabsize': 4160, 'total_vocabsize': 156160, 'eot': 151936, 'pad_t': 151937, 'input_t': 151938, 'answer_t': 151939, 'asr': 151940, 'eoa': 4096, 'pad_a': 4097, 'input_a': 4098, 'answer_a': 4099, 'split': 4100}, 'codec_decode': True, 'codec_decoder_type': 'CosyVoice', 'codec_decoder_path': '/home/xj_data/jishengpeng/CosyVoice/pretrained_models/CosyVoice-300M-SFT', 'tts_adapter': False, 'tts_adapter_config': {'add_qkv_bias': True, 'bias': False, 'gelu_approximate': None, 'head_size': 64, 'intermediate_size': 4864, 'lm_head_bias': False, 'mlp_class_name': 'GptNeoxMLP', 'n_layer': 6, 'n_head': 14, 'n_embd': 896, 'n_query_groups': 2, 'norm_class_name': 'RMSNorm', 'norm_eps': 1e-06, 'parallel_residual': False, 'rotary_percentage': 1.0, 'shared_attention_norm': False}, 'encoder_path_hf': None, 'code_type': 'CosyVoice', 'group_decode': True, 'group_decode_adapter_type': 'linear', 'whisper_decode': False}
|
4 |
+
[2025-03-05 13:28:09,748][wavomni_llm.utils.train_utils][INFO] - --> Module whisper
|
5 |
+
[2025-03-05 13:28:09,750][wavomni_llm.utils.train_utils][INFO] - --> whisper has 635.04896 Million params
|
6 |
+
|
7 |
+
[2025-03-05 13:28:09,752][wavomni_llm.utils.train_utils][INFO] - --> Module whisper
|
8 |
+
[2025-03-05 13:28:09,753][wavomni_llm.utils.train_utils][INFO] - --> whisper has 0.0 Million params
|
9 |
+
|
10 |
+
[2025-03-05 13:28:33,421][wavomni_llm.utils.train_utils][INFO] - --> Module qwen2-0.5b
|
11 |
+
[2025-03-05 13:28:33,423][wavomni_llm.utils.train_utils][INFO] - --> qwen2-0.5b has 494.032768 Million params
|
12 |
+
|
13 |
+
[2025-03-05 13:28:33,424][wavomni_llm.utils.train_utils][INFO] - --> Module qwen2-0.5b
|
14 |
+
[2025-03-05 13:28:33,425][wavomni_llm.utils.train_utils][INFO] - --> qwen2-0.5b has 0.0 Million params
|
15 |
+
|
16 |
+
[2025-03-05 13:28:33,502][wavomni_llm.utils.train_utils][INFO] - --> Module linear
|
17 |
+
[2025-03-05 13:28:33,502][wavomni_llm.utils.train_utils][INFO] - --> linear has 14.945152 Million params
|
18 |
+
|
19 |
+
[2025-03-05 13:28:46,757][root][INFO] - input frame rate=50
|
20 |
+
[2025-03-05 13:28:56,015][wetext-zh_normalizer][INFO] - found existing fst: /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/zh_tn_tagger.fst
|
21 |
+
[2025-03-05 13:28:56,015][wetext-zh_normalizer][INFO] - /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/zh_tn_verbalizer.fst
|
22 |
+
[2025-03-05 13:28:56,015][wetext-zh_normalizer][INFO] - skip building fst for zh_normalizer ...
|
23 |
+
[2025-03-05 13:28:56,372][wetext-en_normalizer][INFO] - found existing fst: /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/en_tn_tagger.fst
|
24 |
+
[2025-03-05 13:28:56,373][wetext-en_normalizer][INFO] - /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/en_tn_verbalizer.fst
|
25 |
+
[2025-03-05 13:28:56,373][wetext-en_normalizer][INFO] - skip building fst for en_normalizer ...
|
26 |
+
[2025-03-05 13:29:46,255][wavomni_llm.utils.train_utils][INFO] - --> Module CosyVoice Codec
|
27 |
+
[2025-03-05 13:29:46,260][wavomni_llm.utils.train_utils][INFO] - --> CosyVoice Codec has 104.577327 Million params
|
28 |
+
|
29 |
+
[2025-03-05 13:29:46,543][wavomni_llm.utils.train_utils][INFO] - --> Module GroupDecodeAdapter_linear
|
30 |
+
[2025-03-05 13:29:46,543][wavomni_llm.utils.train_utils][INFO] - --> GroupDecodeAdapter_linear has 51.92928 Million params
|
31 |
+
|
32 |
+
[2025-03-05 13:29:48,085][slam_model_s2s.py][INFO] - Resize llm embedding layer's vocab size to 156160
|
33 |
+
|
34 |
+
[2025-03-05 13:29:48,085][slam_model_s2s.py][INFO] - loading other parts from: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt
|
35 |
+
|
36 |
+
[2025-03-05 13:30:16,442][wavomni_llm.utils.train_utils][INFO] - --> Model s2s
|
37 |
+
[2025-03-05 13:30:16,444][wavomni_llm.utils.train_utils][INFO] - --> s2s has 0.0 Million params
|
38 |
+
|
39 |
+
[2025-03-05 13:30:17,360][root][INFO] - dataset_config: {'dataset': 'speech_dataset_s2s', 'file': 'examples/s2s/speech_dataset_s2s.py:get_speech_dataset', 'train_data_path': '/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl', 'val_data_path': '/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl', 'train_split': 'train', 'test_split': 'validation', 'prompt': 'Conduct a spoken conversation with the user. ', 'data_path': None, 'max_words': None, 'max_mel': None, 'fix_length_audio': -1, 'inference_mode': True, 'input_type': 'mel', 'mel_size': 128, 'normalize': False, 'seed': 777, 'manifest_format': 'jsonl', 'split_size': 0.002, 'vocab_config': {'text_vocabsize': 151936, 'text_specialtokens': 64, 'audio_vocabsize': 4096, 'audio_specialtokens': 64, 'code_layer': 3, 'padded_text_vocabsize': 152000, 'padded_audio_vocabsize': 4160, 'total_audio_vocabsize': 4160, 'total_vocabsize': 156160, 'eot': 151936, 'pad_t': 151937, 'input_t': 151938, 'answer_t': 151939, 'asr': 151940, 'eoa': 4096, 'pad_a': 4097, 'input_a': 4098, 'answer_a': 4099, 'split': 4100}, 'load_from_cache_file': False, 'task_type': 's2s', 'upsample_text_tokens': False, 'upsampling_factor': 1, 'upsample_method': 'repeat', 'code_type': 'CosyVoice', 'num_latency_tokens': 0, 'do_layershift': False}
|
40 |
+
[2025-03-05 13:30:17,420][root][INFO] - --> Training Set Length = 4
|
41 |
+
[2025-03-05 13:30:17,429][root][INFO] - decode_config: {'do_sample': False, 'max_new_tokens': 3000, 'min_length': 10, 'temperature': 1.0, 'top_k': 0, 'top_p': 1.0, 'num_beams': 1, 'num_return_sequences': 1, 'num_samples': 1, 'max_time': 0.0, 'text_repetition_penalty': 1.2, 'audio_repetition_penalty': 1.2, 'length_penalty': 1.0, 'early_stopping': False, 'no_repeat_ngram_size': 0, 'bad_words_ids': [], 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'task_type': 's2s', 'decode_text_only': False, 'streaming': False, 'stream_stride': 4, 'upsampling_factor': 1, 'input_text': False, 'do_layershift': False, 'num_latency_tokens': 0}
|
42 |
+
[2025-03-05 13:30:17,429][root][INFO] - Decode Strategy: Greedy
|
43 |
+
[2025-03-05 13:30:17,430][root][INFO] - Decode Text & Audio
|
44 |
+
[2025-03-05 13:30:17,430][root][INFO] - Decode Code Type: CosyVoice
|
45 |
+
[2025-03-05 13:30:17,430][root][INFO] - Decode Code Layer: 3
|
46 |
+
[2025-03-05 13:30:17,430][root][INFO] - Tone for Audio Generation: prompt_1
|
47 |
+
[2025-03-05 13:30:17,430][root][INFO] - ============== Start s2s Inference ==============
|
48 |
+
[2025-03-05 13:30:38,187][root][INFO] - LLM Inference Time: 19.46s
|
49 |
+
[2025-03-05 13:30:38,188][root][INFO] - Question: 回答该题:柠檬的科学名称是什么?
|
50 |
+
[2025-03-05 13:30:38,188][root][INFO] - Generated Text: 柠檬的科学名称是“柑橘科柠檬”,也被称为“柑橘属柠檬”。柑橘科柠檬是一种常见的水果,其果实通常呈圆锥形或椭圆形,并且含有大量的汁液。它富含维生素C、钾和纤维素等营养成分,对身体健康有很好的保健作用。柑橘科柠檬也被广泛用于烹饪和饮料制作中,被人们认为具有美容养颜功效。
|
51 |
+
[2025-03-05 13:30:44,356][root][INFO] - Generated Audio: prompt_1/1.wav, audio length: 25.80s, generation time: 25.60s, RTF: 0.99
|
52 |
+
[2025-03-05 13:30:44,356][root][INFO] - LLM RTF: 0.75
|
53 |
+
[2025-03-05 13:57:49,762][root][INFO] - train_config: {'model_name': 's2s', 'enable_ddp': False, 'enable_deepspeed': False, 'enable_fsdp': False, 'low_cpu_fsdp': False, 'run_validation': True, 'batch_size_training': 4, 'batching_strategy': 'custom', 'context_length': 4096, 'gradient_accumulation_steps': 1, 'num_epochs': 1, 'num_workers_dataloader': 2, 'warmup_steps': 1000, 'total_steps': 100000, 'validation_interval': 1000, 'lr': 0.0001, 'weight_decay': 0.0, 'gamma': 0.85, 'seed': 42, 'use_fp16': False, 'mixed_precision': True, 'val_batch_size': 1, 'use_peft': False, 'peft_config': {'peft_method': 'lora', 'r': 8, 'lora_alpha': 32, 'target_modules': ['q_proj', 'v_proj'], 'bias': 'none', 'task_type': 'CAUSAL_LM', 'lora_dropout': 0.05, 'inference_mode': False}, 'output_dir': 'PATH/to/save/PEFT/model', 'freeze_layers': False, 'num_freeze_layers': 1, 'quantization': False, 'one_gpu': False, 'save_model': True, 'dist_checkpoint_root_folder': 'PATH/to/save/FSDP/model', 'dist_checkpoint_folder': 'fine-tuned', 'save_optimizer': False, 'use_fast_kernels': False, 'run_test_during_validation': False, 'run_test_during_validation_file': 'test.wav', 'run_test_during_validation_prompt': '<|S2S|>', 'freeze_llm': True, 'freeze_encoder': True, 'train_embed_only': False, 'train_audio_embed_only': False, 'train_embed': False, 'task_type': 's2s', 'freeze_encoder_projector': True, 'freeze_group_decode_adapter': True}
|
54 |
+
[2025-03-05 13:57:49,762][root][INFO] - fsdp_config: {'mixed_precision': True, 'use_fp16': False, 'sharding_strategy': 'NO_SHARD', 'checkpoint_type': 'SHARDED_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'pure_bf16': False, 'optimizer': 'AdamW'}
|
55 |
+
[2025-03-05 13:57:49,762][root][INFO] - model_config: {'file': 'examples/s2s/model/slam_model_s2s.py:model_factory', 'llm_name': 'qwen2-0.5b', 'llm_path': '/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/Qwen2-0.5B', 'llm_type': 'decoder_only', 'llm_dim': 896, 'encoder_name': 'whisper', 'encoder_ds_rate': 2, 'encoder_path': 'large-v3', 'encoder_dim': 1280, 'encoder_projector': 'linear', 'encoder_projector_ds_rate': 5, 'modal': 'audio', 'normalize': False, 'encoder_type': 'finetune', 'vocab_config': {'text_vocabsize': 151936, 'text_specialtokens': 64, 'audio_vocabsize': 4096, 'audio_specialtokens': 64, 'code_layer': 3, 'padded_text_vocabsize': 152000, 'padded_audio_vocabsize': 4160, 'total_audio_vocabsize': 4160, 'total_vocabsize': 156160, 'eot': 151936, 'pad_t': 151937, 'input_t': 151938, 'answer_t': 151939, 'asr': 151940, 'eoa': 4096, 'pad_a': 4097, 'input_a': 4098, 'answer_a': 4099, 'split': 4100}, 'codec_decode': True, 'codec_decoder_type': 'CosyVoice', 'codec_decoder_path': '/home/xj_data/jishengpeng/CosyVoice/pretrained_models/CosyVoice-300M-SFT', 'tts_adapter': False, 'tts_adapter_config': {'add_qkv_bias': True, 'bias': False, 'gelu_approximate': None, 'head_size': 64, 'intermediate_size': 4864, 'lm_head_bias': False, 'mlp_class_name': 'GptNeoxMLP', 'n_layer': 6, 'n_head': 14, 'n_embd': 896, 'n_query_groups': 2, 'norm_class_name': 'RMSNorm', 'norm_eps': 1e-06, 'parallel_residual': False, 'rotary_percentage': 1.0, 'shared_attention_norm': False}, 'encoder_path_hf': None, 'code_type': 'CosyVoice', 'group_decode': True, 'group_decode_adapter_type': 'linear', 'whisper_decode': False}
|
56 |
+
[2025-03-05 13:58:15,152][wavomni_llm.utils.train_utils][INFO] - --> Module whisper
|
57 |
+
[2025-03-05 13:58:15,154][wavomni_llm.utils.train_utils][INFO] - --> whisper has 635.04896 Million params
|
58 |
+
|
59 |
+
[2025-03-05 13:58:15,157][wavomni_llm.utils.train_utils][INFO] - --> Module whisper
|
60 |
+
[2025-03-05 13:58:15,158][wavomni_llm.utils.train_utils][INFO] - --> whisper has 0.0 Million params
|
61 |
+
|
62 |
+
[2025-03-05 13:58:17,223][wavomni_llm.utils.train_utils][INFO] - --> Module qwen2-0.5b
|
63 |
+
[2025-03-05 13:58:17,224][wavomni_llm.utils.train_utils][INFO] - --> qwen2-0.5b has 494.032768 Million params
|
64 |
+
|
65 |
+
[2025-03-05 13:58:17,226][wavomni_llm.utils.train_utils][INFO] - --> Module qwen2-0.5b
|
66 |
+
[2025-03-05 13:58:17,227][wavomni_llm.utils.train_utils][INFO] - --> qwen2-0.5b has 0.0 Million params
|
67 |
+
|
68 |
+
[2025-03-05 13:58:17,322][wavomni_llm.utils.train_utils][INFO] - --> Module linear
|
69 |
+
[2025-03-05 13:58:17,322][wavomni_llm.utils.train_utils][INFO] - --> linear has 14.945152 Million params
|
70 |
+
|
71 |
+
[2025-03-05 13:58:27,777][root][INFO] - input frame rate=50
|
72 |
+
[2025-03-05 13:58:30,984][wetext-zh_normalizer][INFO] - found existing fst: /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/zh_tn_tagger.fst
|
73 |
+
[2025-03-05 13:58:30,984][wetext-zh_normalizer][INFO] - /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/zh_tn_verbalizer.fst
|
74 |
+
[2025-03-05 13:58:30,984][wetext-zh_normalizer][INFO] - skip building fst for zh_normalizer ...
|
75 |
+
[2025-03-05 13:58:31,355][wetext-en_normalizer][INFO] - found existing fst: /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/en_tn_tagger.fst
|
76 |
+
[2025-03-05 13:58:31,355][wetext-en_normalizer][INFO] - /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/en_tn_verbalizer.fst
|
77 |
+
[2025-03-05 13:58:31,355][wetext-en_normalizer][INFO] - skip building fst for en_normalizer ...
|
78 |
+
[2025-03-05 13:58:35,642][wavomni_llm.utils.train_utils][INFO] - --> Module CosyVoice Codec
|
79 |
+
[2025-03-05 13:58:35,647][wavomni_llm.utils.train_utils][INFO] - --> CosyVoice Codec has 104.577327 Million params
|
80 |
+
|
81 |
+
[2025-03-05 13:58:35,932][wavomni_llm.utils.train_utils][INFO] - --> Module GroupDecodeAdapter_linear
|
82 |
+
[2025-03-05 13:58:35,932][wavomni_llm.utils.train_utils][INFO] - --> GroupDecodeAdapter_linear has 51.92928 Million params
|
83 |
+
|
84 |
+
[2025-03-05 13:58:37,463][slam_model_s2s.py][INFO] - Resize llm embedding layer's vocab size to 156160
|
85 |
+
|
86 |
+
[2025-03-05 13:58:37,463][slam_model_s2s.py][INFO] - loading other parts from: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt
|
87 |
+
|
88 |
+
[2025-03-05 13:58:38,622][wavomni_llm.utils.train_utils][INFO] - --> Model s2s
|
89 |
+
[2025-03-05 13:58:38,625][wavomni_llm.utils.train_utils][INFO] - --> s2s has 0.0 Million params
|
90 |
+
|
91 |
+
[2025-03-05 13:58:39,524][root][INFO] - dataset_config: {'dataset': 'speech_dataset_s2s', 'file': 'examples/s2s/speech_dataset_s2s.py:get_speech_dataset', 'train_data_path': '/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl', 'val_data_path': '/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl', 'train_split': 'train', 'test_split': 'validation', 'prompt': 'Conduct a spoken conversation with the user. ', 'data_path': None, 'max_words': None, 'max_mel': None, 'fix_length_audio': -1, 'inference_mode': True, 'input_type': 'mel', 'mel_size': 128, 'normalize': False, 'seed': 777, 'manifest_format': 'jsonl', 'split_size': 0.002, 'vocab_config': {'text_vocabsize': 151936, 'text_specialtokens': 64, 'audio_vocabsize': 4096, 'audio_specialtokens': 64, 'code_layer': 3, 'padded_text_vocabsize': 152000, 'padded_audio_vocabsize': 4160, 'total_audio_vocabsize': 4160, 'total_vocabsize': 156160, 'eot': 151936, 'pad_t': 151937, 'input_t': 151938, 'answer_t': 151939, 'asr': 151940, 'eoa': 4096, 'pad_a': 4097, 'input_a': 4098, 'answer_a': 4099, 'split': 4100}, 'load_from_cache_file': False, 'task_type': 's2s', 'upsample_text_tokens': False, 'upsampling_factor': 1, 'upsample_method': 'repeat', 'code_type': 'CosyVoice', 'num_latency_tokens': 0, 'do_layershift': False}
|
92 |
+
[2025-03-05 13:58:39,527][root][INFO] - --> Training Set Length = 4
|
93 |
+
[2025-03-05 13:58:39,530][root][INFO] - decode_config: {'do_sample': False, 'max_new_tokens': 3000, 'min_length': 10, 'temperature': 1.0, 'top_k': 0, 'top_p': 1.0, 'num_beams': 1, 'num_return_sequences': 1, 'num_samples': 1, 'max_time': 0.0, 'text_repetition_penalty': 1.2, 'audio_repetition_penalty': 1.2, 'length_penalty': 1.0, 'early_stopping': False, 'no_repeat_ngram_size': 0, 'bad_words_ids': [], 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'task_type': 's2s', 'decode_text_only': False, 'streaming': False, 'stream_stride': 4, 'upsampling_factor': 1, 'input_text': False, 'do_layershift': False, 'num_latency_tokens': 0}
|
94 |
+
[2025-03-05 13:58:39,530][root][INFO] - Decode Strategy: Greedy
|
95 |
+
[2025-03-05 13:58:39,530][root][INFO] - Decode Text & Audio
|
96 |
+
[2025-03-05 13:58:39,530][root][INFO] - Decode Code Type: CosyVoice
|
97 |
+
[2025-03-05 13:58:39,530][root][INFO] - Decode Code Layer: 3
|
98 |
+
[2025-03-05 13:58:39,530][root][INFO] - Tone for Audio Generation: prompt_1
|
99 |
+
[2025-03-05 13:58:39,530][root][INFO] - ============== Start s2s Inference ==============
|
100 |
+
[2025-03-05 14:00:42,800][root][INFO] - train_config: {'model_name': 's2s', 'enable_ddp': False, 'enable_deepspeed': False, 'enable_fsdp': False, 'low_cpu_fsdp': False, 'run_validation': True, 'batch_size_training': 4, 'batching_strategy': 'custom', 'context_length': 4096, 'gradient_accumulation_steps': 1, 'num_epochs': 1, 'num_workers_dataloader': 2, 'warmup_steps': 1000, 'total_steps': 100000, 'validation_interval': 1000, 'lr': 0.0001, 'weight_decay': 0.0, 'gamma': 0.85, 'seed': 42, 'use_fp16': False, 'mixed_precision': True, 'val_batch_size': 1, 'use_peft': False, 'peft_config': {'peft_method': 'lora', 'r': 8, 'lora_alpha': 32, 'target_modules': ['q_proj', 'v_proj'], 'bias': 'none', 'task_type': 'CAUSAL_LM', 'lora_dropout': 0.05, 'inference_mode': False}, 'output_dir': 'PATH/to/save/PEFT/model', 'freeze_layers': False, 'num_freeze_layers': 1, 'quantization': False, 'one_gpu': False, 'save_model': True, 'dist_checkpoint_root_folder': 'PATH/to/save/FSDP/model', 'dist_checkpoint_folder': 'fine-tuned', 'save_optimizer': False, 'use_fast_kernels': False, 'run_test_during_validation': False, 'run_test_during_validation_file': 'test.wav', 'run_test_during_validation_prompt': '<|S2S|>', 'freeze_llm': True, 'freeze_encoder': True, 'train_embed_only': False, 'train_audio_embed_only': False, 'train_embed': False, 'task_type': 's2s', 'freeze_encoder_projector': True, 'freeze_group_decode_adapter': True}
|
101 |
+
[2025-03-05 14:00:42,800][root][INFO] - fsdp_config: {'mixed_precision': True, 'use_fp16': False, 'sharding_strategy': 'NO_SHARD', 'checkpoint_type': 'SHARDED_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'pure_bf16': False, 'optimizer': 'AdamW'}
|
102 |
+
[2025-03-05 14:00:42,800][root][INFO] - model_config: {'file': 'examples/s2s/model/slam_model_s2s.py:model_factory', 'llm_name': 'qwen2-0.5b', 'llm_path': '/home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/Qwen2-0.5B', 'llm_type': 'decoder_only', 'llm_dim': 896, 'encoder_name': 'whisper', 'encoder_ds_rate': 2, 'encoder_path': 'large-v3', 'encoder_dim': 1280, 'encoder_projector': 'linear', 'encoder_projector_ds_rate': 5, 'modal': 'audio', 'normalize': False, 'encoder_type': 'finetune', 'vocab_config': {'text_vocabsize': 151936, 'text_specialtokens': 64, 'audio_vocabsize': 4096, 'audio_specialtokens': 64, 'code_layer': 3, 'padded_text_vocabsize': 152000, 'padded_audio_vocabsize': 4160, 'total_audio_vocabsize': 4160, 'total_vocabsize': 156160, 'eot': 151936, 'pad_t': 151937, 'input_t': 151938, 'answer_t': 151939, 'asr': 151940, 'eoa': 4096, 'pad_a': 4097, 'input_a': 4098, 'answer_a': 4099, 'split': 4100}, 'codec_decode': True, 'codec_decoder_type': 'CosyVoice', 'codec_decoder_path': '/home/xj_data/jishengpeng/CosyVoice/pretrained_models/CosyVoice-300M-SFT', 'tts_adapter': False, 'tts_adapter_config': {'add_qkv_bias': True, 'bias': False, 'gelu_approximate': None, 'head_size': 64, 'intermediate_size': 4864, 'lm_head_bias': False, 'mlp_class_name': 'GptNeoxMLP', 'n_layer': 6, 'n_head': 14, 'n_embd': 896, 'n_query_groups': 2, 'norm_class_name': 'RMSNorm', 'norm_eps': 1e-06, 'parallel_residual': False, 'rotary_percentage': 1.0, 'shared_attention_norm': False}, 'encoder_path_hf': None, 'code_type': 'CosyVoice', 'group_decode': True, 'group_decode_adapter_type': 'linear', 'whisper_decode': False}
|
103 |
+
[2025-03-05 14:01:06,313][wavomni_llm.utils.train_utils][INFO] - --> Module whisper
|
104 |
+
[2025-03-05 14:01:06,315][wavomni_llm.utils.train_utils][INFO] - --> whisper has 635.04896 Million params
|
105 |
+
|
106 |
+
[2025-03-05 14:01:06,316][wavomni_llm.utils.train_utils][INFO] - --> Module whisper
|
107 |
+
[2025-03-05 14:01:06,317][wavomni_llm.utils.train_utils][INFO] - --> whisper has 0.0 Million params
|
108 |
+
|
109 |
+
[2025-03-05 14:01:08,196][wavomni_llm.utils.train_utils][INFO] - --> Module qwen2-0.5b
|
110 |
+
[2025-03-05 14:01:08,197][wavomni_llm.utils.train_utils][INFO] - --> qwen2-0.5b has 494.032768 Million params
|
111 |
+
|
112 |
+
[2025-03-05 14:01:08,198][wavomni_llm.utils.train_utils][INFO] - --> Module qwen2-0.5b
|
113 |
+
[2025-03-05 14:01:08,199][wavomni_llm.utils.train_utils][INFO] - --> qwen2-0.5b has 0.0 Million params
|
114 |
+
|
115 |
+
[2025-03-05 14:01:08,274][wavomni_llm.utils.train_utils][INFO] - --> Module linear
|
116 |
+
[2025-03-05 14:01:08,274][wavomni_llm.utils.train_utils][INFO] - --> linear has 14.945152 Million params
|
117 |
+
|
118 |
+
[2025-03-05 14:01:17,364][root][INFO] - input frame rate=50
|
119 |
+
[2025-03-05 14:01:20,258][wetext-zh_normalizer][INFO] - found existing fst: /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/zh_tn_tagger.fst
|
120 |
+
[2025-03-05 14:01:20,258][wetext-zh_normalizer][INFO] - /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/zh_tn_verbalizer.fst
|
121 |
+
[2025-03-05 14:01:20,258][wetext-zh_normalizer][INFO] - skip building fst for zh_normalizer ...
|
122 |
+
[2025-03-05 14:01:20,643][wetext-en_normalizer][INFO] - found existing fst: /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/en_tn_tagger.fst
|
123 |
+
[2025-03-05 14:01:20,643][wetext-en_normalizer][INFO] - /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni/lib/python3.9/site-packages/tn/en_tn_verbalizer.fst
|
124 |
+
[2025-03-05 14:01:20,643][wetext-en_normalizer][INFO] - skip building fst for en_normalizer ...
|
125 |
+
[2025-03-05 14:01:24,558][wavomni_llm.utils.train_utils][INFO] - --> Module CosyVoice Codec
|
126 |
+
[2025-03-05 14:01:24,563][wavomni_llm.utils.train_utils][INFO] - --> CosyVoice Codec has 104.577327 Million params
|
127 |
+
|
128 |
+
[2025-03-05 14:01:24,850][wavomni_llm.utils.train_utils][INFO] - --> Module GroupDecodeAdapter_linear
|
129 |
+
[2025-03-05 14:01:24,850][wavomni_llm.utils.train_utils][INFO] - --> GroupDecodeAdapter_linear has 51.92928 Million params
|
130 |
+
|
131 |
+
[2025-03-05 14:01:26,392][slam_model_s2s.py][INFO] - Resize llm embedding layer's vocab size to 156160
|
132 |
+
|
133 |
+
[2025-03-05 14:01:26,393][slam_model_s2s.py][INFO] - loading other parts from: /home/xj_data/jishengpeng/huawei/Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt
|
134 |
+
|
135 |
+
[2025-03-05 14:01:27,554][wavomni_llm.utils.train_utils][INFO] - --> Model s2s
|
136 |
+
[2025-03-05 14:01:27,556][wavomni_llm.utils.train_utils][INFO] - --> s2s has 0.0 Million params
|
137 |
+
|
138 |
+
[2025-03-05 14:01:28,452][root][INFO] - dataset_config: {'dataset': 'speech_dataset_s2s', 'file': 'examples/s2s/speech_dataset_s2s.py:get_speech_dataset', 'train_data_path': '/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl', 'val_data_path': '/home/xj_data/jishengpeng/Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl', 'train_split': 'train', 'test_split': 'validation', 'prompt': 'Conduct a spoken conversation with the user. ', 'data_path': None, 'max_words': None, 'max_mel': None, 'fix_length_audio': -1, 'inference_mode': True, 'input_type': 'mel', 'mel_size': 128, 'normalize': False, 'seed': 777, 'manifest_format': 'jsonl', 'split_size': 0.002, 'vocab_config': {'text_vocabsize': 151936, 'text_specialtokens': 64, 'audio_vocabsize': 4096, 'audio_specialtokens': 64, 'code_layer': 3, 'padded_text_vocabsize': 152000, 'padded_audio_vocabsize': 4160, 'total_audio_vocabsize': 4160, 'total_vocabsize': 156160, 'eot': 151936, 'pad_t': 151937, 'input_t': 151938, 'answer_t': 151939, 'asr': 151940, 'eoa': 4096, 'pad_a': 4097, 'input_a': 4098, 'answer_a': 4099, 'split': 4100}, 'load_from_cache_file': False, 'task_type': 's2s', 'upsample_text_tokens': False, 'upsampling_factor': 1, 'upsample_method': 'repeat', 'code_type': 'CosyVoice', 'num_latency_tokens': 0, 'do_layershift': False}
|
139 |
+
[2025-03-05 14:01:28,499][root][INFO] - --> Training Set Length = 4
|
140 |
+
[2025-03-05 14:01:28,502][root][INFO] - decode_config: {'do_sample': False, 'max_new_tokens': 3000, 'min_length': 10, 'temperature': 1.0, 'top_k': 0, 'top_p': 1.0, 'num_beams': 1, 'num_return_sequences': 1, 'num_samples': 1, 'max_time': 0.0, 'text_repetition_penalty': 1.2, 'audio_repetition_penalty': 1.2, 'length_penalty': 1.0, 'early_stopping': False, 'no_repeat_ngram_size': 0, 'bad_words_ids': [], 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'task_type': 's2s', 'decode_text_only': False, 'streaming': False, 'stream_stride': 4, 'upsampling_factor': 1, 'input_text': False, 'do_layershift': False, 'num_latency_tokens': 0}
|
141 |
+
[2025-03-05 14:01:28,502][root][INFO] - Decode Strategy: Greedy
|
142 |
+
[2025-03-05 14:01:28,503][root][INFO] - Decode Text & Audio
|
143 |
+
[2025-03-05 14:01:28,503][root][INFO] - Decode Code Type: CosyVoice
|
144 |
+
[2025-03-05 14:01:28,503][root][INFO] - Decode Code Layer: 3
|
145 |
+
[2025-03-05 14:01:28,503][root][INFO] - Tone for Audio Generation: prompt_1
|
146 |
+
[2025-03-05 14:01:28,503][root][INFO] - ============== Start s2s Inference ==============
|
147 |
+
[2025-03-05 14:01:49,477][root][INFO] - LLM Inference Time: 19.84s
|
148 |
+
[2025-03-05 14:01:49,478][root][INFO] - Question: 回答该题:柠檬的科学名称是什么?
|
149 |
+
[2025-03-05 14:01:49,478][root][INFO] - Generated Text: 柠檬的科学名称是“柑橘科柠檬”,也被称为“柑橘属柠檬”。柑橘科柠檬是一种常见的水果,其果实通常呈圆锥形或椭圆形,并且含有大量的汁液。它富含维生素C、钾和纤维素等营养成分,对身体健康有很好的保健作用。柑橘科柠檬也被广泛用于烹饪和饮料制作中,被人们认为具有美容养颜功效。
|
150 |
+
[2025-03-05 14:01:55,437][root][INFO] - Generated Audio: prompt_1/1.wav, audio length: 25.80s, generation time: 25.76s, RTF: 1.00
|
151 |
+
[2025-03-05 14:01:55,437][root][INFO] - LLM RTF: 0.77
|
152 |
+
[2025-03-05 14:02:07,906][root][INFO] - LLM Inference Time: 12.47s
|
153 |
+
[2025-03-05 14:02:07,906][root][INFO] - Question: 请提出五种减少塑料垃圾的方法。
|
154 |
+
[2025-03-05 14:02:07,906][root][INFO] - Generated Text: 1. 使用可重复使用的购物袋和水瓶
|
155 |
+
2. 减少使用一次性塑料制品,如塑料餐具、瓶子等
|
156 |
+
3. 避免购买包装过多的食品,选择天然食材制作的食物
|
157 |
+
4. 选择购买环保型产品,如有机纸袋、玻璃杯等
|
158 |
+
5. 在家中安装玻璃窗或窗户,减少室内空气流通。
|
159 |
+
[2025-03-05 14:02:16,500][root][INFO] - Generated Audio: prompt_1/2.wav, audio length: 23.09s, generation time: 21.04s, RTF: 0.91
|
160 |
+
[2025-03-05 14:02:16,500][root][INFO] - LLM RTF: 0.54
|
161 |
+
[2025-03-05 14:02:30,030][root][INFO] - LLM Inference Time: 13.53s
|
162 |
+
[2025-03-05 14:02:30,030][root][INFO] - Question: 小李的薪水是每月8000元,他每个月需要交400元的养老保险和200元的医疗保险,请问他每个月实际能拿到的薪水是多少?
|
163 |
+
[2025-03-05 14:02:30,030][root][INFO] - Generated Text: 小李每月实际能拿到的薪水是8000元+400元+200元=16000元。
|
164 |
+
解题过程:
|
165 |
+
1. 小李每个月需要交的养老保险和医疗保健的金额之和为 16000元
|
166 |
+
2. 计算小李每月实际能拿到的薪水,即 16000元 - 15000元 = 15000元
|
167 |
+
因此,小李每月实际能拿到的薪水是15000元。
|
168 |
+
[2025-03-05 14:02:38,316][root][INFO] - Generated Audio: prompt_1/3.wav, audio length: 25.14s, generation time: 21.79s, RTF: 0.87
|
169 |
+
[2025-03-05 14:02:38,316][root][INFO] - LLM RTF: 0.54
|
170 |
+
[2025-03-05 14:02:48,677][root][INFO] - LLM Inference Time: 10.36s
|
171 |
+
[2025-03-05 14:02:48,677][root][INFO] - Question: 你好,我现在是浙江大学计算机二年级的硕士生,攻读语音方向,你能给我推荐一些学习的建议吗?
|
172 |
+
[2025-03-05 14:02:48,677][root][INFO] - Generated Text: 当然可以。首先,建议你多读一些经典的数学书籍,例如《数学原理》、《代数与几何》等。其次,要注重练习和思考,可以通过实践项目或者项目来巩固知识。最后,不要忘记在学习过程中保持耐心和积极性,因为只有通过不断的努力才能取得好的成果。
|
173 |
+
[2025-03-05 14:02:57,149][root][INFO] - Generated Audio: prompt_1/4.wav, audio length: 19.41s, generation time: 18.80s, RTF: 0.97
|
174 |
+
[2025-03-05 14:02:57,150][root][INFO] - LLM RTF: 0.53
|
175 |
+
[2025-03-05 14:02:57,379][root][INFO] - ============== Inference Finished ==============
|
Wav-Omni/ckpt/wavomni_chinese_3_2/model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dcd393406fe8e519802f73520504470f109a67519a3ab5d5442274793cace3bc
|
3 |
+
size 2258846794
|
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/gt_text
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
1 柠檬的科学名称为Citrus limon,属于芸香科柑橘属的常绿小乔木或灌木植物。它起源于亚洲南部,并在欧洲和地中海地区以及美洲热带地区广泛栽培。柠檬树可以在强烈的阳光下生长,所以它经常被栽种在阳光充足的地方,如地中海气候区。柠檬的果实是圆锥形的,皮表面略带纵行皱纹,果肉酸爽,用于制作饮料和调味料等。柠檬也是一种富含维生素C和抗氧化剂的水果,对保持健康有很大的益处。
|
2 |
+
2 1. 减少使用塑料制品,转而使用可再生或可分解材料制品。2. 训练良好的垃圾分类习惯,将可回收和可分解的塑料垃圾分别分类处理。3. 限制超市、商场,以及快餐店等商家使用一次性塑料袋、餐具和纸杯等,鼓励折叠包与可重复使用的餐具。4. 呼吁政府出台相关的环境保护法律法规,限制塑料垃圾排放,处罚违规行为。5. 开展垃圾分类教育活动,吸引公众关注塑料垃圾减量,培养环保意识,减少不必要的消费行为
|
3 |
+
3 小李每个月的薪水是8000元,需要交400元的养老保险和200元的医疗保险。因此,他实际能拿到的薪水是多少呢?我们可以用一个简单的计算公式来计算:实际薪水 = 总薪水 - 养老保险 - 医疗保险。将小李每月的总薪水8000元代入公式,养老保险400元和医疗保险200元分别代入得:实际薪水 = 8000 - 400 - 200 = 7400元,所以小李每个月实际能拿到的薪水是7400元。
|
4 |
+
4 没有标准回答,请随意发挥。
|
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e6a88562a43f4076affb94dada09d329021ecd7ad041bb2b1b67c22fc8fb3345
|
3 |
+
size 1137708
|
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a94d7a467901758c23a0fc998b44542b7f46f521ce2cb05fe36a1cdc485c9f04
|
3 |
+
size 1018412
|
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/3.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6e7f3a3d1c9cf1da6280012358379bcb0af32d7d2f4556fcd19de66b114a3676
|
3 |
+
size 1108524
|
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_audio/prompt_1/4.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:101fc7477936e0d7ab017d8b2ac78837a0f539a814bdd7df40004e6d3d974eb8
|
3 |
+
size 856108
|
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/pred_text
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
1 柠檬的科学名称是“柑橘科柠檬”,也被称为“柑橘属柠檬”。柑橘科柠檬是一种常见的水果,其果实通常呈圆锥形或椭圆形,并且含有大量的汁液。它富含维生素C、钾和纤维素等营养成分,对身体健康有很好的保健作用。柑橘科柠檬也被广泛用于烹饪和饮料制作中,被人们认为具有美容养颜功效。
|
2 |
+
2 1. 使用可重复使用的购物袋和水瓶
|
3 |
+
2. 减少使用一次性塑料制品,如塑料餐具、瓶子等
|
4 |
+
3. 避免购买包装过多的食品,选择天然食材制作的食物
|
5 |
+
4. 选择购买环保型产品,如有机纸袋、玻璃杯等
|
6 |
+
5. 在家中安装玻璃窗或窗户,减少室内空气流通。
|
7 |
+
3 小李每月实际能拿到的薪水是8000元+400元+200元=16000元。
|
8 |
+
解题过程:
|
9 |
+
1. 小李每个月需要交的养老保险和医疗保健的金额之和为 16000元
|
10 |
+
2. 计算小李每月实际能拿到的薪水,即 16000元 - 15000元 = 15000元
|
11 |
+
因此,小李每月实际能拿到的薪水是15000元。
|
12 |
+
4 当然可以。首先,建议你多读一些经典的数学书籍,例如《数学原理》、《代数与几何》等。其次,要注重练习和思考,可以通过实践项目或者项目来巩固知识。最后,不要忘记在学习过程中保持耐心和积极性,因为只有通过不断的努力才能取得好的成果。
|
Wav-Omni/ckpt/wavomni_chinese_3_2/s2s_decode_jsp_trp1.2_arp1.2_seed777_greedy/question_text
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
1 回答该题:柠檬的科学名称是什么?
|
2 |
+
2 请提出五种减少塑料垃圾的方法。
|
3 |
+
3 小李的薪水是每月8000元,他每个月需要交400元的养老保险和200元的医疗保险,请问他每个月实际能拿到的薪水是多少?
|
4 |
+
4 你好,我现在是浙江大学计算机二年级的硕士生,攻读语音方向,你能给我推荐一些学习的建议吗?
|
Wav-Omni/environment.yml
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: wavomni
|
2 |
+
channels:
|
3 |
+
- defaults
|
4 |
+
- https://repo.anaconda.com/pkgs/main
|
5 |
+
- https://repo.anaconda.com/pkgs/r
|
6 |
+
dependencies:
|
7 |
+
- _libgcc_mutex=0.1=main
|
8 |
+
- _openmp_mutex=5.1=1_gnu
|
9 |
+
- ca-certificates=2024.12.31=h06a4308_0
|
10 |
+
- ld_impl_linux-64=2.40=h12ee557_0
|
11 |
+
- libffi=3.4.4=h6a678d5_1
|
12 |
+
- libgcc-ng=11.2.0=h1234567_1
|
13 |
+
- libgomp=11.2.0=h1234567_1
|
14 |
+
- libstdcxx-ng=11.2.0=h1234567_1
|
15 |
+
- ncurses=6.4=h6a678d5_0
|
16 |
+
- openssl=3.0.15=h5eee18b_0
|
17 |
+
- pip=25.0=py39h06a4308_0
|
18 |
+
- python=3.9.21=he870216_1
|
19 |
+
- readline=8.2=h5eee18b_0
|
20 |
+
- setuptools=75.8.0=py39h06a4308_0
|
21 |
+
- sqlite=3.45.3=h5eee18b_0
|
22 |
+
- tk=8.6.14=h39e8969_0
|
23 |
+
- wheel=0.45.1=py39h06a4308_0
|
24 |
+
- xz=5.4.6=h5eee18b_1
|
25 |
+
- zlib=1.2.13=h5eee18b_1
|
26 |
+
- pip:
|
27 |
+
- accelerate==1.3.0
|
28 |
+
- addict==2.4.0
|
29 |
+
- aiofiles==23.2.1
|
30 |
+
- aiohappyeyeballs==2.4.4
|
31 |
+
- aiohttp==3.11.12
|
32 |
+
- aiosignal==1.3.2
|
33 |
+
- aliyun-python-sdk-core==2.16.0
|
34 |
+
- aliyun-python-sdk-kms==2.16.5
|
35 |
+
- annotated-types==0.7.0
|
36 |
+
- antlr4-python3-runtime==4.9.3
|
37 |
+
- anyio==4.8.0
|
38 |
+
- appdirs==1.4.4
|
39 |
+
- asttokens==3.0.0
|
40 |
+
- async-timeout==5.0.1
|
41 |
+
- attrs==25.1.0
|
42 |
+
- audioread==3.0.1
|
43 |
+
- beautifulsoup4==4.13.3
|
44 |
+
- bitsandbytes==0.45.2
|
45 |
+
- black==25.1.0
|
46 |
+
- brotli==1.1.0
|
47 |
+
- certifi==2025.1.31
|
48 |
+
- cffi==1.17.1
|
49 |
+
- charset-normalizer==3.4.1
|
50 |
+
- click==8.1.8
|
51 |
+
- cmake==3.25.0
|
52 |
+
- coloredlogs==15.0.1
|
53 |
+
- conformer==0.3.2
|
54 |
+
- contourpy==1.3.0
|
55 |
+
- crcmod==1.7
|
56 |
+
- cryptography==44.0.0
|
57 |
+
- cycler==0.12.1
|
58 |
+
- cython==3.0.11
|
59 |
+
- datasets==2.18.0
|
60 |
+
- decorator==5.1.1
|
61 |
+
- deepspeed==0.14.2
|
62 |
+
- diffusers==0.27.2
|
63 |
+
- dill==0.3.8
|
64 |
+
- docker-pycreds==0.4.0
|
65 |
+
- einops==0.8.0
|
66 |
+
- eval-type-backport==0.2.2
|
67 |
+
- exceptiongroup==1.2.2
|
68 |
+
- executing==2.2.0
|
69 |
+
- fastapi==0.115.8
|
70 |
+
- ffmpy==0.5.0
|
71 |
+
- filelock==3.17.0
|
72 |
+
- fire==0.7.0
|
73 |
+
- flatbuffers==25.1.24
|
74 |
+
- fonttools==4.55.8
|
75 |
+
- frozenlist==1.5.0
|
76 |
+
- fsspec==2024.2.0
|
77 |
+
- gast==0.6.0
|
78 |
+
- gdown==5.2.0
|
79 |
+
- gitdb==4.0.12
|
80 |
+
- gitpython==3.1.44
|
81 |
+
- gradio==4.44.1
|
82 |
+
- gradio-client==1.3.0
|
83 |
+
- grpcio==1.57.0
|
84 |
+
- grpcio-tools==1.57.0
|
85 |
+
- h11==0.14.0
|
86 |
+
- hjson==3.1.0
|
87 |
+
- httpcore==1.0.7
|
88 |
+
- httpx==0.28.1
|
89 |
+
- huggingface-hub==0.25.2
|
90 |
+
- humanfriendly==10.0
|
91 |
+
- hydra-core==1.3.2
|
92 |
+
- hyperpyyaml==1.2.2
|
93 |
+
- idna==3.10
|
94 |
+
- importlib-metadata==8.6.1
|
95 |
+
- importlib-resources==6.5.2
|
96 |
+
- inflate64==1.0.1
|
97 |
+
- inflect==7.3.1
|
98 |
+
- ipython==8.18.1
|
99 |
+
- jedi==0.19.2
|
100 |
+
- jinja2==3.1.5
|
101 |
+
- jmespath==0.10.0
|
102 |
+
- joblib==1.4.2
|
103 |
+
- kiwisolver==1.4.7
|
104 |
+
- lazy-loader==0.4
|
105 |
+
- librosa==0.10.2.post1
|
106 |
+
- lightning==2.2.4
|
107 |
+
- lightning-utilities==0.12.0
|
108 |
+
- lit==15.0.7
|
109 |
+
- llvmlite==0.43.0
|
110 |
+
- loralib==0.1.2
|
111 |
+
- markdown-it-py==3.0.0
|
112 |
+
- markupsafe==2.1.5
|
113 |
+
- matplotlib==3.7.5
|
114 |
+
- matplotlib-inline==0.1.7
|
115 |
+
- mdurl==0.1.2
|
116 |
+
- modelscope==1.15.0
|
117 |
+
- more-itertools==10.6.0
|
118 |
+
- mpmath==1.3.0
|
119 |
+
- msgpack==1.1.0
|
120 |
+
- multidict==6.1.0
|
121 |
+
- multiprocess==0.70.16
|
122 |
+
- multivolumefile==0.2.3
|
123 |
+
- mypy-extensions==1.0.0
|
124 |
+
- networkx==3.2.1
|
125 |
+
- ninja==1.11.1.3
|
126 |
+
- numba==0.60.0
|
127 |
+
- numpy==1.26.4
|
128 |
+
- nvidia-cublas-cu12==12.1.3.1
|
129 |
+
- nvidia-cuda-cupti-cu12==12.1.105
|
130 |
+
- nvidia-cuda-nvrtc-cu12==12.1.105
|
131 |
+
- nvidia-cuda-runtime-cu12==12.1.105
|
132 |
+
- nvidia-cudnn-cu12==8.9.2.26
|
133 |
+
- nvidia-cufft-cu12==11.0.2.54
|
134 |
+
- nvidia-curand-cu12==10.3.2.106
|
135 |
+
- nvidia-cusolver-cu12==11.4.5.107
|
136 |
+
- nvidia-cusparse-cu12==12.1.0.106
|
137 |
+
- nvidia-cusparselt-cu12==0.6.2
|
138 |
+
- nvidia-ml-py==12.570.86
|
139 |
+
- nvidia-nccl-cu12==2.20.5
|
140 |
+
- nvidia-nvjitlink-cu12==12.4.127
|
141 |
+
- nvidia-nvtx-cu12==12.1.105
|
142 |
+
- omegaconf==2.3.0
|
143 |
+
- onnxruntime-gpu==1.18.0
|
144 |
+
- openai-whisper==20231117
|
145 |
+
- optimum==1.24.0
|
146 |
+
- orjson==3.10.15
|
147 |
+
- oss2==2.19.1
|
148 |
+
- packaging==24.2
|
149 |
+
- pandas==2.2.3
|
150 |
+
- parso==0.8.4
|
151 |
+
- pathspec==0.12.1
|
152 |
+
- peft==0.6.0
|
153 |
+
- pexpect==4.9.0
|
154 |
+
- pillow==10.4.0
|
155 |
+
- platformdirs==4.3.6
|
156 |
+
- pooch==1.8.2
|
157 |
+
- prompt-toolkit==3.0.50
|
158 |
+
- propcache==0.2.1
|
159 |
+
- protobuf==4.25.6
|
160 |
+
- psutil==6.1.1
|
161 |
+
- ptyprocess==0.7.0
|
162 |
+
- pure-eval==0.2.3
|
163 |
+
- py-cpuinfo==9.0.0
|
164 |
+
- py7zr==0.22.0
|
165 |
+
- pyarrow==19.0.0
|
166 |
+
- pyarrow-hotfix==0.6
|
167 |
+
- pybcj==1.0.3
|
168 |
+
- pycparser==2.22
|
169 |
+
- pycryptodome==3.21.0
|
170 |
+
- pycryptodomex==3.21.0
|
171 |
+
- pydantic==2.10.6
|
172 |
+
- pydantic-core==2.27.2
|
173 |
+
- pydub==0.25.1
|
174 |
+
- pygments==2.19.1
|
175 |
+
- pynini==2.1.5
|
176 |
+
- pynvml==12.0.0
|
177 |
+
- pyparsing==3.2.1
|
178 |
+
- pyppmd==1.1.1
|
179 |
+
- pysocks==1.7.1
|
180 |
+
- python-dateutil==2.9.0.post0
|
181 |
+
- python-multipart==0.0.20
|
182 |
+
- pytorch-lightning==2.5.0.post0
|
183 |
+
- pytz==2025.1
|
184 |
+
- pyyaml==6.0.2
|
185 |
+
- pyzstd==0.16.2
|
186 |
+
- regex==2024.11.6
|
187 |
+
- requests==2.32.3
|
188 |
+
- rich==13.9.4
|
189 |
+
- ruamel-yaml==0.18.10
|
190 |
+
- ruamel-yaml-clib==0.2.12
|
191 |
+
- ruff==0.9.5
|
192 |
+
- safetensors==0.5.2
|
193 |
+
- scikit-learn==1.6.1
|
194 |
+
- scipy==1.13.1
|
195 |
+
- semantic-version==2.10.0
|
196 |
+
- sentencepiece==0.2.0
|
197 |
+
- sentry-sdk==2.20.0
|
198 |
+
- setproctitle==1.3.4
|
199 |
+
- shellingham==1.5.4
|
200 |
+
- simplejson==3.19.3
|
201 |
+
- six==1.17.0
|
202 |
+
- slam-llm==0.0.1
|
203 |
+
- smmap==5.0.2
|
204 |
+
- sniffio==1.3.1
|
205 |
+
- sortedcontainers==2.4.0
|
206 |
+
- soundfile==0.13.1
|
207 |
+
- soupsieve==2.6
|
208 |
+
- soxr==0.5.0.post1
|
209 |
+
- stack-data==0.6.3
|
210 |
+
- starlette==0.45.3
|
211 |
+
- sympy==1.13.1
|
212 |
+
- termcolor==2.5.0
|
213 |
+
- texttable==1.7.0
|
214 |
+
- threadpoolctl==3.5.0
|
215 |
+
- tiktoken==0.8.0
|
216 |
+
- tokenize-rt==6.1.0
|
217 |
+
- tokenizers==0.19.1
|
218 |
+
- tomli==2.2.1
|
219 |
+
- tomlkit==0.12.0
|
220 |
+
- torch==2.3.0
|
221 |
+
- torchaudio==2.3.0
|
222 |
+
- torchmetrics==1.6.1
|
223 |
+
- torchvision==0.15.2+cu118
|
224 |
+
- tqdm==4.67.1
|
225 |
+
- traitlets==5.14.3
|
226 |
+
- transformers==4.43.4
|
227 |
+
- triton==2.3.0
|
228 |
+
- typeguard==4.4.1
|
229 |
+
- typer==0.15.1
|
230 |
+
- typing-extensions==4.12.2
|
231 |
+
- tzdata==2025.1
|
232 |
+
- urllib3==2.3.0
|
233 |
+
- uvicorn==0.34.0
|
234 |
+
- wandb==0.19.6
|
235 |
+
- wcwidth==0.2.13
|
236 |
+
- websockets==12.0
|
237 |
+
- wetextprocessing==1.0.3
|
238 |
+
- wget==3.2
|
239 |
+
- xxhash==3.5.0
|
240 |
+
- yapf==0.43.0
|
241 |
+
- yarl==1.18.3
|
242 |
+
- zipp==3.21.0
|
243 |
+
prefix: /cpfs01/shared/jishengpeng/miniconda3/envs/wavomni
|
Wav-Omni/examples/s2s/__pycache__/s2s_config.cpython-39.pyc
ADDED
Binary file (10.2 kB). View file
|
|
Wav-Omni/examples/s2s/__pycache__/speech_dataset_s2s.cpython-39.pyc
ADDED
Binary file (15 kB). View file
|
|
Wav-Omni/examples/s2s/audio_prompt/en/prompt_1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:feeda768f3add18351ee20bed5bea58f5bb0d1936bb6073b4539496e20226888
|
3 |
+
size 303016
|
Wav-Omni/examples/s2s/audio_prompt/en/prompt_2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:685356ad8d595caed18b836df579d29e1764327f02c740e34c166e53fe7ff7e0
|
3 |
+
size 308662
|
Wav-Omni/examples/s2s/audio_prompt/en/prompt_3.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5993d88854f032e50e3bd850d6aec76222d9300d9999cec24bb4ea7186c46799
|
3 |
+
size 303016
|
Wav-Omni/examples/s2s/audio_prompt/en/prompt_4.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e8403303c63f3ecd78a71b423aecbff73f2c5ec50cd105f26391707ec2f7b8d
|
3 |
+
size 319950
|
Wav-Omni/examples/s2s/audio_prompt/en/prompt_5.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8415294e7303851998823ea67e23d5d9033f04fd6c2170a83f3be8d1c65e7583
|
3 |
+
size 312424
|
Wav-Omni/examples/s2s/audio_prompt/en/prompt_6.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c17b0cbe97a1d195493cf903077a90caa50029c5a57644f60e98e4e2e547b61b
|
3 |
+
size 335004
|
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:739ab2183a65576f14eec6f72ea7de6954e02014491b2876b1c528feea4ba5bc
|
3 |
+
size 214316
|
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28fc8a7443a5fbadb04566e1344792fb6fc5847d27b16afeb441b813063fec58
|
3 |
+
size 197676
|
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_3.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c70be19e3a974efae41a727872d2385f597f3cb48054dd2b19bd41c8e3910749
|
3 |
+
size 198444
|
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_4.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa7c7ee1b196484661326e915ee3cbb5a313efdbbf67ed735ded8a0b7cd9f441
|
3 |
+
size 216364
|
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_5.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8580fc452743c8fb8554b8fbf72f0962fb82346d64471c0d2f976d9abbf3f0e7
|
3 |
+
size 219180
|
Wav-Omni/examples/s2s/audio_prompt/zh/prompt_6.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:751bc465053b4cdcb184cef48c3f58168a3d6c699d3b6b72230c383507d77fb8
|
3 |
+
size 197164
|
Wav-Omni/examples/s2s/conf/ds_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train_micro_batch_size_per_gpu": 4,
|
3 |
+
"gradient_accumulation_steps": 1,
|
4 |
+
"optimizer": {
|
5 |
+
"type": "Adam",
|
6 |
+
"params": {
|
7 |
+
"lr": 1e-4
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"fp16": {
|
11 |
+
"enabled": true
|
12 |
+
},
|
13 |
+
"zero_optimization": {
|
14 |
+
"stage": 3,
|
15 |
+
"offload_optimizer": {
|
16 |
+
"device": "cpu"
|
17 |
+
}
|
18 |
+
}
|
19 |
+
}
|
Wav-Omni/examples/s2s/conf/prompt.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_config:
|
2 |
+
# we put prompt here, because the hydra override in shell script only support a small subset of chars
|
3 |
+
prompt: "Conduct a spoken conversation with the user. "
|
Wav-Omni/examples/s2s/conf/prompt_asr.yaml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
dataset_config:
|
2 |
+
prompt: "Transcribe the provided audio into accurate text. "
|
Wav-Omni/examples/s2s/conf/prompt_tts.yaml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_config:
|
2 |
+
# we put prompt here, because the hydra override in shell script only support a small subset of chars
|
3 |
+
# prompt: "Transcribe speech to text. Output the transcription directly without redundant content. Ensure that the output is not duplicated. "
|
4 |
+
prompt: "Generate a natural and expressive spoken version of the given text. "
|
Wav-Omni/examples/s2s/deepspeed_finetune_s2s.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from wavomni_llm.pipeline.finetune_deepspeed import main as train
|
2 |
+
from wavomni_llm.utils.deepspeed_utils import deepspeed_main_wrapper
|
3 |
+
|
4 |
+
import logging
|
5 |
+
from dataclasses import dataclass, field
|
6 |
+
from omegaconf import DictConfig, ListConfig, OmegaConf
|
7 |
+
from s2s_config import ModelConfig, TrainConfig, DataConfig, LogConfig
|
8 |
+
|
9 |
+
|
10 |
+
@dataclass
|
11 |
+
class RunConfig:
|
12 |
+
dataset_config: DataConfig = field(default_factory=DataConfig)
|
13 |
+
model_config: ModelConfig = field(default_factory=ModelConfig)
|
14 |
+
train_config: TrainConfig = field(default_factory=TrainConfig)
|
15 |
+
log_config: LogConfig = field(default_factory=LogConfig)
|
16 |
+
debug: bool = field(default=False, metadata={"help": "Use pdb when true"})
|
17 |
+
metric: str = field(default="acc", metadata={"help": "The metric for evaluation"})
|
18 |
+
deepspeed_config: str = field(default="examples/asr_librispeech/conf/ds_config.json", metadata={"help": "The metric for evaluation"})
|
19 |
+
|
20 |
+
|
21 |
+
@deepspeed_main_wrapper(config_name=None, version_base=None)
|
22 |
+
def main_hydra(cfg: DictConfig):
|
23 |
+
run_config = RunConfig()
|
24 |
+
cfg = OmegaConf.merge(run_config, cfg)
|
25 |
+
def to_plain_list(cfg_item):
|
26 |
+
if isinstance(cfg_item, ListConfig):
|
27 |
+
return OmegaConf.to_container(cfg_item, resolve=True)
|
28 |
+
elif isinstance(cfg_item, DictConfig):
|
29 |
+
return {k: to_plain_list(v) for k, v in cfg_item.items()}
|
30 |
+
else:
|
31 |
+
return cfg_item
|
32 |
+
|
33 |
+
# kwargs = to_plain_list(cfg)
|
34 |
+
kwargs = cfg
|
35 |
+
log_level = getattr(logging, kwargs.get("log_level", "INFO").upper())
|
36 |
+
|
37 |
+
logging.basicConfig(level=log_level)
|
38 |
+
|
39 |
+
if kwargs.get("debug", False):
|
40 |
+
import pdb;
|
41 |
+
pdb.set_trace()
|
42 |
+
|
43 |
+
train(kwargs)
|
44 |
+
|
45 |
+
|
46 |
+
if __name__ == "__main__":
|
47 |
+
main_hydra()
|
Wav-Omni/examples/s2s/demo/142001_0_rlhf.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d027ee1e2cbe76cef17ead8d8d92bfb2a68df09d756ebcce8e9a4339e8738be
|
3 |
+
size 330796
|
Wav-Omni/examples/s2s/demo/142023_0_rlhf.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:64278538582f31b7657b9496876a9cc5ea3c1204bfe49aaf2d085c0ca7356400
|
3 |
+
size 221228
|
Wav-Omni/examples/s2s/demo/142038_0_rlhf.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3a09e765fb5e025c206ea65531143d8c60e8b545c82dc5e0d040f4e7c5a4417
|
3 |
+
size 362540
|
Wav-Omni/examples/s2s/demo/64474_1_rlhf.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eb2a25bb5a13844494e4e8069e277ea12aa199a6476eae06cfbfc47cf668ff64
|
3 |
+
size 183340
|
Wav-Omni/examples/s2s/demo/demo_data/142038_0_rlhf.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3a09e765fb5e025c206ea65531143d8c60e8b545c82dc5e0d040f4e7c5a4417
|
3 |
+
size 362540
|
Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp.jsonl
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"key": "1", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
2 |
+
{"key": "2", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00244-of-00325/142001_0_rlhf.wav", "source_text": "What are some things I can mix into a pizza crust?", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00244-of-00325/142001_0_rlhf.wav", "target_text": "There are lots of tasty ingredients you can mix into a pizza crust to enhance flavor and texture! You can try adding herbs like oregano, basil, or rosemary for a fragrant touch. Garlic powder or minced garlic can add a nice, strong flavor. Cheese like Parmesan or shredded mozzarella can make your crust richer and more savory. Some people also like to add a bit of sugar or honey for a slightly sweet note. You can even mix in finely chopped olives or sundried tomatoes for a unique twist. Always ensure your added ingredients are well-incorporated into the dough before baking for the best results."}
|
3 |
+
{"key": "3", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00244-of-00325/142023_0_rlhf.wav", "source_text": "How do you take care of a frog?", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00244-of-00325/142023_0_rlhf.wav", "target_text": "Taking care of a frog involves several important steps. First, you'll need a proper habitat, like a terrarium, with enough space, humidity, and proper temperature. Ensure the terrarium has a secure lid to prevent escape. Use substrates like coconut fiber or moss to keep the environment humid. Provide hiding spots like logs or rocks for your frog to feel safe. Frogs require clean water, so provide a shallow dish with dechlorinated or bottled water. Feed your frog a diet of live insects, like crickets or worms, and dust them with a calcium supplement occasionally. Regularly clean the terrarium to prevent bacterial buildup and monitor your frog for signs of illness, consulting a vet if anything seems amiss."}
|
4 |
+
{"key": "4", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00244-of-00325/142038_0_rlhf.wav", "source_text": "Search for a peach crumble bar recipe.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00244-of-00325/142038_0_rlhf.wav", "target_text": "Sure! A quick search online will give you some great peach crumble bar recipes. Typically, these recipes involve making a base crust, a peach filling, and a crumbly topping. You'll need ingredients like fresh peaches, flour, sugar, butter, and oats. You start by creating the crust, layering the peach filling, and finishing with the crumble topping before baking it all together. Enjoy your delicious treat!"}
|
Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_chinese.jsonl
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"key": "1", "source_wav": "/cpfs01/shared/jishengpeng/Data/wavomni/tmp/73856459_1_train_3.5M_CN_ready4cosy_wo_code_switching.wav", "source_text": "回答该题:柠檬的科学名称是什么?", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "柠檬的科学名称为Citrus limon,属于芸香科柑橘属的常绿小乔木或灌木植物。它起源于亚洲南部,并在欧洲和地中海地区以及美洲热带地区广泛栽培。柠檬树可以在强烈的阳光下生长,所以它经常被栽种在阳光充足的地方,如地中海气候区。柠檬的果实是圆锥形的,皮表面略带纵行皱纹,果肉酸爽,用于制作饮料和调味料等。柠檬也是一种富含维生素C和抗氧化剂的水果,对保持健康有很大的益处。"}
|
2 |
+
{"key": "2", "source_wav": "/cpfs01/shared/jishengpeng/Data/wavomni/tmp/96297378_1_train_3.5M_CN_ready4cosy_wo_code_switching.wav", "source_text": "请提出五种减少塑料垃圾的方法。", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00244-of-00325/142001_0_rlhf.wav", "target_text": "1. 减少使用塑料制品,转而使用可再生或可分解材料制品。2. 训练良好的垃圾分类习惯,将可回收和可分解的塑料垃圾分别分类处理。3. 限制超市、商场,以及快餐店等商家使用一次性塑料袋、餐具和纸杯等,鼓励折叠包与可重复使用的餐具。4. 呼吁政府出台相关的环境保护法律法规,限制塑料垃圾排放,处罚违规行为。5. 开展垃圾分类教育活动,吸引公众关注塑料垃圾减量,培养环保意识,减少不必要的消费行为"}
|
3 |
+
{"key": "3", "source_wav": "/cpfs01/shared/jishengpeng/Data/wavomni/tmp/4412288_1_train_3.5M_CN_ready4cosy_wo_code_switching.wav", "source_text": "小李的薪水是每月8000元,他每个月需要交400元的养老保险和200元的医疗保险,请问他每个月实际能拿到的薪水是多少?", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00244-of-00325/142023_0_rlhf.wav", "target_text": "小李每个月的薪水是8000元,需要交400元的养老保险和200元的医疗保险。因此,他实际能拿到的薪水是多少呢?我们可以用一个简单的计算公式来计算:实际薪水 = 总薪水 - 养老保险 - 医疗保险。将小李每月的总薪水8000元代入公式,养老保险400元和医疗保险200元分别代入得:实际薪水 = 8000 - 400 - 200 = 7400元,所以小李每个月实际能拿到的薪水是7400元。"}
|
4 |
+
{"key": "4", "source_wav": "/home/xj_data/jishengpeng/huawei/Wav-Omni/examples/s2s/demo/question_jsp.wav", "source_text": "你好,我现在是浙江大学计算机二年级的硕士生,攻读语音方向,你能给我推荐一些学习的建议吗?", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00244-of-00325/142038_0_rlhf.wav", "target_text": "没有标准回答,请随意发挥。"}
|
Wav-Omni/examples/s2s/demo/demo_data/jsonl_demo_jsp_train.jsonl
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"key": "1", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
2 |
+
{"key": "2", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
3 |
+
{"key": "3", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
4 |
+
{"key": "4", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
5 |
+
{"key": "5", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
6 |
+
{"key": "6", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
7 |
+
{"key": "7", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
8 |
+
{"key": "8", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
9 |
+
{"key": "9", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
10 |
+
{"key": "10", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
11 |
+
{"key": "11", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
12 |
+
{"key": "12", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
13 |
+
{"key": "13", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
14 |
+
{"key": "14", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
15 |
+
{"key": "15", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
16 |
+
{"key": "16", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
17 |
+
{"key": "17", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
18 |
+
{"key": "18", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
19 |
+
{"key": "19", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
20 |
+
{"key": "20", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
21 |
+
{"key": "21", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
22 |
+
{"key": "22", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
23 |
+
{"key": "23", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
24 |
+
{"key": "24", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
25 |
+
{"key": "25", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
26 |
+
{"key": "26", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
27 |
+
{"key": "27", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
28 |
+
{"key": "28", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
29 |
+
{"key": "29", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
30 |
+
{"key": "30", "source_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/question_audio/train-00060-of-00325/64474_1_rlhf.wav", "source_text": "I want to help people.", "target_wav": "/cpfs01/shared/jishengpeng/Data/VoiceAssistant-400K/answer_audio/girl/train-00060-of-00325/64474_1_rlhf.wav", "target_text": "That's wonderful to hear! There are many ways you can help others. You can volunteer at local shelters, donate to charities, or simply offer support and kindness to those around you. Listening to someone's problems and providing a comforting presence can make a huge difference. Small acts of kindness, like helping a neighbor with groceries, can also have a big impact. Follow your heart and find the way that feels right for you."}
|
Wav-Omni/examples/s2s/demo/question_jsp.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a83d78aca6d0973e96ed1d0c59082a1f13234c9b4a80408cce29d3a9e13fb7b4
|
3 |
+
size 248997
|