ZeroTwo3 Xubo-Liu commited on
Commit
8811068
0 Parent(s):

Duplicate from Audio-AGI/WavJourney

Browse files

Co-authored-by: Xubo Liu <Xubo-Liu@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. .gitignore +8 -0
  3. APIs.py +202 -0
  4. Dockerfile +75 -0
  5. Envs/AudioCraft.yml +237 -0
  6. Envs/Bark.yml +179 -0
  7. Envs/VoiceFixer.yml +123 -0
  8. Envs/WavJourney.yml +248 -0
  9. LICENSE +251 -0
  10. README.md +112 -0
  11. VoiceParser/__init__.py +0 -0
  12. VoiceParser/customtokenizer.py +202 -0
  13. VoiceParser/hubert_manager.py +33 -0
  14. VoiceParser/model.py +102 -0
  15. VoiceParser/pre_kmeans_hubert.py +106 -0
  16. add_voice_preset.py +21 -0
  17. code_generator.py +188 -0
  18. config.yaml +17 -0
  19. convert_json_to_audio_gen_code.py +30 -0
  20. data/voice_presets/metadata.json +272 -0
  21. data/voice_presets/npz/child_boy.npz +0 -0
  22. data/voice_presets/npz/cnn_male_speaker.npz +0 -0
  23. data/voice_presets/npz/elder_morgen.npz +0 -0
  24. data/voice_presets/npz/news_female_speaker.npz +0 -0
  25. data/voice_presets/npz/news_female_speaker_outside.npz +0 -0
  26. data/voice_presets/npz/news_male_speaker.npz +0 -0
  27. examples/1.mp4 +0 -0
  28. examples/2.mp4 +0 -0
  29. examples/3.mp4 +0 -0
  30. examples/example1.wav +0 -0
  31. examples/example2.wav +0 -0
  32. examples/examples.py +87 -0
  33. parse_voice.py +31 -0
  34. pipeline.py +229 -0
  35. prompts/audio_script_to_character_voice_map.prompt +11 -0
  36. prompts/audio_script_to_json.prompt +74 -0
  37. prompts/script_to_json.prompt +58 -0
  38. prompts/text_to_audio_script.prompt +34 -0
  39. prompts/text_to_json.prompt +33 -0
  40. scripts/EnvsSetup.sh +7 -0
  41. scripts/download_models.py +32 -0
  42. scripts/kill_services.py +11 -0
  43. scripts/start_service_and_ui.sh +2 -0
  44. scripts/start_services.sh +1 -0
  45. scripts/start_ui.sh +1 -0
  46. services.py +231 -0
  47. share_btn.py +74 -0
  48. ui_client.py +632 -0
  49. utils.py +82 -0
  50. voice_presets.py +96 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ output/*
2
+ __pycache__/*
3
+ service_logs/*
4
+ convert_script_to_audio_gen_code.py
5
+ /cache/
6
+ VoiceParser/hubert/*
7
+ VoiceParser/__pycache__
8
+ /services_logs/
APIs.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import requests
4
+ import yaml
5
+ import pyloudnorm as pyln
6
+ from scipy.io.wavfile import write
7
+ import torchaudio
8
+ from retrying import retry
9
+ from utils import get_service_port, get_service_url
10
+
11
+
12
+ os.environ['OPENBLAS_NUM_THREADS'] = '1'
13
+ SAMPLE_RATE = 32000
14
+
15
+
16
+ with open('config.yaml', 'r') as file:
17
+ config = yaml.safe_load(file)
18
+ service_port = get_service_port()
19
+ localhost_addr = get_service_url()
20
+ enable_sr = config['Speech-Restoration']['Enable']
21
+
22
+ def LOUDNESS_NORM(audio, sr=32000, volumn=-25):
23
+ # peak normalize audio to -1 dB
24
+ peak_normalized_audio = pyln.normalize.peak(audio, -10.0)
25
+ # measure the loudness first
26
+ meter = pyln.Meter(sr) # create BS.1770 meter
27
+ loudness = meter.integrated_loudness(peak_normalized_audio)
28
+ # loudness normalize audio to -12 dB LUFS
29
+ normalized_audio = pyln.normalize.loudness(peak_normalized_audio, loudness, volumn)
30
+ return normalized_audio
31
+
32
+
33
+ def WRITE_AUDIO(wav, name=None, sr=SAMPLE_RATE):
34
+ """
35
+ function: write audio numpy to .wav file
36
+ @params:
37
+ wav: np.array [samples]
38
+ """
39
+ if name is None:
40
+ name = 'output.wav'
41
+
42
+ if len(wav.shape) > 1:
43
+ wav = wav[0]
44
+
45
+ # declipping
46
+
47
+ max_value = np.max(np.abs(wav))
48
+ if max_value > 1:
49
+ wav *= 0.9 / max_value
50
+
51
+ # write audio
52
+ write(name, sr, np.round(wav*32767).astype(np.int16))
53
+
54
+
55
+ def READ_AUDIO_NUMPY(wav, sr=SAMPLE_RATE):
56
+ """
57
+ function: read audio numpy
58
+ return: np.array [samples]
59
+ """
60
+ waveform, sample_rate = torchaudio.load(wav)
61
+
62
+ if sample_rate != sr:
63
+ waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=sr)
64
+
65
+ wav_numpy = waveform[0].numpy()
66
+
67
+ return wav_numpy
68
+
69
+
70
+ def MIX(wavs=[['1.wav', 0.], ['2.wav', 10.]], out_wav='out.wav', sr=SAMPLE_RATE):
71
+ """
72
+ wavs:[[wav_name, absolute_offset], ...]
73
+ """
74
+
75
+ max_length = max([int(wav[1]*sr + len(READ_AUDIO_NUMPY(wav[0]))) for wav in wavs])
76
+ template_wav = np.zeros(max_length)
77
+
78
+ for wav in wavs:
79
+ cur_name, cur_offset = wav
80
+ cur_wav = READ_AUDIO_NUMPY(cur_name)
81
+ cur_len = len(cur_wav)
82
+ cur_offset = int(cur_offset * sr)
83
+
84
+ # mix
85
+ template_wav[cur_offset:cur_offset+cur_len] += cur_wav
86
+
87
+ WRITE_AUDIO(template_wav, name=out_wav)
88
+
89
+
90
+ def CAT(wavs, out_wav='out.wav'):
91
+ """
92
+ wavs: List of wav file ['1.wav', '2.wav', ...]
93
+ """
94
+ wav_num = len(wavs)
95
+
96
+ segment0 = READ_AUDIO_NUMPY(wavs[0])
97
+
98
+ cat_wav = segment0
99
+
100
+ if wav_num > 1:
101
+ for i in range(1, wav_num):
102
+ next_wav = READ_AUDIO_NUMPY(wavs[i])
103
+ cat_wav = np.concatenate((cat_wav, next_wav), axis=-1)
104
+
105
+ WRITE_AUDIO(cat_wav, name=out_wav)
106
+
107
+
108
+ def COMPUTE_LEN(wav):
109
+ wav= READ_AUDIO_NUMPY(wav)
110
+ return len(wav) / 32000
111
+
112
+
113
+ @retry(stop_max_attempt_number=5, wait_fixed=2000)
114
+ def TTM(text, length=10, volume=-28, out_wav='out.wav'):
115
+ url = f'http://{localhost_addr}:{service_port}/generate_music'
116
+ data = {
117
+ 'text': f'{text}',
118
+ 'length': f'{length}',
119
+ 'volume': f'{volume}',
120
+ 'output_wav': f'{out_wav}',
121
+ }
122
+
123
+ response = requests.post(url, json=data)
124
+
125
+ if response.status_code == 200:
126
+ print('Success:', response.json()['message'])
127
+ else:
128
+ print('Error:', response.json()['API error'])
129
+ raise RuntimeError(response.json()['API error'])
130
+
131
+ @retry(stop_max_attempt_number=5, wait_fixed=2000)
132
+ def TTA(text, length=5, volume=-35, out_wav='out.wav'):
133
+ url = f'http://{localhost_addr}:{service_port}/generate_audio'
134
+ data = {
135
+ 'text': f'{text}',
136
+ 'length': f'{length}',
137
+ 'volume': f'{volume}',
138
+ 'output_wav': f'{out_wav}',
139
+ }
140
+
141
+ response = requests.post(url, json=data)
142
+
143
+ if response.status_code == 200:
144
+ print('Success:', response.json()['message'])
145
+ else:
146
+ print('Error:', response.json()['API error'])
147
+ raise RuntimeError(response.json()['API error'])
148
+
149
+
150
+ @retry(stop_max_attempt_number=5, wait_fixed=2000)
151
+ def TTS(text, volume=-20, out_wav='out.wav', enhanced=enable_sr, speaker_id='', speaker_npz=''):
152
+ url = f'http://{localhost_addr}:{service_port}/generate_speech'
153
+ data = {
154
+ 'text': f'{text}',
155
+ 'speaker_id': f'{speaker_id}',
156
+ 'speaker_npz': f'{speaker_npz}',
157
+ 'volume': f'{volume}',
158
+ 'output_wav': f'{out_wav}',
159
+ }
160
+
161
+ response = requests.post(url, json=data)
162
+
163
+ if response.status_code == 200:
164
+ print('Success:', response.json()['message'])
165
+ else:
166
+ print('Error:', response.json()['API error'])
167
+ raise RuntimeError(response.json()['API error'])
168
+
169
+ if enhanced:
170
+ SR(processfile=out_wav)
171
+
172
+
173
+ @retry(stop_max_attempt_number=5, wait_fixed=2000)
174
+ def SR(processfile):
175
+ url = f'http://{localhost_addr}:{service_port}/fix_audio'
176
+ data = {'processfile': f'{processfile}'}
177
+
178
+ response = requests.post(url, json=data)
179
+
180
+ if response.status_code == 200:
181
+ print('Success:', response.json()['message'])
182
+ else:
183
+ print('Error:', response.json()['API error'])
184
+ raise RuntimeError(response.json()['API error'])
185
+
186
+
187
+ @retry(stop_max_attempt_number=5, wait_fixed=2000)
188
+ def VP(wav_path, out_dir):
189
+ url = f'http://{localhost_addr}:{service_port}/parse_voice'
190
+ data = {
191
+ 'wav_path': f'{wav_path}',
192
+ 'out_dir':f'{out_dir}'
193
+ }
194
+
195
+ response = requests.post(url, json=data)
196
+
197
+ if response.status_code == 200:
198
+ print('Success:', response.json()['message'])
199
+ else:
200
+ print('Error:', response.json()['API error'])
201
+ raise RuntimeError(response.json()['API error'])
202
+
Dockerfile ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
4
+ ENV DEBIAN_FRONTEND=noninteractive
5
+
6
+ RUN apt-get update && \
7
+ apt-get upgrade -y && \
8
+ apt-get install -y --no-install-recommends \
9
+ git \
10
+ git-lfs \
11
+ wget \
12
+ curl \
13
+ # python build dependencies \
14
+ build-essential \
15
+ libssl-dev \
16
+ zlib1g-dev \
17
+ libbz2-dev \
18
+ libreadline-dev \
19
+ libsqlite3-dev \
20
+ libncursesw5-dev \
21
+ xz-utils \
22
+ tk-dev \
23
+ libxml2-dev \
24
+ libxmlsec1-dev \
25
+ libffi-dev \
26
+ liblzma-dev \
27
+ # gradio dependencies \
28
+ ffmpeg \
29
+ # fairseq2 dependencies \
30
+ libsndfile-dev && \
31
+ apt-get clean && \
32
+ rm -rf /var/lib/apt/lists/*
33
+
34
+
35
+ # Install miniconda
36
+ RUN apt-get install -y wget && rm -rf /var/lib/apt/lists/*
37
+
38
+ RUN wget \
39
+ https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
40
+ && bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/miniconda3 \
41
+ && rm -f Miniconda3-latest-Linux-x86_64.sh
42
+
43
+ # Set up a new user named "user" with user ID 1000
44
+ RUN useradd -m -u 1000 user
45
+
46
+ # Switch to the "user" user
47
+ USER user
48
+
49
+ # Add conda binary to PATH variable
50
+ ENV HOME=/home/user \
51
+ PATH=/opt/miniconda3/bin:/home/user/.local/bin:$PATH \
52
+ CONDA_PREFIX=/opt/miniconda3/envs
53
+
54
+ # Setup conda envs
55
+ WORKDIR $HOME/app
56
+ COPY --chown=user . $HOME/app
57
+
58
+ # Conda envs setup
59
+ RUN bash ./scripts/EnvsSetup.sh
60
+
61
+ # pre-download all models
62
+ RUN conda run --live-stream -n WavJourney python scripts/download_models.py
63
+ RUN mkdir $HOME/app/services_logs
64
+
65
+ # Env settings to get docker images to work on HF Spaces
66
+ ENV PYTHONPATH=${HOME}/app \
67
+ PYTHONUNBUFFERED=1 \
68
+ GRADIO_ALLOW_FLAGGING=never \
69
+ GRADIO_NUM_PORTS=1 \
70
+ GRADIO_SERVER_NAME=0.0.0.0 \
71
+ GRADIO_THEME=huggingface \
72
+ SYSTEM=spaces
73
+
74
+ # entrypoint
75
+ ENTRYPOINT bash /home/user/app/scripts/start_service_and_ui.sh
Envs/AudioCraft.yml ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: WavJourney
2
+ channels:
3
+ - nvidia/label/cuda-11.8.0
4
+ - conda-forge
5
+ - defaults
6
+ dependencies:
7
+ - _libgcc_mutex=0.1=conda_forge
8
+ - _openmp_mutex=4.5=2_gnu
9
+ - bzip2=1.0.8=h7f98852_4
10
+ - ca-certificates=2023.05.30=h06a4308_0
11
+ - cuda-cccl=11.8.89=0
12
+ - cuda-command-line-tools=11.8.0=0
13
+ - cuda-compiler=11.8.0=0
14
+ - cuda-cudart=11.8.89=0
15
+ - cuda-cudart-dev=11.8.89=0
16
+ - cuda-cuobjdump=11.8.86=0
17
+ - cuda-cupti=11.8.87=0
18
+ - cuda-cuxxfilt=11.8.86=0
19
+ - cuda-documentation=11.8.86=0
20
+ - cuda-driver-dev=11.8.89=0
21
+ - cuda-gdb=11.8.86=0
22
+ - cuda-libraries=11.8.0=0
23
+ - cuda-libraries-dev=11.8.0=0
24
+ - cuda-memcheck=11.8.86=0
25
+ - cuda-nsight=11.8.86=0
26
+ - cuda-nsight-compute=11.8.0=0
27
+ - cuda-nvcc=11.8.89=0
28
+ - cuda-nvdisasm=11.8.86=0
29
+ - cuda-nvml-dev=11.8.86=0
30
+ - cuda-nvprof=11.8.87=0
31
+ - cuda-nvprune=11.8.86=0
32
+ - cuda-nvrtc=11.8.89=0
33
+ - cuda-nvrtc-dev=11.8.89=0
34
+ - cuda-nvtx=11.8.86=0
35
+ - cuda-nvvp=11.8.87=0
36
+ - cuda-profiler-api=11.8.86=0
37
+ - cuda-sanitizer-api=11.8.86=0
38
+ - cuda-toolkit=11.8.0=0
39
+ - cuda-tools=11.8.0=0
40
+ - cuda-visual-tools=11.8.0=0
41
+ - gds-tools=1.4.0.31=0
42
+ - ld_impl_linux-64=2.40=h41732ed_0
43
+ - libcublas=11.11.3.6=0
44
+ - libcublas-dev=11.11.3.6=0
45
+ - libcufft=10.9.0.58=0
46
+ - libcufft-dev=10.9.0.58=0
47
+ - libcufile=1.4.0.31=0
48
+ - libcufile-dev=1.4.0.31=0
49
+ - libcurand=10.3.0.86=0
50
+ - libcurand-dev=10.3.0.86=0
51
+ - libcusolver=11.4.1.48=0
52
+ - libcusolver-dev=11.4.1.48=0
53
+ - libcusparse=11.7.5.86=0
54
+ - libcusparse-dev=11.7.5.86=0
55
+ - libffi=3.4.2=h7f98852_5
56
+ - libgcc-ng=13.1.0=he5830b7_0
57
+ - libgomp=13.1.0=he5830b7_0
58
+ - libnpp=11.8.0.86=0
59
+ - libnpp-dev=11.8.0.86=0
60
+ - libnsl=2.0.0=h7f98852_0
61
+ - libnvjpeg=11.9.0.86=0
62
+ - libnvjpeg-dev=11.9.0.86=0
63
+ - libsqlite=3.42.0=h2797004_0
64
+ - libuuid=2.38.1=h0b41bf4_0
65
+ - libzlib=1.2.13=hd590300_5
66
+ - ncurses=6.4=hcb278e6_0
67
+ - nsight-compute=2022.3.0.22=0
68
+ - openssl=3.1.1=hd590300_1
69
+ - pip=23.1.2=pyhd8ed1ab_0
70
+ - python=3.8.17=he550d4f_0_cpython
71
+ - readline=8.2=h8228510_1
72
+ - setuptools=68.0.0=pyhd8ed1ab_0
73
+ - tk=8.6.12=h27826a3_0
74
+ - wheel=0.40.0=pyhd8ed1ab_0
75
+ - xz=5.2.6=h166bdaf_0
76
+ - pip:
77
+ - aiofiles==23.1.0
78
+ - aiohttp==3.8.4
79
+ - aiosignal==1.3.1
80
+ - altair==5.0.1
81
+ - antlr4-python3-runtime==4.9.3
82
+ - anyio==3.7.1
83
+ - appdirs==1.4.4
84
+ - async-timeout==4.0.2
85
+ - attrs==23.1.0
86
+ - audioread==3.0.0
87
+ - av==10.0.0
88
+ - blinker==1.6.2
89
+ - blis==0.7.9
90
+ - catalogue==2.0.8
91
+ - certifi==2023.5.7
92
+ - cffi==1.15.1
93
+ - charset-normalizer==3.2.0
94
+ - click==8.1.5
95
+ - cloudpickle==2.2.1
96
+ - cmake==3.26.4
97
+ - colorlog==6.7.0
98
+ - confection==0.1.0
99
+ - contourpy==1.1.0
100
+ - cycler==0.11.0
101
+ - cymem==2.0.7
102
+ - cython==0.29.36
103
+ - decorator==5.1.1
104
+ - demucs==4.0.0
105
+ - diffq==0.2.4
106
+ - docopt==0.6.2
107
+ - dora-search==0.1.12
108
+ - einops==0.6.1
109
+ - encodec==0.1.1
110
+ - exceptiongroup==1.1.2
111
+ - fastapi==0.100.0
112
+ - ffmpy==0.3.0
113
+ - filelock==3.12.2
114
+ - flashy==0.0.2
115
+ - flask==2.3.2
116
+ - fonttools==4.41.0
117
+ - frozenlist==1.4.0
118
+ - fsspec==2023.6.0
119
+ - future==0.18.3
120
+ - gradio==3.36.1
121
+ - gradio-client==0.2.9
122
+ - h11==0.14.0
123
+ - httpcore==0.17.3
124
+ - httpx==0.24.1
125
+ - huggingface-hub==0.16.4
126
+ - hydra-colorlog==1.2.0
127
+ - hydra-core==1.3.2
128
+ - idna==3.4
129
+ - importlib-metadata==6.8.0
130
+ - importlib-resources==6.0.0
131
+ - itsdangerous==2.1.2
132
+ - jinja2==3.1.2
133
+ - joblib==1.3.1
134
+ - jsonschema==4.18.3
135
+ - jsonschema-specifications==2023.6.1
136
+ - julius==0.2.7
137
+ - kiwisolver==1.4.4
138
+ - lameenc==1.5.1
139
+ - langcodes==3.3.0
140
+ - lazy-loader==0.3
141
+ - librosa==0.10.0.post2
142
+ - lightning-utilities==0.9.0
143
+ - linkify-it-py==2.0.2
144
+ - lit==16.0.6
145
+ - llvmlite==0.40.1
146
+ - markdown-it-py==2.2.0
147
+ - markupsafe==2.1.3
148
+ - matplotlib==3.7.2
149
+ - mdit-py-plugins==0.3.3
150
+ - mdurl==0.1.2
151
+ - mpmath==1.3.0
152
+ - msgpack==1.0.5
153
+ - multidict==6.0.4
154
+ - murmurhash==1.0.9
155
+ - mypy-extensions==1.0.0
156
+ - networkx==3.1
157
+ - num2words==0.5.12
158
+ - numba==0.57.1
159
+ - numpy==1.24.4
160
+ - nvidia-cublas-cu11==11.10.3.66
161
+ - nvidia-cuda-cupti-cu11==11.7.101
162
+ - nvidia-cuda-nvrtc-cu11==11.7.99
163
+ - nvidia-cuda-runtime-cu11==11.7.99
164
+ - nvidia-cudnn-cu11==8.5.0.96
165
+ - nvidia-cufft-cu11==10.9.0.58
166
+ - nvidia-curand-cu11==10.2.10.91
167
+ - nvidia-cusolver-cu11==11.4.0.1
168
+ - nvidia-cusparse-cu11==11.7.4.91
169
+ - nvidia-nccl-cu11==2.14.3
170
+ - nvidia-nvtx-cu11==11.7.91
171
+ - omegaconf==2.3.0
172
+ - openunmix==1.2.1
173
+ - orjson==3.9.2
174
+ - packaging==23.1
175
+ - pandas==2.0.3
176
+ - pathy==0.10.2
177
+ - pillow==10.0.0
178
+ - pkgutil-resolve-name==1.3.10
179
+ - pooch==1.6.0
180
+ - preshed==3.0.8
181
+ - pycparser==2.21
182
+ - pydantic==1.10.11
183
+ - pydub==0.25.1
184
+ - pygments==2.15.1
185
+ - pyloudnorm==0.1.1
186
+ - pyparsing==3.0.9
187
+ - pyre-extensions==0.0.29
188
+ - python-dateutil==2.8.2
189
+ - python-multipart==0.0.6
190
+ - pytz==2023.3
191
+ - pyyaml==6.0
192
+ - referencing==0.29.1
193
+ - regex==2023.6.3
194
+ - requests==2.31.0
195
+ - retrying==1.3.4
196
+ - rpds-py==0.8.10
197
+ - safetensors==0.3.1
198
+ - scikit-learn==1.3.0
199
+ - scipy==1.10.1
200
+ - semantic-version==2.10.0
201
+ - sentencepiece==0.1.99
202
+ - six==1.16.0
203
+ - smart-open==6.3.0
204
+ - sniffio==1.3.0
205
+ - soundfile==0.12.1
206
+ - soxr==0.3.5
207
+ - spacy==3.5.2
208
+ - spacy-legacy==3.0.12
209
+ - spacy-loggers==1.0.4
210
+ - srsly==2.4.6
211
+ - starlette==0.27.0
212
+ - submitit==1.4.5
213
+ - sympy==1.12
214
+ - thinc==8.1.10
215
+ - threadpoolctl==3.2.0
216
+ - tokenizers==0.13.3
217
+ - toolz==0.12.0
218
+ - torch==2.0.1
219
+ - torchaudio==2.0.2
220
+ - torchmetrics==1.0.1
221
+ - tqdm==4.65.0
222
+ - transformers==4.31.0
223
+ - treetable==0.2.5
224
+ - triton==2.0.0
225
+ - typer==0.7.0
226
+ - typing-extensions==4.7.1
227
+ - typing-inspect==0.9.0
228
+ - tzdata==2023.3
229
+ - uc-micro-py==1.0.2
230
+ - urllib3==2.0.3
231
+ - uvicorn==0.22.0
232
+ - wasabi==1.1.2
233
+ - websockets==11.0.3
234
+ - werkzeug==2.3.6
235
+ - xformers==0.0.20
236
+ - yarl==1.9.2
237
+ - zipp==3.16.2
Envs/Bark.yml ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: WavJourney
2
+ channels:
3
+ - conda-forge
4
+ - defaults
5
+ dependencies:
6
+ - _libgcc_mutex=0.1=conda_forge
7
+ - _openmp_mutex=4.5=2_gnu
8
+ - bzip2=1.0.8=h7f98852_4
9
+ - ca-certificates=2023.5.7=hbcca054_0
10
+ - ld_impl_linux-64=2.40=h41732ed_0
11
+ - libffi=3.4.2=h7f98852_5
12
+ - libgcc-ng=13.1.0=he5830b7_0
13
+ - libgomp=13.1.0=he5830b7_0
14
+ - libnsl=2.0.0=h7f98852_0
15
+ - libsqlite=3.42.0=h2797004_0
16
+ - libuuid=2.38.1=h0b41bf4_0
17
+ - libzlib=1.2.13=hd590300_5
18
+ - ncurses=6.4=hcb278e6_0
19
+ - openssl=3.1.1=hd590300_1
20
+ - pip=23.1.2=pyhd8ed1ab_0
21
+ - python=3.8.17=he550d4f_0_cpython
22
+ - readline=8.2=h8228510_1
23
+ - setuptools=68.0.0=pyhd8ed1ab_0
24
+ - tk=8.6.12=h27826a3_0
25
+ - wheel=0.40.0=pyhd8ed1ab_0
26
+ - xz=5.2.6=h166bdaf_0
27
+ - pip:
28
+ - aiohttp==3.8.5
29
+ - aiosignal==1.3.1
30
+ - altair==5.0.1
31
+ - appdirs==1.4.4
32
+ - asttokens==2.2.1
33
+ - async-timeout==4.0.3
34
+ - attrs==23.1.0
35
+ - audioread==3.0.0
36
+ - backcall==0.2.0
37
+ - backports-zoneinfo==0.2.1
38
+ - blinker==1.6.2
39
+ - boto3==1.28.3
40
+ - botocore==1.31.3
41
+ - cachetools==5.3.1
42
+ - certifi==2023.5.7
43
+ - cffi==1.15.1
44
+ - charset-normalizer==3.2.0
45
+ - click==8.1.5
46
+ - cmake==3.26.4
47
+ - coloredlogs==15.0.1
48
+ - contourpy==1.1.0
49
+ - cycler==0.11.0
50
+ - datasets==2.14.4
51
+ - decorator==5.1.1
52
+ - dill==0.3.7
53
+ - einops==0.6.1
54
+ - encodec==0.1.1
55
+ - executing==1.2.0
56
+ - filelock==3.12.2
57
+ - fire==0.5.0
58
+ - flask==2.3.2
59
+ - fonttools==4.41.0
60
+ - frozenlist==1.4.0
61
+ - fsspec==2023.6.0
62
+ - funcy==2.0
63
+ - future==0.18.3
64
+ - gitdb==4.0.10
65
+ - gitpython==3.1.32
66
+ - huggingface-hub==0.16.4
67
+ - humanfriendly==10.0
68
+ - idna==3.4
69
+ - importlib-metadata==6.8.0
70
+ - importlib-resources==6.0.0
71
+ - ipdb==0.13.13
72
+ - ipython==8.12.2
73
+ - itsdangerous==2.1.2
74
+ - jedi==0.19.0
75
+ - jinja2==3.1.2
76
+ - jmespath==1.0.1
77
+ - joblib==1.3.1
78
+ - jsonschema==4.18.3
79
+ - jsonschema-specifications==2023.6.1
80
+ - kiwisolver==1.4.4
81
+ - lazy-loader==0.3
82
+ - librosa==0.10.0.post2
83
+ - lit==16.0.6
84
+ - llvmlite==0.40.1
85
+ - markdown-it-py==3.0.0
86
+ - markupsafe==2.1.3
87
+ - matplotlib==3.7.2
88
+ - matplotlib-inline==0.1.6
89
+ - mdurl==0.1.2
90
+ - mpmath==1.3.0
91
+ - msgpack==1.0.5
92
+ - multidict==6.0.4
93
+ - multiprocess==0.70.15
94
+ - networkx==3.1
95
+ - nltk==3.8.1
96
+ - numba==0.57.1
97
+ - numpy==1.24.4
98
+ - nvidia-cublas-cu11==11.10.3.66
99
+ - nvidia-cuda-cupti-cu11==11.7.101
100
+ - nvidia-cuda-nvrtc-cu11==11.7.99
101
+ - nvidia-cuda-runtime-cu11==11.7.99
102
+ - nvidia-cudnn-cu11==8.5.0.96
103
+ - nvidia-cufft-cu11==10.9.0.58
104
+ - nvidia-curand-cu11==10.2.10.91
105
+ - nvidia-cusolver-cu11==11.4.0.1
106
+ - nvidia-cusparse-cu11==11.7.4.91
107
+ - nvidia-nccl-cu11==2.14.3
108
+ - nvidia-nvtx-cu11==11.7.91
109
+ - optimum==1.11.1
110
+ - packaging==23.1
111
+ - pandas==2.0.3
112
+ - parso==0.8.3
113
+ - pexpect==4.8.0
114
+ - pickleshare==0.7.5
115
+ - pillow==9.5.0
116
+ - pkgutil-resolve-name==1.3.10
117
+ - pooch==1.6.0
118
+ - progressbar==2.5
119
+ - prompt-toolkit==3.0.39
120
+ - protobuf==4.23.4
121
+ - ptyprocess==0.7.0
122
+ - pure-eval==0.2.2
123
+ - pyarrow==12.0.1
124
+ - pycparser==2.21
125
+ - pydeck==0.8.1b0
126
+ - pygments==2.15.1
127
+ - pyloudnorm==0.1.1
128
+ - pympler==1.0.1
129
+ - pyparsing==3.0.9
130
+ - python-dateutil==2.8.2
131
+ - pytz==2023.3
132
+ - pytz-deprecation-shim==0.1.0.post0
133
+ - pyyaml==6.0
134
+ - referencing==0.29.1
135
+ - regex==2023.6.3
136
+ - requests==2.31.0
137
+ - resampy==0.4.2
138
+ - retrying==1.3.4
139
+ - rich==13.4.2
140
+ - rpds-py==0.8.10
141
+ - s3transfer==0.6.1
142
+ - safetensors==0.3.1
143
+ - scikit-learn==1.3.0
144
+ - scipy==1.10.1
145
+ - sentencepiece==0.1.99
146
+ - six==1.16.0
147
+ - smmap==5.0.0
148
+ - soundfile==0.12.1
149
+ - soxr==0.3.5
150
+ - stack-data==0.6.2
151
+ - streamlit==1.24.1
152
+ - suno-bark==0.1.5
153
+ - sympy==1.12
154
+ - tenacity==8.2.2
155
+ - termcolor==2.3.0
156
+ - threadpoolctl==3.2.0
157
+ - tokenizers==0.13.3
158
+ - toml==0.10.2
159
+ - tomli==2.0.1
160
+ - toolz==0.12.0
161
+ - torch==2.0.1
162
+ - torchaudio==2.0.2
163
+ - torchlibrosa==0.0.7
164
+ - tornado==6.3.2
165
+ - tqdm==4.65.0
166
+ - traitlets==5.9.0
167
+ - transformers==4.31.0
168
+ - triton==2.0.0
169
+ - typing-extensions==4.7.1
170
+ - tzdata==2023.3
171
+ - tzlocal==4.3.1
172
+ - urllib3==1.26.16
173
+ - validators==0.20.0
174
+ - watchdog==3.0.0
175
+ - wcwidth==0.2.6
176
+ - werkzeug==2.3.6
177
+ - xxhash==3.3.0
178
+ - yarl==1.9.2
179
+ - zipp==3.16.1
Envs/VoiceFixer.yml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: VoiceFixer
2
+ channels:
3
+ - defaults
4
+ dependencies:
5
+ - _libgcc_mutex=0.1=main
6
+ - _openmp_mutex=5.1=1_gnu
7
+ - ca-certificates=2023.05.30=h06a4308_0
8
+ - ld_impl_linux-64=2.38=h1181459_1
9
+ - libffi=3.4.4=h6a678d5_0
10
+ - libgcc-ng=11.2.0=h1234567_1
11
+ - libgomp=11.2.0=h1234567_1
12
+ - libstdcxx-ng=11.2.0=h1234567_1
13
+ - ncurses=6.4=h6a678d5_0
14
+ - openssl=3.0.9=h7f8727e_0
15
+ - pip=23.2.1=py38h06a4308_0
16
+ - python=3.8.17=h955ad1f_0
17
+ - readline=8.2=h5eee18b_0
18
+ - setuptools=68.0.0=py38h06a4308_0
19
+ - sqlite=3.41.2=h5eee18b_0
20
+ - tk=8.6.12=h1ccaba5_0
21
+ - wheel=0.38.4=py38h06a4308_0
22
+ - xz=5.4.2=h5eee18b_0
23
+ - zlib=1.2.13=h5eee18b_0
24
+ - pip:
25
+ - altair==5.0.1
26
+ - attrs==23.1.0
27
+ - audioread==3.0.0
28
+ - backports-zoneinfo==0.2.1
29
+ - blinker==1.6.2
30
+ - cachetools==5.3.1
31
+ - certifi==2023.5.7
32
+ - cffi==1.15.1
33
+ - charset-normalizer==3.2.0
34
+ - click==8.1.5
35
+ - cmake==3.27.0
36
+ - contourpy==1.1.0
37
+ - cycler==0.11.0
38
+ - decorator==5.1.1
39
+ - filelock==3.12.2
40
+ - flask==2.3.2
41
+ - fonttools==4.38.0
42
+ - gitdb==4.0.10
43
+ - gitpython==3.1.32
44
+ - idna==3.4
45
+ - importlib-metadata==6.7.0
46
+ - importlib-resources==5.12.0
47
+ - itsdangerous==2.1.2
48
+ - jinja2==3.1.2
49
+ - joblib==1.3.1
50
+ - jsonschema==4.17.3
51
+ - jsonschema-specifications==2023.7.1
52
+ - kiwisolver==1.4.4
53
+ - librosa==0.8.1
54
+ - lit==16.0.6
55
+ - llvmlite==0.39.1
56
+ - markdown-it-py==2.2.0
57
+ - markupsafe==2.1.3
58
+ - matplotlib==3.5.3
59
+ - mdurl==0.1.2
60
+ - mpmath==1.3.0
61
+ - networkx==3.1
62
+ - numba==0.56.4
63
+ - numpy==1.21.6
64
+ - nvidia-cublas-cu11==11.10.3.66
65
+ - nvidia-cuda-cupti-cu11==11.7.101
66
+ - nvidia-cuda-nvrtc-cu11==11.7.99
67
+ - nvidia-cuda-runtime-cu11==11.7.99
68
+ - nvidia-cudnn-cu11==8.5.0.96
69
+ - nvidia-cufft-cu11==10.9.0.58
70
+ - nvidia-curand-cu11==10.2.10.91
71
+ - nvidia-cusolver-cu11==11.4.0.1
72
+ - nvidia-cusparse-cu11==11.7.4.91
73
+ - nvidia-nccl-cu11==2.14.3
74
+ - nvidia-nvtx-cu11==11.7.91
75
+ - packaging==23.1
76
+ - pandas==1.3.5
77
+ - pillow==9.5.0
78
+ - pkgutil-resolve-name==1.3.10
79
+ - platformdirs==3.9.1
80
+ - pooch==1.7.0
81
+ - progressbar==2.5
82
+ - protobuf==4.23.4
83
+ - pyarrow==12.0.1
84
+ - pycparser==2.21
85
+ - pydeck==0.8.1b0
86
+ - pygments==2.15.1
87
+ - pympler==1.0.1
88
+ - pyparsing==3.1.0
89
+ - pyrsistent==0.19.3
90
+ - python-dateutil==2.8.2
91
+ - pytz==2023.3
92
+ - pytz-deprecation-shim==0.1.0.post0
93
+ - pyyaml==6.0.1
94
+ - referencing==0.30.0
95
+ - requests==2.31.0
96
+ - resampy==0.4.2
97
+ - retrying==1.3.4
98
+ - rich==13.4.2
99
+ - rpds-py==0.9.2
100
+ - scikit-learn==1.0.2
101
+ - scipy==1.7.3
102
+ - six==1.16.0
103
+ - smmap==5.0.0
104
+ - soundfile==0.12.1
105
+ - streamlit==1.23.1
106
+ - sympy==1.12
107
+ - tenacity==8.2.2
108
+ - threadpoolctl==3.1.0
109
+ - toml==0.10.2
110
+ - toolz==0.12.0
111
+ - torch==1.13.1
112
+ - torchlibrosa==0.0.7
113
+ - tornado==6.2
114
+ - triton==2.0.0
115
+ - typing-extensions==4.7.1
116
+ - tzdata==2023.3
117
+ - tzlocal==4.3.1
118
+ - urllib3==2.0.3
119
+ - validators==0.20.0
120
+ - voicefixer==0.1.2
121
+ - watchdog==3.0.0
122
+ - werkzeug==2.3.6
123
+ - zipp==3.15.0
Envs/WavJourney.yml ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: WavJourney
2
+ channels:
3
+ - conda-forge
4
+ - defaults
5
+ dependencies:
6
+ - _libgcc_mutex=0.1=conda_forge
7
+ - _openmp_mutex=4.5=2_gnu
8
+ - aom=3.5.0=h27087fc_0
9
+ - bzip2=1.0.8=h7f98852_4
10
+ - ca-certificates=2023.7.22=hbcca054_0
11
+ - cairo=1.16.0=hbbf8b49_1016
12
+ - dav1d=1.2.1=hd590300_0
13
+ - expat=2.5.0=hcb278e6_1
14
+ - ffmpeg=6.0.0=gpl_hdbbbd96_103
15
+ - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
16
+ - font-ttf-inconsolata=3.000=h77eed37_0
17
+ - font-ttf-source-code-pro=2.038=h77eed37_0
18
+ - font-ttf-ubuntu=0.83=hab24e00_0
19
+ - fontconfig=2.14.2=h14ed4e7_0
20
+ - fonts-conda-ecosystem=1=0
21
+ - fonts-conda-forge=1=0
22
+ - freetype=2.12.1=hca18f0e_1
23
+ - fribidi=1.0.10=h36c2ea0_0
24
+ - gettext=0.21.1=h27087fc_0
25
+ - gmp=6.2.1=h58526e2_0
26
+ - gnutls=3.7.8=hf3e180e_0
27
+ - graphite2=1.3.13=h58526e2_1001
28
+ - harfbuzz=7.3.0=hdb3a94d_0
29
+ - icu=72.1=hcb278e6_0
30
+ - lame=3.100=h166bdaf_1003
31
+ - ld_impl_linux-64=2.40=h41732ed_0
32
+ - libass=0.17.1=hc9aadba_0
33
+ - libdrm=2.4.114=h166bdaf_0
34
+ - libexpat=2.5.0=hcb278e6_1
35
+ - libffi=3.4.2=h7f98852_5
36
+ - libgcc-ng=13.1.0=he5830b7_0
37
+ - libglib=2.76.4=hebfc3b9_0
38
+ - libgomp=13.1.0=he5830b7_0
39
+ - libiconv=1.17=h166bdaf_0
40
+ - libidn2=2.3.4=h166bdaf_0
41
+ - libnsl=2.0.0=h7f98852_0
42
+ - libopus=1.3.1=h7f98852_1
43
+ - libpciaccess=0.17=h166bdaf_0
44
+ - libpng=1.6.39=h753d276_0
45
+ - libsqlite=3.42.0=h2797004_0
46
+ - libstdcxx-ng=13.1.0=hfd8a6a1_0
47
+ - libtasn1=4.19.0=h166bdaf_0
48
+ - libunistring=0.9.10=h7f98852_0
49
+ - libuuid=2.38.1=h0b41bf4_0
50
+ - libva=2.19.0=hd590300_0
51
+ - libvpx=1.13.0=hcb278e6_0
52
+ - libxcb=1.15=h0b41bf4_0
53
+ - libxml2=2.11.5=h0d562d8_0
54
+ - libzlib=1.2.13=hd590300_5
55
+ - ncurses=6.4=hcb278e6_0
56
+ - nettle=3.8.1=hc379101_1
57
+ - openh264=2.3.1=hcb278e6_2
58
+ - openssl=3.1.2=hd590300_0
59
+ - p11-kit=0.24.1=hc5aa10d_0
60
+ - pcre2=10.40=hc3806b6_0
61
+ - pip=23.2=pyhd8ed1ab_0
62
+ - pixman=0.40.0=h36c2ea0_0
63
+ - pthread-stubs=0.4=h36c2ea0_1001
64
+ - python=3.8.17=he550d4f_0_cpython
65
+ - readline=8.2=h8228510_1
66
+ - setuptools=68.0.0=pyhd8ed1ab_0
67
+ - svt-av1=1.6.0=h59595ed_0
68
+ - tk=8.6.12=h27826a3_0
69
+ - wheel=0.40.0=pyhd8ed1ab_1
70
+ - x264=1!164.3095=h166bdaf_2
71
+ - x265=3.5=h924138e_3
72
+ - xorg-fixesproto=5.0=h7f98852_1002
73
+ - xorg-kbproto=1.0.7=h7f98852_1002
74
+ - xorg-libice=1.1.1=hd590300_0
75
+ - xorg-libsm=1.2.4=h7391055_0
76
+ - xorg-libx11=1.8.6=h8ee46fc_0
77
+ - xorg-libxau=1.0.11=hd590300_0
78
+ - xorg-libxdmcp=1.1.3=h7f98852_0
79
+ - xorg-libxext=1.3.4=h0b41bf4_2
80
+ - xorg-libxfixes=5.0.3=h7f98852_1004
81
+ - xorg-libxrender=0.9.11=hd590300_0
82
+ - xorg-renderproto=0.11.1=h7f98852_1002
83
+ - xorg-xextproto=7.3.0=h0b41bf4_1003
84
+ - xorg-xproto=7.0.31=h7f98852_1007
85
+ - xz=5.2.6=h166bdaf_0
86
+ - zlib=1.2.13=hd590300_5
87
+ - pip:
88
+ - accelerate==0.21.0
89
+ - aiofiles==23.1.0
90
+ - aiohttp==3.8.5
91
+ - aiosignal==1.3.1
92
+ - altair==5.0.1
93
+ - annotated-types==0.5.0
94
+ - antlr4-python3-runtime==4.8
95
+ - anyio==3.7.1
96
+ - appdirs==1.4.4
97
+ - asttokens==2.2.1
98
+ - async-timeout==4.0.2
99
+ - attrs==23.1.0
100
+ - audiolm-pytorch==1.1.4
101
+ - audioread==3.0.0
102
+ - backcall==0.2.0
103
+ - beartype==0.15.0
104
+ - bitarray==2.8.1
105
+ - blinker==1.6.2
106
+ - certifi==2023.5.7
107
+ - cffi==1.15.1
108
+ - charset-normalizer==3.2.0
109
+ - click==8.1.6
110
+ - cmake==3.26.4
111
+ - colorama==0.4.6
112
+ - contourpy==1.1.0
113
+ - cycler==0.11.0
114
+ - cython==3.0.0
115
+ - decorator==5.1.1
116
+ - einops==0.6.1
117
+ - ema-pytorch==0.2.3
118
+ - encodec==0.1.1
119
+ - exceptiongroup==1.1.2
120
+ - executing==1.2.0
121
+ - fairseq==0.12.2
122
+ - fastapi==0.100.1
123
+ - ffmpy==0.3.1
124
+ - filelock==3.12.2
125
+ - flask==2.3.2
126
+ - fonttools==4.42.0
127
+ - frozenlist==1.4.0
128
+ - fsspec==2023.6.0
129
+ - future==0.18.3
130
+ - gradio==3.39.0
131
+ - gradio-client==0.3.0
132
+ - h11==0.14.0
133
+ - httpcore==0.17.3
134
+ - httpx==0.24.1
135
+ - huggingface-hub==0.16.4
136
+ - hydra-core==1.0.7
137
+ - idna==3.4
138
+ - importlib-metadata==6.8.0
139
+ - importlib-resources==6.0.0
140
+ - ipdb==0.13.13
141
+ - ipython==8.12.2
142
+ - itsdangerous==2.1.2
143
+ - jedi==0.18.2
144
+ - jinja2==3.1.2
145
+ - joblib==1.3.1
146
+ - json5==0.9.14
147
+ - jsonschema==4.18.6
148
+ - jsonschema-specifications==2023.7.1
149
+ - kiwisolver==1.4.4
150
+ - lazy-loader==0.3
151
+ - librosa==0.10.0.post2
152
+ - linkify-it-py==2.0.2
153
+ - lion-pytorch==0.1.2
154
+ - lit==16.0.6
155
+ - llvmlite==0.40.1
156
+ - local-attention==1.8.6
157
+ - lxml==4.9.3
158
+ - markdown-it-py==2.2.0
159
+ - markupsafe==2.1.3
160
+ - matplotlib==3.7.2
161
+ - matplotlib-inline==0.1.6
162
+ - mdit-py-plugins==0.3.3
163
+ - mdurl==0.1.2
164
+ - mpmath==1.3.0
165
+ - msgpack==1.0.5
166
+ - multidict==6.0.4
167
+ - networkx==3.1
168
+ - nltk==3.8.1
169
+ - numba==0.57.1
170
+ - numpy==1.24.4
171
+ - nvidia-cublas-cu11==11.10.3.66
172
+ - nvidia-cuda-cupti-cu11==11.7.101
173
+ - nvidia-cuda-nvrtc-cu11==11.7.99
174
+ - nvidia-cuda-runtime-cu11==11.7.99
175
+ - nvidia-cudnn-cu11==8.5.0.96
176
+ - nvidia-cufft-cu11==10.9.0.58
177
+ - nvidia-curand-cu11==10.2.10.91
178
+ - nvidia-cusolver-cu11==11.4.0.1
179
+ - nvidia-cusparse-cu11==11.7.4.91
180
+ - nvidia-nccl-cu11==2.14.3
181
+ - nvidia-nvtx-cu11==11.7.91
182
+ - omegaconf==2.0.6
183
+ - openai==0.27.8
184
+ - orjson==3.9.2
185
+ - packaging==23.1
186
+ - pandas==2.0.3
187
+ - parso==0.8.3
188
+ - pexpect==4.8.0
189
+ - pickleshare==0.7.5
190
+ - pillow==10.0.0
191
+ - pkgutil-resolve-name==1.3.10
192
+ - pooch==1.6.0
193
+ - portalocker==2.7.0
194
+ - prompt-toolkit==3.0.39
195
+ - psutil==5.9.5
196
+ - ptyprocess==0.7.0
197
+ - pure-eval==0.2.2
198
+ - pycparser==2.21
199
+ - pydantic==2.1.1
200
+ - pydantic-core==2.4.0
201
+ - pydub==0.25.1
202
+ - pygments==2.15.1
203
+ - pyloudnorm==0.1.1
204
+ - pyparsing==3.0.9
205
+ - python-dateutil==2.8.2
206
+ - python-multipart==0.0.6
207
+ - pytz==2023.3
208
+ - pyyaml==6.0.1
209
+ - referencing==0.30.1
210
+ - regex==2023.6.3
211
+ - requests==2.31.0
212
+ - retrying==1.3.4
213
+ - rpds-py==0.9.2
214
+ - sacrebleu==2.3.1
215
+ - safetensors==0.3.2
216
+ - scikit-learn==1.3.0
217
+ - scipy==1.10.1
218
+ - semantic-version==2.10.0
219
+ - sentencepiece==0.1.99
220
+ - six==1.16.0
221
+ - sniffio==1.3.0
222
+ - soundfile==0.12.1
223
+ - soxr==0.3.5
224
+ - stack-data==0.6.2
225
+ - starlette==0.27.0
226
+ - sympy==1.12
227
+ - tabulate==0.9.0
228
+ - threadpoolctl==3.2.0
229
+ - tokenizers==0.13.3
230
+ - tomli==2.0.1
231
+ - toolz==0.12.0
232
+ - torch==2.0.1
233
+ - torchaudio==2.0.2
234
+ - tqdm==4.65.0
235
+ - traitlets==5.9.0
236
+ - transformers==4.31.0
237
+ - triton==2.0.0
238
+ - typing-extensions==4.7.1
239
+ - tzdata==2023.3
240
+ - uc-micro-py==1.0.2
241
+ - urllib3==2.0.4
242
+ - uvicorn==0.23.2
243
+ - vector-quantize-pytorch==1.6.30
244
+ - wcwidth==0.2.6
245
+ - websockets==11.0.3
246
+ - werkzeug==2.3.6
247
+ - yarl==1.9.2
248
+ - zipp==3.16.2
LICENSE ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License
2
+
3
+ By exercising the Licensed Rights (defined below), You accept and agree to be
4
+ bound by the terms and conditions of this Creative Commons
5
+ Attribution-NonCommercial-NoDerivatives 4.0 International Public License
6
+ ("Public License"). To the extent this Public License may be interpreted as a
7
+ contract, You are granted the Licensed Rights in consideration of Your
8
+ acceptance of these terms and conditions, and the Licensor grants You such
9
+ rights in consideration of benefits the Licensor receives from making the
10
+ Licensed Material available under these terms and conditions.
11
+
12
+ Section 1 – Definitions.
13
+
14
+ a. Adapted Material means material subject to Copyright and Similar Rights
15
+ that is derived from or based upon the Licensed Material and in which
16
+ the Licensed Material is translated, altered, arranged, transformed, or
17
+ otherwise modified in a manner requiring permission under the Copyright
18
+ and Similar Rights held by the Licensor. For purposes of this Public
19
+ License, where the Licensed Material is a musical work, performance, or
20
+ sound recording, Adapted Material is always produced where the Licensed
21
+ Material is synched in timed relation with a moving image.
22
+ b. Copyright and Similar Rights means copyright and/or similar rights
23
+ closely related to copyright including, without limitation,
24
+ performance, broadcast, sound recording, and Sui Generis Database
25
+ Rights, without regard to how the rights are labeled or categorized.
26
+ For purposes of this Public License, the rights specified in Section
27
+ 2(b)(1)-(2) are not Copyright and Similar Rights.
28
+ c. Effective Technological Measures means those measures that, in the
29
+ absence of proper authority, may not be circumvented under laws
30
+ fulfilling obligations under Article 11 of the WIPO Copyright Treaty
31
+ adopted on December 20, 1996, and/or similar international agreements.
32
+ d. Exceptions and Limitations means fair use, fair dealing, and/or any
33
+ other exception or limitation to Copyright and Similar Rights that
34
+ applies to Your use of the Licensed Material.
35
+ e. Licensed Material means the artistic or literary work, database, or
36
+ other material to which the Licensor applied this Public License.
37
+ f. Licensed Rights means the rights granted to You subject to the terms
38
+ and conditions of this Public License, which are limited to all
39
+ Copyright and Similar Rights that apply to Your use of the Licensed
40
+ Material and that the Licensor has authority to license.
41
+ g. Licensor means the individual(s) or entity(ies) granting rights under
42
+ this Public License.
43
+ h. NonCommercial means not primarily intended for or directed towards
44
+ commercial advantage or monetary compensation. For purposes of this
45
+ Public License, the exchange of the Licensed Material for other
46
+ material subject to Copyright and Similar Rights by digital
47
+ file-sharing or similar means is NonCommercial provided there is no
48
+ payment of monetary compensation in connection with the exchange.
49
+ i. Share means to provide material to the public by any means or process
50
+ that requires permission under the Licensed Rights, such as
51
+ reproduction, public display, public performance, distribution,
52
+ dissemination, communication, or importation, and to make material
53
+ available to the public including in ways that members of the public
54
+ may access the material from a place and at a time individually chosen
55
+ by them.
56
+ j. Sui Generis Database Rights means rights other than copyright resulting
57
+ from Directive 96/9/EC of the European Parliament and of the Council of
58
+ 11 March 1996 on the legal protection of databases, as amended and/or
59
+ succeeded, as well as other essentially equivalent rights anywhere in
60
+ the world.
61
+ k. You means the individual or entity exercising the Licensed Rights under
62
+ this Public License. Your has a corresponding meaning.
63
+
64
+ Section 2 – Scope.
65
+
66
+ a. License grant.
67
+ 1. Subject to the terms and conditions of this Public License, the
68
+ Licensor hereby grants You a worldwide, royalty-free,
69
+ non-sublicensable, non-exclusive, irrevocable license to exercise
70
+ the Licensed Rights in the Licensed Material to:
71
+ A. reproduce and Share the Licensed Material, in whole or in part,
72
+ for NonCommercial purposes only; and
73
+ B. produce and reproduce, but not Share, Adapted Material for
74
+ NonCommercial purposes only.
75
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
76
+ Exceptions and Limitations apply to Your use, this Public License
77
+ does not apply, and You do not need to comply with its terms and
78
+ conditions.
79
+ 3. Term. The term of this Public License is specified in Section 6(a).
80
+ 4. Media and formats; technical modifications allowed. The Licensor
81
+ authorizes You to exercise the Licensed Rights in all media and
82
+ formats whether now known or hereafter created, and to make
83
+ technical modifications necessary to do so. The Licensor waives
84
+ and/or agrees not to assert any right or authority to forbid You
85
+ from making technical modifications necessary to exercise the
86
+ Licensed Rights, including technical modifications necessary to
87
+ circumvent Effective Technological Measures. For purposes of this
88
+ Public License, simply making modifications authorized by this
89
+ Section 2(a)(4) never produces Adapted Material.
90
+ 5. Downstream recipients.
91
+ A. Offer from the Licensor – Licensed Material. Every recipient of
92
+ the Licensed Material automatically receives an offer from the
93
+ Licensor to exercise the Licensed Rights under the terms and
94
+ conditions of this Public License.
95
+ B. No downstream restrictions. You may not offer or impose any
96
+ additional or different terms or conditions on, or apply any
97
+ Effective Technological Measures to, the Licensed Material if
98
+ doing so restricts exercise of the Licensed Rights by any
99
+ recipient of the Licensed Material.
100
+ 6. No endorsement. Nothing in this Public License constitutes or may
101
+ be construed as permission to assert or imply that You are, or that
102
+ Your use of the Licensed Material is, connected with, or sponsored,
103
+ endorsed, or granted official status by, the Licensor or others
104
+ designated to receive attribution as provided in Section
105
+ 3(a)(1)(A)(i).
106
+
107
+ b. Other rights.
108
+ 1. Moral rights, such as the right of integrity, are not licensed
109
+ under this Public License, nor are publicity, privacy, and/or other
110
+ similar personality rights; however, to the extent possible, the
111
+ Licensor waives and/or agrees not to assert any such rights held by
112
+ the Licensor to the limited extent necessary to allow You to
113
+ exercise the Licensed Rights, but not otherwise.
114
+ 2. Patent and trademark rights are not licensed under this Public
115
+ License.
116
+ 3. To the extent possible, the Licensor waives any right to collect
117
+ royalties from You for the exercise of the Licensed Rights, whether
118
+ directly or through a collecting society under any voluntary or
119
+ waivable statutory or compulsory licensing scheme. In all other
120
+ cases the Licensor expressly reserves any right to collect such
121
+ royalties, including when the Licensed Material is used other than
122
+ for NonCommercial purposes.
123
+
124
+ Section 3 – License Conditions.
125
+
126
+ Your exercise of the Licensed Rights is expressly made subject to the following conditions.
127
+
128
+ a. Attribution.
129
+
130
+ 1. If You Share the Licensed Material, You must:
131
+ A. retain the following if it is supplied by the Licensor with the
132
+ Licensed Material:
133
+ i. identification of the creator(s) of the Licensed Material
134
+ and any others designated to receive attribution, in any
135
+ reasonable manner requested by the Licensor (including by
136
+ pseudonym if designated);
137
+ ii. a copyright notice;
138
+ iii. a notice that refers to this Public License;
139
+ iv. a notice that refers to the disclaimer of warranties;
140
+ v. a URI or hyperlink to the Licensed Material to the extent
141
+ reasonably practicable;
142
+ B. indicate if You modified the Licensed Material and retain an
143
+ indication of any previous modifications; and
144
+ C. indicate the Licensed Material is licensed under this Public
145
+ License, and include the text of, or the URI or hyperlink to,
146
+ this Public License.
147
+
148
+ For the avoidance of doubt, You do not have permission under this
149
+ Public License to Share Adapted Material.
150
+
151
+ 2. You may satisfy the conditions in Section 3(a)(1) in any reasonable
152
+ manner based on the medium, means, and context in which You Share
153
+ the Licensed Material. For example, it may be reasonable to satisfy
154
+ the conditions by providing a URI or hyperlink to a resource that
155
+ includes the required information.
156
+ 3. If requested by the Licensor, You must remove any of the
157
+ information required by Section 3(a)(1)(A) to the extent reasonably
158
+ practicable.
159
+
160
+ Section 4 – Sui Generis Database Rights.
161
+
162
+ Where the Licensed Rights include Sui Generis Database Rights that apply to
163
+ Your use of the Licensed Material:
164
+
165
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right to
166
+ extract, reuse, reproduce, and Share all or a substantial portion of
167
+ the contents of the database for NonCommercial purposes only and
168
+ provided You do not Share Adapted Material;
169
+ b. if You include all or a substantial portion of the database contents in
170
+ a database in which You have Sui Generis Database Rights, then the
171
+ database in which You have Sui Generis Database Rights (but not its
172
+ individual contents) is Adapted Material; and
173
+ c. You must comply with the conditions in Section 3(a) if You Share all or
174
+ a substantial portion of the contents of the database.
175
+
176
+ For the avoidance of doubt, this Section 4 supplements and does not replace
177
+ Your obligations under this Public License where the Licensed Rights include
178
+ other Copyright and Similar Rights.
179
+
180
+ Section 5 – Disclaimer of Warranties and Limitation of Liability.
181
+
182
+ a. Unless otherwise separately undertaken by the Licensor, to the extent
183
+ possible, the Licensor offers the Licensed Material as-is and
184
+ as-available, and makes no representations or warranties of any kind
185
+ concerning the Licensed Material, whether express, implied, statutory,
186
+ or other. This includes, without limitation, warranties of title,
187
+ merchantability, fitness for a particular purpose, non-infringement,
188
+ absence of latent or other defects, accuracy, or the presence or
189
+ absence of errors, whether or not known or discoverable. Where
190
+ disclaimers of warranties are not allowed in full or in part, this
191
+ disclaimer may not apply to You.
192
+ b. To the extent possible, in no event will the Licensor be liable to You
193
+ on any legal theory (including, without limitation, negligence) or
194
+ otherwise for any direct, special, indirect, incidental, consequential,
195
+ punitive, exemplary, or other losses, costs, expenses, or damages
196
+ arising out of this Public License or use of the Licensed Material,
197
+ even if the Licensor has been advised of the possibility of such
198
+ losses, costs, expenses, or damages. Where a limitation of liability is
199
+ not allowed in full or in part, this limitation may not apply to You.
200
+ c. The disclaimer of warranties and limitation of liability provided above
201
+ shall be interpreted in a manner that, to the extent possible, most
202
+ closely approximates an absolute disclaimer and waiver of all
203
+ liability.
204
+
205
+ Section 6 – Term and Termination.
206
+
207
+ a. This Public License applies for the term of the Copyright and Similar
208
+ Rights licensed here. However, if You fail to comply with this Public
209
+ License, then Your rights under this Public License terminate
210
+ automatically.
211
+ b. Where Your right to use the Licensed Material has terminated under
212
+ Section 6(a), it reinstates:
213
+ 1. automatically as of the date the violation is cured, provided it is
214
+ cured within 30 days of Your discovery of the violation; or
215
+ 2. upon express reinstatement by the Licensor.
216
+
217
+ For the avoidance of doubt, this Section 6(b) does not affect any right
218
+ the Licensor may have to seek remedies for Your violations of this
219
+ Public License.
220
+
221
+ c. For the avoidance of doubt, the Licensor may also offer the Licensed
222
+ Material under separate terms or conditions or stop distributing the
223
+ Licensed Material at any time; however, doing so will not terminate
224
+ this Public License.
225
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
226
+
227
+ Section 7 – Other Terms and Conditions.
228
+
229
+ a. The Licensor shall not be bound by any additional or different terms or
230
+ conditions communicated by You unless expressly agreed.
231
+ b. Any arrangements, understandings, or agreements regarding the Licensed
232
+ Material not stated herein are separate from and independent of the
233
+ terms and conditions of this Public License.
234
+
235
+ Section 8 – Interpretation.
236
+
237
+ a. For the avoidance of doubt, this Public License does not, and shall not
238
+ be interpreted to, reduce, limit, restrict, or impose conditions on any
239
+ use of the Licensed Material that could lawfully be made without
240
+ permission under this Public License.
241
+ b. To the extent possible, if any provision of this Public License is
242
+ deemed unenforceable, it shall be automatically reformed to the minimum
243
+ extent necessary to make it enforceable. If the provision cannot be
244
+ reformed, it shall be severed from this Public License without
245
+ affecting the enforceability of the remaining terms and conditions.
246
+ c. No term or condition of this Public License will be waived and no
247
+ failure to comply consented to unless expressly agreed to by the Licensor.
248
+ d. Nothing in this Public License constitutes or may be interpreted as a
249
+ limitation upon, or waiver of, any privileges and immunities that apply
250
+ to the Licensor or You, including from the legal processes of any
251
+ jurisdiction or authority.
README.md ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: WavJourney
3
+ emoji: 🔥
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: false
8
+ license: cc-by-nc-4.0
9
+ duplicated_from: Audio-AGI/WavJourney
10
+ ---
11
+ # <span style="color: blue;">🎵</span> WavJourney: Compositional Audio Creation with LLMs
12
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2307.14335) [![GitHub Stars](https://img.shields.io/github/stars/Audio-AGI/WavJourney?style=social)](https://github.com/Audio-AGI/WavJourney/) [![githubio](https://img.shields.io/badge/GitHub.io-Demo_Page-blue?logo=Github&style=flat-square)](https://audio-agi.github.io/WavJourney_demopage/) [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Audio-AGI/WavJourney)
13
+
14
+
15
+ This repository contains the official implementation of ["WavJourney: Compositional Audio Creation with Large Language Models"](https://audio-agi.github.io/WavJourney_demopage/WavJourney_arXiv.pdf).
16
+
17
+ Starting with a text prompt, WavJourney can create audio content with engaging storylines encompassing personalized speakers, lifelike speech in context, emotionally resonant music compositions, and impactful sound effects that enhance the auditory experience. Check the audio examples in the [Project Page](https://audio-agi.github.io/WavJourney_demopage/)!
18
+
19
+ <!-- <p align="center">
20
+ <img align="middle" width="800" src="assets/WavJourney.png"/>
21
+ </p> -->
22
+
23
+ <hr>
24
+
25
+
26
+ ## Preliminaries
27
+ 1. Install the environment:
28
+ ```bash
29
+ bash ./scripts/EnvsSetup.sh
30
+ ```
31
+ 2. Activate the conda environment:
32
+ ```bash
33
+ conda activate WavJourney
34
+ ```
35
+
36
+ 3. (Optional) You can modify the default configuration in `config.yaml`, check the details described in the configuration file.
37
+ 4. Pre-download the models (might take some time):
38
+ ```bash
39
+ python scripts/download_models.py
40
+ ```
41
+
42
+ 5. Set the WAVJOURNEY_OPENAI_KEY in the environment variable for accessing [GPT-4 API](https://platform.openai.com/account/api-keys) [[Guidance](https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4)]
43
+ ```bash
44
+ export WAVJOURNEY_OPENAI_KEY=your_openai_key_here
45
+ ```
46
+
47
+ 6. Set environment variables for using API services
48
+ ```bash
49
+ # Set the port for the WAVJOURNEY service to 8021
50
+ export WAVJOURNEY_SERVICE_PORT=8021
51
+
52
+ # Set the URL for the WAVJOURNEY service to 127.0.0.1
53
+ export WAVJOURNEY_SERVICE_URL=127.0.0.1
54
+
55
+ # Limit the maximum script lines for WAVJOURNEY to 999
56
+ export WAVJOURNEY_MAX_SCRIPT_LINES=999
57
+ ```
58
+
59
+
60
+ 7. Start Python API services (e.g., Text-to-Speech, Text-to-Audio)
61
+ ```bash
62
+ bash scripts/start_services.sh
63
+ ```
64
+
65
+ ## Web APP
66
+ ```bash
67
+ bash scripts/start_ui.sh
68
+ ```
69
+
70
+ ## Commandline Usage
71
+ ```bash
72
+ python wavjourney_cli.py -f --input-text "Generate a one-minute introduction to quantum mechanics"
73
+ ```
74
+
75
+
76
+ ## Kill the services
77
+ You can kill the running services via this command:
78
+ ```bash
79
+ python scripts/kill_services.py
80
+ ```
81
+
82
+ ## (Advanced features) Speaker customization
83
+ You can add voice presets to WavJourney to customize the voice actors. Simply provide the voice id, the description and a sample wav file, and WavJourney will pick the voice automatically based on the audio script. Predefined system voice presets are in `data/voice_presets`.
84
+
85
+ You can manage voice presets via UI. Specifically, if you want to add voice to voice presets. Run the script via command line below:
86
+ ```bash
87
+ python add_voice_preset.py --id "id" --desc "description" --wav-path path/to/wav --session-id ''
88
+ ```
89
+ What makes for good voice prompt? See detailed instructions <a href="https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer">here</a>.
90
+ ## Hardware requirement
91
+ - The VRAM of the GPU in the default configuration should be greater than 16 GB.
92
+ - Operation system: Linux.
93
+
94
+ ## Citation
95
+ If you find this work useful, you can cite the paper below:
96
+
97
+ @article{liu2023wavjourney,
98
+ title = {WavJourney: Compositional Audio Creation with Large Language Models},
99
+ author = {Liu, Xubo and Zhu, Zhongkai and Liu, Haohe and Yuan, Yi and Huang, Qiushi and Liang, Jinhua and Cao, Yin and Kong, Qiuqiang and Plumbley, Mark D and Wang, Wenwu},
100
+ journal = {arXiv preprint arXiv:2307.14335},
101
+ year = {2023}
102
+ }
103
+
104
+ [!["Buy Me A Coffee"](https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png)](https://www.buymeacoffee.com/liuxubo)
105
+
106
+ ## Appreciation
107
+ - [Bark](https://github.com/suno-ai/bark) for a zero-shot text-to-speech synthesis model.
108
+ - [AudioCraft](https://github.com/facebookresearch/audiocraft) for state-of-the-art audio generation models.
109
+
110
+ ## Disclaimer
111
+ We are not responsible for audio generated using semantics created by this model. Just don't use it for illegal purposes.
112
+
VoiceParser/__init__.py ADDED
File without changes
VoiceParser/customtokenizer.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom tokenizer model.
3
+ Author: https://www.github.com/gitmylo/
4
+ License: MIT
5
+ """
6
+
7
+ import json
8
+ import os.path
9
+ from zipfile import ZipFile
10
+ from typing import Union
11
+
12
+
13
+ import numpy
14
+ import torch
15
+ from torch import nn, optim
16
+ from torch.serialization import MAP_LOCATION
17
+
18
+
19
+ class CustomTokenizer(nn.Module):
20
+ def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
21
+ super(CustomTokenizer, self).__init__()
22
+ next_size = input_size
23
+ if version == 0:
24
+ self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
25
+ next_size = hidden_size
26
+ if version == 1:
27
+ self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
28
+ self.intermediate = nn.Linear(hidden_size, 4096)
29
+ next_size = 4096
30
+
31
+ self.fc = nn.Linear(next_size, output_size)
32
+ self.softmax = nn.LogSoftmax(dim=1)
33
+ self.optimizer: optim.Optimizer = None
34
+ self.lossfunc = nn.CrossEntropyLoss()
35
+ self.input_size = input_size
36
+ self.hidden_size = hidden_size
37
+ self.output_size = output_size
38
+ self.version = version
39
+
40
+ def forward(self, x):
41
+ x, _ = self.lstm(x)
42
+ if self.version == 1:
43
+ x = self.intermediate(x)
44
+ x = self.fc(x)
45
+ x = self.softmax(x)
46
+ return x
47
+
48
+ @torch.no_grad()
49
+ def get_token(self, x):
50
+ """
51
+ Used to get the token for the first
52
+ :param x: An array with shape (N, input_size) where N is a whole number greater or equal to 1, and input_size is the input size used when creating the model.
53
+ :return: An array with shape (N,) where N is the same as N from the input. Every number in the array is a whole number in range 0...output_size - 1 where output_size is the output size used when creating the model.
54
+ """
55
+ return torch.argmax(self(x), dim=1)
56
+
57
+ def prepare_training(self):
58
+ self.optimizer = optim.Adam(self.parameters(), 0.001)
59
+
60
+ def train_step(self, x_train, y_train, log_loss=False):
61
+ # y_train = y_train[:-1]
62
+ # y_train = y_train[1:]
63
+
64
+ optimizer = self.optimizer
65
+ lossfunc = self.lossfunc
66
+ # Zero the gradients
67
+ self.zero_grad()
68
+
69
+ # Forward pass
70
+ y_pred = self(x_train)
71
+
72
+ y_train_len = len(y_train)
73
+ y_pred_len = y_pred.shape[0]
74
+
75
+ if y_train_len > y_pred_len:
76
+ diff = y_train_len - y_pred_len
77
+ y_train = y_train[diff:]
78
+ elif y_train_len < y_pred_len:
79
+ diff = y_pred_len - y_train_len
80
+ y_pred = y_pred[:-diff, :]
81
+
82
+ y_train_hot = torch.zeros(len(y_train), self.output_size)
83
+ y_train_hot[range(len(y_train)), y_train] = 1
84
+ y_train_hot = y_train_hot.to('cuda')
85
+
86
+ # Calculate the loss
87
+ loss = lossfunc(y_pred, y_train_hot)
88
+
89
+ # Print loss
90
+ if log_loss:
91
+ print('Loss', loss.item())
92
+
93
+ # Backward pass
94
+ loss.backward()
95
+
96
+ # Update the weights
97
+ optimizer.step()
98
+
99
+ def save(self, path):
100
+ info_path = '.'.join(os.path.basename(path).split('.')[:-1]) + '/.info'
101
+ torch.save(self.state_dict(), path)
102
+ data_from_model = Data(self.input_size, self.hidden_size, self.output_size, self.version)
103
+ with ZipFile(path, 'a') as model_zip:
104
+ model_zip.writestr(info_path, data_from_model.save())
105
+ model_zip.close()
106
+
107
+ @staticmethod
108
+ def load_from_checkpoint(path, map_location: MAP_LOCATION = None):
109
+ old = True
110
+ with ZipFile(path) as model_zip:
111
+ filesMatch = [file for file in model_zip.namelist() if file.endswith('/.info')]
112
+ file = filesMatch[0] if filesMatch else None
113
+ if file:
114
+ old = False
115
+ data_from_model = Data.load(model_zip.read(file).decode('utf-8'))
116
+ model_zip.close()
117
+ if old:
118
+ model = CustomTokenizer()
119
+ else:
120
+ model = CustomTokenizer(data_from_model.hidden_size, data_from_model.input_size, data_from_model.output_size, data_from_model.version)
121
+ model.load_state_dict(torch.load(path, map_location=map_location))
122
+ if map_location:
123
+ model = model.to(map_location)
124
+ return model
125
+
126
+
127
+
128
+ class Data:
129
+ input_size: int
130
+ hidden_size: int
131
+ output_size: int
132
+ version: int
133
+
134
+ def __init__(self, input_size=768, hidden_size=1024, output_size=10000, version=0):
135
+ self.input_size = input_size
136
+ self.hidden_size = hidden_size
137
+ self.output_size = output_size
138
+ self.version = version
139
+
140
+ @staticmethod
141
+ def load(string):
142
+ data = json.loads(string)
143
+ return Data(data['input_size'], data['hidden_size'], data['output_size'], data['version'])
144
+
145
+ def save(self):
146
+ data = {
147
+ 'input_size': self.input_size,
148
+ 'hidden_size': self.hidden_size,
149
+ 'output_size': self.output_size,
150
+ 'version': self.version,
151
+ }
152
+ return json.dumps(data)
153
+
154
+
155
+ def auto_train(data_path, save_path='model.pth', lload_model: Union[str, None] = None, save_epochs=1):
156
+ data_x, data_y = {}, {}
157
+
158
+ if load_model and os.path.isfile(load_model):
159
+ print('Loading model from', load_model)
160
+ model_training = CustomTokenizer.load_from_checkpoint(load_model, 'cuda')
161
+ else:
162
+ print('Creating new model.')
163
+ model_training = CustomTokenizer(version=1).to('cuda')
164
+ save_path = os.path.join(data_path, save_path)
165
+ base_save_path = '.'.join(save_path.split('.')[:-1])
166
+
167
+ sem_string = '_semantic.npy'
168
+ feat_string = '_semantic_features.npy'
169
+
170
+ ready = os.path.join(data_path, 'ready')
171
+ for input_file in os.listdir(ready):
172
+ full_path = os.path.join(ready, input_file)
173
+ try:
174
+ prefix = input_file.split("_")[0]
175
+ number = int(prefix)
176
+ except ValueError as e:
177
+ raise e
178
+ if input_file.endswith(sem_string):
179
+ data_y[number] = numpy.load(full_path)
180
+ elif input_file.endswith(feat_string):
181
+ data_x[number] = numpy.load(full_path)
182
+
183
+ model_training.prepare_training()
184
+ epoch = 1
185
+
186
+ while 1:
187
+ for i in range(save_epochs):
188
+ j = 0
189
+ for i in range(max(len(data_x), len(data_y))):
190
+ x = data_x.get(i)
191
+ y = data_y.get(i)
192
+ if x is None or y is None:
193
+ print(f'The training data does not match. key={i}')
194
+ continue
195
+ model_training.train_step(torch.tensor(x).to('cuda'), torch.tensor(y).to('cuda'), j % 50 == 0) # Print loss every 50 steps
196
+ j += 1
197
+ save_p = save_path
198
+ save_p_2 = f'{base_save_path}_epoch_{epoch}.pth'
199
+ model_training.save(save_p)
200
+ model_training.save(save_p_2)
201
+ print(f'Epoch {epoch} completed')
202
+ epoch += 1
VoiceParser/hubert_manager.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+ import shutil
3
+ import urllib.request
4
+
5
+ import huggingface_hub
6
+
7
+
8
+ class HuBERTManager:
9
+ @staticmethod
10
+ def make_sure_hubert_installed(download_url: str = 'https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt', file_name: str = 'hubert.pt'):
11
+ install_dir = os.path.join('VoiceParser', 'hubert')
12
+ if not os.path.isdir(install_dir):
13
+ os.makedirs(install_dir, exist_ok=True)
14
+ install_file = os.path.join(install_dir, file_name)
15
+ if not os.path.isfile(install_file):
16
+ print('Downloading HuBERT base model')
17
+ urllib.request.urlretrieve(download_url, install_file)
18
+ print('Downloaded HuBERT')
19
+ return install_file
20
+
21
+
22
+ @staticmethod
23
+ def make_sure_tokenizer_installed(model: str = 'quantifier_hubert_base_ls960_14.pth', repo: str = 'GitMylo/bark-voice-cloning', local_file: str = 'tokenizer.pth'):
24
+ install_dir = os.path.join('VoiceParser', 'hubert')
25
+ if not os.path.isdir(install_dir):
26
+ os.makedirs(install_dir, exist_ok=True)
27
+ install_file = os.path.join(install_dir, local_file)
28
+ if not os.path.isfile(install_file):
29
+ print('Downloading HuBERT custom tokenizer')
30
+ huggingface_hub.hf_hub_download(repo, model, local_dir=install_dir, local_dir_use_symlinks=False)
31
+ shutil.move(os.path.join(install_dir, model), install_file)
32
+ print('Downloaded tokenizer')
33
+ return install_file
VoiceParser/model.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import numpy as np
4
+
5
+ import torch
6
+ import torchaudio
7
+ torchaudio.set_audio_backend("soundfile") # Use 'soundfile' backend
8
+
9
+ from encodec import EncodecModel
10
+ from encodec.utils import convert_audio
11
+ from .hubert_manager import HuBERTManager
12
+ from .pre_kmeans_hubert import CustomHubert
13
+ from .customtokenizer import CustomTokenizer
14
+
15
+ class VoiceParser():
16
+ def __init__(self, device='cpu'):
17
+ model = ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')
18
+
19
+ hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed(), device=device)
20
+ quant_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)
21
+ encodec_model = EncodecModel.encodec_model_24khz()
22
+ encodec_model.set_target_bandwidth(6.0)
23
+
24
+ self.hubert_model = hubert_model
25
+ self.quant_model = quant_model
26
+ self.encodec_model = encodec_model.to(device)
27
+ self.device = device
28
+ print('Loaded VoiceParser models!')
29
+
30
+
31
+ def extract_acoustic_embed(self, wav_path, npz_dir):
32
+ wav, sr = torchaudio.load(wav_path)
33
+
34
+ wav_hubert = wav.to(self.device)
35
+
36
+ if wav_hubert.shape[0] == 2: # Stereo to mono if needed
37
+ wav_hubert = wav_hubert.mean(0, keepdim=True)
38
+
39
+ semantic_vectors = self.hubert_model.forward(wav_hubert, input_sample_hz=sr)
40
+ semantic_tokens = self.quant_model.get_token(semantic_vectors)
41
+ wav = convert_audio(wav, sr, self.encodec_model.sample_rate, 1).unsqueeze(0)
42
+
43
+ wav = wav.to(self.device)
44
+
45
+ with torch.no_grad():
46
+ encoded_frames = self.encodec_model.encode(wav)
47
+
48
+ codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()
49
+
50
+ codes = codes.cpu()
51
+ semantic_tokens = semantic_tokens.cpu()
52
+
53
+ wav_name = os.path.split(wav_path)[1]
54
+ npz_name = wav_name[:-4] + '.npz'
55
+ npz_path = os.path.join(npz_dir, npz_name)
56
+
57
+ np.savez(
58
+ npz_path,
59
+ semantic_prompt=semantic_tokens,
60
+ fine_prompt=codes,
61
+ coarse_prompt=codes[:2, :]
62
+ )
63
+
64
+ return npz_path
65
+
66
+
67
+ def read_json_file(self, json_path):
68
+ with open(json_path, 'r') as file:
69
+ data = json.load(file)
70
+ return data
71
+
72
+
73
+ def parse_voice_json(self, voice_json, output_dir):
74
+ """
75
+ Parse a voice json file, generate the corresponding output json and npz files
76
+ Params:
77
+ voice_json: path of a json file or List of json nodes
78
+ output_dir: output dir for new json and npz files
79
+ """
80
+ if isinstance(voice_json, list):
81
+ voice_json = voice_json
82
+ else:
83
+ # If voice_json is a file path (str), read the JSON file
84
+ voice_json = self.read_json_file(voice_json)
85
+ for item in voice_json:
86
+ wav_path = item['wav']
87
+ npz_path = self.extract_acoustic_embed(wav_path=wav_path, npz_dir=output_dir)
88
+ item['npz'] = npz_path
89
+ del item['wav']
90
+
91
+ output_json = os.path.join(output_dir, 'metadata.json')
92
+
93
+ with open(output_json, 'w') as file:
94
+ json.dump(voice_json, file, indent=4)
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
VoiceParser/pre_kmeans_hubert.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Modified HuBERT model without kmeans.
3
+ Original author: https://github.com/lucidrains/
4
+ Modified by: https://www.github.com/gitmylo/
5
+ License: MIT
6
+ """
7
+
8
+ # Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
9
+
10
+ from pathlib import Path
11
+
12
+ import torch
13
+ from torch import nn
14
+ from einops import pack, unpack
15
+
16
+ import fairseq
17
+
18
+ from torchaudio.functional import resample
19
+
20
+ from audiolm_pytorch.utils import curtail_to_multiple
21
+
22
+ import logging
23
+ logging.root.setLevel(logging.ERROR)
24
+
25
+
26
+ def exists(val):
27
+ return val is not None
28
+
29
+
30
+ def default(val, d):
31
+ return val if exists(val) else d
32
+
33
+
34
+ class CustomHubert(nn.Module):
35
+ """
36
+ checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
37
+ or you can train your own
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ checkpoint_path,
43
+ target_sample_hz=16000,
44
+ seq_len_multiple_of=None,
45
+ output_layer=9,
46
+ device=None
47
+ ):
48
+ super().__init__()
49
+ self.target_sample_hz = target_sample_hz
50
+ self.seq_len_multiple_of = seq_len_multiple_of
51
+ self.output_layer = output_layer
52
+
53
+ if device is not None:
54
+ self.to(device)
55
+
56
+ model_path = Path(checkpoint_path)
57
+
58
+ assert model_path.exists(), f'path {checkpoint_path} does not exist'
59
+
60
+ checkpoint = torch.load(checkpoint_path, map_location=device)
61
+ load_model_input = {checkpoint_path: checkpoint}
62
+ model, *_ = fairseq.checkpoint_utils.load_model_ensemble_and_task(load_model_input)
63
+
64
+ if device is not None:
65
+ model[0].to(device)
66
+
67
+ self.model = model[0]
68
+ self.model.eval()
69
+
70
+ @property
71
+ def groups(self):
72
+ return 1
73
+
74
+ @torch.no_grad()
75
+ def forward(
76
+ self,
77
+ wav_input,
78
+ flatten=True,
79
+ input_sample_hz=None
80
+ ):
81
+ device = wav_input.device
82
+
83
+ if exists(input_sample_hz):
84
+ wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz)
85
+
86
+ if exists(self.seq_len_multiple_of):
87
+ wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
88
+
89
+ embed = self.model(
90
+ wav_input,
91
+ features_only=True,
92
+ mask=False, # thanks to @maitycyrus for noticing that mask is defaulted to True in the fairseq code
93
+ output_layer=self.output_layer
94
+ )
95
+
96
+ embed, packed_shape = pack([embed['x']], '* d')
97
+
98
+ # codebook_indices = self.kmeans.predict(embed.cpu().detach().numpy())
99
+
100
+ codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device) # .long()
101
+
102
+ if flatten:
103
+ return codebook_indices
104
+
105
+ codebook_indices, = unpack(codebook_indices, packed_shape, '*')
106
+ return codebook_indices
add_voice_preset.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import voice_presets
3
+
4
+ def main():
5
+ # Argument Parsing
6
+ parser = argparse.ArgumentParser(description="Add Voice Preset")
7
+ parser.add_argument("--id", required=True, help="ID of the voice")
8
+ parser.add_argument("--desc", required=True, help="Description of the voice")
9
+ parser.add_argument("--wav-path", required=True, help="Path to the .wav file")
10
+ parser.add_argument("--session-id", required=True, help="session_id, if set to '' then it's system voice presets")
11
+ args = parser.parse_args()
12
+
13
+ if args.session_id:
14
+ print(voice_presets.add_session_voice_preset(args.id, args.desc, args.wav_path, args.session_id))
15
+ else:
16
+ print(voice_presets.add_system_voice_preset(args.id, args.desc, args.wav_path))
17
+
18
+
19
+
20
+ if __name__ == "__main__":
21
+ main()
code_generator.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json5
3
+ import utils
4
+
5
+
6
+ def check_json_script(data):
7
+ foreground_mandatory_attrs_map = {
8
+ 'music': ['vol', 'len', 'desc'],
9
+ 'sound_effect': ['vol', 'len', 'desc'],
10
+ 'speech': ['vol', 'text']
11
+ }
12
+ background_mandatory_attrs_map = {
13
+ 'music': ['vol', 'desc'],
14
+ 'sound_effect': ['vol', 'desc'],
15
+ }
16
+
17
+ def check_by_audio_type(audio, mandatory_attrs_map, audio_str):
18
+ if audio['audio_type'] not in mandatory_attrs_map:
19
+ raise ValueError('audio_type is not allowed in this layout, audio={audio_str}')
20
+ for attr_name in mandatory_attrs_map[audio['audio_type']]:
21
+ if attr_name not in audio:
22
+ raise ValueError(f'{attr_name} does not exist, audio={audio_str}')
23
+
24
+ # Check json's format
25
+ for audio in data:
26
+ audio_str = json5.dumps(audio, indent=None)
27
+ if 'layout' not in audio:
28
+ raise ValueError(f'layout missing, audio={audio_str}')
29
+ elif 'audio_type' not in audio:
30
+ raise ValueError(f'audio_type missing, audio={audio_str}')
31
+ elif audio['layout'] == 'foreground':
32
+ check_by_audio_type(audio, foreground_mandatory_attrs_map, audio_str)
33
+ elif audio['layout'] == 'background':
34
+ if 'id' not in audio:
35
+ raise ValueError(f'id not in background audio, audio={audio_str}')
36
+ if 'action' not in audio:
37
+ raise ValueError(f'action not in background audio, audio={audio_str}')
38
+ if audio['action'] == 'begin':
39
+ check_by_audio_type(audio, background_mandatory_attrs_map, audio_str)
40
+ else:
41
+ if audio['action'] != 'end':
42
+ raise ValueError(f'Unknown action, audio={audio_str}')
43
+ else:
44
+ raise ValueError(f'Unknown layout, audio={audio_str}')
45
+ #except Exception as err:
46
+ # sys.stderr.write(f'PARSING ERROR: {err}, audio={json5.dumps(audio, indent=None)}\n')
47
+ # all_clear = False
48
+
49
+
50
+ def collect_and_check_audio_data(data):
51
+ fg_audio_id = 0
52
+ fg_audios = []
53
+ bg_audios = []
54
+ # Collect all the foreground and background audio ids used to calculate background audio length later
55
+ for audio in data:
56
+ if audio['layout'] == 'foreground':
57
+ audio['id'] = fg_audio_id
58
+ fg_audios.append(audio)
59
+ fg_audio_id += 1
60
+ else: # background
61
+ if audio['action'] == 'begin':
62
+ audio['begin_fg_audio_id'] = fg_audio_id
63
+ bg_audios.append(audio)
64
+ else: # ends
65
+ # find the backgound with the id, and update its 'end_fg_audio_id'
66
+ for bg_audio in bg_audios:
67
+ if bg_audio['id'] == audio['id'] and bg_audio['audio_type'] == audio['audio_type']:
68
+ bg_audio['end_fg_audio_id'] = fg_audio_id
69
+ break
70
+
71
+ # check if all background audios are valid
72
+ for bg_audio in bg_audios:
73
+ if 'begin_fg_audio_id' not in bg_audio:
74
+ raise ValueError(f'begin of background missing, audio={bg_audio}')
75
+ elif 'end_fg_audio_id' not in bg_audio:
76
+ raise ValueError(f'end of background missing, audio={bg_audio}')
77
+
78
+ if bg_audio['begin_fg_audio_id'] > bg_audio['end_fg_audio_id']:
79
+ raise ValueError(f'background audio ends before start, audio={bg_audio}')
80
+ elif bg_audio['begin_fg_audio_id'] == bg_audio['end_fg_audio_id']:
81
+ raise ValueError(f'background audio contains no foreground audio, audio={bg_audio}')
82
+ #except Exception as err:
83
+ # sys.stderr.write(f'ALIGNMENT ERROR: {err}, audio={bg_audio}\n')
84
+ # return None, None
85
+
86
+ return fg_audios, bg_audios
87
+
88
+
89
+ class AudioCodeGenerator:
90
+ def __init__(self):
91
+ self.wav_counters = {
92
+ 'bg_sound_effect': 0,
93
+ 'bg_music': 0,
94
+ 'idle': 0,
95
+ 'fg_sound_effect': 0,
96
+ 'fg_music': 0,
97
+ 'fg_speech': 0,
98
+ }
99
+ self.code = ''
100
+
101
+ def append_code(self, content):
102
+ self.code = f'{self.code}{content}\n'
103
+
104
+ def generate_code(self, fg_audios, bg_audios, output_path, result_filename):
105
+ def get_wav_name(audio):
106
+ audio_type = audio['audio_type']
107
+ layout = 'fg' if audio['layout'] == 'foreground' else 'bg'
108
+ wav_type = f'{layout}_{audio_type}' if layout else audio_type
109
+ desc = audio['text'] if 'text' in audio else audio['desc']
110
+ desc = utils.text_to_abbrev_prompt(desc)
111
+ wav_filename = f'{wav_type}_{self.wav_counters[wav_type]}_{desc}.wav'
112
+ self.wav_counters[wav_type] += 1
113
+ return wav_filename
114
+
115
+ header = f'''
116
+ import os
117
+ import sys
118
+ import datetime
119
+
120
+ from APIs import TTM, TTS, TTA, MIX, CAT, COMPUTE_LEN
121
+
122
+
123
+ fg_audio_lens = []
124
+ wav_path = \"{output_path.absolute()}/audio\"
125
+ os.makedirs(wav_path, exist_ok=True)
126
+
127
+ '''
128
+ self.append_code(header)
129
+
130
+ fg_audio_wavs = []
131
+ for fg_audio in fg_audios:
132
+ wav_name = get_wav_name(fg_audio)
133
+ if fg_audio['audio_type'] == 'sound_effect':
134
+ self.append_code(f'TTA(text=\"{fg_audio["desc"]}\", length={fg_audio["len"]}, volume={fg_audio["vol"]}, out_wav=os.path.join(wav_path, \"{wav_name}\"))')
135
+ elif fg_audio['audio_type'] == 'music':
136
+ self.append_code(f'TTM(text=\"{fg_audio["desc"]}\", length={fg_audio["len"]}, volume={fg_audio["vol"]}, out_wav=os.path.join(wav_path, \"{wav_name}\"))')
137
+ elif fg_audio['audio_type'] == 'speech':
138
+ npz_path = self.char_to_voice_map[fg_audio["character"]]["npz_path"]
139
+ npz_full_path = os.path.abspath(npz_path) if os.path.exists(npz_path) else npz_path
140
+ self.append_code(f'TTS(text=\"{fg_audio["text"]}\", speaker_id=\"{self.char_to_voice_map[fg_audio["character"]]["id"]}\", volume={fg_audio["vol"]}, out_wav=os.path.join(wav_path, \"{wav_name}\"), speaker_npz=\"{npz_full_path}\")')
141
+ fg_audio_wavs.append(wav_name)
142
+ self.append_code(f'fg_audio_lens.append(COMPUTE_LEN(os.path.join(wav_path, \"{wav_name}\")))\n')
143
+
144
+ # cat all foreground audio together
145
+ self.append_code(f'fg_audio_wavs = []')
146
+ for wav_filename in fg_audio_wavs:
147
+ self.append_code(f'fg_audio_wavs.append(os.path.join(wav_path, \"{wav_filename}\"))')
148
+ self.append_code(f'CAT(wavs=fg_audio_wavs, out_wav=os.path.join(wav_path, \"foreground.wav\"))')
149
+
150
+ bg_audio_wavs = []
151
+ self.append_code(f'\nbg_audio_offsets = []')
152
+ for bg_audio in bg_audios:
153
+ wav_name = get_wav_name(bg_audio)
154
+ self.append_code(f'bg_audio_len = sum(fg_audio_lens[{bg_audio["begin_fg_audio_id"]}:{bg_audio["end_fg_audio_id"]}])')
155
+ self.append_code(f'bg_audio_offset = sum(fg_audio_lens[:{bg_audio["begin_fg_audio_id"]}])')
156
+ if bg_audio['audio_type'] == 'sound_effect':
157
+ self.append_code(f'TTA(text=\"{bg_audio["desc"]}\", volume={bg_audio["vol"]}, length=bg_audio_len, out_wav=os.path.join(wav_path, \"{wav_name}\"))')
158
+ elif bg_audio['audio_type'] == 'music':
159
+ self.append_code(f'TTM(text=\"{bg_audio["desc"]}\", volume={bg_audio["vol"]}, length=bg_audio_len, out_wav=os.path.join(wav_path, \"{wav_name}\"))')
160
+ else:
161
+ raise ValueError()
162
+ bg_audio_wavs.append(wav_name)
163
+ self.append_code(f'bg_audio_offsets.append(bg_audio_offset)\n')
164
+ self.append_code(f'bg_audio_wavs = []')
165
+ for wav_filename in bg_audio_wavs:
166
+ self.append_code(f'bg_audio_wavs.append(os.path.join(wav_path, \"{wav_filename}\"))')
167
+
168
+ self.append_code(f'bg_audio_wav_offset_pairs = list(zip(bg_audio_wavs, bg_audio_offsets))')
169
+ self.append_code(f'bg_audio_wav_offset_pairs.append((os.path.join(wav_path, \"foreground.wav\"), 0))')
170
+ self.append_code(f'MIX(wavs=bg_audio_wav_offset_pairs, out_wav=os.path.join(wav_path, \"{result_filename}.wav\"))')
171
+
172
+
173
+ def init_char_to_voice_map(self, filename):
174
+ with open(filename, 'r') as file:
175
+ self.char_to_voice_map = json5.load(file)
176
+
177
+
178
+ def parse_and_generate(self, script_filename, char_to_voice_map_filename, output_path, result_filename='result'):
179
+ self.code = ''
180
+ self.init_char_to_voice_map(char_to_voice_map_filename)
181
+
182
+ with open(script_filename, 'r') as file:
183
+ data = json5.load(file)
184
+
185
+ check_json_script(data)
186
+ fg_audios, bg_audios = collect_and_check_audio_data(data)
187
+ self.generate_code(fg_audios, bg_audios, output_path, result_filename)
188
+ return self.code
config.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AudioCraft:
2
+ # MusicGen
3
+ ttm_model_size: small # [small, medium, large]
4
+ # AudioGen
5
+ tta_model_size: medium # [medium]
6
+
7
+ Text-to-Speech:
8
+ # Bark
9
+ speed: 1.05
10
+
11
+ Speech-Restoration:
12
+ # VoiceFixer
13
+ Enable: True
14
+
15
+ Voice-Parser:
16
+ # HuBERT
17
+ device: 'cpu'
convert_json_to_audio_gen_code.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import json5
4
+ from pathlib import Path
5
+ from code_generator import AudioCodeGenerator
6
+
7
+
8
+ def main():
9
+ parser = argparse.ArgumentParser()
10
+ parser.add_argument("--script", help="Path to the json script file")
11
+ parser.add_argument("--character-to-voice-map", help="Path to the character-to-voice mapping CSV file")
12
+ parser.add_argument(
13
+ "--path",
14
+ type=str,
15
+ default=".",
16
+ help="Path of all the output wav files to be created by the generated code, default: current path"
17
+ )
18
+ args = parser.parse_args()
19
+
20
+ if not os.path.isfile(args.script):
21
+ print(f"File {args.script} does not exist.")
22
+ return
23
+
24
+ output_path = Path(args.path)
25
+ audio_code_generator = AudioCodeGenerator()
26
+ code = audio_code_generator.parse_and_generate(args.script, args.character_to_voice_map, output_path)
27
+ print(code)
28
+
29
+ if __name__ == "__main__":
30
+ main()
data/voice_presets/metadata.json ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Male1_En": {
3
+ "id": "Male1_En",
4
+ "desc": "A normal male adult voice, British accent; Language: English.",
5
+ "npz_path": "v2/en_speaker_1"
6
+ },
7
+ "Male2_En": {
8
+ "id": "Male2_En",
9
+ "desc": "A normal male adult voice, American accent; Language: English.",
10
+ "npz_path": "v2/en_speaker_6"
11
+ },
12
+ "Female1_En": {
13
+ "id": "Female1_En",
14
+ "desc": "A normal female adult voice, British accent; Language: English.",
15
+ "npz_path": "v2/en_speaker_9"
16
+ },
17
+ "Female2_En": {
18
+ "id": "Female2_En",
19
+ "desc": "A normal female adult voice, American accent; Language: English.",
20
+ "npz_path": "v2/de_speaker_3"
21
+ },
22
+ "News_Male_En": {
23
+ "id": "News_Male_En",
24
+ "desc": "A male voice of a news anchor, suitable for news scenarios; Language: English.",
25
+ "npz_path": "data/voice_presets/npz/news_male_speaker.npz"
26
+ },
27
+ "News_Female_En": {
28
+ "id": "News_Female_En",
29
+ "desc": "A female voice of a news anchor, suitable for news scenarios; Language: English.",
30
+ "npz_path": "data/voice_presets/npz/news_male_speaker.npz"
31
+ },
32
+ "News_Female_Out_En": {
33
+ "id": "News_Female_Out_En",
34
+ "desc": "A female voice of a off-site news reporter, suitable for news scenario; Language: English.",
35
+ "npz_path": "data/voice_presets/npz/news_female_speaker_outside.npz"
36
+ },
37
+ "Child_En": {
38
+ "id": "Child_En",
39
+ "desc": "A small young boy voice; Language: English.",
40
+ "npz_path": "data/voice_presets/npz/child_boy.npz"
41
+ },
42
+ "Old_Man_En": {
43
+ "id": "Old_Man_En",
44
+ "desc": "A voice of an old man; Language: English.",
45
+ "npz_path": "data/voice_presets/npz/elder_morgen.npz"
46
+ },
47
+ "Male1_Zh": {
48
+ "id": "Male1_Zh",
49
+ "desc": "A normal male adult voice; Language: Chinese.",
50
+ "npz_path": "v2/zh_speaker_0"
51
+ },
52
+ "Male2_Zh": {
53
+ "id": "Male2_Zh",
54
+ "desc": "A normal male adult voice; Language: Chinese.",
55
+ "npz_path": "v2/zh_speaker_1"
56
+ },
57
+ "Female1_Zh": {
58
+ "id": "Female1_Zh",
59
+ "desc": "A normal female adult voice; Language: Chinese.",
60
+ "npz_path": "v2/zh_speaker_9"
61
+ },
62
+ "Female2_Zh": {
63
+ "id": "Female2_Zh",
64
+ "desc": "A normal female adult voice; Language: Chinese.",
65
+ "npz_path": "v2/zh_speaker_4"
66
+ },
67
+ "Male1_Fr": {
68
+ "id": "Male1_Fr",
69
+ "desc": "A normal male adult voice; Language: French.",
70
+ "npz_path": "v2/fr_speaker_0"
71
+ },
72
+ "Male2_Fr": {
73
+ "id": "Male2_Fr",
74
+ "desc": "A normal male adult voice; Language: French.",
75
+ "npz_path": "v2/fr_speaker_8"
76
+ },
77
+ "Female1_Fr": {
78
+ "id": "Female1_Fr",
79
+ "desc": "A normal female adult voice; Language: French.",
80
+ "npz_path": "v2/fr_speaker_5"
81
+ },
82
+ "Female2_Fr": {
83
+ "id": "Female2_Fr",
84
+ "desc": "A normal female adult voice; Language: French.",
85
+ "npz_path": "v2/fr_speaker_1"
86
+ },
87
+ "Male1_De": {
88
+ "id": "Male1_De",
89
+ "desc": "A normal male adult voice; Language: German.",
90
+ "npz_path": "v2/de_speaker_0"
91
+ },
92
+ "Male2_De": {
93
+ "id": "Male2_De",
94
+ "desc": "A normal male adult voice; Language: German.",
95
+ "npz_path": "v2/de_speaker_1"
96
+ },
97
+ "Female1_De": {
98
+ "id": "Female1_De",
99
+ "desc": "A normal female adult voice; Language: German.",
100
+ "npz_path": "v2/de_speaker_3"
101
+ },
102
+ "Female2_De": {
103
+ "id": "Female2_De",
104
+ "desc": "A normal female adult voice; Language: German.",
105
+ "npz_path": "v2/de_speaker_8"
106
+ },
107
+ "Male1_Hi": {
108
+ "id": "Male1_Hi",
109
+ "desc": "A normal male adult voice; Language: Hindi.",
110
+ "npz_path": "v2/hi_speaker_5"
111
+ },
112
+ "Male2_Hi": {
113
+ "id": "Male2_Hi",
114
+ "desc": "A normal male adult voice; Language: Hindi.",
115
+ "npz_path": "v2/hi_speaker_8"
116
+ },
117
+ "Female1_Hi": {
118
+ "id": "Female1_Hi",
119
+ "desc": "A normal female adult voice; Language: Hindi.",
120
+ "npz_path": "v2/hi_speaker_0"
121
+ },
122
+ "Female2_Hi": {
123
+ "id": "Female2_Hi",
124
+ "desc": "A normal female adult voice; Language: Hindi.",
125
+ "npz_path": "v2/hi_speaker_3"
126
+ },
127
+ "Male1_It": {
128
+ "id": "Male1_It",
129
+ "desc": "A normal male adult voice; Language: Italian.",
130
+ "npz_path": "v2/it_speaker_4"
131
+ },
132
+ "Male2_It": {
133
+ "id": "Male2_It",
134
+ "desc": "A normal male adult voice; Language: Italian.",
135
+ "npz_path": "v2/it_speaker_5"
136
+ },
137
+ "Female1_It": {
138
+ "id": "Female1_It",
139
+ "desc": "A normal female adult voice; Language: Italian.",
140
+ "npz_path": "v2/it_speaker_7"
141
+ },
142
+ "Female2_It": {
143
+ "id": "Female2_It",
144
+ "desc": "A normal female adult voice; Language: Italian.",
145
+ "npz_path": "v2/it_speaker_9"
146
+ },
147
+ "Male1_Ja": {
148
+ "id": "Male1_Ja",
149
+ "desc": "A normal male adult voice; Language: Japanese.",
150
+ "npz_path": "v2/ja_speaker_2"
151
+ },
152
+ "Male2_Ja": {
153
+ "id": "Male2_Ja",
154
+ "desc": "A normal male adult voice; Language: Japanese.",
155
+ "npz_path": "v2/ja_speaker_6"
156
+ },
157
+ "Female1_Ja": {
158
+ "id": "Female1_Ja",
159
+ "desc": "A normal female adult voice; Language: Japanese.",
160
+ "npz_path": "v2/ja_speaker_4"
161
+ },
162
+ "Female2_Ja": {
163
+ "id": "Female2_Ja",
164
+ "desc": "A normal female adult voice; Language: Japanese.",
165
+ "npz_path": "v2/ja_speaker_5"
166
+ },
167
+ "Male1_Ko": {
168
+ "id": "Male1_Ko",
169
+ "desc": "A normal male adult voice; Language: Korean.",
170
+ "npz_path": "v2/ko_speaker_1"
171
+ },
172
+ "Male2_Ko": {
173
+ "id": "Male2_Ko",
174
+ "desc": "A normal male adult voice; Language: Korean.",
175
+ "npz_path": "v2/ko_speaker_2"
176
+ },
177
+ "Female1_Ko": {
178
+ "id": "Female1_Ko",
179
+ "desc": "A normal female adult voice; Language: Korean.",
180
+ "npz_path": "v2/ko_speaker_0"
181
+ },
182
+ "Female1_Ru": {
183
+ "id": "Female1_Ru",
184
+ "desc": "A normal female adult voice; Language: Russian.",
185
+ "npz_path": "v2/ru_speaker_5"
186
+ },
187
+ "Female2_Ru": {
188
+ "id": "Female2_Ru",
189
+ "desc": "A normal female adult voice; Language: Russian.",
190
+ "npz_path": "v2/ru_speaker_6"
191
+ },
192
+ "Male1_Ru": {
193
+ "id": "Male1_Ru",
194
+ "desc": "A normal male adult voice; Language: Russian.",
195
+ "npz_path": "v2/ru_speaker_3"
196
+ },
197
+ "Male2_Ru": {
198
+ "id": "Male2_Ru",
199
+ "desc": "A normal male adult voice; Language: Russian.",
200
+ "npz_path": "v2/ru_speaker_4"
201
+ },
202
+ "Female1_Es": {
203
+ "id": "Female1_Es",
204
+ "desc": "A normal female adult voice; Language: Spanish.",
205
+ "npz_path": "v2/es_speaker_8"
206
+ },
207
+ "Female2_Es": {
208
+ "id": "Female2_Es",
209
+ "desc": "A normal female adult voice; Language: Spanish.",
210
+ "npz_path": "v2/es_speaker_9"
211
+ },
212
+ "Male1_Es": {
213
+ "id": "Male1_Es",
214
+ "desc": "A normal male adult voice; Language: Spanish.",
215
+ "npz_path": "v2/es_speaker_6"
216
+ },
217
+ "Male2_Es": {
218
+ "id": "Male2_Es",
219
+ "desc": "A normal male adult voice; Language: Spanish.",
220
+ "npz_path": "v2/es_speaker_7"
221
+ },
222
+ "Female1_Tr": {
223
+ "id": "Female1_Tr",
224
+ "desc": "A normal female adult voice; Language: Turkish.",
225
+ "npz_path": "v2/tr_speaker_4"
226
+ },
227
+ "Female2_Tr": {
228
+ "id": "Female2_Tr",
229
+ "desc": "A normal female adult voice; Language: Turkish.",
230
+ "npz_path": "v2/tr_speaker_5"
231
+ },
232
+ "Male1_Tr": {
233
+ "id": "Male1_Tr",
234
+ "desc": "A normal male adult voice; Language: Turkish.",
235
+ "npz_path": "v2/tr_speaker_2"
236
+ },
237
+ "Male2_Tr": {
238
+ "id": "Male2_Tr",
239
+ "desc": "A normal male adult voice; Language: Turkish.",
240
+ "npz_path": "v2/tr_speaker_3"
241
+ },
242
+ "Male1_Pt": {
243
+ "id": "Male1_Pt",
244
+ "desc": "A normal male adult voice; Language: Purtuguese.",
245
+ "npz_path": "v2/pt_speaker_0"
246
+ },
247
+ "Male2_Pt": {
248
+ "id": "Male2_Pt",
249
+ "desc": "A normal male adult voice; Language: Purtuguese.",
250
+ "npz_path": "v2/pt_speaker_1"
251
+ },
252
+ "Female1_Pl": {
253
+ "id": "Female1_Pl",
254
+ "desc": "A normal female adult voice; Language: Polish.",
255
+ "npz_path": "v2/pl_speaker_4"
256
+ },
257
+ "Female2_Pl": {
258
+ "id": "Female2_Pl",
259
+ "desc": "A normal female adult voice; Language: Polish.",
260
+ "npz_path": "v2/pl_speaker_6"
261
+ },
262
+ "Male1_Pl": {
263
+ "id": "Male1_Pl",
264
+ "desc": "A normal male adult voice; Language: Polish.",
265
+ "npz_path": "v2/pl_speaker_5"
266
+ },
267
+ "Male2_Pl": {
268
+ "id": "Male2_Pl",
269
+ "desc": "A normal male adult voice; Language: Polish.",
270
+ "npz_path": "v2/pl_speaker_7"
271
+ }
272
+ }
data/voice_presets/npz/child_boy.npz ADDED
Binary file (33.1 kB). View file
 
data/voice_presets/npz/cnn_male_speaker.npz ADDED
Binary file (46.6 kB). View file
 
data/voice_presets/npz/elder_morgen.npz ADDED
Binary file (30.8 kB). View file
 
data/voice_presets/npz/news_female_speaker.npz ADDED
Binary file (71.8 kB). View file
 
data/voice_presets/npz/news_female_speaker_outside.npz ADDED
Binary file (60.5 kB). View file
 
data/voice_presets/npz/news_male_speaker.npz ADDED
Binary file (36 kB). View file
 
examples/1.mp4 ADDED
Binary file (365 kB). View file
 
examples/2.mp4 ADDED
Binary file (241 kB). View file
 
examples/3.mp4 ADDED
Binary file (346 kB). View file
 
examples/example1.wav ADDED
Binary file (320 kB). View file
 
examples/example2.wav ADDED
Binary file (320 kB). View file
 
examples/examples.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ example1 = {
3
+ 'text': "An introduction to AI-assisted audio content creation.",
4
+ 'table_script': """
5
+ | Audio Type | Layout | ID | Character | Action | Volume | Description | Length |
6
+ |--------------|------------|----|-----------|--------|--------|------------------------------------------------------------------|--------|
7
+ | music | background | 1 | N/A | begin | -35 | Inspirational technology-themed music | Auto |
8
+ | speech | foreground | N/A| Narrator | N/A | -15 | Welcome to the future of audio content creation. | Auto |
9
+ | sound_effect | foreground | N/A| N/A | N/A | -35 | Digital startup sound | 2 |
10
+ | speech | foreground | N/A| Narrator | N/A | -15 | With evolving technology, we are introducing AI-assisted tools for pristine audio production. | Auto |
11
+ | sound_effect | foreground | N/A| N/A | N/A | -35 | Keyboard typing noise | 3 |
12
+ | speech | foreground | N/A| Narrator | N/A | -15 | Imagine crafting audio content with the power of AI at your fingertips. | Auto |
13
+ | sound_effect | background | 2 | N/A | begin | -35 | Ambiance of a busy control room | Auto |
14
+ | speech | foreground | N/A| Narrator | N/A | -15 | Enhanced quality, efficient production and limitless creativity, all under one roof. | Auto |
15
+ | sound_effect | background | 2 | N/A | end | N/A | N/A | Auto |
16
+ | speech | foreground | N/A| Narrator | N/A | -15 | Unleash your potential with AI-assisted audio content creation. | Auto |
17
+ | music | background | 1 | N/A | end | N/A | N/A | Auto |
18
+
19
+ """,
20
+ 'table_voice': """
21
+ | Character | Voice |
22
+ |-------------|-----------|
23
+ | Narrator | News_Male_En |
24
+
25
+ """,
26
+ 'wav_file': 'examples/1.mp4',
27
+ }
28
+
29
+ example2 = {
30
+ 'text': "A couple dating in a cafe.",
31
+ 'table_script': """
32
+ | Audio Type | Layout | ID | Character | Action | Volume | Description | Length |
33
+ |--------------|------------|----|-----------|--------|--------|-----------------------------------------------|--------|
34
+ | sound_effect | background | 1 | N/A | begin | -35 | Soft chattering in a cafe | Auto |
35
+ | sound_effect | background | 2 | N/A | begin | -38 | Coffee brewing noises | Auto |
36
+ | music | background | 3 | N/A | begin | -35 | Soft jazz playing in the background | Auto |
37
+ | speech | foreground | N/A| Man | N/A | -15 | It’s really nice to finally get out and relax a little, isn’t it? | Auto |
38
+ | speech | foreground | N/A| Woman | N/A | -15 | I know, right? We should do this more often. | Auto |
39
+ | sound_effect | background | 2 | N/A | end | N/A | N/A | Auto |
40
+ | speech | foreground | N/A| Man | N/A | -15 | Here’s your coffee, just as you like it. | Auto |
41
+ | speech | foreground | N/A| Woman | N/A | -15 | Thank you, it smells wonderful. | Auto |
42
+ | music | background | 3 | N/A | end | N/A | N/A | Auto |
43
+ | sound_effect | background | 1 | N/A | end | N/A | N/A | Auto |
44
+
45
+ """,
46
+ 'table_voice': """
47
+ | Character | Voice |
48
+ |-------------|-----------|
49
+ | Man | Male1_En |
50
+ | Woman | Female1_En |
51
+
52
+ """,
53
+ 'wav_file': 'examples/2.mp4',
54
+ }
55
+
56
+
57
+ example3 = {
58
+ 'text': "A child is participating in a farting contest.",
59
+ 'table_script': """
60
+ | Audio Type | Layout | ID | Character | Action | Volume | Description | Length |
61
+ |--------------|------------|----|-----------|--------|--------|------------------------------------------------------|--------|
62
+ | sound_effect | background | 1 | N/A | begin | -35 | Outdoor park ambiance, people chattering | Auto |
63
+ | music | background | 2 | N/A | begin | -35 | Light comedy theme music, quirky | Auto |
64
+ | speech | foreground | N/A| Host | N/A | -15 | Welcome to the annual Fart Competition. | Auto |
65
+ | speech | foreground | N/A| Host | N/A | -15 | Now, let’s welcome our youngest participant. | Auto |
66
+ | sound_effect | foreground | N/A| N/A | N/A | -35 | Clapping sound | 2 |
67
+ | speech | foreground | N/A| Child | N/A | -15 | Hi, I’m excited to be here. | Auto |
68
+ | sound_effect | foreground | N/A| N/A | N/A | -35 | Short, cartoonish duration of a fart sound | 4 |
69
+ | sound_effect | foreground | N/A| N/A | N/A | -35 | Audience laughing and applauding | 2 |
70
+ | speech | foreground | N/A| Host | N/A | -15 | Wow, that was impressive! Let’s give another round of applause! | Auto |
71
+ | sound_effect | foreground | N/A| N/A | N/A | -35 | Audience clapping and cheering | 3 |
72
+ | music | background | 2 | N/A | end | N/A | N/A | Auto |
73
+ | sound_effect | background | 1 | N/A | end | N/A | N/A | Auto |
74
+ """,
75
+ 'table_voice': """
76
+ | Character | Voice |
77
+ |-------------|-----------|
78
+ | Host | Male1_En |
79
+ | Child | Child_En |
80
+
81
+ """,
82
+ 'wav_file': 'examples/3.mp4',
83
+ }
84
+
85
+
86
+
87
+ examples = [example1, example2, example3]
parse_voice.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ from VoiceParser.model import VoiceParser
4
+
5
+ def main():
6
+ parser = argparse.ArgumentParser()
7
+ parser.add_argument('--wav-path', type=str, help="Path of a wav file")
8
+ parser.add_argument('--wav-dir', type=str, help="Directory of wav files")
9
+ parser.add_argument('--out-dir', type=str, help="Directory of output npz files")
10
+ args = parser.parse_args()
11
+
12
+ if (args.wav_path is None and args.wav_dir is None) or (args.wav_path is not None and args.wav_dir is not None):
13
+ parser.error("Please provide either '--wav-path' or '--wav-dir', but not both.")
14
+
15
+ out_dir = args.out_dir
16
+
17
+ model = VoiceParser(device='cpu')
18
+
19
+ if args.wav_path is not None:
20
+ model.extract_acoustic_embed(args.wav_path, out_dir)
21
+ print(f'Sucessfully parsed {args.wav_path}')
22
+ else:
23
+ wav_name_list = os.listdir(args.wav_dir)
24
+ for wav_name in wav_name_list:
25
+ wav_path = os.path.join(args.wav_dir, wav_name)
26
+ model.extract_acoustic_embed(wav_path, out_dir)
27
+ print(f'Sucessfully parsed {wav_path}')
28
+
29
+
30
+ if __name__ == '__main__':
31
+ main()
pipeline.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import os
3
+ from string import Template
4
+ import openai
5
+ import re
6
+ import glob
7
+ import pickle
8
+ import time
9
+ import json5
10
+ from retrying import retry
11
+ from code_generator import check_json_script, collect_and_check_audio_data
12
+ import random
13
+ import string
14
+
15
+ import utils
16
+ import voice_presets
17
+ from code_generator import AudioCodeGenerator
18
+
19
+ # Enable this for debugging
20
+ USE_OPENAI_CACHE = False
21
+ openai_cache = []
22
+ if USE_OPENAI_CACHE:
23
+ os.makedirs('cache', exist_ok=True)
24
+ for cache_file in glob.glob('cache/*.pkl'):
25
+ with open(cache_file, 'rb') as file:
26
+ openai_cache.append(pickle.load(file))
27
+
28
+ def chat_with_gpt(prompt, api_key):
29
+ if USE_OPENAI_CACHE:
30
+ filtered_object = list(filter(lambda x: x['prompt'] == prompt, openai_cache))
31
+ if len(filtered_object) > 0:
32
+ response = filtered_object[0]['response']
33
+ return response
34
+
35
+ try:
36
+ openai.api_key = api_key
37
+ chat = openai.ChatCompletion.create(
38
+ # model="gpt-3.5-turbo",
39
+ model="gpt-4",
40
+ messages=[
41
+ {
42
+ "role": "system",
43
+ "content": "You are a helpful assistant."
44
+ },
45
+ {
46
+ "role": "user",
47
+ "content": prompt
48
+ }
49
+ ]
50
+ )
51
+ finally:
52
+ openai.api_key = ''
53
+
54
+ if USE_OPENAI_CACHE:
55
+ cache_obj = {
56
+ 'prompt': prompt,
57
+ 'response': chat['choices'][0]['message']['content']
58
+ }
59
+ with open(f'cache/{time.time()}.pkl', 'wb') as _openai_cache:
60
+ pickle.dump(cache_obj, _openai_cache)
61
+ openai_cache.append(cache_obj)
62
+
63
+ return chat['choices'][0]['message']['content']
64
+
65
+
66
+ def get_file_content(filename):
67
+ with open(filename, 'r') as file:
68
+ return file.read().strip()
69
+
70
+
71
+ def write_to_file(filename, content):
72
+ with open(filename, 'w') as file:
73
+ file.write(content)
74
+
75
+
76
+ def extract_substring_with_quotes(input_string, quotes="'''"):
77
+ pattern = f"{quotes}(.*?){quotes}"
78
+ matches = re.findall(pattern, input_string, re.DOTALL)
79
+ return matches
80
+
81
+
82
+ def try_extract_content_from_quotes(content):
83
+ if "'''" in content:
84
+ return extract_substring_with_quotes(content)[0]
85
+ elif "```" in content:
86
+ return extract_substring_with_quotes(content, quotes="```")[0]
87
+ else:
88
+ return content
89
+
90
+ def maybe_get_content_from_file(content_or_filename):
91
+ if os.path.exists(content_or_filename):
92
+ with open(content_or_filename, 'r') as file:
93
+ return file.read().strip()
94
+ return content_or_filename
95
+
96
+
97
+
98
+ # Pipeline Interface Guidelines:
99
+ #
100
+ # Init calls:
101
+ # - Init calls must be called before running the actual steps
102
+ # - init_session() is called every time a gradio webpage is loaded
103
+ #
104
+ # Single Step:
105
+ # - takes input (file or content) and output path as input
106
+ # - most of time just returns output content
107
+ #
108
+ # Compositional Step:
109
+ # - takes session_id as input (you have session_id, you have all the paths)
110
+ # - run a series of steps
111
+
112
+ # This is called for every new gradio webpage
113
+
114
+ def init_session(session_id=''):
115
+ def uid8():
116
+ return ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))
117
+
118
+ if session_id == '':
119
+ session_id = f'{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}_{uid8()}'
120
+ # create the paths
121
+ os.makedirs(utils.get_session_voice_preset_path(session_id))
122
+ os.makedirs(utils.get_session_audio_path(session_id))
123
+ print(f'New session created, session_id={session_id}')
124
+ return session_id
125
+
126
+ @retry(stop_max_attempt_number=3)
127
+ def input_text_to_json_script_with_retry(complete_prompt_path, api_key):
128
+ print(" trying ...")
129
+ complete_prompt = get_file_content(complete_prompt_path)
130
+ json_response = try_extract_content_from_quotes(chat_with_gpt(complete_prompt, api_key))
131
+ json_data = json5.loads(json_response)
132
+
133
+ try:
134
+ check_json_script(json_data)
135
+ collect_and_check_audio_data(json_data)
136
+ except Exception as err:
137
+ print(f'JSON ERROR: {err}')
138
+ retry_complete_prompt = f'{complete_prompt}\n```\n{json_response}```\nThe script above has format error(s). Return the fixed script.\n\nScript:\n'
139
+ write_to_file(complete_prompt_path, retry_complete_prompt)
140
+ raise err
141
+
142
+ return json_response
143
+
144
+ # Step 1: input_text to json
145
+ def input_text_to_json_script(input_text, output_path, api_key):
146
+ input_text = maybe_get_content_from_file(input_text)
147
+ text_to_audio_script_prompt = get_file_content('prompts/text_to_json.prompt')
148
+ prompt = f'{text_to_audio_script_prompt}\n\nInput text: {input_text}\n\nScript:\n'
149
+ complete_prompt_path = output_path / 'complete_input_text_to_audio_script.prompt'
150
+ write_to_file(complete_prompt_path, prompt)
151
+ audio_script_response = input_text_to_json_script_with_retry(complete_prompt_path, api_key)
152
+ generated_audio_script_filename = output_path / 'audio_script.json'
153
+ write_to_file(generated_audio_script_filename, audio_script_response)
154
+ return audio_script_response
155
+
156
+ # Step 2: json to char-voice map
157
+ def json_script_to_char_voice_map(json_script, voices, output_path, api_key):
158
+ json_script_content = maybe_get_content_from_file(json_script)
159
+ prompt = get_file_content('prompts/audio_script_to_character_voice_map.prompt')
160
+ presets_str = '\n'.join(f"{preset['id']}: {preset['desc']}" for preset in voices.values())
161
+ prompt = Template(prompt).substitute(voice_and_desc=presets_str)
162
+ prompt = f"{prompt}\n\nAudio script:\n'''\n{json_script_content}\n'''\n\noutput:\n"
163
+ write_to_file(output_path / 'complete_audio_script_to_char_voice_map.prompt', prompt)
164
+ char_voice_map_response = try_extract_content_from_quotes(chat_with_gpt(prompt, api_key))
165
+ char_voice_map = json5.loads(char_voice_map_response)
166
+ # enrich char_voice_map with voice preset metadata
167
+ complete_char_voice_map = {c: voices[char_voice_map[c]] for c in char_voice_map}
168
+ char_voice_map_filename = output_path / 'character_voice_map.json'
169
+ write_to_file(char_voice_map_filename, json5.dumps(complete_char_voice_map))
170
+ return complete_char_voice_map
171
+
172
+ # Step 3: json to py code
173
+ def json_script_and_char_voice_map_to_audio_gen_code(json_script_filename, char_voice_map_filename, output_path, result_filename):
174
+ audio_code_generator = AudioCodeGenerator()
175
+ code = audio_code_generator.parse_and_generate(
176
+ json_script_filename,
177
+ char_voice_map_filename,
178
+ output_path,
179
+ result_filename
180
+ )
181
+ write_to_file(output_path / 'audio_generation.py', code)
182
+
183
+ # Step 4: py code to final wav
184
+ def audio_code_gen_to_result(audio_gen_code_path):
185
+ audio_gen_code_filename = audio_gen_code_path / 'audio_generation.py'
186
+ os.system(f'PYTHONPATH=. python {audio_gen_code_filename}')
187
+
188
+ # Function call used by Gradio: input_text to json
189
+ def generate_json_file(session_id, input_text, api_key):
190
+ output_path = utils.get_session_path(session_id)
191
+ # Step 1
192
+ print(f'session_id={session_id}, Step 1: Writing audio script based on text: {input_text} ...')
193
+ return input_text_to_json_script(input_text, output_path, api_key)
194
+
195
+ # Function call used by Gradio: json to result wav
196
+ def generate_audio(session_id, json_script, api_key):
197
+ def count_lines(content):
198
+ # Split the string using the newline character and count the non-empty lines
199
+ return sum(1 for line in content.split('\n') if line.strip())
200
+
201
+ max_lines = utils.get_max_script_lines()
202
+ if count_lines(json_script) > max_lines:
203
+ raise ValueError(f'The number of lines of the JSON script has exceeded {max_lines}!')
204
+
205
+ output_path = utils.get_session_path(session_id)
206
+ output_audio_path = utils.get_session_audio_path(session_id)
207
+ voices = voice_presets.get_merged_voice_presets(session_id)
208
+
209
+ # Step 2
210
+ print(f'session_id={session_id}, Step 2: Parsing character voice with LLM...')
211
+ char_voice_map = json_script_to_char_voice_map(json_script, voices, output_path, api_key)
212
+ # Step 3
213
+ json_script_filename = output_path / 'audio_script.json'
214
+ char_voice_map_filename = output_path / 'character_voice_map.json'
215
+ result_wav_basename = f'res_{session_id}'
216
+ print(f'session_id={session_id}, Step 3: Compiling audio script to Python program ...')
217
+ json_script_and_char_voice_map_to_audio_gen_code(json_script_filename, char_voice_map_filename, output_path, result_wav_basename)
218
+ # Step 4
219
+ print(f'session_id={session_id}, Step 4: Start running Python program ...')
220
+ audio_code_gen_to_result(output_path)
221
+
222
+ result_wav_filename = output_audio_path / f'{result_wav_basename}.wav'
223
+ print(f'Done all processes, result: {result_wav_filename}')
224
+ return result_wav_filename, char_voice_map
225
+
226
+ # Convenient function call used by wavjourney_cli
227
+ def full_steps(session_id, input_text, api_key):
228
+ json_script = generate_json_file(session_id, input_text, api_key)
229
+ return generate_audio(session_id, json_script, api_key)
prompts/audio_script_to_character_voice_map.prompt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Given an audio script in json format, for each character appeared in the "character" attribute, you should map the character to a "voice type" according to the his/her lines and the voice type's features. Each character must be mapped to a different voice type, and each voice type must be from one of the following(each line in the format of "[voice_type_id]: [voice_type_description]"):
2
+ $voice_and_desc
3
+
4
+ Output should be in the format of json, like:
5
+ '''
6
+ {
7
+ "character_1": "voice_type_1",
8
+ "character_2": "voice_type_2",
9
+ ...
10
+ }
11
+ '''
prompts/audio_script_to_json.prompt ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Given an audio script, adapt it into a json file. You must go through each line of the script, and try your best to convert it to a json object or multiple json objects.
2
+
3
+ Each json object represents an audio. There are three types of audios: sound effect, music, and speech. For each audio, there are two types of layouts: foreground and background. Foreground audios are played sequentially, and background audios are environmental sounds or music which are played while the foreground audios are being played.
4
+
5
+ While going through each line of the script, you have choices as below:
6
+ - For character lines, you need to convert it to a speech audio. Note that a speech audio can only be foreground. Example:
7
+ From
8
+ ```
9
+ News Anchor: Good evening, this is BBC News.
10
+ ```
11
+ To
12
+ ```
13
+ {"audio_type": "speech", "layout": "foreground", "character": "News Anchor", "vol": -15, "text": "Good evening, this is BBC News."},
14
+ ```
15
+ - For sound effects, you need to convert it to a sound_effect audio. Especially, you need to figure out its length according to the script's context, and put it into "len". Example:
16
+ From
17
+ ```
18
+ (SFX: Airport beeping sound)
19
+ ```
20
+ to
21
+ ```
22
+ {"audio_type": "sound_effect", "layout": "foreground", "vol": -35, "len": 2, "desc": "Airport beeping sound"},
23
+ ```
24
+ - For music, you need to convert it to a music audio. Especially, you need to figure out its length according to the script's context, and put it into "len". Example:
25
+ From
26
+ ```
27
+ (SFX: Uplifting newsroom music)
28
+ ```
29
+ to
30
+ ```
31
+ {"audio_type": "music", "layout": "foreground", "vol": -35, "len": 10, "desc": "Uplifting newsroom music"},
32
+ ```
33
+
34
+ When a sound effect or music is environmental played in the background, you should set their layout to "background". You must give the background audio an unique id, and you must figure out the end of the background audio according to the context and indicate it explicitly. Example:
35
+ From
36
+ ```
37
+ ...
38
+ (SFX: Airport ambiance, people walking)
39
+ Airport Announcer: Lades and Gentlemen, attentions please!
40
+ ...
41
+ ```
42
+ to
43
+ ```
44
+ ...
45
+ {"audio_type": "sound_effect", "layout": "background", "id":1, "action": "begin", "vol": -35, "desc": "Airport ambiance, people walking"},
46
+ [foreground audio]
47
+ ...
48
+ {"audio_type": "sound_effect", "layout": "background", "id":1, "action": "end"},
49
+ ...
50
+ ```
51
+
52
+ When a line contains multiple sound effects and musics, you need to decompose it into multiple audios. Example:
53
+ From
54
+ ```
55
+ ...
56
+ (SFX: A classy restaurant, low chatter, clinking silverware, jazz music playing)
57
+ ...
58
+ ```
59
+ to
60
+ ```
61
+ ...
62
+ {"audio_type": "sound_effect", "layout": "background", "id":1, "action": "begin", "vol": -35, "desc": "low chatter"},
63
+ {"audio_type": "sound_effect", "layout": "background", "id":2, "action": "begin", "vol": -35, "desc": "clinking silverware"},
64
+ {"audio_type": "music", "layout": "background", "id":3, "action": "begin", "vol": -35, "desc": "jazz music"},
65
+ ...
66
+ {"audio_type": "sound_effect", "layout": "background", "id":1, "action": "end"},
67
+ {"audio_type": "sound_effect", "layout": "background", "id":2, "action": "end"},
68
+ {"audio_type": "music", "layout": "background", "id":3, "action": "end"},
69
+ ...
70
+ ```
71
+
72
+ The final json object contains a list of all the audio objects.
73
+
74
+ Script:
prompts/script_to_json.prompt ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Convert an audio script line to another format. Each line will be converted to a simple json format. Below are the examples of conversion of each line.
2
+
3
+ Example line 1:
4
+ '''
5
+ [Background music 1 begins, -35dB: Uplifting newsroom music]
6
+ '''
7
+ convert to:
8
+ '''
9
+ {"voice_type": "back_ground_music", "id": 1, "state": "begin", "volume": -35, "desc": "Uplifting newsroom music"},
10
+ '''
11
+ Example line 2:
12
+ '''
13
+ [Background music 1 ends]
14
+ '''
15
+ convert to:
16
+ '''
17
+ {"voice_type": "back_ground_music", "id": 1, "state": "end"},
18
+ '''
19
+ Example line 3:
20
+ '''
21
+ [Background sound effect 2 begins, -35dB: Crowds cheering and arcade ambiance]
22
+ '''
23
+ convert to:
24
+ '''
25
+ {"voice_type": "back_ground_sound_effect", "id": 2, "state": "begin", "volume": -35, "desc": "Crowds cheering and arcade ambiance"},
26
+ '''
27
+ Example line 4:
28
+ '''
29
+ [Background sound effect 2 ends]
30
+ '''
31
+ convert to:
32
+ '''
33
+ {"voice_type": "back_ground_sound_effect", "id": 2, "state": "end"},
34
+ '''
35
+ Example line 5:
36
+ '''
37
+ News Anchor, -15dB: Good evening, this is BBC News.
38
+ '''
39
+ convert to:
40
+ '''
41
+ {"voice_type": "speech", "character": "News Anchor", "volume": -15, "desc": "Good evening, this is BBC News."},
42
+ '''
43
+ Example line 6:
44
+ '''
45
+ [Sound effect, 3s, -15dB: Keyboard typing and mouse clicking]
46
+ '''
47
+ convert to:
48
+ '''
49
+ {"voice_type": "sound_effect", "length": 3, "volume": -15, "desc": "Keyboard typing and mouse clicking"},
50
+ '''
51
+ Example line 7:
52
+ '''
53
+ [Sound music, 10s, -15dB: Uplifting newsroom music]
54
+ '''
55
+ convert to:
56
+ '''
57
+ {"voice_type": "music", "length": 10, "volume": -15, "desc": "Uplifting newsroom music"},
58
+ '''
prompts/text_to_audio_script.prompt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ I want you to act as a audio script writer. I'll give you an instruction which is a general idea and you will make it a short audio script.
2
+
3
+ The script should follow the rules below:
4
+ - For dialogs, each line must contain the character's name, its volume in decibel (human voices are usually around -15dB) and the line, example:
5
+ '''
6
+ Darth Vader, -16dB: Luke, I'm your father.
7
+ '''
8
+ - For foreground sound effect, you must wrap the line with brackets and start with "Sound effect, ", and you should give the duration of the sound effect in seconds, and you should specify the volume you want in decibel(For foreground sound effects it's usually around -15dB), and you should give very detailed description of the sound effect, example:
9
+ '''
10
+ [Sound effect, 2s, -15dB: Airport beeping sound]
11
+ '''
12
+ - For foreground music, you must wrap the line with brackets and start with "Music, ", and you should give the duration of the music in seconds, and you should specify the volume you want in decibel(for foreground music it's usually around -15dB), and you should give very detailed description of the music, example:
13
+ '''
14
+ [Music, 10s, -15dB: 80's Rock and Roll music]
15
+ '''
16
+ - For background sound effects, you must wrap the line with brackets and start with "Background sound effect" followed by its id, and you must always explicitly indicate the start and end of the sound effects, and you should specify the volume you want in decibel(for background sound effect it's usually around -35dB), and you should give very detailed description of the sound effect, example:
17
+ '''
18
+ [Background sound effect 1 begins, -34dB: Airport ambiance, including footsteps, luggage rolling, and distant airplane engine]
19
+ ...
20
+ [Background sound effect 1 ends]
21
+ '''
22
+ - For background music, you must wrap the line with brackets and start with "Background music" followed by its id, and you must always explicitly indicate the start and end of the music, and you should specify the volume you want in decibel(for background sound effect it's usually around -35dB), and you should give very detailed description of the music, example:
23
+ '''
24
+ [Background music 1 begins, -35dB: Uplifting newsroom music]
25
+ ...
26
+ [Background music 1 ends]
27
+ '''
28
+ - For music and sound effect, you can not name the element outside these:
29
+ ["Sound effect, ",
30
+ "Music, ",
31
+ "Background sound effect" followed by its id,
32
+ "Background music" followed by its id]
33
+ such as "Foreground sound effect", "Foreground music" is forbidden
34
+
prompts/text_to_json.prompt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ I want you to act as an audio script writer. I'll give you input text which is a general idea and you will make it an audio script in json format. Instructions:
2
+ - Each line represents an audio. There are three types of audio: sound effects, music, and speech. For each audio, there are only two types of layouts: foreground and background. Foreground audios are played sequentially, and background audios are environmental sounds or music which are played while the foreground audios are being played.
3
+ - Sound effects can be either foreground or background. For sound effects, you must provide its layout, volume, length (in seconds), and detailed description of the real-world sound effect. Example:
4
+ '''
5
+ - The description of sound effects should not contain a specific person.
6
+ {"audio_type": "sound_effect", "layout": "foreground", "vol": -35, "len": 2, "desc": "Airport beeping sound"},
7
+ '''
8
+ - Music can be either foreground or background. For music, you must provide its layout, volume, length (in seconds), and detailed description of the music. Example:
9
+ '''
10
+ {"audio_type": "music", "layout": "foreground", "vol": -35, "len": 10, "desc": "Uplifting newsroom music"},
11
+ '''
12
+ - Speech can only be foreground. For speech, you must provide the character, volume, and the character's line. You do not need to specify the length of the speech. Example:
13
+ '''
14
+ {"audio_type": "speech", "layout": "foreground", "character": "News Anchor", "vol": -15, "text": "Good evening, this is BBC News. In today's breaking news, we have an unexpected turn of events in the political arena"},
15
+ '''
16
+ - The description of speech should not contain anything other than the lines, such as actions, expressions, emotions etc.
17
+ - For background sound audio, you must specify the beginning and the end of background audio in separate lines to indicate when the audio begins and when it ends. Example for background sound effect (for background music it's similar):
18
+ '''
19
+ {"audio_type": "sound_effect", "layout": "background", "id":1, "action": "begin", "vol": -35, "desc": "Airport ambiance, people walking"},
20
+ [foreground audio 1]
21
+ [foreground audio 2]
22
+ ...
23
+ {"audio_type": "sound_effect", "layout": "background", "id":1, "action": "end"},
24
+ '''
25
+ - Each background audio must have a unique id.
26
+ - You do not specify the length of the background audio.
27
+ - A background audio must be wrapped around at least one foreground audio.
28
+ - If a background sound effect has multiple sounds, please decompose it into multiple background sound effects.
29
+ - The description of speech can be multilingual, default is English.
30
+ - The description of sound effects and music must be in English.
31
+ - At the same time there must be at most only one audio with the type of music playing, either foreground or background.
32
+ - The volume of background sound effects/music is usually around -35 ~ -40 dB
33
+ - The output json must be a list as the root node containing all the audio nodes, and must be wrapped with triple quotes '''.
scripts/EnvsSetup.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ conda env create -f Envs/WavJourney.yml && \
2
+ conda env update -f Envs/Bark.yml && \
3
+ conda env update -f Envs/AudioCraft.yml && \
4
+ conda run --live-stream -n WavJourney pip install -U git+https://git@github.com/facebookresearch/audiocraft@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft && \
5
+ conda run --live-stream -n WavJourney pip install -U --no-deps voicefixer==0.1.2 && \
6
+ conda run --live-stream -n WavJourney pip install -U --no-deps numpy==1.21 && \
7
+ conda run --live-stream -n WavJourney pip install -U --no-deps librosa==0.8.1
scripts/download_models.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+
4
+ # Read the YAML file
5
+ with open('config.yaml', 'r') as file:
6
+ config = yaml.safe_load(file)
7
+
8
+ # Extract values for each application
9
+ ttm_model_size = config['AudioCraft']['ttm_model_size']
10
+ tta_model_size = config['AudioCraft']['tta_model_size']
11
+
12
+ # Download nltk
13
+ import nltk
14
+ nltk.download('punkt')
15
+
16
+ # Downloading the TTS models
17
+ print('Step 1: Downloading TTS model ...')
18
+ os.system(f'conda run --live-stream -n WavJourney python -c \'from transformers import BarkModel; BarkModel.from_pretrained("suno/bark")\'')
19
+
20
+ print('Step 2: Downloading TTA model ...')
21
+ os.system(f'conda run --live-stream -n WavJourney python -c \'from audiocraft.models import AudioGen; tta_model = AudioGen.get_pretrained("facebook/audiogen-{tta_model_size}")\'')
22
+
23
+ print('Step 3: Downloading TTM model ...')
24
+ os.system(f'conda run --live-stream -n WavJourney python -c \'from audiocraft.models import MusicGen; tta_model = MusicGen.get_pretrained("facebook/musicgen-{ttm_model_size}")\'')
25
+
26
+ print('Step 4: Downloading SR model ...')
27
+ os.system(f'conda run --live-stream -n WavJourney python -c \'from voicefixer import VoiceFixer; vf = VoiceFixer()\'')
28
+
29
+ print('Step 5: Downloading VP model ...')
30
+ os.system(f'conda run --live-stream -n WavJourney python -c \'from VoiceParser.model import VoiceParser; vp = VoiceParser(device="cpu")\'')
31
+
32
+ print('All models successfully downloaded!')
scripts/kill_services.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # Extract values for each application
4
+ service_port = os.environ.get('WAVJOURNEY_SERVICE_PORT')
5
+
6
+ # Execute the commands
7
+ os.system(f'kill $(lsof -t -i :{service_port})')
8
+
9
+
10
+
11
+
scripts/start_service_and_ui.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ conda run --live-stream -n WavJourney python -u services.py 2>&1 | tee services_logs/service.out &
2
+ conda run --live-stream -n WavJourney python -u ui_client.py 2>&1 | tee services_logs/wavejourney.out
scripts/start_services.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ nohup conda run --live-stream -n WavJourney python services.py > services_logs/service.out 2>&1 &
scripts/start_ui.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ conda run --live-stream -n WavJourney python -u ui_client.py 2>&1 | stdbuf -oL tee services_logs/wavejourney.out
services.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ import logging
4
+ import nltk
5
+ import torch
6
+ import torchaudio
7
+ from torchaudio.transforms import SpeedPerturbation
8
+ from APIs import WRITE_AUDIO, LOUDNESS_NORM
9
+ from utils import fade, get_service_port
10
+ from flask import Flask, request, jsonify
11
+
12
+ with open('config.yaml', 'r') as file:
13
+ config = yaml.safe_load(file)
14
+
15
+ # Configure the logging format and level
16
+ logging.basicConfig(
17
+ level=logging.INFO,
18
+ format='%(asctime)s - %(levelname)s - %(message)s'
19
+ )
20
+
21
+ # Create a FileHandler for the log file
22
+ os.makedirs('services_logs', exist_ok=True)
23
+ log_filename = 'services_logs/Wav-API.log'
24
+ file_handler = logging.FileHandler(log_filename, mode='w')
25
+ file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
26
+
27
+ # Add the FileHandler to the root logger
28
+ logging.getLogger('').addHandler(file_handler)
29
+
30
+
31
+ """
32
+ Initialize the AudioCraft models here
33
+ """
34
+ from audiocraft.models import AudioGen, MusicGen
35
+ tta_model_size = config['AudioCraft']['tta_model_size']
36
+ tta_model = AudioGen.get_pretrained(f'facebook/audiogen-{tta_model_size}')
37
+ logging.info(f'AudioGen ({tta_model_size}) is loaded ...')
38
+
39
+ ttm_model_size = config['AudioCraft']['ttm_model_size']
40
+ ttm_model = MusicGen.get_pretrained(f'facebook/musicgen-{ttm_model_size}')
41
+ logging.info(f'MusicGen ({ttm_model_size}) is loaded ...')
42
+
43
+
44
+ """
45
+ Initialize the BarkModel here
46
+ """
47
+ from transformers import BarkModel, AutoProcessor
48
+ SPEED = float(config['Text-to-Speech']['speed'])
49
+ speed_perturb = SpeedPerturbation(32000, [SPEED])
50
+ tts_model = BarkModel.from_pretrained("suno/bark")
51
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
52
+ tts_model = tts_model.to(device)
53
+ tts_model = tts_model.to_bettertransformer() # Flash attention
54
+ SAMPLE_RATE = tts_model.generation_config.sample_rate
55
+ SEMANTIC_TEMPERATURE = 0.9
56
+ COARSE_TEMPERATURE = 0.5
57
+ FINE_TEMPERATURE = 0.5
58
+ processor = AutoProcessor.from_pretrained("suno/bark")
59
+ logging.info('Bark model is loaded ...')
60
+
61
+
62
+ """
63
+ Initialize the VoiceFixer model here
64
+ """
65
+ from voicefixer import VoiceFixer
66
+ vf = VoiceFixer()
67
+ logging.info('VoiceFixer is loaded ...')
68
+
69
+
70
+ """
71
+ Initalize the VoiceParser model here
72
+ """
73
+ from VoiceParser.model import VoiceParser
74
+ vp_device = config['Voice-Parser']['device']
75
+ vp = VoiceParser(device=vp_device)
76
+ logging.info('VoiceParser is loaded ...')
77
+
78
+
79
+ app = Flask(__name__)
80
+
81
+
82
+ @app.route('/generate_audio', methods=['POST'])
83
+ def generate_audio():
84
+ # Receive the text from the POST request
85
+ data = request.json
86
+ text = data['text']
87
+ length = float(data.get('length', 5.0))
88
+ volume = float(data.get('volume', -35))
89
+ output_wav = data.get('output_wav', 'out.wav')
90
+
91
+ logging.info(f'TTA (AudioGen): Prompt: {text}, length: {length} seconds, volume: {volume} dB')
92
+
93
+ try:
94
+ tta_model.set_generation_params(duration=length)
95
+ wav = tta_model.generate([text])
96
+ wav = torchaudio.functional.resample(wav, orig_freq=16000, new_freq=32000)
97
+
98
+ wav = wav.squeeze().cpu().detach().numpy()
99
+ wav = fade(LOUDNESS_NORM(wav, volumn=volume))
100
+ WRITE_AUDIO(wav, name=output_wav)
101
+
102
+ # Return success message and the filename of the generated audio
103
+ return jsonify({'message': f'Text-to-Audio generated successfully | {text}', 'file': output_wav})
104
+
105
+ except Exception as e:
106
+ return jsonify({'API error': str(e)}), 500
107
+
108
+
109
+ @app.route('/generate_music', methods=['POST'])
110
+ def generate_music():
111
+ # Receive the text from the POST request
112
+ data = request.json
113
+ text = data['text']
114
+ length = float(data.get('length', 5.0))
115
+ volume = float(data.get('volume', -35))
116
+ output_wav = data.get('output_wav', 'out.wav')
117
+
118
+ logging.info(f'TTM (MusicGen): Prompt: {text}, length: {length} seconds, volume: {volume} dB')
119
+
120
+
121
+ try:
122
+ ttm_model.set_generation_params(duration=length)
123
+ wav = ttm_model.generate([text])
124
+ wav = wav[0][0].cpu().detach().numpy()
125
+ wav = fade(LOUDNESS_NORM(wav, volumn=volume))
126
+ WRITE_AUDIO(wav, name=output_wav)
127
+
128
+ # Return success message and the filename of the generated audio
129
+ return jsonify({'message': f'Text-to-Music generated successfully | {text}', 'file': output_wav})
130
+
131
+ except Exception as e:
132
+ # Return error message if something goes wrong
133
+ return jsonify({'API error': str(e)}), 500
134
+
135
+
136
+ @app.route('/generate_speech', methods=['POST'])
137
+ def generate_speech():
138
+ # Receive the text from the POST request
139
+ data = request.json
140
+ text = data['text']
141
+ speaker_id = data['speaker_id']
142
+ speaker_npz = data['speaker_npz']
143
+ volume = float(data.get('volume', -35))
144
+ output_wav = data.get('output_wav', 'out.wav')
145
+
146
+ logging.info(f'TTS (Bark): Speaker: {speaker_id}, Volume: {volume} dB, Prompt: {text}')
147
+
148
+ try:
149
+ # Generate audio using the global pipe object
150
+ text = text.replace('\n', ' ').strip()
151
+ sentences = nltk.sent_tokenize(text)
152
+ silence = torch.zeros(int(0.1 * SAMPLE_RATE), device=device).unsqueeze(0) # 0.1 second of silence
153
+
154
+ pieces = []
155
+ for sentence in sentences:
156
+ inputs = processor(sentence, voice_preset=speaker_npz).to(device)
157
+ # NOTE: you must run the line below, otherwise you will see the runtime error
158
+ # RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
159
+ inputs['history_prompt']['coarse_prompt'] = inputs['history_prompt']['coarse_prompt'].transpose(0, 1).contiguous().transpose(0, 1)
160
+
161
+ with torch.inference_mode():
162
+ # TODO: min_eos_p?
163
+ output = tts_model.generate(
164
+ **inputs,
165
+ do_sample = True,
166
+ semantic_temperature = SEMANTIC_TEMPERATURE,
167
+ coarse_temperature = COARSE_TEMPERATURE,
168
+ fine_temperature = FINE_TEMPERATURE
169
+ )
170
+
171
+ pieces += [output, silence]
172
+
173
+ result_audio = torch.cat(pieces, dim=1)
174
+ wav_tensor = result_audio.to(dtype=torch.float32).cpu()
175
+ wav = torchaudio.functional.resample(wav_tensor, orig_freq=SAMPLE_RATE, new_freq=32000)
176
+ wav = speed_perturb(wav.float())[0].squeeze(0)
177
+ wav = wav.numpy()
178
+ wav = LOUDNESS_NORM(wav, volumn=volume)
179
+ WRITE_AUDIO(wav, name=output_wav)
180
+
181
+ # Return success message and the filename of the generated audio
182
+ return jsonify({'message': f'Text-to-Speech generated successfully | {speaker_id}: {text}', 'file': output_wav})
183
+
184
+ except Exception as e:
185
+ # Return error message if something goes wrong
186
+ return jsonify({'API error': str(e)}), 500
187
+
188
+
189
+ @app.route('/fix_audio', methods=['POST'])
190
+ def fix_audio():
191
+ # Receive the text from the POST request
192
+ data = request.json
193
+ processfile = data['processfile']
194
+
195
+ logging.info(f'Fixing {processfile} ...')
196
+
197
+ try:
198
+ vf.restore(input=processfile, output=processfile, cuda=True, mode=0)
199
+
200
+ # Return success message and the filename of the generated audio
201
+ return jsonify({'message': 'Speech restored successfully', 'file': processfile})
202
+
203
+ except Exception as e:
204
+ # Return error message if something goes wrong
205
+ return jsonify({'API error': str(e)}), 500
206
+
207
+
208
+ @app.route('/parse_voice', methods=['POST'])
209
+ def parse_voice():
210
+ # Receive the text from the POST request
211
+ data = request.json
212
+ wav_path = data['wav_path']
213
+ out_dir = data['out_dir']
214
+
215
+ logging.info(f'Parsing {wav_path} ...')
216
+
217
+ try:
218
+ vp.extract_acoustic_embed(wav_path, out_dir)
219
+
220
+ # Return success message and the filename of the generated audio
221
+ return jsonify({'message': f'Sucessfully parsed {wav_path}'})
222
+
223
+ except Exception as e:
224
+ # Return error message if something goes wrong
225
+ return jsonify({'API error': str(e)}), 500
226
+
227
+
228
+ if __name__ == '__main__':
229
+ service_port = get_service_port()
230
+ # We disable multithreading to force services to process one request at a time and avoid CUDA OOM
231
+ app.run(debug=False, threaded=False, port=service_port)
share_btn.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ community_icon_html = """<svg id="share-btn-share-icon" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32">
2
+ <path d="M20.6081 3C21.7684 3 22.8053 3.49196 23.5284 4.38415C23.9756 4.93678 24.4428 5.82749 24.4808 7.16133C24.9674 7.01707 25.4353 6.93643 25.8725 6.93643C26.9833 6.93643 27.9865 7.37587 28.696 8.17411C29.6075 9.19872 30.0124 10.4579 29.8361 11.7177C29.7523 12.3177 29.5581 12.8555 29.2678 13.3534C29.8798 13.8646 30.3306 14.5763 30.5485 15.4322C30.719 16.1032 30.8939 17.5006 29.9808 18.9403C30.0389 19.0342 30.0934 19.1319 30.1442 19.2318C30.6932 20.3074 30.7283 21.5229 30.2439 22.6548C29.5093 24.3704 27.6841 25.7219 24.1397 27.1727C21.9347 28.0753 19.9174 28.6523 19.8994 28.6575C16.9842 29.4379 14.3477 29.8345 12.0653 29.8345C7.87017 29.8345 4.8668 28.508 3.13831 25.8921C0.356375 21.6797 0.754104 17.8269 4.35369 14.1131C6.34591 12.058 7.67023 9.02782 7.94613 8.36275C8.50224 6.39343 9.97271 4.20438 12.4172 4.20438H12.4179C12.6236 4.20438 12.8314 4.2214 13.0364 4.25468C14.107 4.42854 15.0428 5.06476 15.7115 6.02205C16.4331 5.09583 17.134 4.359 17.7682 3.94323C18.7242 3.31737 19.6794 3 20.6081 3ZM20.6081 5.95917C20.2427 5.95917 19.7963 6.1197 19.3039 6.44225C17.7754 7.44319 14.8258 12.6772 13.7458 14.7131C13.3839 15.3952 12.7655 15.6837 12.2086 15.6837C11.1036 15.6837 10.2408 14.5497 12.1076 13.1085C14.9146 10.9402 13.9299 7.39584 12.5898 7.1776C12.5311 7.16799 12.4731 7.16355 12.4172 7.16355C11.1989 7.16355 10.6615 9.33114 10.6615 9.33114C10.6615 9.33114 9.0863 13.4148 6.38031 16.206C3.67434 18.998 3.5346 21.2388 5.50675 24.2246C6.85185 26.2606 9.42666 26.8753 12.0653 26.8753C14.8021 26.8753 17.6077 26.2139 19.1799 25.793C19.2574 25.7723 28.8193 22.984 27.6081 20.6107C27.4046 20.212 27.0693 20.0522 26.6471 20.0522C24.9416 20.0522 21.8393 22.6726 20.5057 22.6726C20.2076 22.6726 19.9976 22.5416 19.9116 22.222C19.3433 20.1173 28.552 19.2325 27.7758 16.1839C27.639 15.6445 27.2677 15.4256 26.746 15.4263C24.4923 15.4263 19.4358 19.5181 18.3759 19.5181C18.2949 19.5181 18.2368 19.4937 18.2053 19.4419C17.6743 18.557 17.9653 17.9394 21.7082 15.6009C25.4511 13.2617 28.0783 11.8545 26.5841 10.1752C26.4121 9.98141 26.1684 9.8956 25.8725 9.8956C23.6001 9.89634 18.2311 14.9403 18.2311 14.9403C18.2311 14.9403 16.7821 16.496 15.9057 16.496C15.7043 16.496 15.533 16.4139 15.4169 16.2112C14.7956 15.1296 21.1879 10.1286 21.5484 8.06535C21.7928 6.66715 21.3771 5.95917 20.6081 5.95917Z" fill="#FF9D00"></path>
3
+ <path d="M5.50686 24.2246C3.53472 21.2387 3.67446 18.9979 6.38043 16.206C9.08641 13.4147 10.6615 9.33111 10.6615 9.33111C10.6615 9.33111 11.2499 6.95933 12.59 7.17757C13.93 7.39581 14.9139 10.9401 12.1069 13.1084C9.29997 15.276 12.6659 16.7489 13.7459 14.713C14.8258 12.6772 17.7747 7.44316 19.304 6.44221C20.8326 5.44128 21.9089 6.00204 21.5484 8.06532C21.188 10.1286 14.795 15.1295 15.4171 16.2118C16.0391 17.2934 18.2312 14.9402 18.2312 14.9402C18.2312 14.9402 25.0907 8.49588 26.5842 10.1752C28.0776 11.8545 25.4512 13.2616 21.7082 15.6008C17.9646 17.9393 17.6744 18.557 18.2054 19.4418C18.7372 20.3266 26.9998 13.1351 27.7759 16.1838C28.5513 19.2324 19.3434 20.1173 19.9117 22.2219C20.48 24.3274 26.3979 18.2382 27.6082 20.6107C28.8193 22.9839 19.2574 25.7722 19.18 25.7929C16.0914 26.62 8.24723 28.3726 5.50686 24.2246Z" fill="#FFD21E"></path>
4
+ </svg>"""
5
+
6
+ loading_icon_html = """<svg id="share-btn-loading-icon" style="display:none;" class="animate-spin"
7
+ style="color: #ffffff;
8
+ "
9
+ xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" fill="none" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><circle style="opacity: 0.25;" cx="12" cy="12" r="10" stroke="white" stroke-width="4"></circle><path style="opacity: 0.75;" fill="white" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path></svg>"""
10
+
11
+ share_js = """async () => {
12
+ async function uploadFile(file){
13
+ const UPLOAD_URL = 'https://huggingface.co/uploads';
14
+ const response = await fetch(UPLOAD_URL, {
15
+ method: 'POST',
16
+ headers: {
17
+ 'Content-Type': file.type,
18
+ 'X-Requested-With': 'XMLHttpRequest',
19
+ },
20
+ body: file, /// <- File inherits from Blob
21
+ });
22
+ const url = await response.text();
23
+ return url;
24
+ }
25
+ async function getInputVideoFile(videoEl){
26
+ const res = await fetch(videoEl.src);
27
+ const blob = await res.blob();
28
+ const videoId = Date.now() % 200;
29
+ const fileName = `sd-perception-${videoId}.mp4`;
30
+ return new File([blob], fileName, { type: 'video/mp4' });
31
+ }
32
+
33
+ async function audioToBase64(audioFile) {
34
+ return new Promise((resolve, reject) => {
35
+ let reader = new FileReader();
36
+ reader.readAsDataURL(audioFile);
37
+ reader.onload = () => resolve(reader.result);
38
+ reader.onerror = error => reject(error);
39
+
40
+ });
41
+ }
42
+ const gradioEl = document.querySelector("gradio-app").shadowRoot || document.querySelector('body > gradio-app');
43
+ const inputPromptEl = gradioEl.querySelector('#prompt-in textarea').value;
44
+ const outputVideoEl = gradioEl.querySelector('#output-video video');
45
+
46
+ let titleTxt = `WavJourney: ${inputPromptEl}`;
47
+
48
+ const shareBtnEl = gradioEl.querySelector('#share-btn');
49
+ const shareIconEl = gradioEl.querySelector('#share-btn-share-icon');
50
+ const loadingIconEl = gradioEl.querySelector('#share-btn-loading-icon');
51
+ if(!outputVideoEl){
52
+ return;
53
+ };
54
+ shareBtnEl.style.pointerEvents = 'none';
55
+ shareIconEl.style.display = 'none';
56
+ loadingIconEl.style.removeProperty('display');
57
+ const outputVideo = await getInputVideoFile(outputVideoEl);
58
+ const urlOutputVideo = await uploadFile(outputVideo);
59
+
60
+ const descriptionMd = `
61
+ ##### ${inputPromptEl}
62
+
63
+ ${urlOutputVideo}
64
+ `;
65
+ const params = new URLSearchParams({
66
+ title: titleTxt,
67
+ description: descriptionMd,
68
+ });
69
+ const paramsStr = params.toString();
70
+ window.open(`https://huggingface.co/spaces/Audio-AGI/WavJourney/discussions/new?${paramsStr}`, '_blank');
71
+ shareBtnEl.style.removeProperty('pointer-events');
72
+ shareIconEl.style.removeProperty('display');
73
+ loadingIconEl.style.display = 'none';
74
+ }"""
ui_client.py ADDED
@@ -0,0 +1,632 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ import json5
3
+ import traceback
4
+
5
+ import gradio as gr
6
+ from tabulate import tabulate
7
+
8
+ import utils
9
+ import pipeline
10
+ from pipeline import generate_json_file, generate_audio
11
+ from voice_presets import load_voice_presets_metadata, add_session_voice_preset, \
12
+ remove_session_voice_preset
13
+ from share_btn import community_icon_html, loading_icon_html, share_js
14
+
15
+
16
+
17
+ VOICE_PRESETS_HEADERS = ['ID', 'Description']
18
+ DELETE_FILE_WHEN_DO_CLEAR = False
19
+ DEBUG = False
20
+
21
+
22
+ def convert_json_to_md(audio_script_response):
23
+ audio_json_data = json5.loads(audio_script_response)
24
+ table = [[node.get(field, 'N/A') for field in ["audio_type", "layout", "id", "character", "action", 'vol']] +
25
+ [node.get("desc", "N/A") if node.get("audio_type") != "speech" else node.get("text", "N/A")] +
26
+ [node.get("len", "Auto") if "len" in node else "Auto"]
27
+ for i, node in enumerate(audio_json_data)]
28
+
29
+ headers = ["Audio Type", "Layout", "ID", "Character", "Action", 'Volume', "Description", "Length" ]
30
+
31
+ # Tabulate
32
+ table_txt = tabulate(table, headers, tablefmt="github")
33
+ return table_txt
34
+
35
+
36
+ def convert_char_voice_map_to_md(char_voice_map):
37
+ table =[[character, char_voice_map[character]["id"]] for character in char_voice_map]
38
+ headers = ["Character", "Voice"]
39
+ # Tabulate
40
+ table_txt = tabulate(table, headers, tablefmt="github")
41
+ return table_txt
42
+
43
+
44
+ def get_or_create_session_from_state(ui_state):
45
+ if 'session_id' not in ui_state:
46
+ ui_state['session_id'] = pipeline.init_session()
47
+ return ui_state['session_id']
48
+
49
+
50
+ def generate_script_fn(instruction, _state: gr.State):
51
+ try:
52
+ session_id = get_or_create_session_from_state(_state)
53
+ api_key = utils.get_api_key()
54
+ json_script = generate_json_file(session_id, instruction, api_key)
55
+ table_text = convert_json_to_md(json_script)
56
+ except Exception as e:
57
+ gr.Warning(str(e))
58
+ print(f"Generating script error: {str(e)}")
59
+ traceback.print_exc()
60
+ return [
61
+ None,
62
+ _state,
63
+ gr.Button.update(interactive=False),
64
+ gr.Button.update(interactive=True),
65
+ gr.Button.update(interactive=True),
66
+ gr.Button.update(interactive=True),
67
+ ]
68
+
69
+ _state = {
70
+ **_state,
71
+ 'session_id': session_id,
72
+ 'json_script': json_script
73
+ }
74
+ return [
75
+ table_text,
76
+ _state,
77
+ gr.Button.update(interactive=True),
78
+ gr.Button.update(interactive=True),
79
+ gr.Button.update(interactive=True),
80
+ gr.Button.update(interactive=True),
81
+ ]
82
+
83
+
84
+ def generate_audio_fn(state):
85
+ btn_state = gr.Button.update(interactive=True)
86
+ try:
87
+ api_key = utils.get_api_key()
88
+ audio_path, char_voice_map = generate_audio(**state, api_key=api_key)
89
+ table_text = convert_char_voice_map_to_md(char_voice_map)
90
+ # TODO: output char_voice_map to a table
91
+ return [
92
+ table_text,
93
+ gr.make_waveform(str(audio_path)),
94
+ btn_state,
95
+ btn_state,
96
+ btn_state,
97
+ btn_state,
98
+ ]
99
+ except Exception as e:
100
+ print(f"Generation audio error: {str(e)}")
101
+ traceback.print_exc()
102
+ gr.Warning(str(e))
103
+
104
+ return [
105
+ None,
106
+ None,
107
+ btn_state,
108
+ btn_state,
109
+ btn_state,
110
+ btn_state,
111
+ ]
112
+
113
+
114
+ def clear_fn(state):
115
+ if DELETE_FILE_WHEN_DO_CLEAR:
116
+ shutil.rmtree('output', ignore_errors=True)
117
+ state = {'session_id': pipeline.init_session()}
118
+ return [gr.Markdown.update(value=''),
119
+ gr.Textbox.update(value=''),
120
+ gr.Video.update(value=None),
121
+ gr.Markdown.update(value=''),
122
+ gr.Button.update(interactive=False),
123
+ gr.Button.update(interactive=False),
124
+ state, gr.Dataframe.update(visible=False),
125
+ gr.Button.update(visible=False),
126
+ gr.Textbox.update(value=''),
127
+ gr.Textbox.update(value=''),
128
+ gr.File.update(value=None)]
129
+
130
+
131
+ def textbox_listener(textbox_input):
132
+ if len(textbox_input) > 0:
133
+ return gr.Button.update(interactive=True)
134
+ else:
135
+ return gr.Button.update(interactive=False)
136
+
137
+
138
+ def get_voice_preset_to_list(state: gr.State):
139
+ if state.__class__ == gr.State:
140
+ state = state.value
141
+ if 'session_id' in state:
142
+ path = utils.get_session_voice_preset_path(state['session_id'])
143
+ else:
144
+ path = ''
145
+ voice_presets = load_voice_presets_metadata(
146
+ path,
147
+ safe_if_metadata_not_exist=True
148
+ )
149
+ dataframe = []
150
+ for key in voice_presets.keys():
151
+ row = [key, voice_presets[key]['desc']]
152
+ dataframe.append(row)
153
+ return dataframe
154
+
155
+
156
+ def df_on_select(evt: gr.SelectData):
157
+ print(f"You selected {evt.value} at {evt.index} from {evt.target}")
158
+ return {'selected_voice_preset': evt.index}
159
+
160
+
161
+ def del_voice_preset(selected_voice_presets, ui_state, dataframe):
162
+ gr_visible = gr.Dataframe.update(visible=True)
163
+ btn_visible = gr.Button.update(visible=True)
164
+ current_presets = get_voice_preset_to_list(ui_state)
165
+ if selected_voice_presets['selected_voice_preset'] is None or \
166
+ selected_voice_presets['selected_voice_preset'][0] > len(current_presets) - 1:
167
+ gr.Warning('None row is selected')
168
+ return [current_presets, gr_visible, btn_visible, selected_voice_presets]
169
+ # Do the real file deletion
170
+ index = selected_voice_presets['selected_voice_preset'][0]
171
+ vp_id = dataframe['ID'][index]
172
+ remove_session_voice_preset(vp_id, ui_state['session_id'])
173
+ current_presets = get_voice_preset_to_list(ui_state)
174
+ gr.Dataframe.update(value=current_presets)
175
+ if len(current_presets) == 0:
176
+ gr_visible = gr.Dataframe.update(visible=False)
177
+ btn_visible = gr.Button.update(visible=False)
178
+ selected_voice_presets['selected_voice_preset'] = None
179
+ return [current_presets, gr_visible, btn_visible, selected_voice_presets]
180
+
181
+
182
+ def get_system_voice_presets():
183
+ system_presets = load_voice_presets_metadata(utils.get_system_voice_preset_path())
184
+ data = []
185
+ for k, v in system_presets.items():
186
+ data.append([k, v['desc']])
187
+ # headers = ['id', 'description']
188
+ # table_txt = tabulate(data, headers, tablefmt="github")
189
+ return data
190
+
191
+
192
+ def set_openai_key(key, _state):
193
+ _state['api_key'] = key
194
+ return key
195
+
196
+
197
+ def add_voice_preset(vp_id, vp_desc, file, ui_state, added_voice_preset):
198
+ if vp_id is None or vp_desc is None or file is None or vp_id.strip() == '' or vp_desc.strip() == '':
199
+ gr.Warning('please complete all three fields')
200
+ else:
201
+ count: int = added_voice_preset['count']
202
+ # check if greater than 3
203
+ session_id = get_or_create_session_from_state(ui_state)
204
+ file_path = file.name
205
+ print(f'session {session_id}, id {id}, desc {vp_desc}, file {file_path}')
206
+ # Do adding ...
207
+ try:
208
+ add_session_voice_preset(vp_id, vp_desc, file_path, session_id)
209
+ added_voice_preset['count'] = count + 1
210
+ except Exception as exception:
211
+ print(exception)
212
+ traceback.print_exc()
213
+ gr.Warning(str(exception))
214
+
215
+ # After added
216
+ dataframe = get_voice_preset_to_list(ui_state)
217
+ df_visible = gr.Dataframe.update(visible=True)
218
+ del_visible = gr.Button.update(visible=True)
219
+ if len(dataframe) == 0:
220
+ df_visible = gr.Dataframe.update(visible=False)
221
+ del_visible = gr.Button.update(visible=False)
222
+ return [gr.Textbox.update(value=''), gr.Textbox.update(value=''), gr.File.update(value=None),
223
+ ui_state, added_voice_preset, dataframe, gr.Button.update(interactive=True),
224
+ df_visible, del_visible]
225
+
226
+
227
+ css = """
228
+ a {
229
+ color: inherit;
230
+ text-decoration: underline;
231
+ }
232
+ .gradio-container {
233
+ font-family: 'IBM Plex Sans', sans-serif;
234
+ }
235
+ .gr-button {
236
+ color: white;
237
+ border-color: #000000;
238
+ background: #000000;
239
+ }
240
+ input[type='range'] {
241
+ accent-color: #000000;
242
+ }
243
+ .dark input[type='range'] {
244
+ accent-color: #dfdfdf;
245
+ }
246
+ .container {
247
+ max-width: 730px;
248
+ margin: auto;
249
+ padding-top: 1.5rem;
250
+ }
251
+ #gallery {
252
+ min-height: 22rem;
253
+ margin-bottom: 15px;
254
+ margin-left: auto;
255
+ margin-right: auto;
256
+ border-bottom-right-radius: .5rem !important;
257
+ border-bottom-left-radius: .5rem !important;
258
+ }
259
+ #gallery>div>.h-full {
260
+ min-height: 20rem;
261
+ }
262
+ .details:hover {
263
+ text-decoration: underline;
264
+ }
265
+ .gr-button {
266
+ white-space: nowrap;
267
+ }
268
+ .gr-button:focus {
269
+ border-color: rgb(147 197 253 / var(--tw-border-opacity));
270
+ outline: none;
271
+ box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
272
+ --tw-border-opacity: 1;
273
+ --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);
274
+ --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
275
+ --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
276
+ --tw-ring-opacity: .5;
277
+ }
278
+ #advanced-btn {
279
+ font-size: .7rem !important;
280
+ line-height: 19px;
281
+ margin-top: 12px;
282
+ margin-bottom: 12px;
283
+ padding: 2px 8px;
284
+ border-radius: 14px !important;
285
+ }
286
+ #advanced-options {
287
+ margin-bottom: 20px;
288
+ }
289
+ .footer {
290
+ margin-bottom: 45px;
291
+ margin-top: 35px;
292
+ text-align: center;
293
+ border-bottom: 1px solid #e5e5e5;
294
+ }
295
+ .footer>p {
296
+ font-size: .8rem;
297
+ display: inline-block;
298
+ padding: 0 10px;
299
+ transform: translateY(10px);
300
+ background: white;
301
+ }
302
+ .dark .footer {
303
+ border-color: #303030;
304
+ }
305
+ .dark .footer>p {
306
+ background: #0b0f19;
307
+ }
308
+ .acknowledgments h4{
309
+ margin: 1.25em 0 .25em 0;
310
+ font-weight: bold;
311
+ font-size: 115%;
312
+ }
313
+ #container-advanced-btns{
314
+ display: flex;
315
+ flex-wrap: wrap;
316
+ justify-content: space-between;
317
+ align-items: center;
318
+ }
319
+ .animate-spin {
320
+ animation: spin 1s linear infinite;
321
+ }
322
+ @keyframes spin {
323
+ from {
324
+ transform: rotate(0deg);
325
+ }
326
+ to {
327
+ transform: rotate(360deg);
328
+ }
329
+ }
330
+ #share-btn-container {
331
+ display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
332
+ margin-top: 10px;
333
+ margin-left: auto;
334
+ }
335
+ #share-btn {
336
+ all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;right:0;
337
+ }
338
+ #share-btn * {
339
+ all: unset;
340
+ }
341
+ #share-btn-container div:nth-child(-n+2){
342
+ width: auto !important;
343
+ min-height: 0px !important;
344
+ }
345
+ #share-btn-container .wrap {
346
+ display: none !important;
347
+ }
348
+ .gr-form{
349
+ flex: 1 1 50%; border-top-right-radius: 0; border-bottom-right-radius: 0;
350
+ }
351
+ #prompt-container{
352
+ gap: 0;
353
+ }
354
+ #generated_id{
355
+ min-height: 700px
356
+ }
357
+ #setting_id{
358
+ margin-bottom: 12px;
359
+ text-align: center;
360
+ font-weight: 900;
361
+ }
362
+ """
363
+
364
+ with gr.Blocks(css=css) as interface:
365
+
366
+ gr.HTML(
367
+ """
368
+ <div style="text-align: center; max-width: 700px; margin: 0 auto;">
369
+ <div
370
+ style="
371
+ display: inline-flex;
372
+ align-items: center;
373
+ gap: 0.8rem;
374
+ font-size: 1.75rem;
375
+ "
376
+ >
377
+ <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
378
+ WavJourney: Compositional Audio Creation with LLMs
379
+ </h1>
380
+ </div>
381
+ <p style="margin-bottom: 10px; margin-top: 10px; font-size: 94%">
382
+ <a href="https://arxiv.org/abs/2307.14335">[Paper]</a> <a href="https://audio-agi.github.io/WavJourney_demopage/">[Demo Page]</a> <a href="https://github.com/Audio-AGI/WavJourney">[GitHub]</a> <a href="https://discord.com/invite/5Hqu9NmA8V">[Join Discord]</a>
383
+ </p>
384
+ </div>
385
+ """
386
+ )
387
+ gr.HTML(
388
+ """
389
+ <p>Due to the high user demand we are facing from our community, we will be offering free access to WavJourney for a few more days. You can also access WavJourney in this space later by providing your OPENAI_KEY.<p/>
390
+ <p>For faster inference without waiting in the queue, you can duplicate the space and upgrade to GPU (VRAM>16G) and provide OPENAI_KEY to access GPT-4 in settings.
391
+ <br>
392
+ <a href="https://huggingface.co/spaces/Audio-AGI/WavJourney?duplicate=true">
393
+ <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
394
+ <p/>
395
+ """
396
+ )
397
+
398
+ # gr.HTML(
399
+ # """
400
+ # <p>Begin with a text prompt, and let WavJourney transform it into captivating audio content. Experience engaging audio storylines, personalized voices, lifelike speech, emotionally resonant musical compositions, and immersive sound effects!
401
+ # <p/>
402
+ # """
403
+ # )
404
+
405
+ gr.Markdown(value='## WavJourney Pipeline:')
406
+
407
+ gr.Markdown(value='Begin with a text prompt, and let WavJourney transform it into captivating audio content. Experience engaging audio storylines, personalized voices, lifelike speech, emotionally resonant musical compositions, and immersive sound effects!')
408
+
409
+ gr.HTML(
410
+ """
411
+ <ul>
412
+ <li>Stage 0: (optional) add your customized voice preset for a more personalized audio creation experience. User also often shares presets in <a href="https://discord.com/invite/5Hqu9NmA8V">Discord</a>.</li>
413
+ <li>Stage 1: generate the audio script based on the input text instruction (the default language is English, but you can actually type in your own language).</li>
414
+ <li>Stage 2: Select the suitable voice in the multilingual voice preset for each character in the audio script & generate audio.</li>
415
+ </ul>
416
+
417
+
418
+ """
419
+ )
420
+
421
+
422
+
423
+ system_voice_presets = get_system_voice_presets()
424
+ # State
425
+ ui_state = gr.State({})
426
+ selected_voice_presets = gr.State(value={'selected_voice_preset': None})
427
+ added_voice_preset_state = gr.State(value={'added_file': None, 'count': 0})
428
+ # UI Component
429
+ # gr.Markdown(
430
+ # """
431
+ # How can I access GPT-4? <a href="https://platform.openai.com/account/api-keys">[Ref1]</a><a href="https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4">[Ref2]</a>
432
+ # """
433
+ # )
434
+ # key_text_input = gr.Textbox(label='Please Enter OPENAI Key for accessing GPT-4 API', lines=1, placeholder="OPENAI Key here.",
435
+ # value=utils.get_key())
436
+ text_input_value = '' if DEBUG is False else "an audio introduction to quantum mechanics"
437
+
438
+ text_input = gr.Textbox(
439
+ label='Input Text Instruction',
440
+ lines=2,
441
+ placeholder="Input instruction here (e.g., An introduction to AI-assisted audio content creation).",
442
+ value=text_input_value,
443
+ elem_id="prompt-in",)
444
+
445
+ gr.Markdown(
446
+ """
447
+ Clicking 'Generate Script' button, the generated audio script will be displayed below.
448
+ """
449
+ )
450
+ audio_script_markdown = gr.Markdown(label='Audio Script')
451
+ generate_script_btn = gr.Button(value='Generate Script', interactive=False)
452
+
453
+ gr.Markdown(
454
+ """
455
+ Clicking 'Generate Audio' button, the voice mapping results & generated audio will be displayed below (might take some time).
456
+ """
457
+ )
458
+ char_voice_map_markdown = gr.Markdown(label='Character-to-voice Map')
459
+
460
+ audio_output = gr.Video(elem_id="output-video")
461
+
462
+ generate_audio_btn = gr.Button(value='Generate Audio', interactive=False)
463
+
464
+ # share to community
465
+ with gr.Group(elem_id="share-btn-container", visible=False):
466
+ community_icon = gr.HTML(community_icon_html)
467
+ loading_icon = gr.HTML(loading_icon_html)
468
+ share_button = gr.Button(value="Share to community", elem_id="share-btn")
469
+
470
+ gr.Markdown(value='### Share your creation with the community!')
471
+ gr.HTML(
472
+ """
473
+ <ul>
474
+ <li> You can share with the HuggingFace community by clicking the "Share to community" button.</li>
475
+ <li> You can share your generations to our <a href="https://discord.com/invite/5Hqu9NmA8V">Discord</a> channel!</li>
476
+ <li> You can also share the voice presets (along with descriptions) you found in <a href="https://discord.com/invite/5Hqu9NmA8V">Discord</a>.</li>
477
+ </ul>
478
+ """
479
+ )
480
+
481
+
482
+
483
+
484
+ gr.Markdown(value='### Useful tips for prompting WavJourney:')
485
+
486
+ gr.HTML(
487
+ """
488
+ <ul>
489
+ <li>You can use vague or specific descriptions or a combination of them. For example: "male speech about pizza" or "a man is saying: I love pizza!"</li>
490
+ <li> You can control the length of the audio script by simply adding the restriction. For example: "generate an audio script around 10-15 lines (max length has been set to 30)"</li>
491
+ <li> You can specify the language of the speaker. For example: "a boy is playing with a girl, boy's speech is in Chinese while girl's speech in Japanese"</li>
492
+ <li> Explore more prompting techniques by yourself! 🤗</li>
493
+ </ul>
494
+
495
+ """
496
+ )
497
+
498
+ # add examples
499
+ from examples.examples import examples as WJExamples
500
+ def example_fn(idx, _text_input):
501
+ print('from example', idx, _text_input)
502
+ example = WJExamples[int(idx)-1]
503
+ print(example['table_script'], example['table_voice'], gr.make_waveform(example['wav_file']))
504
+ return example['table_script'], example['table_voice'], gr.make_waveform(example['wav_file'])
505
+
506
+ _idx_input = gr.Textbox(label='Example No.')
507
+ _idx_input.visible=False
508
+ gr.Examples(
509
+ [[idx+1, x['text']] for idx, x in enumerate(WJExamples)],
510
+ fn=example_fn,
511
+ inputs=[_idx_input, text_input],
512
+ outputs=[audio_script_markdown, char_voice_map_markdown, audio_output],
513
+ cache_examples=True,
514
+ )
515
+
516
+ # System Voice Presets
517
+ gr.Markdown(label='System Voice Presets', value='### System Voice Presets')
518
+ with gr.Accordion("Click to display system speakers", open=False):
519
+ gr.Markdown('Supported Language: English, Chinese, French, German, Hindi, Italian, Japanese, Korean, Russian, Spanish, Turkish, Polish, Portuguese')
520
+
521
+ system_markdown_voice_presets = gr.Dataframe(label='System Voice Presets', headers=VOICE_PRESETS_HEADERS,
522
+ value=system_voice_presets)
523
+ # User Voice Preset Related
524
+ gr.Markdown('## (Optional) Speaker Customization ')
525
+ with gr.Accordion("Click to add speakers", open=False):
526
+ gr.Markdown(label='User Voice Presets', value='### User Voice Presets')
527
+ get_voice_preset_to_list(ui_state)
528
+ voice_presets_df = gr.Dataframe(headers=VOICE_PRESETS_HEADERS, col_count=len(VOICE_PRESETS_HEADERS),
529
+ value=get_voice_preset_to_list(ui_state), interactive=False, visible=False)
530
+ # voice_presets_ds = gr.Dataset(components=[gr.Dataframe(visible=True)], samples=get_voice_preset_to_list(ui_state))
531
+ del_voice_btn = gr.Button(value='Delete Selected Voice Preset', visible=False)
532
+ gr.Markdown(label='Add Voice Preset', value='### Add Voice Preset')
533
+ gr.Markdown(
534
+ """
535
+ What makes for good voice prompt? See detailed instructions <a href="https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer">here</a>.
536
+ """
537
+ )
538
+ vp_text_id = gr.Textbox(label='Id', lines=1, placeholder="Input voice preset id here.")
539
+ vp_text_desc = gr.Textbox(label='Desc', lines=1, placeholder="Input description here.")
540
+ vp_file = gr.File(label='Wav File', type='file', file_types=['.wav'],
541
+ interactive=True)
542
+ vp_submit = gr.Button(label='Upload Voice Preset', value="Upload Voice Preset")
543
+
544
+ # clear btn, will re-new a session
545
+ clear_btn = gr.ClearButton(value='Clear All')
546
+
547
+ # disclaimer
548
+ gr.Markdown(
549
+ """
550
+ ## Disclaimer
551
+ We are not responsible for audio generated using semantics created by WavJourney. Just don't use it for illegal purposes.
552
+ """
553
+ )
554
+
555
+ # events
556
+ # key_text_input.change(fn=set_openai_key, inputs=[key_text_input, ui_state], outputs=[key_text_input])
557
+ text_input.change(fn=textbox_listener, inputs=[text_input], outputs=[generate_script_btn])
558
+ generate_audio_btn.click(
559
+ fn=generate_audio_fn,
560
+ inputs=[ui_state],
561
+ outputs=[
562
+ char_voice_map_markdown,
563
+ audio_output,
564
+ generate_audio_btn,
565
+ generate_script_btn,
566
+ clear_btn,
567
+ vp_submit,
568
+ ],
569
+ api_name='audio_journey',
570
+ )
571
+ generate_audio_btn.click(
572
+ fn=lambda: [
573
+ gr.Button.update(interactive=False),
574
+ gr.Button.update(interactive=False),
575
+ gr.Button.update(interactive=False),
576
+ gr.Button.update(interactive=False),
577
+ ],
578
+ outputs=[
579
+ generate_audio_btn,
580
+ generate_script_btn,
581
+ clear_btn,
582
+ vp_submit,
583
+ ]
584
+ )
585
+ clear_btn.click(fn=clear_fn, inputs=ui_state,
586
+ outputs=[char_voice_map_markdown, text_input, audio_output, audio_script_markdown, generate_audio_btn, generate_script_btn,
587
+ ui_state, voice_presets_df, del_voice_btn,
588
+ vp_text_id, vp_text_desc, vp_file])
589
+ generate_script_btn.click(
590
+ fn=generate_script_fn, inputs=[text_input, ui_state],
591
+ outputs=[
592
+ audio_script_markdown,
593
+ ui_state,
594
+ generate_audio_btn,
595
+ generate_script_btn,
596
+ clear_btn,
597
+ vp_submit,
598
+ ]
599
+ )
600
+ generate_script_btn.click(
601
+ fn=lambda: [
602
+ gr.Button.update(interactive=False),
603
+ gr.Button.update(interactive=False),
604
+ gr.Button.update(interactive=False),
605
+ gr.Button.update(interactive=False),
606
+ ],
607
+ outputs=[
608
+ generate_audio_btn,
609
+ generate_script_btn,
610
+ clear_btn,
611
+ vp_submit,
612
+ ]
613
+ )
614
+ voice_presets_df.select(df_on_select, outputs=[selected_voice_presets])
615
+ voice_presets_df.update(lambda x: print(x))
616
+ del_voice_btn.click(del_voice_preset, inputs=[selected_voice_presets, ui_state, voice_presets_df],
617
+ outputs=[voice_presets_df, voice_presets_df, del_voice_btn, selected_voice_presets])
618
+ # user voice preset upload
619
+ vp_submit.click(add_voice_preset, inputs=[vp_text_id, vp_text_desc, vp_file, ui_state, added_voice_preset_state],
620
+ outputs=[vp_text_id, vp_text_desc, vp_file, ui_state, added_voice_preset_state, voice_presets_df,
621
+ vp_submit,
622
+ voice_presets_df, del_voice_btn])
623
+ vp_submit.click(lambda _: gr.Button.update(interactive=False), inputs=[vp_submit])
624
+
625
+ # share to HF community
626
+ share_button.click(None, [], [], _js=share_js)
627
+
628
+ # debug only
629
+ # print_state_btn = gr.Button(value='Print State')
630
+ # print_state_btn.click(fn=lambda state, state2: print(state, state2), inputs=[ui_state, selected_voice_presets])
631
+ interface.queue(concurrency_count=2, max_size=20)
632
+ interface.launch()
utils.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import torch
4
+ import numpy as np
5
+ import yaml
6
+ from pathlib import Path
7
+
8
+
9
+ #### path related code BEGIN ####
10
+ def get_session_path(session_id):
11
+ return Path(f'output/sessions/{session_id}')
12
+
13
+ def get_system_voice_preset_path():
14
+ return Path('data/voice_presets')
15
+
16
+ def get_session_voice_preset_path(session_id):
17
+ return Path(f'{get_session_path(session_id)}/voice_presets')
18
+
19
+ def get_session_audio_path(session_id):
20
+ return Path(f'{get_session_path(session_id)}/audio')
21
+
22
+ def rescale_to_match_energy(segment1, segment2):
23
+ ratio = get_energy_ratio(segment1, segment2)
24
+ recaled_segment1 = segment1 / ratio
25
+ return recaled_segment1.numpy()
26
+ #### path related code END ####
27
+
28
+ def text_to_abbrev_prompt(input_text):
29
+ return re.sub(r'[^a-zA-Z_]', '', '_'.join(input_text.split()[:5]))
30
+
31
+ def get_energy(x):
32
+ return np.mean(x ** 2)
33
+
34
+
35
+ def get_energy_ratio(segment1, segment2):
36
+ energy1 = get_energy(segment1)
37
+ energy2 = max(get_energy(segment2), 1e-10)
38
+ ratio = (energy1 / energy2) ** 0.5
39
+ ratio = torch.tensor(ratio)
40
+ ratio = torch.clamp(ratio, 0.02, 50)
41
+ return ratio
42
+
43
+ def fade(audio_data, fade_duration=2, sr=32000):
44
+ audio_duration = audio_data.shape[0] / sr
45
+
46
+ # automated choose fade duration
47
+ if audio_duration >=8:
48
+ # keep fade_duration 2
49
+ pass
50
+ else:
51
+ fade_duration = audio_duration / 5
52
+
53
+ fade_sampels = int(sr * fade_duration)
54
+ fade_in = np.linspace(0, 1, fade_sampels)
55
+ fade_out = np.linspace(1, 0, fade_sampels)
56
+
57
+ audio_data_fade_in = audio_data[:fade_sampels] * fade_in
58
+ audio_data_fade_out = audio_data[-fade_sampels:] * fade_out
59
+
60
+ audio_data_faded = np.concatenate((audio_data_fade_in, audio_data[len(fade_in):-len(fade_out)], audio_data_fade_out))
61
+ return audio_data_faded
62
+
63
+ # def get_key(config='config.yaml'):
64
+ # with open('config.yaml', 'r') as file:
65
+ # config = yaml.safe_load(file)
66
+ # return config['OpenAI-Key'] if 'OpenAI-Key' in config else None
67
+
68
+ def get_service_port():
69
+ service_port = os.environ.get('WAVJOURNEY_SERVICE_PORT')
70
+ return service_port
71
+
72
+ def get_service_url():
73
+ service_url = os.environ.get('WAVJOURNEY_SERVICE_URL')
74
+ return service_url
75
+
76
+ def get_api_key():
77
+ api_key = os.environ.get('WAVJOURNEY_OPENAI_KEY')
78
+ return api_key
79
+
80
+ def get_max_script_lines():
81
+ max_lines = int(os.environ.get('WAVJOURNEY_MAX_SCRIPT_LINES', 999))
82
+ return max_lines
voice_presets.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json, json5
3
+ from pathlib import Path
4
+
5
+ import utils
6
+ from APIs import VP
7
+
8
+
9
+ def save_voice_presets_metadata(voice_presets_path, metadata):
10
+ with open(voice_presets_path / 'metadata.json', 'w') as f:
11
+ json.dump(metadata, f, indent=4)
12
+
13
+ def load_voice_presets_metadata(voice_presets_path, safe_if_metadata_not_exist=False):
14
+ metadata_full_path = Path(voice_presets_path) / 'metadata.json'
15
+
16
+ if safe_if_metadata_not_exist:
17
+ if not os.path.exists(metadata_full_path):
18
+ return {}
19
+
20
+ with open(metadata_full_path, 'r') as f:
21
+ presets = json5.load(f)
22
+
23
+ return presets
24
+
25
+ # return system voice presets and session voice presets individually, each in a list
26
+ def get_voice_presets(session_id):
27
+ system_presets, session_presets = [], []
28
+
29
+ # Load system presets
30
+ system_presets = load_voice_presets_metadata(utils.get_system_voice_preset_path())
31
+
32
+ # Load session presets
33
+ session_presets = load_voice_presets_metadata(
34
+ utils.get_session_voice_preset_path(session_id),
35
+ safe_if_metadata_not_exist=True
36
+ )
37
+
38
+ return system_presets, session_presets
39
+
40
+ # return merged voice presets in a {voice_preset_name: voice_preset} dict
41
+ def get_merged_voice_presets(session_id):
42
+ system_presets, session_presets = get_voice_presets(session_id)
43
+ res = {}
44
+ for preset in list(system_presets.values()) + list(session_presets.values()):
45
+ res[preset['id']] = preset # session presets with the same id will cover that of system presets
46
+ return res
47
+
48
+ def add_voice_preset(voice_presets_path, presets, id, desc, wav_file_path):
49
+ if id in presets:
50
+ raise KeyError(f'{id} already in voice preset, path={voice_presets_path}!')
51
+
52
+ # Convert wav to npz
53
+ npz_path = voice_presets_path / 'npz'
54
+ VP(wav_file_path, npz_path)
55
+ npz_file_path = npz_path / f'{Path(wav_file_path).stem}.npz'
56
+
57
+ presets[id] = {
58
+ 'id': id,
59
+ 'desc': desc,
60
+ 'npz_path': str(npz_file_path)
61
+ }
62
+ save_voice_presets_metadata(voice_presets_path, presets)
63
+ return presets[id]
64
+
65
+ def add_session_voice_preset(id, desc, wav_file_path, session_id):
66
+ voice_presets_path = utils.get_session_voice_preset_path(session_id)
67
+ os.makedirs(voice_presets_path / 'npz', exist_ok=True)
68
+ presets = load_voice_presets_metadata(voice_presets_path, safe_if_metadata_not_exist=True)
69
+ if len(presets) >= 3:
70
+ raise ValueError(f'session voice presets size exceed 3')
71
+ if id in presets:
72
+ raise KeyError(f'{id} already in voice preset, path={voice_presets_path}!')
73
+
74
+ return add_voice_preset(voice_presets_path, presets, id, desc, wav_file_path)
75
+
76
+ def add_system_voice_preset(id, desc, wav_file_path):
77
+ voice_presets_path = utils.get_system_voice_preset_path()
78
+ presets = load_voice_presets_metadata(voice_presets_path)
79
+ return add_voice_preset(voice_presets_path, presets, id, desc, wav_file_path)
80
+
81
+ # if session_id set to '', we are removing system voice presets
82
+ def remove_session_voice_preset(id, session_id):
83
+ voice_presets_path = utils.get_session_voice_preset_path(session_id)
84
+ presets = load_voice_presets_metadata(
85
+ voice_presets_path,
86
+ safe_if_metadata_not_exist=True
87
+ )
88
+ preset = presets.pop(id)
89
+ npz_path = preset['npz_path']
90
+
91
+ try:
92
+ os.remove(npz_path)
93
+ except FileNotFoundError:
94
+ print(f"INFO: trying to delete {npz_path} which does not exist, path={voice_presets_path}.")
95
+
96
+ save_voice_presets_metadata(voice_presets_path, presets)