ALeLacheur commited on
Commit
957e2dc
1 Parent(s): 7ca4ec1

Voiceblock demo: Attempt 8

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. __pycache__/gradio.cpython-310.pyc +0 -0
  3. __pycache__/model.cpython-310.pyc +0 -0
  4. __pycache__/model.cpython-39.pyc +0 -0
  5. app.py +67 -0
  6. example.wav +0 -0
  7. requirements.txt +28 -0
  8. voicebox/.DS_Store +0 -0
  9. voicebox/LICENSE +0 -0
  10. voicebox/README.md +136 -0
  11. voicebox/cache/.gitkeep +0 -0
  12. voicebox/data/.gitkeep +0 -0
  13. voicebox/figures/demo_thumbnail.png +0 -0
  14. voicebox/figures/use_diagram_embeddings.png +0 -0
  15. voicebox/figures/vb_color_logo.png +0 -0
  16. voicebox/figures/voicebox_untargeted_conditioning_draft.png +0 -0
  17. voicebox/pretrained/denoiser/demucs/dns_48.pt +3 -0
  18. voicebox/pretrained/phoneme/causal_ppg_128_hidden_128_hop.pt +3 -0
  19. voicebox/pretrained/phoneme/causal_ppg_256_hidden.pt +3 -0
  20. voicebox/pretrained/phoneme/causal_ppg_256_hidden_256_hop.pt +3 -0
  21. voicebox/pretrained/phoneme/causal_ppg_256_hidden_512_hop.pt +3 -0
  22. voicebox/pretrained/phoneme/ppg_causal_small.pt +3 -0
  23. voicebox/pretrained/speaker/resemblyzer/resemblyzer.pt +3 -0
  24. voicebox/pretrained/speaker/resnetse34v2/resnetse34v2.pt +3 -0
  25. voicebox/pretrained/speaker/yvector/yvector.pt +3 -0
  26. voicebox/pretrained/universal/universal_final.pt +3 -0
  27. voicebox/pretrained/voicebox/voicebox_final.pt +3 -0
  28. voicebox/pretrained/voicebox/voicebox_final.yaml +20 -0
  29. voicebox/requirements.txt +28 -0
  30. voicebox/scripts/downloads/download_librispeech_eval.sh +25 -0
  31. voicebox/scripts/downloads/download_librispeech_train.sh +54 -0
  32. voicebox/scripts/downloads/download_rir_noise.sh +73 -0
  33. voicebox/scripts/downloads/download_voxceleb.py +189 -0
  34. voicebox/scripts/downloads/ff_rir.txt +132 -0
  35. voicebox/scripts/downloads/voxceleb1_file_parts.txt +5 -0
  36. voicebox/scripts/downloads/voxceleb1_files.txt +1 -0
  37. voicebox/scripts/downloads/voxceleb2_file_parts.txt +9 -0
  38. voicebox/scripts/downloads/voxceleb2_files.txt +1 -0
  39. voicebox/scripts/experiments/evaluate.py +915 -0
  40. voicebox/scripts/experiments/train.py +282 -0
  41. voicebox/scripts/experiments/train_phoneme_predictor.py +205 -0
  42. voicebox/scripts/streamer/benchmark_streamer.py +97 -0
  43. voicebox/scripts/streamer/enroll.py +105 -0
  44. voicebox/scripts/streamer/stream.py +135 -0
  45. voicebox/setup.py +20 -0
  46. voicebox/src.egg-info/PKG-INFO +148 -0
  47. voicebox/src.egg-info/SOURCES.txt +9 -0
  48. voicebox/src.egg-info/dependency_links.txt +1 -0
  49. voicebox/src.egg-info/top_level.txt +1 -0
  50. voicebox/src/__init__.py +0 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
__pycache__/gradio.cpython-310.pyc ADDED
Binary file (1.04 kB). View file
 
__pycache__/model.cpython-310.pyc ADDED
Binary file (1.43 kB). View file
 
__pycache__/model.cpython-39.pyc ADDED
Binary file (1.42 kB). View file
 
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ import voicebox.src.attacks.offline.perturbation.voicebox.voicebox as vb #To access VoiceBox class
4
+ #import voicebox.src.attacks.online.voicebox_streamer as streamer #To access VoiceBoxStreamer class
5
+ import numpy as np
6
+ from voicebox.src.constants import PPG_PRETRAINED_PATH
7
+
8
+ #Set voicebox default parameters
9
+ LOOKAHEAD = 5
10
+ voicebox_kwargs={'win_length': 256,
11
+ 'ppg_encoder_hidden_size': 256,
12
+ 'use_phoneme_encoder': True,
13
+ 'use_pitch_encoder': True,
14
+ 'use_loudness_encoder': True,
15
+ 'spec_encoder_lookahead_frames': 0,
16
+ 'spec_encoder_type': 'mel',
17
+ 'spec_encoder_mlp_depth': 2,
18
+ 'bottleneck_lookahead_frames': LOOKAHEAD,
19
+ 'ppg_encoder_path': PPG_PRETRAINED_PATH,
20
+ 'n_bands': 128,
21
+ 'spec_encoder_hidden_size': 512,
22
+ 'bottleneck_skip': True,
23
+ 'bottleneck_hidden_size': 512,
24
+ 'bottleneck_feedforward_size': 512,
25
+ 'bottleneck_type': 'lstm',
26
+ 'bottleneck_depth': 2,
27
+ 'control_eps': 0.5,
28
+ 'projection_norm': float('inf'),
29
+ 'conditioning_dim': 512}
30
+
31
+ #Load pretrained model:
32
+ model = vb.VoiceBox(**voicebox_kwargs)
33
+ model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True)
34
+ model.eval()
35
+
36
+ #Define function to convert final audio format:
37
+ def float32_to_int16(waveform):
38
+ waveform = waveform / np.abs(waveform).max()
39
+ waveform = waveform * 32767
40
+ waveform = waveform.astype(np.int16)
41
+ waveform = waveform.ravel()
42
+ return waveform
43
+
44
+ #Define predict function:
45
+ def predict(inp):
46
+ #How to transform audio from string to tensor
47
+ waveform, sample_rate = torchaudio.load(inp)
48
+
49
+ #Run model without changing weights
50
+ with torch.no_grad():
51
+ waveform = model(waveform)
52
+
53
+ #Transform output audio into gradio-readable format
54
+ waveform = waveform.numpy()
55
+ waveform = float32_to_int16(waveform)
56
+ return sample_rate, waveform
57
+
58
+ #Set up gradio interface
59
+ import gradio as gr
60
+
61
+ interface = gr.Interface(
62
+ fn=predict,
63
+ inputs=gr.Audio(type="filepath"),
64
+ outputs=gr.Audio()
65
+ )
66
+
67
+ interface.launch()
example.wav ADDED
Binary file (218 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==1.10.0
2
+ torchaudio==0.10.0
3
+ torchvision
4
+ torchcrepe
5
+ tensorboard
6
+ textgrid
7
+ Pillow
8
+ numpy
9
+ tqdm
10
+ jiwer
11
+ librosa
12
+ pandas
13
+ protobuf==3.20.0
14
+ git+https://github.com/ludlows/python-pesq#egg=pesq
15
+ psutil
16
+ pystoi
17
+ pytest
18
+ pyworld
19
+ pyyaml
20
+ matplotlib
21
+ seaborn
22
+ ipython
23
+ scipy
24
+ scikit-learn
25
+ ipywebrtc
26
+ argbind
27
+ sounddevice
28
+ keyboard
voicebox/.DS_Store ADDED
Binary file (6.15 kB). View file
 
voicebox/LICENSE ADDED
File without changes
voicebox/README.md ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <h1 align="center">VoiceBlock</h1>
2
+ <h4 align="center"> Privacy through Real-Time Adversarial Attacks with Audio-to-Audio Models</h4>
3
+ <div align="center">
4
+
5
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/???/???.ipynb)
6
+ [![Demo](https://img.shields.io/badge/Web-Demo-blue)](https://master.d3hvhbnf7qxjtf.amplifyapp.com/)
7
+ [![MIT license](https://img.shields.io/badge/License-MIT-blue.svg)](/LICENSE)
8
+
9
+ </div>
10
+ <p align="center"><img src="./figures/vb_color_logo.png" width="200"/></p>
11
+
12
+
13
+ ## Contents
14
+
15
+ * <a href="#install">Installation</a>
16
+ * <a href="#reproduce">Reproducing Results</a>
17
+ * <a href="#streamer">Streaming Implementation</a>
18
+ * <a href="#citation">Citation</a>
19
+
20
+ <h2 id="install">Installation</h2>
21
+
22
+ 1. Clone the repository:
23
+
24
+ git clone https://github.com/voiceboxneurips/voicebox.git
25
+
26
+ 2. We recommend working from a clean environment, e.g. using `conda`:
27
+
28
+ conda create --name voicebox python=3.9
29
+ source activate voicebox
30
+
31
+ 3. Install dependencies:
32
+
33
+ cd voicebox
34
+ pip install -r requirements.txt
35
+ pip install -e .
36
+
37
+ 4. Grant permissions:
38
+
39
+ chmod -R u+x scripts/
40
+
41
+ <h2 id="reproduce">Reproducing Results</h2>
42
+
43
+ To reproduce our results, first download the corresponding data. Note that to download the [VoxCeleb1 dataset](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html), you must register and obtain a username and password.
44
+
45
+ | Task | Dataset (Size) | Command |
46
+ |---|---|---|
47
+ | Objective evaluation | VoxCeleb1 (39G) | `python scripts/downloads/download_voxceleb.py --subset=1 --username=<VGG_USERNAME> --password=<VGG_PASSWORD>` |
48
+ | WER / supplemental evaluations | LibriSpeech `train-clean-360` (23G) | `./scripts/downloads/download_librispeech_eval.sh` |
49
+ | Train attacks | LibriSpeech `train-clean-100` (11G) | `./scripts/downloads/download_librispeech_train.sh` |
50
+
51
+
52
+ We provide scripts to reproduce our experiments and save results, including generated audio, to named and time-stamped subdirectories within `runs/`. To reproduce our objective evaluation experiments using pre-trained attacks, run:
53
+
54
+ ```
55
+ python scripts/experiments/evaluate.py
56
+ ```
57
+
58
+ To reproduce our training, run:
59
+
60
+ ```
61
+ python scripts/experiments/train.py
62
+ ```
63
+
64
+ <h2 id="streamer">Streaming Implementation</h2>
65
+
66
+ As a proof of concept, we provide a streaming implementation of VoiceBox capable of modifying user audio in real-time. Here, we provide installation instructions for MacOS and Ubuntu 20.04.
67
+
68
+ <h3 id="streamer-mac">MacOS</h3>
69
+
70
+ See video below:
71
+
72
+ <a href="https://youtu.be/LcNjO5E7F3E">
73
+ <p align="center"><img src="./figures/demo_thumbnail.png" width="500"/></p>
74
+ </a>
75
+
76
+ <h3 id="streamer-ubuntu">Ubuntu 20.04</h3>
77
+
78
+
79
+ 1. Open a terminal and follow the [installation instructions](#install) above. Change directory to the root of this repository.
80
+
81
+ 2. Run the following command:
82
+
83
+ pacmd load-module module-null-sink sink_name=voicebox sink_properties=device.description=voicebox
84
+
85
+ If you are using PipeWire instead of PulseAudio:
86
+
87
+ pactl load-module module-null-sink media.class=Audio/Sink sink_name=voicebox sink_properties=device.description=voicebox
88
+
89
+ PulseAudio is the default on Ubuntu. If you haven't changed your system defaults, you are probably using PulseAudio. This will add "voicebox" as an output device. Select it as the input to your chosen audio software.
90
+
91
+ 3. Find which audio device to read and write from. In your conda environment, run:
92
+
93
+ python -m sounddevice
94
+
95
+ You will get output similar to this:
96
+
97
+ 0 HDA Intel HDMI: 0 (hw:0,3), ALSA (0 in, 8 out)
98
+ 1 HDA Intel HDMI: 1 (hw:0,7), ALSA (0 in, 8 out)
99
+ 2 HDA Intel HDMI: 2 (hw:0,8), ALSA (0 in, 8 out)
100
+ 3 HDA Intel HDMI: 3 (hw:0,9), ALSA (0 in, 8 out)
101
+ 4 HDA Intel HDMI: 4 (hw:0,10), ALSA (0 in, 8 out)
102
+ 5 hdmi, ALSA (0 in, 8 out)
103
+ 6 jack, ALSA (2 in, 2 out)
104
+ 7 pipewire, ALSA (64 in, 64 out)
105
+ 8 pulse, ALSA (32 in, 32 out)
106
+ * 9 default, ALSA (32 in, 32 out)
107
+
108
+ In this example, we are going to route the audio through PipeWire (channel 7). This will be our INPUT_NUM and OUTPUT_NUM
109
+
110
+ 4. First, we need to create a conditioning embedding. To do this, run the enrollment script and follow its on-screen instructions:
111
+
112
+ python scripts/streamer/enroll.py --input INPUT_NUM
113
+
114
+ 5. We can now use the streamer. Run:
115
+
116
+ python scripts/stream.py --input INPUT_NUM --output OUTPUT_NUM
117
+
118
+ 6. Once the streamer is running, open `pavucontrol`.
119
+
120
+ a. In `pavucontrol`, go to the "Playback" tab and find "ALSA pug-in [python3.9]: ALSA Playback on". Set the output to "voicebox".
121
+
122
+ b. Then, go to "Recording" and find "ALSA pug-in [python3.9]: ALSA Playback from", and set the input to your desired microphone device.
123
+
124
+ <h2 id="citation">Citation</h2>
125
+
126
+ If you use this your academic research, please cite the following:
127
+
128
+ ```
129
+ @inproceedings{authors2022voicelock,
130
+ title={VoiceBlock: Privacy through Real-Time Adversarial Attacks with Audio-to-Audio Models},
131
+ author={Patrick O'Reilly, Andreas Bugler, Keshav Bhandari, Max Morrison, Bryan Pardo},
132
+ booktitle={Neural Information Processing Systems},
133
+ month={November},
134
+ year={2022}
135
+ }
136
+ ```
voicebox/cache/.gitkeep ADDED
File without changes
voicebox/data/.gitkeep ADDED
File without changes
voicebox/figures/demo_thumbnail.png ADDED
voicebox/figures/use_diagram_embeddings.png ADDED
voicebox/figures/vb_color_logo.png ADDED
voicebox/figures/voicebox_untargeted_conditioning_draft.png ADDED
voicebox/pretrained/denoiser/demucs/dns_48.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cfd4151600ed611d4af05083f4633d4fc31b53761cff8a185293346df745988
3
+ size 75486933
voicebox/pretrained/phoneme/causal_ppg_128_hidden_128_hop.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be4c7a60c9af77e50af86924df8b73eb0c861a46f461e3bfe825c523a0a1a969
3
+ size 1175695
voicebox/pretrained/phoneme/causal_ppg_256_hidden.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e8f20e4973a6b91002c97605f993cf6e16a24ca9d0d39e183438a8c16d85c87
3
+ size 4556495
voicebox/pretrained/phoneme/causal_ppg_256_hidden_256_hop.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0836df2f8465b53d4e0b5b14f1d1ef954b3570d6f95f1af22c3ac19b3e10099
3
+ size 4573903
voicebox/pretrained/phoneme/causal_ppg_256_hidden_512_hop.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a860d6f01058dc14b984845d27e681b5fe7c3bfffe41350e2e6e0f92e72778ad
3
+ size 4608719
voicebox/pretrained/phoneme/ppg_causal_small.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4627bc2b63798df3391fe5c9ccbd72b929dc146b84f0fe61d1aa22848d107973
3
+ size 18002639
voicebox/pretrained/speaker/resemblyzer/resemblyzer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afb2230a894f5a8f91263ff0b4811bde1ea5981bedda45a579c225e5a602ada3
3
+ size 5697307
voicebox/pretrained/speaker/resnetse34v2/resnetse34v2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d96a4dad0118e9945bc7e676d8e5ff34d493ca2209fe188b3f982005132369bc
3
+ size 32311667
voicebox/pretrained/speaker/yvector/yvector.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2b4228cc772e689f800f1f9dc91d4ef4ee289e7e62f2822805edfc5b7faf399
3
+ size 57703939
voicebox/pretrained/universal/universal_final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f435535934f6c8c24fda42c251e65f41627b0660d3420ba1c694e25a82be033e
3
+ size 128811
voicebox/pretrained/voicebox/voicebox_final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb26234cc493182545dbfcc74501f6df7e90347ca3e2a94a7966978325a34ccd
3
+ size 30232012
voicebox/pretrained/voicebox/voicebox_final.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ win_length: 256
2
+ ppg_encoder_hidden_size: 256
3
+ use_phoneme_encoder: True
4
+ use_pitch_encoder: True
5
+ use_loudness_encoder: True
6
+ spec_encoder_lookahead_frames: 0
7
+ spec_encoder_type: 'mel'
8
+ spec_encoder_mlp_depth: 2
9
+ bottleneck_lookahead_frames: 5
10
+ ppg_encoder_path: 'pretrained/phoneme/causal_ppg_256_hidden.pt'
11
+ n_bands: 128
12
+ spec_encoder_hidden_size: 512
13
+ bottleneck_skip: True
14
+ bottleneck_hidden_size: 512
15
+ bottleneck_feedforward_size: 512
16
+ bottleneck_type: 'lstm'
17
+ bottleneck_depth: 2
18
+ control_eps: 0.5
19
+ projection_norm: 'inf'
20
+ conditioning_dim: 512
voicebox/requirements.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==1.10.0
2
+ torchaudio==0.10.0
3
+ torchvision
4
+ torchcrepe
5
+ tensorboard
6
+ textgrid
7
+ Pillow
8
+ numpy
9
+ tqdm
10
+ jiwer
11
+ librosa
12
+ pandas
13
+ protobuf==3.20.0
14
+ git+https://github.com/ludlows/python-pesq#egg=pesq
15
+ psutil
16
+ pystoi
17
+ pytest
18
+ pyworld
19
+ pyyaml
20
+ matplotlib
21
+ seaborn
22
+ ipython
23
+ scipy
24
+ scikit-learn
25
+ ipywebrtc
26
+ argbind
27
+ sounddevice
28
+ keyboard
voicebox/scripts/downloads/download_librispeech_eval.sh ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ set -e
4
+
5
+ DOWNLOADS_SCRIPTS_DIR=$(eval dirname "$(readlink -f "$0")")
6
+ SCRIPTS_DIR="$(dirname "$DOWNLOADS_SCRIPTS_DIR")"
7
+ PROJECT_DIR="$(dirname "$SCRIPTS_DIR")"
8
+
9
+ DATA_DIR="${PROJECT_DIR}/data/"
10
+ CACHE_DIR="${PROJECT_DIR}/cache/"
11
+
12
+ mkdir -p "${DATA_DIR}"
13
+ mkdir -p "${CACHE_DIR}"
14
+
15
+ # download train-clean-360 subset
16
+ echo "downloading LibriSpeech train-clean-360..."
17
+ wget http://www.openslr.org/resources/12/train-clean-360.tar.gz
18
+
19
+ # extract train-clean-360 subset
20
+ echo "extracting LibriSpeech train-clean-360..."
21
+ tar -xf train-clean-360.tar.gz \
22
+ -C "${DATA_DIR}"
23
+
24
+ # delete archive
25
+ rm -f "train-clean-360.tar.gz"
voicebox/scripts/downloads/download_librispeech_train.sh ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ set -e
4
+
5
+ DOWNLOADS_SCRIPTS_DIR=$(eval dirname "$(readlink -f "$0")")
6
+ SCRIPTS_DIR="$(dirname "$DOWNLOADS_SCRIPTS_DIR")"
7
+ PROJECT_DIR="$(dirname "$SCRIPTS_DIR")"
8
+
9
+ DATA_DIR="${PROJECT_DIR}/data/"
10
+ CACHE_DIR="${PROJECT_DIR}/cache/"
11
+
12
+ mkdir -p "${DATA_DIR}"
13
+ mkdir -p "${CACHE_DIR}"
14
+
15
+ # download test-clean subset
16
+ echo "downloading LibriSpeech test-clean..."
17
+ wget http://www.openslr.org/resources/12/test-clean.tar.gz
18
+
19
+ # extract test-clean subset
20
+ echo "extracting LibriSpeech test-clean..."
21
+ tar -xf test-clean.tar.gz \
22
+ -C "${DATA_DIR}"
23
+
24
+ # delete archive
25
+ rm -f "test-clean.tar.gz"
26
+
27
+ # download test-other subset
28
+ echo "downloading LibriSpeech test-other..."
29
+ wget http://www.openslr.org/resources/12/test-other.tar.gz
30
+
31
+ # extract test-other subset
32
+ echo "extracting LibriSpeech test-other..."
33
+ tar -xf test-other.tar.gz \
34
+ -C "${DATA_DIR}"
35
+
36
+ # delete archive
37
+ rm -f "test-other.tar.gz"
38
+
39
+ # download train-clean-100 subset
40
+ echo "downloading LibriSpeech train-clean-100..."
41
+ wget http://www.openslr.org/resources/12/train-clean-100.tar.gz
42
+
43
+ # extract train-clean-100 subset
44
+ echo "extracting LibriSpeech train-clean-100..."
45
+ tar -xf train-clean-100.tar.gz \
46
+ -C "${DATA_DIR}"
47
+
48
+ # delete archive
49
+ rm -f "train-clean-100.tar.gz"
50
+
51
+ # download LibriSpeech alignments dataset
52
+ wget -O alignments.zip https://zenodo.org/record/2619474/files/librispeech_alignments.zip?download=1
53
+ unzip -d "${DATA_DIR}/LibriSpeech/" alignments.zip
54
+ rm -f alignments.zip
voicebox/scripts/downloads/download_rir_noise.sh ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ set -e
4
+
5
+ DOWNLOADS_SCRIPTS_DIR=$(eval dirname "$(readlink -f "$0")")
6
+ SCRIPTS_DIR="$(dirname "$DOWNLOADS_SCRIPTS_DIR")"
7
+ PROJECT_DIR="$(dirname "$SCRIPTS_DIR")"
8
+
9
+ DATA_DIR="${PROJECT_DIR}/data/"
10
+ CACHE_DIR="${PROJECT_DIR}/cache/"
11
+
12
+ REAL_RIR_DIR="${DATA_DIR}/rir/real/"
13
+ SYNTHETIC_RIR_DIR="${DATA_DIR}/rir/synthetic/"
14
+ ROOM_NOISE_DIR="${DATA_DIR}/noise/room/"
15
+ PS_NOISE_DIR="${DATA_DIR}/noise/pointsource/"
16
+
17
+ mkdir -p "${REAL_RIR_DIR}"
18
+ mkdir -p "${SYNTHETIC_RIR_DIR}"
19
+ mkdir -p "${ROOM_NOISE_DIR}"
20
+ mkdir -p "${PS_NOISE_DIR}"
21
+
22
+ # download RIR/noise composite dataset
23
+ echo "downloading RIR/noise dataset..."
24
+ wget -O "${DATA_DIR}/rirs_noises.zip" https://www.openslr.org/resources/28/rirs_noises.zip
25
+
26
+ # extract RIR/noise composite dataset
27
+ echo "unzipping RIR/noise dataset..."
28
+ unzip "${DATA_DIR}/rirs_noises.zip" -d "${DATA_DIR}/"
29
+
30
+ # delete archive
31
+ rm -f "${DATA_DIR}/rirs_noises.zip"
32
+
33
+ # organize pointsource noise data
34
+ echo "extracting point-source noise data"
35
+ cp -a "${DATA_DIR}/RIRS_NOISES/pointsource_noises"/. "${PS_NOISE_DIR}"
36
+
37
+ # organize room noise data
38
+ echo "extracting room noise data"
39
+ room_noises=($(find "${DATA_DIR}/RIRS_NOISES/real_rirs_isotropic_noises/" -maxdepth 1 -name '*noise*' -type f))
40
+ cp -- "${room_noises[@]}" "${ROOM_NOISE_DIR}"
41
+
42
+ # organize real RIR data
43
+ echo "extracting recorded RIR data"
44
+ rirs=($(find "${DATA_DIR}/RIRS_NOISES/real_rirs_isotropic_noises/" ! -name '*noise*' ))
45
+ cp -- "${rirs[@]}" "${REAL_RIR_DIR}"
46
+
47
+ # organize synthetic RIR data
48
+ echo "extracting synthetic RIR data"
49
+ cp -a "${DATA_DIR}/RIRS_NOISES/simulated_rirs"/. "${SYNTHETIC_RIR_DIR}"
50
+
51
+ # delete redundant data
52
+ rm -rf "${DATA_DIR}/RIRS_NOISES/"
53
+
54
+ # separate near-field and far-field RIRs
55
+ NEARFIELD_RIR_DIR="${REAL_RIR_DIR}/nearfield/"
56
+ FARFIELD_RIR_DIR="${REAL_RIR_DIR}/farfield/"
57
+
58
+ mkdir -p "${NEARFIELD_RIR_DIR}"
59
+ mkdir -p "${FARFIELD_RIR_DIR}"
60
+
61
+ # read list of far-field RIRs
62
+ readarray -t FF_RIR_LIST < "${DOWNLOADS_SCRIPTS_DIR}/ff_rir.txt"
63
+
64
+ # move far-field RIRs
65
+ for name in "${FF_RIR_LIST[@]}"; do
66
+ mv "$name" "${FARFIELD_RIR_DIR}/$(basename "$name")"
67
+ done
68
+
69
+ # move remaining near-field RIRs
70
+ for name in "${REAL_RIR_DIR}"/*.wav; do
71
+ mv "$name" "${NEARFIELD_RIR_DIR}/$(basename "$name")"
72
+ done
73
+
voicebox/scripts/downloads/download_voxceleb.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from pathlib import Path
3
+ import subprocess
4
+ import hashlib
5
+ import tarfile
6
+ from zipfile import ZipFile
7
+
8
+ from src.constants import VOXCELEB1_DATA_DIR, VOXCELEB2_DATA_DIR
9
+ from src.utils import ensure_dir
10
+
11
+ ################################################################################
12
+ # Download VoxCeleb1 dataset using valid credentials
13
+ ################################################################################
14
+
15
+
16
+ def parse_args():
17
+
18
+ """Parse command-line arguments"""
19
+ parser = argparse.ArgumentParser(add_help=False)
20
+
21
+ parser.add_argument(
22
+ '--subset',
23
+ type=int,
24
+ default=1,
25
+ help='Specify which VoxCeleb subset to download: 1 or 2'
26
+ )
27
+
28
+ parser.add_argument(
29
+ '--username',
30
+ type=str,
31
+ default=None,
32
+ help='User name provided by VGG to access VoxCeleb dataset'
33
+ )
34
+
35
+ parser.add_argument(
36
+ '--password',
37
+ type=str,
38
+ default=None,
39
+ help='Password provided by VGG to access VoxCeleb dataset'
40
+ )
41
+
42
+ return parser.parse_args()
43
+
44
+
45
+ def md5(f: str):
46
+ """
47
+ Return MD5 checksum for file. Code adapted from voxceleb_trainer repository:
48
+ https://github.com/clovaai/voxceleb_trainer/blob/master/dataprep.py
49
+ """
50
+
51
+ hash_md5 = hashlib.md5()
52
+ with open(f, "rb") as f:
53
+ for chunk in iter(lambda: f.read(4096), b""):
54
+ hash_md5.update(chunk)
55
+ return hash_md5.hexdigest()
56
+
57
+
58
+ def download(username: str,
59
+ password: str,
60
+ save_path: str,
61
+ lines: list):
62
+ """
63
+ Given a list of dataset shards formatted as <URL, MD5>, download
64
+ each using `wget` and verify checksums. Code adapted from voxceleb_trainer
65
+ repository:
66
+ https://github.com/clovaai/voxceleb_trainer/blob/master/dataprep.py
67
+ """
68
+
69
+ for line in lines:
70
+ url = line.split()[0]
71
+ md5gt = line.split()[1]
72
+ outfile = url.split('/')[-1]
73
+
74
+ # download files
75
+ out = subprocess.call(
76
+ f'wget {url} --user {username} --password {password} -O {save_path}'
77
+ f'/{outfile}', shell=True)
78
+ if out != 0:
79
+ raise ValueError(f'Download failed for {url}')
80
+
81
+ # verify checksum
82
+ md5ck = md5(f'{save_path}/{outfile}')
83
+ if md5ck == md5gt:
84
+ print(f'Checksum successful for {outfile}')
85
+ else:
86
+ raise Warning(f'Checksum failed for {outfile}')
87
+
88
+
89
+ def concatenate(save_path: str, lines: list):
90
+ """
91
+ Given a specification in the format <FMT, FILENAME, MD5>, concatenate all
92
+ downloaded data shards matching FMT into the file FILENAME and verify
93
+ checksums. Code adapted from voxceleb_trainer repository:
94
+ https://github.com/clovaai/voxceleb_trainer/blob/master/dataprep.py
95
+ """
96
+
97
+ for line in lines:
98
+ infile = line.split()[0]
99
+ outfile = line.split()[1]
100
+ md5gt = line.split()[2]
101
+
102
+ # concatenate shards
103
+ out = subprocess.call(
104
+ f'cat {save_path}/{infile} > {save_path}/{outfile}', shell=True)
105
+
106
+ # verify checksum
107
+ md5ck = md5(f'{save_path}/{outfile}')
108
+ if md5ck == md5gt:
109
+ print(f'Checksum successful for {outfile}')
110
+ else:
111
+ raise Warning(f'Checksum failed for {outfile}')
112
+
113
+ # delete shards
114
+ out = subprocess.call(
115
+ f'rm {save_path}/{infile}', shell=True)
116
+
117
+
118
+ def full_extract(save_path: str, f: str):
119
+ """
120
+ Extract contents of compressed archive to data directory
121
+ """
122
+
123
+ save_path = str(save_path)
124
+ f = str(f)
125
+
126
+ print(f'Extracting {f}')
127
+
128
+ if f.endswith(".tar.gz"):
129
+ with tarfile.open(f, "r:gz") as tar:
130
+ tar.extractall(save_path)
131
+
132
+ elif f.endswith(".zip"):
133
+ with ZipFile(f, 'r') as zf:
134
+ zf.extractall(save_path)
135
+
136
+
137
+ def main():
138
+
139
+ args = parse_args()
140
+
141
+ # prepare to load dataset file paths
142
+ downloads_dir = Path(__file__).parent
143
+
144
+ if args.subset == 1:
145
+ data_dir = VOXCELEB1_DATA_DIR
146
+ elif args.subset == 2:
147
+ data_dir = VOXCELEB2_DATA_DIR
148
+ else:
149
+ raise ValueError(f'Invalid VoxCeleb subset {args.subset}')
150
+
151
+ ensure_dir(data_dir)
152
+
153
+ # load dataset file paths
154
+ with open(downloads_dir / f'voxceleb{args.subset}_file_parts.txt', 'r') as f:
155
+ file_parts_list = f.readlines()
156
+
157
+ # load output file paths
158
+ with open(downloads_dir / f'voxceleb{args.subset}_files.txt', 'r') as f:
159
+ files_list = f.readlines()
160
+
161
+ # download subset
162
+ download(
163
+ username=args.username,
164
+ password=args.password,
165
+ save_path=data_dir,
166
+ lines=file_parts_list
167
+ )
168
+
169
+ # merge shards
170
+ concatenate(save_path=data_dir, lines=files_list)
171
+
172
+ # account for test data
173
+ archives = [file.split()[1] for file in files_list]
174
+ test = f"vox{args.subset}_test_{'wav' if args.subset == 1 else 'aac'}.zip"
175
+ archives.append(test)
176
+
177
+ # extract all compressed data
178
+ for file in archives:
179
+ full_extract(data_dir, data_dir / file)
180
+
181
+ # organize extracted data
182
+ out = subprocess.call(f'mv {data_dir}/dev/aac/* {data_dir}/aac/ && rm -r '
183
+ f'{data_dir}/dev', shell=True)
184
+ out = subprocess.call(f'mv -v {data_dir}/{"wav" if args.subset == 1 else "aac"}/*'
185
+ f' {data_dir}/voxceleb{args.subset}', shell=True)
186
+
187
+
188
+ if __name__ == "__main__":
189
+ main()
voicebox/scripts/downloads/ff_rir.txt ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data/rir/real/air_type1_air_binaural_lecture_0_1.wav
2
+ data/rir/real/RWCP_type3_rir_cirline_ofc_imp_rev.wav
3
+ data/rir/real/RWCP_type1_rir_cirline_jr2_imp110.wav
4
+ data/rir/real/air_type1_air_binaural_aula_carolina_1_4_90_3.wav
5
+ data/rir/real/RVB2014_type1_rir_largeroom2_far_anglb.wav
6
+ data/rir/real/air_type1_air_binaural_stairway_1_1_60.wav
7
+ data/rir/real/air_type1_air_binaural_aula_carolina_1_3_90_3.wav
8
+ data/rir/real/air_type1_air_binaural_lecture_0_5.wav
9
+ data/rir/real/air_type1_air_binaural_stairway_1_2_30.wav
10
+ data/rir/real/air_type1_air_binaural_stairway_1_1_30.wav
11
+ data/rir/real/air_type1_air_binaural_stairway_1_2_15.wav
12
+ data/rir/real/air_type1_air_binaural_stairway_1_2_165.wav
13
+ data/rir/real/air_type1_air_binaural_stairway_1_2_75.wav
14
+ data/rir/real/air_type1_air_binaural_lecture_0_3.wav
15
+ data/rir/real/air_type1_air_binaural_stairway_1_2_0.wav
16
+ data/rir/real/air_type1_air_binaural_stairway_1_3_0.wav
17
+ data/rir/real/RWCP_type2_rir_cirline_jr1_imp110.wav
18
+ data/rir/real/air_type1_air_binaural_aula_carolina_1_5_90_3.wav
19
+ data/rir/real/RVB2014_type1_rir_largeroom1_far_anglb.wav
20
+ data/rir/real/air_type1_air_binaural_lecture_1_1.wav
21
+ data/rir/real/RVB2014_type1_rir_largeroom1_far_angla.wav
22
+ data/rir/real/air_type1_air_binaural_aula_carolina_1_7_90_3.wav
23
+ data/rir/real/RWCP_type2_rir_cirline_ofc_imp070.wav
24
+ data/rir/real/RWCP_type1_rir_cirline_jr1_imp070.wav
25
+ data/rir/real/air_type1_air_binaural_stairway_1_3_150.wav
26
+ data/rir/real/air_type1_air_binaural_lecture_1_5.wav
27
+ data/rir/real/RWCP_type1_rir_cirline_jr1_imp100.wav
28
+ data/rir/real/RWCP_type1_rir_cirline_jr2_imp100.wav
29
+ data/rir/real/RWCP_type1_rir_cirline_e2b_imp130.wav
30
+ data/rir/real/air_type1_air_phone_corridor_hfrp.wav
31
+ data/rir/real/RWCP_type1_rir_cirline_jr1_imp130.wav
32
+ data/rir/real/RVB2014_type1_rir_largeroom1_near_angla.wav
33
+ data/rir/real/air_type1_air_binaural_stairway_1_1_75.wav
34
+ data/rir/real/RWCP_type1_rir_cirline_e2b_imp150.wav
35
+ data/rir/real/air_type1_air_phone_lecture_hhp.wav
36
+ data/rir/real/air_type1_air_binaural_stairway_1_1_105.wav
37
+ data/rir/real/air_type1_air_phone_stairway_hfrp.wav
38
+ data/rir/real/air_type1_air_binaural_stairway_1_3_105.wav
39
+ data/rir/real/RWCP_type2_rir_cirline_jr1_imp090.wav
40
+ data/rir/real/RWCP_type1_rir_cirline_e2b_imp050.wav
41
+ data/rir/real/air_type1_air_phone_stairway2_hfrp.wav
42
+ data/rir/real/air_type1_air_phone_stairway2_hhp.wav
43
+ data/rir/real/RWCP_type1_rir_cirline_jr2_imp060.wav
44
+ data/rir/real/air_type1_air_binaural_stairway_1_3_90.wav
45
+ data/rir/real/RWCP_type2_rir_cirline_jr1_imp130.wav
46
+ data/rir/real/RWCP_type1_rir_cirline_e2b_imp030.wav
47
+ data/rir/real/RVB2014_type1_rir_largeroom2_near_angla.wav
48
+ data/rir/real/air_type1_air_binaural_lecture_0_6.wav
49
+ data/rir/real/RWCP_type1_rir_cirline_e2b_imp070.wav
50
+ data/rir/real/air_type1_air_phone_stairway1_hhp.wav
51
+ data/rir/real/air_type1_air_binaural_stairway_1_1_45.wav
52
+ data/rir/real/RWCP_type1_rir_cirline_ofc_imp090.wav
53
+ data/rir/real/air_type1_air_binaural_stairway_1_1_135.wav
54
+ data/rir/real/air_type1_air_binaural_stairway_1_2_180.wav
55
+ data/rir/real/RWCP_type1_rir_cirline_ofc_imp100.wav
56
+ data/rir/real/RWCP_type1_rir_cirline_ofc_imp080.wav
57
+ data/rir/real/RWCP_type2_rir_cirline_ofc_imp090.wav
58
+ data/rir/real/RWCP_type1_rir_cirline_jr2_imp080.wav
59
+ data/rir/real/air_type1_air_binaural_lecture_1_2.wav
60
+ data/rir/real/RWCP_type1_rir_cirline_ofc_imp070.wav
61
+ data/rir/real/air_type1_air_binaural_stairway_1_2_150.wav
62
+ data/rir/real/air_type1_air_binaural_lecture_1_4.wav
63
+ data/rir/real/air_type1_air_binaural_aula_carolina_1_3_0_3.wav
64
+ data/rir/real/RVB2014_type1_rir_largeroom1_near_anglb.wav
65
+ data/rir/real/air_type1_air_binaural_stairway_1_1_15.wav
66
+ data/rir/real/air_type1_air_binaural_stairway_1_1_120.wav
67
+ data/rir/real/RWCP_type1_rir_cirline_ofc_imp050.wav
68
+ data/rir/real/air_type1_air_binaural_aula_carolina_1_1_90_3.wav
69
+ data/rir/real/air_type1_air_phone_stairway_hhp.wav
70
+ data/rir/real/RWCP_type1_rir_cirline_jr2_imp120.wav
71
+ data/rir/real/RWCP_type2_rir_cirline_e2b_imp110.wav
72
+ data/rir/real/RWCP_type1_rir_cirline_e2b_imp010.wav
73
+ data/rir/real/air_type1_air_binaural_stairway_1_3_15.wav
74
+ data/rir/real/air_type1_air_binaural_stairway_1_2_135.wav
75
+ data/rir/real/air_type1_air_phone_bt_stairway_hhp.wav
76
+ data/rir/real/RWCP_type2_rir_cirline_e2b_imp070.wav
77
+ data/rir/real/RWCP_type1_rir_cirline_ofc_imp120.wav
78
+ data/rir/real/RWCP_type1_rir_cirline_ofc_imp110.wav
79
+ data/rir/real/air_type1_air_binaural_lecture_0_4.wav
80
+ data/rir/real/RWCP_type2_rir_cirline_ofc_imp050.wav
81
+ data/rir/real/air_type1_air_binaural_stairway_1_1_90.wav
82
+ data/rir/real/RWCP_type1_rir_cirline_jr2_imp090.wav
83
+ data/rir/real/air_type1_air_binaural_stairway_1_1_0.wav
84
+ data/rir/real/air_type1_air_phone_stairway1_hfrp.wav
85
+ data/rir/real/air_type1_air_binaural_lecture_1_3.wav
86
+ data/rir/real/RWCP_type1_rir_cirline_jr1_imp050.wav
87
+ data/rir/real/RWCP_type1_rir_cirline_jr1_imp080.wav
88
+ data/rir/real/air_type1_air_binaural_stairway_1_1_165.wav
89
+ data/rir/real/air_type1_air_binaural_stairway_1_2_45.wav
90
+ data/rir/real/air_type1_air_phone_bt_corridor_hhp.wav
91
+ data/rir/real/air_type1_air_binaural_aula_carolina_1_2_90_3.wav
92
+ data/rir/real/RWCP_type2_rir_cirline_ofc_imp110.wav
93
+ data/rir/real/air_type1_air_binaural_stairway_1_3_120.wav
94
+ data/rir/real/air_type1_air_binaural_aula_carolina_1_3_180_3.wav
95
+ data/rir/real/RWCP_type1_rir_cirline_e2b_imp110.wav
96
+ data/rir/real/RWCP_type1_rir_cirline_jr1_imp060.wav
97
+ data/rir/real/air_type1_air_binaural_stairway_1_3_45.wav
98
+ data/rir/real/RVB2014_type1_rir_largeroom2_far_angla.wav
99
+ data/rir/real/air_type1_air_binaural_stairway_1_2_60.wav
100
+ data/rir/real/RWCP_type2_rir_cirline_jr1_imp070.wav
101
+ data/rir/real/RWCP_type1_rir_cirline_ofc_imp130.wav
102
+ data/rir/real/air_type1_air_binaural_aula_carolina_1_3_135_3.wav
103
+ data/rir/real/air_type1_air_binaural_stairway_1_3_75.wav
104
+ data/rir/real/air_type1_air_binaural_stairway_1_1_180.wav
105
+ data/rir/real/RWCP_type1_rir_cirline_jr1_imp120.wav
106
+ data/rir/real/air_type1_air_binaural_stairway_1_3_60.wav
107
+ data/rir/real/air_type1_air_binaural_stairway_1_2_105.wav
108
+ data/rir/real/air_type1_air_binaural_stairway_1_3_135.wav
109
+ data/rir/real/air_type1_air_binaural_aula_carolina_1_3_45_3.wav
110
+ data/rir/real/air_type1_air_binaural_lecture_1_6.wav
111
+ data/rir/real/RWCP_type2_rir_cirline_e2b_imp090.wav
112
+ data/rir/real/RWCP_type1_rir_cirline_e2b_imp170.wav
113
+ data/rir/real/air_type1_air_binaural_stairway_1_2_90.wav
114
+ data/rir/real/RWCP_type1_rir_cirline_jr2_imp070.wav
115
+ data/rir/real/RWCP_type1_rir_cirline_jr1_imp110.wav
116
+ data/rir/real/air_type1_air_phone_lecture_hfrp.wav
117
+ data/rir/real/RVB2014_type1_rir_largeroom2_near_anglb.wav
118
+ data/rir/real/air_type1_air_binaural_stairway_1_3_165.wav
119
+ data/rir/real/RWCP_type2_rir_cirline_ofc_imp130.wav
120
+ data/rir/real/air_type1_air_binaural_stairway_1_1_150.wav
121
+ data/rir/real/RWCP_type1_rir_cirline_jr1_imp090.wav
122
+ data/rir/real/RWCP_type2_rir_cirline_e2b_imp130.wav
123
+ data/rir/real/RWCP_type1_rir_cirline_ofc_imp060.wav
124
+ data/rir/real/air_type1_air_binaural_stairway_1_3_180.wav
125
+ data/rir/real/RWCP_type2_rir_cirline_jr1_imp050.wav
126
+ data/rir/real/air_type1_air_binaural_stairway_1_3_30.wav
127
+ data/rir/real/air_type1_air_binaural_lecture_0_2.wav
128
+ data/rir/real/air_type1_air_binaural_aula_carolina_1_6_90_3.wav
129
+ data/rir/real/RWCP_type2_rir_cirline_e2b_imp050.wav
130
+ data/rir/real/RWCP_type1_rir_cirline_e2b_imp090.wav
131
+ data/rir/real/air_type1_air_phone_corridor_hhp.wav
132
+ data/rir/real/air_type1_air_binaural_stairway_1_2_120.wav
voicebox/scripts/downloads/voxceleb1_file_parts.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox1_dev_wav_partaa e395d020928bc15670b570a21695ed96
2
+ http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox1_dev_wav_partab bbfaaccefab65d82b21903e81a8a8020
3
+ http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox1_dev_wav_partac 017d579a2a96a077f40042ec33e51512
4
+ http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox1_dev_wav_partad 7bb1e9f70fddc7a678fa998ea8b3ba19
5
+ http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox1_test_wav.zip 185fdc63c3c739954633d50379a3d102
voicebox/scripts/downloads/voxceleb1_files.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ vox1_dev_wav_parta* vox1_dev_wav.zip ae63e55b951748cc486645f532ba230b
voicebox/scripts/downloads/voxceleb2_file_parts.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partaa da070494c573e5c0564b1d11c3b20577
2
+ http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partab 17fe6dab2b32b48abaf1676429cdd06f
3
+ http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partac 1de58e086c5edf63625af1cb6d831528
4
+ http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partad 5a043eb03e15c5a918ee6a52aad477f9
5
+ http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partae cea401b624983e2d0b2a87fb5d59aa60
6
+ http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partaf fc886d9ba90ab88e7880ee98effd6ae9
7
+ http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partag d160ecc3f6ee3eed54d55349531cb42e
8
+ http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partah 6b84a81b9af72a9d9eecbb3b1f602e65
9
+ http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_test_aac.zip 0d2b3ea430a821c33263b5ea37ede312
voicebox/scripts/downloads/voxceleb2_files.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ vox2_dev_aac_parta* vox2_dev_aac.zip bbc063c46078a602ca71605645c2a402
voicebox/scripts/experiments/evaluate.py ADDED
@@ -0,0 +1,915 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ import torchaudio
7
+ import psutil
8
+ import pickle
9
+
10
+ import random
11
+ import argparse
12
+
13
+ import librosa as li
14
+ from sklearn.utils import shuffle
15
+ from sklearn.neighbors import NearestNeighbors
16
+
17
+ from pesq import pesq, NoUtterancesError
18
+
19
+ from tqdm import tqdm
20
+ from sklearn.preprocessing import LabelEncoder
21
+ import numpy as np
22
+ from pathlib import Path
23
+ from tqdm import tqdm
24
+ import builtins
25
+ import math
26
+ import jiwer
27
+ from jiwer import wer, cer
28
+
29
+ from typing import Iterable
30
+ from copy import deepcopy
31
+
32
+ from distutils.util import strtobool
33
+
34
+ from src.data import *
35
+ from src.constants import *
36
+ from src.models import *
37
+ from src.simulation import *
38
+ from src.preprocess import *
39
+ from src.attacks.offline import *
40
+ from src.loss import *
41
+ from src.pipelines import *
42
+ from src.utils import *
43
+
44
+ ################################################################################
45
+ # Evaluate attacks on speaker recognition systems
46
+ ################################################################################
47
+
48
+ EVAL_DATASET = "voxceleb" # "librispeech"
49
+ LOOKAHEAD = 5
50
+ VOICEBOX_PATH = VOICEBOX_PRETRAINED_PATH
51
+ UNIVERSAL_PATH = UNIVERSAL_PRETRAINED_PATH
52
+ BATCH_SIZE = 20 # evaluation batch size
53
+ N_QUERY = 15 # number of query utterances per speaker
54
+ N_CONDITION = 10 # number of conditioning utterances per speaker
55
+ N_ENROLL = 20 # number of enrolled utterances per speaker
56
+ ADV_ENROLL = False # evaluate under assumption adversarial audio is enrolled
57
+ TARGETS_TRAIN = 'centroid' # 'random', 'same', 'single', 'median'
58
+ TARGETS_TEST = 'centroid' # 'random', 'same', 'single', 'median'
59
+ TRANSFER = True # evaluate attacks on unseen model
60
+ DENOISER = False # evaluate with unseen denoiser defense applied to queries
61
+ SIMULATION = False # apply noisy channel simulation to all queries in evaluation
62
+ COMPUTE_OBJECTIVE_METRICS = True # PESQ, STOI
63
+
64
+
65
+ def set_random_seed(seed: int = 123):
66
+ """Set random seed to allow for reproducibility"""
67
+ random.seed(seed)
68
+ torch.manual_seed(seed)
69
+
70
+ if torch.backends.cudnn.is_available():
71
+ # torch.backends.cudnn.benchmark = True
72
+ torch.backends.cudnn.deterministic = True
73
+
74
+
75
+ def param_count(m: nn.Module, trainable: bool = False):
76
+ """Count the number of trainable parameters (weights) in a model"""
77
+ if trainable:
78
+ return builtins.sum(
79
+ [p.shape.numel() for p in m.parameters() if p.requires_grad])
80
+ else:
81
+ return builtins.sum([p.shape.numel() for p in m.parameters()])
82
+
83
+
84
+ def pad_sequence(sequences: list):
85
+
86
+ max_len = max([s.shape[-1] for s in sequences])
87
+
88
+ padded = torch.zeros(
89
+ (len(sequences), 1, max_len),
90
+ dtype=sequences[0].dtype,
91
+ device=sequences[0].device)
92
+
93
+ for i, s in enumerate(sequences):
94
+ padded[i, :, :s.shape[-1]] = s
95
+
96
+ return padded
97
+
98
+
99
+ @torch.no_grad()
100
+ def compute_embeddings_batch(audio: list,
101
+ p: Pipeline,
102
+ defense: nn.Module = nn.Identity()):
103
+ """Compute batched speaker embeddings"""
104
+
105
+ assert isinstance(p.model, SpeakerVerificationModel)
106
+ emb = [p(defense(audio[i].to(p.device))).to('cpu') for i in range(len(audio))]
107
+ emb = torch.cat(emb, dim=0)
108
+ return emb
109
+
110
+
111
+ @torch.no_grad()
112
+ def compute_transcripts_batch(audio: list, p: Pipeline):
113
+ """Compute batched transcripts"""
114
+
115
+ assert isinstance(p.model, SpeechRecognitionModel)
116
+ transcripts = []
117
+ for i in range(len(audio)):
118
+ t = p.model.transcribe(audio[i].to(p.device))
119
+ if isinstance(t, str):
120
+ transcripts.append(t)
121
+ elif isinstance(t, list):
122
+ transcripts.extend(t)
123
+
124
+ assert len(transcripts) == len(audio), f'Transcript format error'
125
+
126
+ return transcripts
127
+
128
+
129
+ @torch.no_grad()
130
+ def compute_attack_batch(audio: list,
131
+ a: TrainableAttack,
132
+ c: torch.Tensor):
133
+
134
+ if len(c) < len(audio):
135
+ c = c.repeat(len(audio), 1, 1)
136
+ adv = [a.perturbation(audio[i].to(a.pipeline.device),
137
+ y=c[i:i+1].to(a.pipeline.device)).to('cpu').reshape(1, 1, -1)
138
+ for i in range(len(audio))]
139
+ return adv
140
+
141
+
142
+ @torch.no_grad()
143
+ def compute_pesq(audio1: list, audio2: list, mode: str = 'wb'):
144
+
145
+ assert len(audio1) == len(audio2)
146
+ scores = []
147
+
148
+ for i in range(len(audio1)):
149
+ try:
150
+ scores.append(
151
+ pesq(DataProperties.get('sample_rate'),
152
+ tensor_to_np(audio1[i]).flatten(),
153
+ tensor_to_np(audio2[i]).flatten(),
154
+ mode)
155
+ )
156
+ except NoUtterancesError:
157
+ print("PESQ error, skipping audio file...")
158
+ return scores
159
+
160
+
161
+ @torch.no_grad()
162
+ def compute_stoi(audio1: list, audio2: list, extended: bool = False):
163
+
164
+ assert len(audio1) == len(audio2)
165
+ scores = []
166
+ for i in range(len(audio1)):
167
+ scores.append(
168
+ stoi(tensor_to_np(audio1[i]).flatten(),
169
+ tensor_to_np(audio2[i]).flatten(),
170
+ DataProperties.get('sample_rate'),
171
+ extended=extended)
172
+ )
173
+ return scores
174
+
175
+
176
+ @torch.no_grad()
177
+ def build_ls_dataset(pipelines: dict):
178
+ """
179
+ Build LibriSpeech evaluation dataset on disk holding:
180
+ * query audio
181
+ * query embeddings
182
+ * conditioning embeddings
183
+ * enrolled embeddings
184
+ * ground-truth query transcripts
185
+ """
186
+
187
+ # locate dataset
188
+ data_dir = LIBRISPEECH_DATA_DIR / 'train-clean-360'
189
+ cache_dir = CACHE_DIR / 'ls_wer_eval'
190
+ ensure_dir(cache_dir)
191
+
192
+ assert os.path.isdir(data_dir), \
193
+ f'LibriSpeech `train-clean-360` subset required for evaluation'
194
+
195
+ spkr_dirs = list(data_dir.glob("*/"))
196
+ spkr_dirs = [s_d for s_d in spkr_dirs if os.path.isdir(s_d)]
197
+
198
+ # catalog audio and load transcripts
199
+ for spkr_dir in tqdm(spkr_dirs, total=len(spkr_dirs), desc='Building dataset'):
200
+
201
+ # identify speaker
202
+ spkr_id = spkr_dir.parts[-1]
203
+
204
+ # check whether cached data exists for speaker
205
+ spkr_cache_dir = cache_dir / spkr_id
206
+ if os.path.isdir(spkr_cache_dir):
207
+ continue
208
+
209
+ # each recording session has a separate subdirectory
210
+ rec_dirs = list(spkr_dir.glob("*/"))
211
+ rec_dirs = [r_d for r_d in rec_dirs if os.path.isdir(r_d)]
212
+
213
+ # for each speaker, process & store necessary (non-adversarial) data
214
+ all_audio = []
215
+ all_transcripts = []
216
+
217
+ # for each recording session, extract all audio files and transcripts
218
+ for rec_dir in rec_dirs:
219
+
220
+ rec_id = rec_dir.parts[-1]
221
+ trans_fn = rec_dir / f"{spkr_id}-{rec_id}.trans.txt"
222
+
223
+ # open transcript file
224
+ with open(trans_fn, "r") as f:
225
+ trans_idx = f.readlines()
226
+
227
+ if len(trans_idx) == 0:
228
+ print(f"Error: empty transcript {trans_fn}")
229
+ continue
230
+
231
+ for line in trans_idx:
232
+
233
+ split_line = line.strip().split(" ")
234
+ audio_fn = rec_dir / f'{split_line[0]}.{LIBRISPEECH_EXT}'
235
+ transcript = " ".join(split_line[1:]).replace(" ", "|")
236
+
237
+ x, _ = li.load(audio_fn, mono=True, sr=16000)
238
+ all_audio.append(torch.as_tensor(x).reshape(1, 1, -1).float())
239
+ all_transcripts.append(transcript)
240
+
241
+ # shuffle audio and transcripts in same random order
242
+ all_audio, all_transcripts = shuffle(all_audio, all_transcripts)
243
+
244
+ # divide audio and transcripts
245
+ query_audio = all_audio[:N_QUERY]
246
+ query_transcripts = all_transcripts[:N_QUERY]
247
+ condition_audio = all_audio[N_QUERY:N_QUERY+N_CONDITION]
248
+ enroll_audio = all_audio[N_QUERY+N_CONDITION:][:N_ENROLL]
249
+
250
+ # check for sufficient audio in each category
251
+ if len(query_audio) < N_QUERY:
252
+ print(f"Error: insufficient query audio for speaker {spkr_id}")
253
+ continue
254
+ elif len(condition_audio) < N_CONDITION:
255
+ print(f"Error: insufficient conditioning audio for speaker {spkr_id}")
256
+ continue
257
+ elif len(enroll_audio) < N_ENROLL:
258
+ print(f"Error: insufficient enrollment audio for speaker {spkr_id}")
259
+ continue
260
+
261
+ # compute and save embeddings
262
+ for p_name, p in pipelines.items():
263
+
264
+ # compute and save query embeddings
265
+ query_emb = compute_embeddings_batch(query_audio, p)
266
+ f_query = spkr_cache_dir / p_name / 'query_emb.pt'
267
+ ensure_dir_for_filename(f_query)
268
+
269
+ # compute and save conditioning embeddings
270
+ condition_emb = compute_embeddings_batch(condition_audio, p)
271
+ f_condition = spkr_cache_dir / p_name / 'condition_emb.pt'
272
+ ensure_dir_for_filename(f_condition)
273
+
274
+ # compute and save enrolled embeddings
275
+ enroll_emb = compute_embeddings_batch(enroll_audio, p)
276
+ f_enroll = spkr_cache_dir / p_name / 'enroll_emb.pt'
277
+ ensure_dir_for_filename(f_enroll)
278
+
279
+ torch.save(query_emb, f_query)
280
+ torch.save(condition_emb, f_condition)
281
+ torch.save(enroll_emb, f_enroll)
282
+
283
+ # save query audio
284
+ f_audio = spkr_cache_dir / 'query_audio.pt'
285
+ torch.save(query_audio, f_audio)
286
+
287
+ # save query transcripts
288
+ f_transcript = spkr_cache_dir / 'query_trans.pt'
289
+ torch.save(query_transcripts, f_transcript)
290
+
291
+ @torch.no_grad()
292
+ def build_vc_dataset(pipelines: dict):
293
+ """
294
+ Build VoxCeleb evaluation dataset on disk holding:
295
+ * query audio
296
+ * query embeddings
297
+ * conditioning embeddings
298
+ * enrolled embeddings
299
+ """
300
+
301
+ # locate dataset
302
+ data_dir = VOXCELEB1_DATA_DIR / 'voxceleb1'
303
+ cache_dir = CACHE_DIR / 'vc_wer_eval'
304
+ ensure_dir(cache_dir)
305
+
306
+ assert os.path.isdir(data_dir), \
307
+ f'VoxCeleb1 dataset required for evaluation'
308
+
309
+ spkr_dirs = list(data_dir.glob("*/"))
310
+ spkr_dirs = [s_d for s_d in spkr_dirs if os.path.isdir(s_d)]
311
+
312
+ # catalog audio
313
+ for spkr_dir in tqdm(spkr_dirs, total=len(spkr_dirs), desc='Building dataset'):
314
+
315
+ # identify speaker
316
+ spkr_id = spkr_dir.parts[-1]
317
+
318
+ # check whether cached data exists for speaker
319
+ spkr_cache_dir = cache_dir / spkr_id
320
+ if os.path.isdir(spkr_cache_dir):
321
+ continue
322
+
323
+ # each recording session has a separate subdirectory
324
+ rec_dirs = list(spkr_dir.glob("*/"))
325
+ rec_dirs = [r_d for r_d in rec_dirs if os.path.isdir(r_d)]
326
+
327
+ # for each speaker, process & store necessary (non-adversarial) data
328
+ all_audio = []
329
+
330
+ # for each recording session, extract all audio files and transcripts
331
+ for rec_dir in rec_dirs:
332
+ for audio_fn in rec_dir.glob(f"*.{VOXCELEB1_EXT}"):
333
+ x, _ = li.load(audio_fn, mono=True, sr=16000)
334
+ all_audio.append(torch.as_tensor(x).reshape(1, 1, -1).float())
335
+
336
+ # shuffle audio in random order
337
+ all_audio = shuffle(all_audio)
338
+
339
+ # divide audio and transcripts
340
+ query_audio = all_audio[:N_QUERY]
341
+ condition_audio = all_audio[N_QUERY:N_QUERY+N_CONDITION]
342
+ enroll_audio = all_audio[N_QUERY+N_CONDITION:][:N_ENROLL]
343
+
344
+ # check for sufficient audio in each category
345
+ if len(query_audio) < N_QUERY:
346
+ print(f"Error: insufficient query audio for speaker {spkr_id}")
347
+ continue
348
+ elif len(condition_audio) < N_CONDITION:
349
+ print(f"Error: insufficient conditioning audio for speaker {spkr_id}")
350
+ continue
351
+ elif len(enroll_audio) < N_ENROLL:
352
+ print(f"Error: insufficient enrollment audio for speaker {spkr_id}")
353
+ continue
354
+
355
+ # compute and save embeddings
356
+ for p_name, p in pipelines.items():
357
+
358
+ # compute and save query embeddings
359
+ query_emb = compute_embeddings_batch(query_audio, p)
360
+ f_query = spkr_cache_dir / p_name / 'query_emb.pt'
361
+ ensure_dir_for_filename(f_query)
362
+
363
+ # compute and save conditioning embeddings
364
+ condition_emb = compute_embeddings_batch(condition_audio, p)
365
+ f_condition = spkr_cache_dir / p_name / 'condition_emb.pt'
366
+ ensure_dir_for_filename(f_condition)
367
+
368
+ # compute and save enrolled embeddings
369
+ enroll_emb = compute_embeddings_batch(enroll_audio, p)
370
+ f_enroll = spkr_cache_dir / p_name / 'enroll_emb.pt'
371
+ ensure_dir_for_filename(f_enroll)
372
+
373
+ torch.save(query_emb, f_query)
374
+ torch.save(condition_emb, f_condition)
375
+ torch.save(enroll_emb, f_enroll)
376
+
377
+ # save query audio
378
+ f_audio = spkr_cache_dir / 'query_audio.pt'
379
+ torch.save(query_audio, f_audio)
380
+
381
+ @torch.no_grad()
382
+ def asr_metrics(true: list, hypothesis: list, batch_size: int = 5):
383
+ """
384
+ Compute word and character error rates between two lists of corresponding
385
+ transcripts
386
+ """
387
+
388
+ assert len(true) == len(hypothesis)
389
+
390
+ n_batches = math.ceil(len(true) / batch_size)
391
+
392
+ transform_wer = jiwer.Compose([
393
+ jiwer.ToLowerCase(),
394
+ jiwer.RemoveWhiteSpace(replace_by_space=True),
395
+ jiwer.RemoveMultipleSpaces(),
396
+ jiwer.ReduceToSingleSentence(word_delimiter="|"),
397
+ jiwer.ReduceToListOfListOfWords(word_delimiter="|"),
398
+ ])
399
+
400
+ wer_score = 0.0
401
+ cer_score = 0.0
402
+
403
+ wer_n = 0
404
+ cer_n = 0
405
+
406
+ for i in range(n_batches):
407
+
408
+ batch_true = true[i*batch_size:(i+1)*batch_size]
409
+ batch_hypothesis = hypothesis[i*batch_size:(i+1)*batch_size]
410
+
411
+ wer_n_batch = builtins.sum([len(s.split('|')) for s in batch_true])
412
+ cer_n_batch = builtins.sum([len(s) for s in batch_true])
413
+
414
+ attack_cer = cer(batch_true, batch_hypothesis)
415
+ attack_wer = wer(batch_true, batch_hypothesis,
416
+ truth_transform=transform_wer,
417
+ hypothesis_transform=transform_wer)
418
+
419
+ wer_score += wer_n_batch*attack_wer
420
+ cer_score += cer_n_batch*attack_cer
421
+
422
+ wer_n += wer_n_batch
423
+ cer_n += cer_n_batch
424
+
425
+ wer_score /= wer_n
426
+ cer_score /= cer_n
427
+
428
+ return wer_score, cer_score
429
+
430
+
431
+ @torch.no_grad()
432
+ def top_k(query: dict, enrolled: dict, k: int):
433
+ """
434
+ Compute portion of queries for which 'correct' ID appears in k-closest
435
+ enrolled entries
436
+ """
437
+
438
+ # concatenate query embeddings into single tensor
439
+ query_array = []
440
+ query_ids = []
441
+
442
+ for s_l in query.keys():
443
+ query_array.append(query[s_l])
444
+ query_ids.extend([s_l] * len(query[s_l]))
445
+
446
+ query_array = torch.cat(query_array, dim=0).squeeze().cpu().numpy()
447
+ query_ids = torch.as_tensor(query_ids).cpu().numpy()
448
+
449
+ # concatenate enrolled embeddings into single tensor
450
+ enrolled_array = []
451
+ enrolled_ids = []
452
+
453
+ for s_l in enrolled.keys():
454
+ enrolled_array.append(enrolled[s_l])
455
+ enrolled_ids.extend([s_l] * len(enrolled[s_l]))
456
+
457
+ enrolled_array = torch.cat(enrolled_array, dim=0).squeeze().cpu().numpy()
458
+ enrolled_ids = torch.as_tensor(enrolled_ids).cpu().numpy()
459
+
460
+ # embedding dimension
461
+ assert query_array.shape[-1] == enrolled_array.shape[-1]
462
+ d = query_array.shape[-1]
463
+
464
+ # index enrolled embeddings
465
+ knn = NearestNeighbors(n_neighbors=k, metric="cosine").fit(enrolled_array)
466
+
467
+ # `I` is a (n_queries, k) array holding the indices of the k-closest enrolled
468
+ # embeddings for each query; `D` is a (n_queries, k) array holding the corresponding
469
+ # embedding-space distances
470
+ D, I = knn.kneighbors(query_array, k, return_distance=True)
471
+
472
+ # for each row, see if at least one of the k nearest enrolled indices maps
473
+ # to a speaker ID that matches the query index's speaker id
474
+ targets = np.tile(query_ids.reshape(-1, 1), (1, k))
475
+
476
+ predictions = enrolled_ids[I]
477
+ matches = (targets == predictions).sum(axis=-1) > 0
478
+
479
+ return np.mean(matches)
480
+
481
+
482
+ def init_attacks():
483
+ """
484
+ Initialize pre-trained speaker recognition pipelines and de-identification
485
+ attacks
486
+ """
487
+
488
+ # channel simulation
489
+ if SIMULATION:
490
+ sim = [
491
+ Offset(length=[-.15, .15]),
492
+ Noise(type='gaussian', snr=[30.0, 50.0]),
493
+ Bandpass(low=[300, 500], high=[3400, 7400]),
494
+ Dropout(rate=0.001)
495
+ ]
496
+ else:
497
+ sim = None
498
+
499
+ pipelines = {}
500
+
501
+ model_resnet = SpeakerVerificationModel(
502
+ model=ResNetSE34V2(nOut=512, encoder_type='ASP'),
503
+ n_segments=1,
504
+ segment_select='lin',
505
+ distance_fn='cosine',
506
+ threshold=0.0
507
+ )
508
+ model_resnet.load_weights(
509
+ MODELS_DIR / 'speaker' / 'resnetse34v2' / 'resnetse34v2.pt')
510
+
511
+ model_yvector = SpeakerVerificationModel(
512
+ model=YVector(),
513
+ n_segments=1,
514
+ segment_select='lin',
515
+ distance_fn='cosine',
516
+ threshold=0.0
517
+ )
518
+ model_yvector.load_weights(
519
+ MODELS_DIR / 'speaker' / 'yvector' / 'yvector.pt')
520
+
521
+ pipelines['resnet'] = Pipeline(
522
+ simulation=sim,
523
+ preprocessor=Preprocessor(Normalize(method='peak')),
524
+ model=model_resnet,
525
+ device='cuda' if torch.cuda.is_available() else 'cpu'
526
+ )
527
+
528
+ if TRANSFER:
529
+ pipelines['yvector'] = Pipeline(
530
+ simulation=sim,
531
+ preprocessor=Preprocessor(Normalize(method='peak')),
532
+ model=model_yvector,
533
+ device='cuda' if torch.cuda.is_available() else 'cpu'
534
+ )
535
+ else:
536
+ del model_yvector
537
+
538
+ # prepare to log attack progress
539
+ writer = Writer(
540
+ root_dir=RUNS_DIR,
541
+ name='evaluate-attacks',
542
+ use_timestamp=True,
543
+ log_iter=300,
544
+ use_tb=True
545
+ )
546
+
547
+ attacks = {}
548
+
549
+ # use consistent adversarial loss
550
+ adv_loss = SpeakerEmbeddingLoss(
551
+ targeted=False,
552
+ confidence=0.1,
553
+ threshold=0.0
554
+ )
555
+
556
+ # use consistent auxiliary loss across attacks
557
+ aux_loss = SumLoss().add_loss_function(
558
+ DemucsMRSTFTLoss(), 1.0
559
+ ).add_loss_function(L1Loss(), 1.0).to('cuda')
560
+
561
+ attacks['voicebox'] = VoiceBoxAttack(
562
+ pipeline=pipelines['resnet'],
563
+ adv_loss=adv_loss,
564
+ aux_loss=aux_loss,
565
+ lr=1e-4,
566
+ epochs=1,
567
+ batch_size=BATCH_SIZE,
568
+ voicebox_kwargs={
569
+ 'win_length': 256,
570
+ 'ppg_encoder_hidden_size': 256,
571
+ 'use_phoneme_encoder': True,
572
+ 'use_pitch_encoder': True,
573
+ 'use_loudness_encoder': True,
574
+ 'spec_encoder_lookahead_frames': 0,
575
+ 'spec_encoder_type': 'mel',
576
+ 'spec_encoder_mlp_depth': 2,
577
+ 'bottleneck_lookahead_frames': LOOKAHEAD,
578
+ 'ppg_encoder_path': PPG_PRETRAINED_PATH,
579
+ 'n_bands': 128,
580
+ 'spec_encoder_hidden_size': 512,
581
+ 'bottleneck_skip': True,
582
+ 'bottleneck_hidden_size': 512,
583
+ 'bottleneck_feedforward_size': 512,
584
+ 'bottleneck_type': 'lstm',
585
+ 'bottleneck_depth': 2,
586
+ 'control_eps': 0.5,
587
+ 'projection_norm': float('inf'),
588
+ 'conditioning_dim': 512
589
+ },
590
+ writer=writer,
591
+ checkpoint_name='voicebox-attack'
592
+ )
593
+ attacks['voicebox'].load(VOICEBOX_PATH)
594
+
595
+ attacks['universal'] = AdvPulseAttack(
596
+ pipeline=pipelines['resnet'],
597
+ adv_loss=adv_loss,
598
+ pgd_norm=float('inf'),
599
+ pgd_variant=None,
600
+ scale_grad=None,
601
+ eps=0.08,
602
+ length=2.0,
603
+ align='start',
604
+ lr=1e-4,
605
+ normalize=True,
606
+ loop=True,
607
+ aux_loss=aux_loss,
608
+ epochs=1,
609
+ batch_size=BATCH_SIZE,
610
+ writer=writer,
611
+ checkpoint_name='universal-attack'
612
+ )
613
+ attacks['universal'].load(UNIVERSAL_PATH)
614
+
615
+ attacks['kenansville'] = KenansvilleAttack(
616
+ pipeline=pipelines['resnet'],
617
+ batch_size=BATCH_SIZE,
618
+ adv_loss=adv_loss,
619
+ threshold_db_low=4.0, # fix threshold
620
+ threshold_db_high=4.0,
621
+ win_length=512,
622
+ writer=writer,
623
+ step_size=1.0,
624
+ search='bisection',
625
+ min_success_rate=0.2,
626
+ checkpoint_name='kenansville-attack'
627
+ )
628
+
629
+ attacks['noise'] = WhiteNoiseAttack(
630
+ pipeline=pipelines['resnet'],
631
+ adv_loss=adv_loss,
632
+ aux_loss=aux_loss,
633
+ snr_low=-10.0, # fix threshold
634
+ snr_high=-10.0,
635
+ writer=writer,
636
+ step_size=1,
637
+ search='bisection',
638
+ min_success_rate=0.2,
639
+ checkpoint_name='noise-perturbation'
640
+ )
641
+
642
+ return attacks, pipelines, writer
643
+
644
+
645
+ @torch.no_grad()
646
+ def evaluate_attack(attack: TrainableAttack,
647
+ speaker_pipeline: Pipeline,
648
+ asr_pipeline: Pipeline):
649
+
650
+ if DENOISER:
651
+ from src.models.denoiser.demucs import load_demucs
652
+ defense = load_demucs('dns_48').to(
653
+ 'cuda' if torch.cuda.is_available() else 'cpu')
654
+ defense.eval()
655
+ else:
656
+ defense = nn.Identity()
657
+
658
+ # prepare for GPU inference
659
+ if torch.cuda.is_available():
660
+
661
+ attack.pipeline.set_device('cuda')
662
+ speaker_pipeline.set_device('cuda')
663
+ asr_pipeline.set_device('cuda')
664
+ attack.perturbation.to('cuda')
665
+
666
+ # locate dataset
667
+ if EVAL_DATASET == "librispeech":
668
+ cache_dir = CACHE_DIR / 'ls_wer_eval'
669
+ else:
670
+ cache_dir = CACHE_DIR / 'vc_wer_eval'
671
+ assert os.path.isdir(cache_dir), \
672
+ f'Dataset must be built/cached before evaluation'
673
+
674
+ # prepare for PESQ/STOI calculations
675
+ all_pesq_scores = []
676
+ all_stoi_scores = []
677
+
678
+ # prepare for WER/CER computations
679
+ all_query_transcripts = []
680
+ all_pred_query_transcripts = []
681
+ all_adv_query_transcripts = []
682
+
683
+ # prepare for accuracy computations
684
+ all_query_emb = {}
685
+ all_adv_query_emb = {}
686
+ all_enroll_emb = {}
687
+ all_enroll_emb_centroid = {}
688
+
689
+ spkr_dirs = list(cache_dir.glob("*/"))
690
+ spkr_dirs = [s_d for s_d in spkr_dirs if os.path.isdir(s_d)]
691
+ for spkr_dir in tqdm(spkr_dirs, total=len(spkr_dirs), desc='Running evaluation'):
692
+
693
+ # identify speaker
694
+ spkr_id = spkr_dir.parts[-1]
695
+
696
+ # use integer IDs
697
+ if EVAL_DATASET != "librispeech":
698
+ spkr_id = spkr_id.split("id")[-1]
699
+
700
+ # identify speaker recognition model
701
+ if isinstance(speaker_pipeline.model.model, ResNetSE34V2):
702
+ model_name = 'resnet'
703
+ elif isinstance(speaker_pipeline.model.model, YVector):
704
+ model_name = 'yvector'
705
+ else:
706
+ raise ValueError(f'Invalid speaker recognition model')
707
+
708
+ # load clean embeddings
709
+ query_emb = torch.load(spkr_dir / model_name / 'query_emb.pt')
710
+ condition_emb = torch.load(spkr_dir / 'resnet' / 'condition_emb.pt')
711
+ enroll_emb = torch.load(spkr_dir / model_name / 'enroll_emb.pt')
712
+
713
+ # load clean audio
714
+ query_audio = torch.load(spkr_dir / 'query_audio.pt')
715
+
716
+ # if defense in use, re-compute query audio
717
+ if DENOISER:
718
+ query_emb = compute_embeddings_batch(
719
+ query_audio, speaker_pipeline, defense=defense
720
+ )
721
+
722
+ # load clean transcript
723
+ if EVAL_DATASET == "librispeech":
724
+ query_transcripts = torch.load(spkr_dir / 'query_trans.pt')
725
+ else:
726
+ query_transcripts = None
727
+
728
+ # compute conditioning embedding centroid
729
+ condition_centroid = condition_emb.mean(dim=(0, 1), keepdim=True)
730
+
731
+ # compute enrolled embedding centroid
732
+ enroll_centroid = enroll_emb.mean(dim=(0, 1), keepdim=True)
733
+
734
+ # compute adversarial query audio
735
+ adv_query_audio = compute_attack_batch(
736
+ query_audio, attack, condition_centroid)
737
+
738
+ # compute adversarial query embeddings; optionally, pass through
739
+ # unseen denoiser defense
740
+ adv_query_emb = compute_embeddings_batch(
741
+ adv_query_audio, speaker_pipeline, defense=defense
742
+ )
743
+
744
+ if EVAL_DATASET == "librispeech":
745
+
746
+ # compute clean predicted transcripts
747
+ pred_query_transcripts = compute_transcripts_batch(
748
+ query_audio, asr_pipeline
749
+ )
750
+
751
+ # compute adversarial transcripts
752
+ adv_query_transcripts = compute_transcripts_batch(
753
+ adv_query_audio, asr_pipeline
754
+ )
755
+
756
+ # compute objective quality metric scores
757
+ if COMPUTE_OBJECTIVE_METRICS:
758
+ pesq_scores = compute_pesq(query_audio, adv_query_audio)
759
+ stoi_scores = compute_stoi(query_audio, adv_query_audio)
760
+ else:
761
+ pesq_scores = np.zeros(len(query_audio))
762
+ stoi_scores = np.zeros(len(query_audio))
763
+
764
+ # store all objective quality metric scores
765
+ all_pesq_scores.extend(pesq_scores)
766
+ all_stoi_scores.extend(stoi_scores)
767
+
768
+ # store all unit-normalized clean, adversarial, and enrolled centroid
769
+ # embeddings
770
+ all_query_emb[int(spkr_id)] = F.normalize(query_emb.clone(), dim=-1)
771
+ all_adv_query_emb[int(spkr_id)] = F.normalize(adv_query_emb.clone(), dim=-1)
772
+ all_enroll_emb[int(spkr_id)] = F.normalize(enroll_emb.clone(), dim=-1)
773
+ all_enroll_emb_centroid[int(spkr_id)] = F.normalize(enroll_centroid.clone(), dim=-1)
774
+
775
+ # store all transcripts
776
+ if EVAL_DATASET == "librispeech":
777
+ all_query_transcripts.extend(query_transcripts)
778
+ all_pred_query_transcripts.extend(pred_query_transcripts)
779
+ all_adv_query_transcripts.extend(adv_query_transcripts)
780
+
781
+ # free GPU memory for similarity search
782
+ attack.pipeline.set_device('cpu')
783
+ speaker_pipeline.set_device('cpu')
784
+ asr_pipeline.set_device('cpu')
785
+ attack.perturbation.to('cpu')
786
+ torch.cuda.empty_cache()
787
+
788
+ # compute and display final objective quality metrics
789
+ print(f"PESQ (mean/std): {np.mean(all_pesq_scores)}/{np.std(all_pesq_scores)}")
790
+ print(f"STOI (mean/std): {np.mean(all_stoi_scores)}/{np.std(all_stoi_scores)}")
791
+
792
+ if EVAL_DATASET == "librispeech":
793
+
794
+ # compute and display final WER/CER metrics
795
+ wer, cer = asr_metrics(all_query_transcripts, all_adv_query_transcripts)
796
+ print(f"Adversarial WER / CER: {wer} / {cer}")
797
+
798
+ wer, cer = asr_metrics(all_query_transcripts, all_pred_query_transcripts)
799
+ print(f"Clean WER / CER: {wer} / {cer}")
800
+
801
+ else:
802
+ wer, cer = None, None
803
+
804
+ del (wer, cer, all_pesq_scores, all_stoi_scores,
805
+ all_query_transcripts, all_adv_query_transcripts, all_pred_query_transcripts)
806
+
807
+ # embedding-space cosine distance calculations
808
+ cos_dist_fn = EmbeddingDistance(distance_fn='cosine')
809
+
810
+ # mean clean-to-adversarial query embedding distance
811
+ total_query_dist = 0.0
812
+ n = 0
813
+ for spkr_id in all_query_emb.keys():
814
+ dist = cos_dist_fn(all_query_emb[spkr_id],
815
+ all_adv_query_emb[spkr_id]).mean()
816
+ total_query_dist += len(all_query_emb[spkr_id]) * dist.item()
817
+ n += len(all_query_emb[spkr_id])
818
+ mean_query_dist = total_query_dist / n
819
+ print(f"\n\t\tMean cosine distance between clean and adversarial query "
820
+ f"embeddings: {mean_query_dist :0.4f}")
821
+
822
+ # mean adversarial-query-to-enrolled-centroid embedding distance
823
+ total_centroid_dist = 0.0
824
+ n = 0
825
+ for spkr_id in all_query_emb.keys():
826
+ n_queries = len(all_adv_query_emb[spkr_id])
827
+ dist = 0.0
828
+ for i in range(n_queries):
829
+ dist += cos_dist_fn(all_enroll_emb_centroid[spkr_id],
830
+ all_adv_query_emb[spkr_id][i:i+1]).item()
831
+ total_centroid_dist += dist
832
+ n += n_queries
833
+ mean_centroid_dist = total_centroid_dist / n
834
+ print(f"\t\tMean cosine distance between clean enrolled centroids and "
835
+ f"adversarial query embeddings: {mean_centroid_dist :0.4f}")
836
+
837
+ # top-1 accuracy for clean queries (closest embedding)
838
+ top_1_clean_single = top_k(all_query_emb, all_enroll_emb, k=1)
839
+
840
+ # top-1 accuracy for clean queries (centroid embedding)
841
+ top_1_clean_centroid = top_k(all_query_emb, all_enroll_emb_centroid, k=1)
842
+
843
+ # top-10 accuracy for clean queries (closest embedding)
844
+ top_10_clean_single = top_k(all_query_emb, all_enroll_emb, k=10)
845
+
846
+ # top-10 accuracy for clean queries (centroid embedding)
847
+ top_10_clean_centroid = top_k(all_query_emb, all_enroll_emb_centroid, k=10)
848
+
849
+ # top-1 accuracy for adversarial queries (closest embedding)
850
+ top_1_adv_single = top_k(all_adv_query_emb, all_enroll_emb, k=1)
851
+
852
+ # top-1 accuracy for adversarial queries (centroid embedding)
853
+ top_1_adv_centroid = top_k(all_adv_query_emb, all_enroll_emb_centroid, k=1)
854
+
855
+ # top-10 accuracy for adversarial queries (closest embedding)
856
+ top_10_adv_single = top_k(all_adv_query_emb, all_enroll_emb, k=10)
857
+
858
+ # top-10 accuracy for adversarial queries (centroid embedding)
859
+ top_10_adv_centroid = top_k(all_adv_query_emb, all_enroll_emb_centroid, k=10)
860
+
861
+ print(f"\n\t\tTop-1 accuracy (clean embedding / nearest enrolled embedding) {top_1_clean_single :0.4f}",
862
+ f"\n\t\tTop-1 accuracy (clean embedding / nearest enrolled centroid) {top_1_clean_centroid :0.4f}",
863
+ f"\n\t\tTop-10 accuracy (clean embedding / nearest enrolled embedding) {top_10_clean_single :0.4f}"
864
+ f"\n\t\tTop-10 accuracy (clean embedding / nearest enrolled centroid) {top_10_clean_centroid :0.4f}",
865
+ f"\n\t\tTop-1 accuracy (adversarial embedding / nearest enrolled embedding {top_1_adv_single :0.4f}",
866
+ f"\n\t\tTop-1 accuracy (adversarial embedding / nearest enrolled centroid) {top_1_adv_centroid :0.4f}",
867
+ f"\n\t\tTop-10 accuracy (adversarial embedding / nearest enrolled embedding {top_10_adv_single :0.4f}",
868
+ f"\n\t\tTop-10 accuracy (adversarial embedding / nearest enrolled centroid) {top_10_adv_centroid :0.4f}"
869
+ )
870
+
871
+
872
+ @torch.no_grad()
873
+ def evaluate_attacks(attacks: dict,
874
+ speaker_pipelines: dict,
875
+ asr_pipeline: Pipeline):
876
+
877
+ for attack_name, attack in attacks.items():
878
+ for sp_name, sp in speaker_pipelines.items():
879
+ print(f'Evaluating {attack_name} against model {sp_name} '
880
+ f'{"with" if DENOISER else "without"} denoiser defense')
881
+ evaluate_attack(attack, sp, asr_pipeline)
882
+
883
+
884
+ def main():
885
+
886
+ # initial random seed (keep dataset order consistent)
887
+ set_random_seed(0)
888
+
889
+ # initialize pipelines
890
+ attacks, pipelines, writer = init_attacks()
891
+
892
+ # ensure that necessary data is cached
893
+ if EVAL_DATASET == "librispeech":
894
+ build_ls_dataset(pipelines)
895
+ else:
896
+ build_vc_dataset(pipelines)
897
+
898
+ # initialize ASR model
899
+ asr_model = SpeechRecognitionModel(
900
+ model=Wav2Vec2(),
901
+ )
902
+ asr_pipeline = Pipeline(
903
+ model=asr_model,
904
+ preprocessor=Preprocessor(Normalize(method='peak')),
905
+ device='cuda' if torch.cuda.is_available() else 'cpu'
906
+ )
907
+
908
+ writer.log_cuda_memory()
909
+
910
+ evaluate_attacks(attacks, pipelines, asr_pipeline)
911
+
912
+
913
+ if __name__ == "__main__":
914
+ main()
915
+
voicebox/scripts/experiments/train.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import torchaudio
5
+ import psutil
6
+ import pickle
7
+ import librosa as li
8
+
9
+ from torch.utils.data import TensorDataset
10
+
11
+ import time
12
+ import random
13
+ import argparse
14
+ from datetime import datetime
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ from typing import Dict
19
+ from pathlib import Path
20
+ from tqdm import tqdm
21
+ import builtins
22
+
23
+ from typing import Iterable
24
+ from copy import deepcopy
25
+
26
+ from distutils.util import strtobool
27
+
28
+ from src.data import *
29
+ from src.constants import *
30
+ from src.models import *
31
+ from src.simulation import *
32
+ from src.preprocess import *
33
+ from src.attacks.offline import *
34
+ from src.loss import *
35
+ from src.pipelines import *
36
+ from src.utils import *
37
+
38
+ ################################################################################
39
+ # Train VoiceBox attack
40
+ ################################################################################
41
+
42
+ BATCH_SIZE = 20 # training batch size
43
+ EPOCHS = 10 # training epochs
44
+ TARGET_PCTL = 25 # de-identification strength; in [1,5,10,15,20,25,50,90,100]
45
+ N_EMBEDDINGS_TRAIN = 15
46
+ TARGETED = False
47
+ TARGETS_TRAIN = 'centroid' # 'random', 'same', 'single', 'median'
48
+ TARGETS_TEST = 'centroid' # 'random', 'same', 'single', 'median'
49
+
50
+ # distributions of inter- ('targeted') and intra- ('untargeted') speaker
51
+ # distances in each pre-trained model's embedding spaces, as measured between
52
+ # individual utterances and their speaker centroid ('single-centroid') or
53
+ # between all pairs of individual utterances ('single-single') over the
54
+ # LibriSpeech test-clean dataset. This allows specification of attack strength
55
+ # during the training process
56
+ percentiles = {
57
+ 'resnet': {
58
+ 'targeted': {
59
+ 'single-centroid': {1:.495, 5:.572, 10:.617, 15:.648, 20:.673, 25:.695, 50:.773, 90:.892, 100:1.127},
60
+ 'single-single': {1:.560, 5:.630, 10:.672, 15:.700, 20:.722, 25:.742, 50:.813, 90:.924, 100:1.194}
61
+ },
62
+ 'untargeted': {
63
+ 'single-centroid': {1:.099, 5:.117, 10:.126, 15:.133, 20:.139, 25:.145, 50:.170, 90:.253, 100:.587},
64
+ 'single-single': {1:.181, 5:.215, 10:.235, 15:.249, 20:.262, 25:.272, 50:.323, 90:.464, 100:.817}
65
+ },
66
+ },
67
+ 'yvector': {
68
+ 'targeted': {
69
+ 'single-centroid': {1:.665, 5:.757, 10:.801, 15:.830, 20:.851, 25:.868, 50:.936, 90:1.056, 100:1.312},
70
+ 'single-single': {1:.695, 5:.779, 10:.821, 15:.847, 20:.868, 25:.885, 50:.952, 90:1.072, 100:1.428}
71
+ },
72
+ 'untargeted': {
73
+ 'single-single': {1:.218, 5:.268, 10:.301, 15:.325, 20:.345, 25:.365, 50:.455, 90:.684, 100:1.156},
74
+ 'single-centroid': {1:.114, 5:.143, 10:.159, 15:.170, 20:.180, 25:.190, 50:.242, 90:.413, 100:.874}
75
+ }
76
+ },
77
+ }
78
+
79
+
80
+ def set_random_seed(seed: int = 123):
81
+ """Set random seed to allow for reproducibility"""
82
+ random.seed(seed)
83
+ torch.manual_seed(seed)
84
+
85
+ if torch.backends.cudnn.is_available():
86
+ # torch.backends.cudnn.benchmark = True
87
+ torch.backends.cudnn.deterministic = True
88
+
89
+
90
+ def param_count(m: nn.Module, trainable: bool = False):
91
+ """Count the number of trainable parameters (weights) in a model"""
92
+ if trainable:
93
+ return builtins.sum(
94
+ [p.shape.numel() for p in m.parameters() if p.requires_grad])
95
+ else:
96
+ return builtins.sum([p.shape.numel() for p in m.parameters()])
97
+
98
+
99
+ def main():
100
+
101
+ set_random_seed(0)
102
+
103
+ model = SpeakerVerificationModel(
104
+ model=ResNetSE34V2(nOut=512, encoder_type='ASP'),
105
+ n_segments=1,
106
+ segment_select='lin',
107
+ distance_fn='cosine',
108
+ threshold=percentiles['resnet']['targeted']['single-centroid' if
109
+ TARGETS_TRAIN == 'centroid' else 'single-single'][TARGET_PCTL]
110
+ )
111
+ model.load_weights(MODELS_DIR / 'speaker' / 'resnetse34v2' / 'resnetse34v2.pt')
112
+
113
+ # instantiate training pipeline
114
+ pipeline = Pipeline(
115
+ simulation=None,
116
+ preprocessor=Preprocessor(Normalize(method='peak')),
117
+ model=model,
118
+ device='cuda' if torch.cuda.is_available() else 'cpu'
119
+ )
120
+
121
+ attacks = {}
122
+
123
+ # log training progress
124
+ writer = Writer(
125
+ root_dir=RUNS_DIR,
126
+ name='train-attacks',
127
+ use_timestamp=True,
128
+ log_iter=300,
129
+ use_tb=True
130
+ )
131
+
132
+ # adversarial training loss
133
+ adv_loss = SpeakerEmbeddingLoss(
134
+ targeted=TARGETED,
135
+ confidence=0.1,
136
+ threshold=pipeline.model.threshold
137
+ )
138
+
139
+ # auxiliary loss
140
+ aux_loss = SumLoss().add_loss_function(
141
+ DemucsMRSTFTLoss(), 1.0
142
+ ).add_loss_function(L1Loss(), 1.0).to('cuda')
143
+
144
+ # speech features loss actually seems to do better...
145
+ # aux_loss = SumLoss().add_loss_function(SpeechFeatureLoss(), 1e-6).to('cuda')
146
+
147
+ attacks['voicebox'] = VoiceBoxAttack(
148
+ pipeline=pipeline,
149
+ adv_loss=adv_loss,
150
+ aux_loss=aux_loss,
151
+ lr=1e-4,
152
+ epochs=EPOCHS,
153
+ batch_size=BATCH_SIZE,
154
+ voicebox_kwargs={
155
+ 'win_length': 256,
156
+ 'ppg_encoder_hidden_size': 256,
157
+ 'use_phoneme_encoder': True,
158
+ 'use_pitch_encoder': True,
159
+ 'use_loudness_encoder': True,
160
+ 'spec_encoder_lookahead_frames': 0,
161
+ 'spec_encoder_type': 'mel',
162
+ 'spec_encoder_mlp_depth': 2,
163
+ 'bottleneck_lookahead_frames': 5,
164
+ 'ppg_encoder_path': PPG_PRETRAINED_PATH,
165
+ 'n_bands': 128,
166
+ 'spec_encoder_hidden_size': 512,
167
+ 'bottleneck_skip': True,
168
+ 'bottleneck_hidden_size': 512,
169
+ 'bottleneck_feedforward_size': 512,
170
+ 'bottleneck_type': 'lstm',
171
+ 'bottleneck_depth': 2,
172
+ 'control_eps': 0.5,
173
+ 'projection_norm': float('inf'),
174
+ 'conditioning_dim': 512
175
+ },
176
+ writer=writer,
177
+ checkpoint_name='voicebox-attack'
178
+ )
179
+
180
+ attacks['universal'] = AdvPulseAttack(
181
+ pipeline=pipeline,
182
+ adv_loss=adv_loss,
183
+ pgd_norm=float('inf'),
184
+ pgd_variant=None,
185
+ scale_grad=None,
186
+ eps=0.08,
187
+ length=2.0,
188
+ align='random', # 'start',
189
+ lr=1e-4,
190
+ normalize=True,
191
+ loop=True,
192
+ aux_loss=aux_loss,
193
+ epochs=EPOCHS,
194
+ batch_size=BATCH_SIZE,
195
+ writer=writer,
196
+ checkpoint_name='universal-attack'
197
+ )
198
+
199
+ if torch.cuda.is_available():
200
+
201
+ # prepare for multi-GPU training
202
+ device_ids = get_cuda_device_ids()
203
+
204
+ # wrap pipeline for multi-GPU training
205
+ pipeline = wrap_pipeline_multi_gpu(pipeline, device_ids)
206
+
207
+ # load training and validation datasets. Features will be computed and
208
+ # cached to disk, which may take some time
209
+ data_train = LibriSpeechDataset(
210
+ split='train-clean-100', features=['pitch', 'periodicity', 'loudness'])
211
+ data_test = LibriSpeechDataset(
212
+ split='test-clean', features=['pitch', 'periodicity', 'loudness'])
213
+
214
+ # reassign targets if necessary
215
+ compiled_train, compiled_test = create_embedding_dataset(
216
+ pipeline=pipeline,
217
+ select_train=TARGETS_TRAIN,
218
+ select_test=TARGETS_TEST,
219
+ data_train=data_train,
220
+ data_test=data_test,
221
+ targeted=TARGETED,
222
+ target_class=None,
223
+ num_embeddings_train=N_EMBEDDINGS_TRAIN,
224
+ batch_size=20
225
+ )
226
+
227
+ # extract embedding datasets
228
+ data_train = compiled_train['dataset']
229
+ data_test = compiled_test['dataset']
230
+
231
+ # log memory use prior to training
232
+ writer.log_info(f'Training data ready; memory use: '
233
+ f'{psutil.virtual_memory().percent :0.3f}%')
234
+ writer.log_cuda_memory()
235
+
236
+ for attack_name, attack in attacks.items():
237
+
238
+ writer.log_info(f'Preparing {attack_name}...')
239
+
240
+ if torch.cuda.is_available():
241
+
242
+ attack.perturbation.to('cuda')
243
+ attack.pipeline.to('cuda')
244
+
245
+ # wrap attack for multi-GPU training
246
+ attack = wrap_attack_multi_gpu(attack, device_ids)
247
+
248
+ # evaluate performance
249
+ with torch.no_grad():
250
+ x_example = next(iter(data_train))['x'].to(pipeline.device)
251
+ st = time.time()
252
+ outs = attack.perturbation(x_example)
253
+ dur = time.time() - st
254
+
255
+ writer.log_info(
256
+ f'Processing time per input (device: '
257
+ f'{pipeline.device}): {dur/x_example.shape[0] :0.4f} (s)'
258
+ )
259
+ writer.log_info(f'Trainable parameters: '
260
+ f'{param_count(attack.perturbation, trainable=True)}')
261
+ writer.log_info(f'Total parameters: {param_count(attack.perturbation, trainable=False)}')
262
+
263
+ # train
264
+ writer.log_info('Training attack...')
265
+ attack.train(data_train=data_train, data_val=data_test)
266
+
267
+ # evaluate
268
+ writer.log_info(f'Evaluating attack...')
269
+ x_adv, success, detection = attack.evaluate(
270
+ dataset=data_test
271
+ )
272
+
273
+ # log results summary: success rate in achieving target threshold
274
+ writer.log_info(
275
+ f'Success rate in meeting embedding distance threshold {pipeline.model.threshold}'
276
+ f' ({TARGET_PCTL}%): '
277
+ f'{success.flatten().mean().item()}'
278
+ )
279
+
280
+
281
+ if __name__ == "__main__":
282
+ main()
voicebox/scripts/experiments/train_phoneme_predictor.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.utils.data import DataLoader
4
+
5
+ from src.models.phoneme import PPGEncoder
6
+ from src.constants import LIBRISPEECH_NUM_PHONEMES, LIBRISPEECH_PHONEME_DICT
7
+ from src.data import LibriSpeechDataset
8
+ from src.utils.writer import Writer
9
+
10
+ import numpy as np
11
+ from sklearn.metrics import confusion_matrix, classification_report
12
+ import seaborn as sn
13
+ import pandas as pd
14
+ import matplotlib.pyplot as plt
15
+
16
+ ################################################################################
17
+ # Train a simple model to produce phonetic posteriorgrams (PPGs)
18
+ ################################################################################
19
+
20
+
21
+ def main():
22
+
23
+ # training hyperparameters
24
+ lr = .001
25
+ epochs = 60
26
+ batch_size = 250
27
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
28
+
29
+ # phoneme encoder hyperparameters
30
+ lstm_depth = 2
31
+ hidden_size = 128 # 512
32
+ win_length = 256
33
+ hop_length = 128
34
+ n_mels = 32
35
+ n_mfcc = 19
36
+ lookahead_frames = 0 # 1
37
+
38
+ # datasets and loaders
39
+ train_data = LibriSpeechDataset(
40
+ split='train-clean-100',
41
+ target='phoneme',
42
+ features=None,
43
+ hop_length=hop_length
44
+ )
45
+ val_data = LibriSpeechDataset(
46
+ split='test-clean',
47
+ target='phoneme',
48
+ features=None,
49
+ hop_length=hop_length
50
+ )
51
+ train_loader = DataLoader(
52
+ train_data,
53
+ batch_size=batch_size,
54
+ shuffle=True)
55
+ val_loader = DataLoader(
56
+ val_data,
57
+ batch_size=batch_size)
58
+
59
+ # initialize phoneme encoder
60
+ encoder = PPGEncoder(
61
+ win_length=win_length,
62
+ hop_length=hop_length,
63
+ win_func=torch.hann_window,
64
+ n_mels=n_mels,
65
+ n_mfcc=n_mfcc,
66
+ lstm_depth=lstm_depth,
67
+ hidden_size=hidden_size,
68
+ )
69
+
70
+ # initialize classification layer and wrap as single module
71
+ classifier = nn.Sequential(
72
+ encoder,
73
+ nn.Linear(hidden_size, LIBRISPEECH_NUM_PHONEMES)
74
+ ).to(device)
75
+
76
+ # log training progress
77
+ writer = Writer(
78
+ name=f"phoneme_lookahead_{lookahead_frames}",
79
+ use_tb=True,
80
+ log_iter=len(train_loader)
81
+ )
82
+
83
+ import builtins
84
+ parameter_count = builtins.sum([
85
+ p.shape.numel()
86
+ for p in classifier[0].parameters()
87
+ if p.requires_grad
88
+ ])
89
+
90
+ writer.log_info(f'Training PPG model with lookahead {lookahead_frames}'
91
+ f' ({parameter_count} parameters)')
92
+
93
+ # initialize optimizer and loss function
94
+ optimizer = torch.optim.Adam(classifier.parameters(), lr=lr)
95
+ loss_fn = nn.CrossEntropyLoss()
96
+
97
+ iter_id = 0
98
+ min_val_loss = float('inf')
99
+
100
+ for epoch in range(epochs):
101
+
102
+ print(f'beginning epoch {epoch}')
103
+
104
+ classifier.train()
105
+ for batch in train_loader:
106
+
107
+ optimizer.zero_grad(set_to_none=True)
108
+
109
+ x, y = batch['x'].to(device), batch['y'].to(device)
110
+
111
+ preds = classifier(x)
112
+
113
+ # offset labels to incorporate lookahead
114
+ y = y[:, :-lookahead_frames if lookahead_frames else None]
115
+
116
+ # offset predictions correspondingly
117
+ preds = preds[:, lookahead_frames:]
118
+
119
+ # compute cross-entropy loss
120
+ loss = loss_fn(
121
+ preds.reshape(-1, LIBRISPEECH_NUM_PHONEMES), y.reshape(-1)
122
+ )
123
+
124
+ loss.backward()
125
+ optimizer.step()
126
+
127
+ writer.log_scalar(loss, tag="CrossEntropyLoss-Train", global_step=iter_id)
128
+ iter_id += 1
129
+
130
+ val_loss, val_acc, n = 0.0, 0.0, 0
131
+ classifier.eval()
132
+ with torch.no_grad():
133
+ for batch in val_loader:
134
+
135
+ x, y = batch['x'].to(device), batch['y'].to(device)
136
+
137
+ preds = classifier(x)
138
+
139
+ # offset labels to incorporate lookahead
140
+ y = y[:, :-lookahead_frames if lookahead_frames else None]
141
+
142
+ # offset predictions correspondingly
143
+ preds = preds[:, lookahead_frames:]
144
+
145
+ n += len(x)
146
+ val_loss += loss_fn(
147
+ preds.reshape(-1, LIBRISPEECH_NUM_PHONEMES), y.reshape(-1)
148
+ ) * len(x)
149
+ val_acc += len(x) * (torch.argmax(preds, dim=2) == y).flatten().float().mean()
150
+
151
+ val_loss /= n
152
+ val_acc /= n
153
+ writer.log_scalar(val_loss, tag="CrossEntropyLoss-Val", global_step=iter_id)
154
+ writer.log_scalar(val_acc, tag="Accuracy-Val")
155
+
156
+ # save weights
157
+ if val_loss < min_val_loss:
158
+ min_val_loss = val_loss
159
+ print(f'new best val loss {val_loss}; saving weights')
160
+ writer.checkpoint(classifier[0].state_dict(), 'phoneme_classifier')
161
+
162
+ # generate confusion matrix
163
+ classifier.eval()
164
+
165
+ # compute accuracy on validation data
166
+ all_preds = []
167
+ all_true = []
168
+ with torch.no_grad():
169
+ for batch in val_loader:
170
+
171
+ x, y = batch['x'].to(device), batch['y'].to(device)
172
+
173
+ preds = classifier(x)
174
+
175
+ # offset labels to incorporate lookahead
176
+ y = y[:, :-lookahead_frames if lookahead_frames else None]
177
+
178
+ # offset predictions correspondingly
179
+ preds = preds[:, lookahead_frames:]
180
+
181
+ all_preds.append(preds.argmax(dim=2).reshape(-1))
182
+ all_true.append(y.reshape(-1))
183
+
184
+ # compile predictions and targets
185
+ all_preds = torch.cat(all_preds, dim=0).cpu().numpy()
186
+ all_true = torch.cat(all_true, dim=0).cpu().numpy()
187
+
188
+ reverse_dict = {v: k for (k, v) in LIBRISPEECH_PHONEME_DICT.items() if v != 0}
189
+ reverse_dict[0] = 'sil'
190
+
191
+ class_report = classification_report(all_true, all_preds)
192
+ writer.log_info(class_report)
193
+
194
+ cm = confusion_matrix(all_true, all_preds, labels=list(range(len(reverse_dict))))
195
+ df_cm = pd.DataFrame(cm, index=[i for i in sorted(list(reverse_dict.keys()))],
196
+ columns=[i for i in sorted(list(reverse_dict.keys()))])
197
+ plt.figure(figsize=(40, 28))
198
+ sn.set(font_scale=1.0) # for label size
199
+ sn.heatmap(df_cm, annot=True, annot_kws={"size": 35 / np.sqrt(len(cm))}, fmt='g')
200
+
201
+ plt.savefig("phoneme_cm.png", dpi=200)
202
+
203
+
204
+ if __name__ == '__main__':
205
+ main()
voicebox/scripts/streamer/benchmark_streamer.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import librosa
3
+ import soundfile as sf
4
+
5
+ from tqdm import tqdm
6
+ from src.attacks.offline.perturbation.voicebox import projection
7
+ from src.attacks.online import Streamer, VoiceBoxStreamer
8
+ from src.models import ResNetSE34V2, SpeakerVerificationModel
9
+ from src.constants import MODELS_DIR, TEST_DIR, PPG_PRETRAINED_PATH
10
+
11
+ import warnings
12
+ warnings.filterwarnings("ignore")
13
+
14
+ torch.set_num_threads(1)
15
+
16
+ device = 'cpu'
17
+
18
+ lookahead = 5
19
+
20
+ signal_length = 64_000
21
+ chunk_size = 640
22
+
23
+ test_audio = torch.Tensor(
24
+ librosa.load(TEST_DIR / 'data' / 'test.wav', sr=16_000, mono=True)[0]
25
+ ).unsqueeze(0).unsqueeze(0)
26
+
27
+ tests = [
28
+ (512, 512, 512)
29
+ ]
30
+ resnet_model = SpeakerVerificationModel(model=ResNetSE34V2())
31
+ condition_vector = resnet_model(test_audio)
32
+ for (bottleneck_hidden_size,
33
+ bottleneck_feedforward_size,
34
+ spec_encoder_hidden_size) in tests:
35
+ print(
36
+ f"""
37
+ ====================================
38
+ bottleneck_hidden_size: {bottleneck_hidden_size}
39
+ bottleneck_feedforward_size: {bottleneck_feedforward_size}
40
+ spec_encoder_hidden_size: {spec_encoder_hidden_size}
41
+ """
42
+ )
43
+
44
+ streamer = Streamer(
45
+ VoiceBoxStreamer(
46
+ win_length=256,
47
+ bottleneck_type='lstm',
48
+ bottleneck_skip=True,
49
+ bottleneck_depth=2,
50
+ bottleneck_lookahead_frames=5,
51
+ bottleneck_hidden_size=bottleneck_hidden_size,
52
+ bottleneck_feedforward_size=bottleneck_feedforward_size,
53
+
54
+ conditioning_dim=512,
55
+
56
+ spec_encoder_mlp_depth=2,
57
+ spec_encoder_hidden_size=spec_encoder_hidden_size,
58
+ spec_encoder_lookahead_frames=0,
59
+ ppg_encoder_path=PPG_PRETRAINED_PATH,
60
+
61
+ ppg_encoder_depth=2,
62
+ ppg_encoder_hidden_size=256,
63
+ projection_norm='inf',
64
+ control_eps=0.5,
65
+ n_bands=128
66
+ ),
67
+ device,
68
+ hop_length=128,
69
+ window_length=256,
70
+ win_type='hann',
71
+ lookahead_frames=lookahead,
72
+ recurrent=True
73
+ )
74
+ streamer.model.load_state_dict(torch.load(MODELS_DIR / 'voicebox' / 'voicebox_final.pt'))
75
+ streamer.condition_vector = condition_vector
76
+
77
+ output_chunks = []
78
+ for i in tqdm(range(0, signal_length, chunk_size)):
79
+ signal_chunk = test_audio[..., i:i+chunk_size]
80
+ out = streamer.feed(signal_chunk)
81
+ output_chunks.append(out)
82
+ output_chunks.append(streamer.flush())
83
+ output_audio = torch.cat(output_chunks, dim=-1)
84
+ output_embedding = resnet_model(output_audio)
85
+
86
+ print(
87
+ f"""
88
+ RTF: {streamer.real_time_factor}
89
+ Embedding Distance: {resnet_model.distance_fn(output_embedding, condition_vector)}
90
+ ====================================
91
+ """
92
+ )
93
+ sf.write(
94
+ 'output.wav',
95
+ output_audio.numpy().squeeze(),
96
+ 16_000,
97
+ )
voicebox/scripts/streamer/enroll.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pipeline for enrolling:
3
+ 1. Provide Recording
4
+ 2. Convert to 16 kHz
5
+ 3. Divide into recordings
6
+ 4. Get embeddings for each recording
7
+ 5. Find centroid
8
+ 6. Save conditioning as some value.
9
+ """
10
+ import os
11
+ import argbind
12
+ import sounddevice as sd
13
+ import soundfile
14
+ import torch
15
+ import numpy as np
16
+
17
+ import sys
18
+
19
+ sys.path.append('.')
20
+
21
+ from src.constants import CONDITIONING_FILENAME, CONDITIONING_FOLDER
22
+ from src.data import DataProperties
23
+ from src.models import ResNetSE34V2
24
+
25
+
26
+ MIN_WINDOWS = 10
27
+ WINDOW_SIZE = 64_000
28
+ BLOCK_SIZE = 256
29
+
30
+ RECORDING_TEXT = """
31
+ This script will record you speaking, and will create an embedding
32
+ to be used for conditioning Voicebox. This will overwrite any previous
33
+ embeddings. We recommend at least 10 seconds of non-stop voice recording.
34
+ Press enter to begin recording. To stop recording, press ctrl-C.
35
+ """
36
+
37
+
38
+ def get_streams(input_name: str, block_size: int) -> sd.InputStream:
39
+ """
40
+ Gets Input stream object
41
+ """
42
+ try:
43
+ input_name = int(input_name)
44
+ except ValueError:
45
+ pass
46
+ return (
47
+ sd.InputStream(device=input_name,
48
+ samplerate=DataProperties.get('sample_rate'),
49
+ channels=1,
50
+ blocksize=block_size)
51
+ )
52
+
53
+
54
+ def record_from_user(input_name: str) -> torch.Tensor:
55
+ input_stream = get_streams(input_name, BLOCK_SIZE)
56
+ input(RECORDING_TEXT)
57
+ input_stream.start()
58
+ all_frames = []
59
+ try:
60
+ print("Recording...")
61
+ while True:
62
+ frames, _ = input_stream.read(BLOCK_SIZE)
63
+ all_frames.append(frames)
64
+ except KeyboardInterrupt:
65
+ print("Stopped Recording.")
66
+ pass
67
+ all_frames = torch.Tensor(np.array(all_frames))
68
+ recording = all_frames.reshape(-1)
69
+ return recording
70
+
71
+
72
+ def get_embedding(recording) -> torch.Tensor:
73
+ model = ResNetSE34V2(nOut=512, encoder_type='ASP')
74
+ recording = recording.view(1, -1)
75
+ embedding = model(recording)
76
+ return embedding
77
+
78
+
79
+ def save(embedding, audio) -> None:
80
+ os.makedirs(CONDITIONING_FOLDER, exist_ok=True)
81
+ torch.save(embedding, CONDITIONING_FILENAME)
82
+ soundfile.write(
83
+ CONDITIONING_FOLDER / 'conditioning_audio.wav',
84
+ audio.detach().cpu(),
85
+ DataProperties.get('sample_rate')
86
+ )
87
+
88
+
89
+ @argbind.bind(positional=True, without_prefix=True)
90
+ def main(input: str = None):
91
+ """
92
+ Creating a conditioning vector for VoiceBox from your voice
93
+
94
+ :param input: Index or name of input audio interface. Defaults to current device
95
+ :type input: str, optional
96
+ """
97
+ recording = record_from_user(input)
98
+ embedding = get_embedding(recording)
99
+ save(embedding, recording)
100
+
101
+
102
+ if __name__ == "__main__":
103
+ args = argbind.parse_args()
104
+ with argbind.scope(args):
105
+ main()
voicebox/scripts/streamer/stream.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argbind
2
+ import sounddevice as sd
3
+ import numpy as np
4
+ import yaml
5
+ import torch
6
+ import os
7
+ from typing import Union
8
+
9
+ import sys
10
+ import warnings
11
+
12
+ sys.path.append('.')
13
+ warnings.filterwarnings('ignore', category=UserWarning)
14
+
15
+ from src.data.dataproperties import DataProperties
16
+ from src.attacks.online import Streamer, VoiceBoxStreamer
17
+ from src.constants import MODELS_DIR, CONDITIONING_FILENAME
18
+
19
+
20
+ def get_streams(input_name: str, output_name: str, block_size: int) -> tuple[sd.InputStream, sd.OutputStream]:
21
+ """
22
+ Gets Input and Output stream objects
23
+ """
24
+ try:
25
+ input_name = int(input_name)
26
+ except ValueError:
27
+ pass
28
+ try:
29
+ output_name = int(output_name)
30
+ except ValueError:
31
+ pass
32
+ return (
33
+ sd.InputStream(device=input_name,
34
+ samplerate=DataProperties.get('sample_rate'),
35
+ channels=1,
36
+ blocksize=block_size),
37
+ sd.OutputStream(device=output_name,
38
+ samplerate=DataProperties.get('sample_rate'),
39
+ channels=1,
40
+ blocksize=block_size)
41
+ )
42
+
43
+
44
+ def get_model_streamer(device: str, conditioning_path: str) -> Streamer:
45
+ # TODO: Make a good way to query an attack type. For now, I'm going to hard code this.
46
+ model_dir = os.path.join(MODELS_DIR, 'voicebox')
47
+ checkpoint_path = os.path.join(model_dir, 'voicebox_final.pt')
48
+ config_path = os.path.join(model_dir, 'voicebox_final.yaml')
49
+
50
+ with open(config_path) as f:
51
+ config = yaml.safe_load(f)
52
+
53
+ state_dict = torch.load(checkpoint_path, map_location=device)
54
+ condition_tensor = torch.load(conditioning_path, map_location=device)
55
+ model = VoiceBoxStreamer(
56
+ **config
57
+ )
58
+ model.load_state_dict(state_dict)
59
+ model.condition_vector = condition_tensor.reshape(1, 1, -1)
60
+
61
+ streamer = Streamer(
62
+ model=model,
63
+ device=device,
64
+ lookahead_frames=config['bottleneck_lookahead_frames'],
65
+ recurrent=True
66
+ )
67
+ return streamer
68
+
69
+
70
+ def to_model(x: np.ndarray, device: str) -> torch.Tensor:
71
+ return torch.Tensor(x).view(1, 1, -1).to(device)
72
+
73
+
74
+ def from_model(x: torch.Tensor) -> np.ndarray:
75
+ return x.detach().cpu().view(-1, 1).numpy()
76
+
77
+
78
+ @argbind.bind(without_prefix=True)
79
+ def main(
80
+ input: str = None,
81
+ output: str = '',
82
+ device: str = 'cpu',
83
+ num_frames: int = 4,
84
+ pass_through: bool = False,
85
+ conditioning_path: str = CONDITIONING_FILENAME
86
+ ):
87
+ f"""
88
+ Uses a streaming implementation of an attack to perturb incoming audio
89
+
90
+ :param input: Index or name of input audio interface. Defaults to current device
91
+ :type input: str, optional
92
+ :param output: Index of name output audio interface. Defaults to 0
93
+ :type output: str, optional
94
+ :param device: Device to processing attack. Should be either 'cpu' or 'cuda:X'
95
+ Defaults to 'cpu'.
96
+ :type device: str, optional
97
+ :param pass_through: If True, the voicebox perturbation is not applied and the input will be
98
+ identical to the output. This is for demo purposes. The input and output audio will
99
+ remain at 16 kHz.
100
+ :type pass_through: bool, optional
101
+ :type device: str, optional
102
+ :param num_frames: Number of overlapping model frames to process at one iteration.
103
+ Defaults to 1
104
+ :type num_frames: int
105
+ :param conditioning_path: Path to conditioning tensor. Default: {CONDITIONING_FILENAME}
106
+ :type conditioning_path: str
107
+ """
108
+ streamer = get_model_streamer(device, conditioning_path)
109
+ input_stream, output_stream = get_streams(input, output, streamer.hop_length)
110
+ if streamer.win_type in ['hann', 'triangular']:
111
+ input_samples = (num_frames - 1) * streamer.hop_length + streamer.window_length
112
+ else:
113
+ input_samples = streamer.hop_length
114
+ print("Ready to process audio")
115
+ input_stream.start()
116
+ output_stream.start()
117
+ try:
118
+ while True:
119
+ frames, overflow = input_stream.read(input_samples)
120
+ if pass_through:
121
+ output_stream.write(frames)
122
+ continue
123
+ out = streamer.feed(to_model(frames, device))
124
+ out = from_model(out)
125
+ underflow = output_stream.write(out)
126
+ except KeyboardInterrupt:
127
+ print("Stopping")
128
+ input_stream.stop()
129
+ output_stream.stop()
130
+
131
+
132
+ if __name__ == "__main__":
133
+ args = argbind.parse_args()
134
+ with argbind.scope(args):
135
+ main()
voicebox/setup.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup
2
+
3
+ with open('README.md') as file:
4
+ long_description = file.read()
5
+
6
+ setup(
7
+ name='src',
8
+ description='Code for VoiceBox',
9
+ version='0.0.1',
10
+ author='',
11
+ author_email='',
12
+ url='',
13
+ install_requires=[],
14
+ packages=['src'],
15
+ long_description=long_description,
16
+ long_description_content_type='text.markdown',
17
+ keywords=[],
18
+ classifiers=['License :: OSI Approved :: MIT License'],
19
+ license='MIT'
20
+ )
voicebox/src.egg-info/PKG-INFO ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: src
3
+ Version: 0.0.1
4
+ Summary: Code for VoiceBox
5
+ Home-page:
6
+ Author:
7
+ Author-email:
8
+ License: MIT
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Description-Content-Type: text.markdown
11
+ License-File: LICENSE
12
+
13
+ <h1 align="center">VoiceBlock</h1>
14
+ <h4 align="center"> Privacy through Real-Time Adversarial Attacks with Audio-to-Audio Models</h4>
15
+ <div align="center">
16
+
17
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/???/???.ipynb)
18
+ [![Demo](https://img.shields.io/badge/Web-Demo-blue)](https://master.d3hvhbnf7qxjtf.amplifyapp.com/)
19
+ [![MIT license](https://img.shields.io/badge/License-MIT-blue.svg)](/LICENSE)
20
+
21
+ </div>
22
+ <p align="center"><img src="./figures/vb_color_logo.png" width="200"/></p>
23
+
24
+
25
+ ## Contents
26
+
27
+ * <a href="#install">Installation</a>
28
+ * <a href="#reproduce">Reproducing Results</a>
29
+ * <a href="#streamer">Streaming Implementation</a>
30
+ * <a href="#citation">Citation</a>
31
+
32
+ <h2 id="install">Installation</h2>
33
+
34
+ 1. Clone the repository:
35
+
36
+ git clone https://github.com/voiceboxneurips/voicebox.git
37
+
38
+ 2. We recommend working from a clean environment, e.g. using `conda`:
39
+
40
+ conda create --name voicebox python=3.9
41
+ source activate voicebox
42
+
43
+ 3. Install dependencies:
44
+
45
+ cd voicebox
46
+ pip install -r requirements.txt
47
+ pip install -e .
48
+
49
+ 4. Grant permissions:
50
+
51
+ chmod -R u+x scripts/
52
+
53
+ <h2 id="reproduce">Reproducing Results</h2>
54
+
55
+ To reproduce our results, first download the corresponding data. Note that to download the [VoxCeleb1 dataset](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html), you must register and obtain a username and password.
56
+
57
+ | Task | Dataset (Size) | Command |
58
+ |---|---|---|
59
+ | Objective evaluation | VoxCeleb1 (39G) | `python scripts/downloads/download_voxceleb.py --subset=1 --username=<VGG_USERNAME> --password=<VGG_PASSWORD>` |
60
+ | WER / supplemental evaluations | LibriSpeech `train-clean-360` (23G) | `./scripts/downloads/download_librispeech_eval.sh` |
61
+ | Train attacks | LibriSpeech `train-clean-100` (11G) | `./scripts/downloads/download_librispeech_train.sh` |
62
+
63
+
64
+ We provide scripts to reproduce our experiments and save results, including generated audio, to named and time-stamped subdirectories within `runs/`. To reproduce our objective evaluation experiments using pre-trained attacks, run:
65
+
66
+ ```
67
+ python scripts/experiments/evaluate.py
68
+ ```
69
+
70
+ To reproduce our training, run:
71
+
72
+ ```
73
+ python scripts/experiments/train.py
74
+ ```
75
+
76
+ <h2 id="streamer">Streaming Implementation</h2>
77
+
78
+ As a proof of concept, we provide a streaming implementation of VoiceBox capable of modifying user audio in real-time. Here, we provide installation instructions for MacOS and Ubuntu 20.04.
79
+
80
+ <h3 id="streamer-mac">MacOS</h3>
81
+
82
+ See video below:
83
+
84
+ <a href="https://youtu.be/LcNjO5E7F3E">
85
+ <p align="center"><img src="./figures/demo_thumbnail.png" width="500"/></p>
86
+ </a>
87
+
88
+ <h3 id="streamer-ubuntu">Ubuntu 20.04</h3>
89
+
90
+
91
+ 1. Open a terminal and follow the [installation instructions](#install) above. Change directory to the root of this repository.
92
+
93
+ 2. Run the following command:
94
+
95
+ pacmd load-module module-null-sink sink_name=voicebox sink_properties=device.description=voicebox
96
+
97
+ If you are using PipeWire instead of PulseAudio:
98
+
99
+ pactl load-module module-null-sink media.class=Audio/Sink sink_name=voicebox sink_properties=device.description=voicebox
100
+
101
+ PulseAudio is the default on Ubuntu. If you haven't changed your system defaults, you are probably using PulseAudio. This will add "voicebox" as an output device. Select it as the input to your chosen audio software.
102
+
103
+ 3. Find which audio device to read and write from. In your conda environment, run:
104
+
105
+ python -m sounddevice
106
+
107
+ You will get output similar to this:
108
+
109
+ 0 HDA Intel HDMI: 0 (hw:0,3), ALSA (0 in, 8 out)
110
+ 1 HDA Intel HDMI: 1 (hw:0,7), ALSA (0 in, 8 out)
111
+ 2 HDA Intel HDMI: 2 (hw:0,8), ALSA (0 in, 8 out)
112
+ 3 HDA Intel HDMI: 3 (hw:0,9), ALSA (0 in, 8 out)
113
+ 4 HDA Intel HDMI: 4 (hw:0,10), ALSA (0 in, 8 out)
114
+ 5 hdmi, ALSA (0 in, 8 out)
115
+ 6 jack, ALSA (2 in, 2 out)
116
+ 7 pipewire, ALSA (64 in, 64 out)
117
+ 8 pulse, ALSA (32 in, 32 out)
118
+ * 9 default, ALSA (32 in, 32 out)
119
+
120
+ In this example, we are going to route the audio through PipeWire (channel 7). This will be our INPUT_NUM and OUTPUT_NUM
121
+
122
+ 4. First, we need to create a conditioning embedding. To do this, run the enrollment script and follow its on-screen instructions:
123
+
124
+ python scripts/streamer/enroll.py --input INPUT_NUM
125
+
126
+ 5. We can now use the streamer. Run:
127
+
128
+ python scripts/stream.py --input INPUT_NUM --output OUTPUT_NUM
129
+
130
+ 6. Once the streamer is running, open `pavucontrol`.
131
+
132
+ a. In `pavucontrol`, go to the "Playback" tab and find "ALSA pug-in [python3.9]: ALSA Playback on". Set the output to "voicebox".
133
+
134
+ b. Then, go to "Recording" and find "ALSA pug-in [python3.9]: ALSA Playback from", and set the input to your desired microphone device.
135
+
136
+ <h2 id="citation">Citation</h2>
137
+
138
+ If you use this your academic research, please cite the following:
139
+
140
+ ```
141
+ @inproceedings{authors2022voicelock,
142
+ title={VoiceBlock: Privacy through Real-Time Adversarial Attacks with Audio-to-Audio Models},
143
+ author={Patrick O'Reilly, Andreas Bugler, Keshav Bhandari, Max Morrison, Bryan Pardo},
144
+ booktitle={Neural Information Processing Systems},
145
+ month={November},
146
+ year={2022}
147
+ }
148
+ ```
voicebox/src.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ LICENSE
2
+ README.md
3
+ setup.py
4
+ src/__init__.py
5
+ src/constants.py
6
+ src.egg-info/PKG-INFO
7
+ src.egg-info/SOURCES.txt
8
+ src.egg-info/dependency_links.txt
9
+ src.egg-info/top_level.txt
voicebox/src.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
voicebox/src.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ src
voicebox/src/__init__.py ADDED
File without changes