laurent commited on
Commit
fe98679
·
1 Parent(s): 874d788

Add some content.

Browse files
Files changed (3) hide show
  1. README.md +3 -4
  2. helper.js +209 -0
  3. index.html +249 -17
README.md CHANGED
@@ -1,10 +1,9 @@
1
  ---
2
  title: Hibiki Samples
3
- emoji: 😻
4
  colorFrom: green
5
- colorTo: pink
6
  sdk: static
 
7
  pinned: false
8
  ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Hibiki Samples
3
+ emoji: 🤗
4
  colorFrom: green
5
+ colorTo: green
6
  sdk: static
7
+ app_file: index.html
8
  pinned: false
9
  ---
 
 
helper.js ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ function createAudioHTML(path) {
2
+ return '<audio controls controlslist="nodownload" class="px-1"> <source src=' +
3
+ path +
4
+ ' type="audio/wav">Your browser does not support the audio element.</audio>';
5
+ }
6
+
7
+ function generateExampleRow(table_row, base_dir, dirs, filename, col_offset) {
8
+ for (var i = 0; i < dirs.length; i++) {
9
+ let cell = table_row.cells[col_offset + i];
10
+ let p = base_dir + '/' + dirs[i] + '/' + filename;
11
+ if (p.endsWith('txt')) {
12
+ var req = new XMLHttpRequest();
13
+ req.onreadystatechange = function() {
14
+ if (this.readyState === this.DONE) {
15
+ cell.innerHTML = '<font size="-1">' + req.responseText + '</font>';
16
+ }
17
+ };
18
+ req.open('GET', p);
19
+ req.send(null);
20
+ } else {
21
+ cell.innerHTML = cell.innerHTML + createAudioHTML(p);
22
+ }
23
+ }
24
+ }
25
+
26
+
27
+ function generateCVSS(tableId) {
28
+ let table = document.getElementById(tableId);
29
+ let base_dir = 'data/cvss_c_test'
30
+ let dirs = ['source', 'hibiki', 'seamless'];
31
+ let filenames = [
32
+ "cvss-fr2en-test-idx14345-20007437.wav",
33
+ "cvss-fr2en-test-idx14410-20011543.wav",
34
+ "cvss-fr2en-test-idx14603-20030929.wav",
35
+ "cvss-fr2en-test-idx14695-20041791.wav",
36
+ "cvss-fr2en-test-idx4562-19004869.wav",
37
+ ];
38
+
39
+ for (var i = 0; i < filenames.length; i++) {
40
+ generateExampleRow(table.rows[1 + i], base_dir, dirs, filenames[i], 0);
41
+ }
42
+ }
43
+
44
+ function generateNTREX(tableId) {
45
+ let table = document.getElementById(tableId);
46
+ let base_dir = 'data/audio_ntrex_long'
47
+ let dirs = ['source', 'hibiki', 'seamless'];
48
+ let filenames = [
49
+ "10887_ea80c8e6-883d-4afe-841b-598ce7db3779.wav",
50
+ "3120_a63eabfc-d5aa-4353-84d0-9c5c068a1b38.wav",
51
+ "5196_ea80c8e6-883d-4afe-841b-598ce7db3779.wav",
52
+ "6855_f3c3ea82-42ef-4c09-b4aa-544a4c95518b.wav",
53
+ "9605_83f1360e-7775-4d36-89f6-60649041c935.wav"
54
+ ];
55
+
56
+ for (var i = 0; i < filenames.length; i++) {
57
+ generateExampleRow(table.rows[1 + i], base_dir, dirs, filenames[i], 0);
58
+ }
59
+ }
60
+
61
+ function generateVoxPopuli(tableId) {
62
+ let table = document.getElementById(tableId);
63
+ let base_dir = 'data/voxpopuli'
64
+ let dirs = ['source', 'hibiki_cfg=1', 'hibiki_cfg=3', 'hibiki_cfg=10', 'seamless'];
65
+ let filenames = [
66
+ "20090422-0900-PLENARY-3_20090422-09:53:50_7.wav",
67
+ "20090506-0900-PLENARY-12_20090506-17:43:49_4.wav",
68
+ "20090914-0900-PLENARY-15_20090914-20:43:54_7.wav",
69
+ "20090916-0900-PLENARY-4_20090916-10:55:02_12.wav",
70
+ ];
71
+
72
+ for (var i = 0; i < filenames.length; i++) {
73
+ generateExampleRow(table.rows[1 + i], base_dir, dirs, filenames[i], 0);
74
+ }
75
+ }
76
+
77
+
78
+ generateNTREX('ntrex-table');
79
+ generateCVSS('cvss-table');
80
+ generateVoxPopuli('voxpopuli-table');
81
+
82
+ // Borrowed from https://nu-dialogue.github.io/j-moshi/
83
+ $(document).ready(function() {
84
+ {
85
+ const columns = ['Hibiki', 'Seamless'];
86
+ const rows = [
87
+ ['data-stereo/hibiki1.wav', 'data-stereo/seamless1.wav'],
88
+ ['data-stereo/hibiki2.wav', 'data-stereo/seamless2.wav'],
89
+ ['data-stereo/hibiki3.wav', 'data-stereo/seamless3.wav'],
90
+ ];
91
+ const table = $('#vis-table');
92
+
93
+ // Add header
94
+ const thead = $('<thead>');
95
+ const headerRow = $('<tr>');
96
+ columns.forEach(header => {
97
+ headerRow.append($('<th style="text-align: center">').text(header));
98
+ });
99
+ thead.append(headerRow);
100
+ table.append(thead);
101
+
102
+ // Add rows
103
+ const tbody = $('<tbody>');
104
+ rows.forEach((files, i) => {
105
+ const row = $('<tr>');
106
+ files.forEach((files, j) => {
107
+ // Add waveform cell
108
+ const waveCell = $('<td style="text-align: center">');//.css('min-width', '200px');
109
+ const waveform = $('<div>').attr('id', `waveform-${i}-${j}`);
110
+ waveCell.append(waveform);
111
+ const playPauseButton = `
112
+ <button class="btn btn-secondary" data-action="play" id="play-pause-${i}-${j}">
113
+ <i class="bi bi-play-fill"></i> Play / <i class="bi bi-pause-fill"></i> Pause
114
+ </button>
115
+ `;
116
+ waveCell.append(playPauseButton);
117
+ row.append(waveCell);
118
+ });
119
+ tbody.append(row);
120
+ });
121
+ table.append(tbody);
122
+
123
+ // Create wavesurfer instances
124
+ rows.forEach((files, i) => {
125
+ files.forEach((file, j) => {
126
+ const wavesurfer = WaveSurfer.create({
127
+ container: `#waveform-${i}-${j}`,
128
+ url: file,
129
+ splitChannels: [
130
+ {
131
+ waveColor: '#2E7D9E',
132
+ progressColor: '#173E4E',
133
+ },
134
+ {
135
+ waveColor: '#E57872',
136
+ progressColor: '#2A0908',
137
+ }
138
+ ],
139
+ barWidth: 2,
140
+ height: 55,
141
+ width: 700,
142
+ });
143
+ $(`#play-pause-${i}-${j}`).click(() => {
144
+ wavesurfer.playPause();
145
+ });
146
+ });
147
+ });
148
+ }
149
+ {
150
+ const columns = ['Real Human Interpretation', 'Hibiki', 'Seamless'];
151
+ const dirs = [
152
+ "data/voxpopuli/gt_with_fr_background",
153
+ "data/voxpopuli/hibiki_cfg=3_with_fr_background",
154
+ "data/voxpopuli/seamless_with_fr_background",
155
+ ];
156
+ const rows = [
157
+ "20090422-0900-PLENARY-3_20090422-09:53:50_7.wav",
158
+ "20090506-0900-PLENARY-12_20090506-17:43:49_4.wav",
159
+ "20090914-0900-PLENARY-15_20090914-20:43:54_7.wav",
160
+ "20090916-0900-PLENARY-4_20090916-10:55:02_12.wav",
161
+ ];
162
+ const table = $('#vis-table2');
163
+
164
+ // Add header
165
+ const thead = $('<thead>');
166
+ const headerRow = $('<tr>');
167
+ columns.forEach(header => {
168
+ headerRow.append($('<th style="text-align: center">').text(header));
169
+ });
170
+ thead.append(headerRow);
171
+ table.append(thead);
172
+
173
+ // Add rows
174
+ const tbody = $('<tbody>');
175
+ rows.forEach((file, i) => {
176
+ const row = $('<tr>');
177
+ dirs.forEach((d, j) => {
178
+ // Add waveform cell
179
+ const waveCell = $('<td style="text-align: center">');//.css('min-width', '200px');
180
+ const waveform = $('<div>').attr('id', `waveform2-${i}-${j}`);
181
+ waveCell.append(waveform);
182
+ const playPauseButton = `
183
+ <button class="btn btn-secondary" data-action="play" id="play-pause-${i}-${j}">
184
+ <i class="bi bi-play-fill"></i> Play / <i class="bi bi-pause-fill"></i> Pause
185
+ </button>
186
+ `;
187
+ waveCell.append(playPauseButton);
188
+ row.append(waveCell);
189
+ });
190
+ tbody.append(row);
191
+ });
192
+ table.append(tbody);
193
+
194
+ // Create wavesurfer instances
195
+ rows.forEach((file, i) => {
196
+ dirs.forEach((dir, j) => {
197
+ const wavesurfer = WaveSurfer.create({
198
+ container: `#waveform2-${i}-${j}`,
199
+ url: dir + '/' + file,
200
+ barWidth: 2,
201
+ height: 55,
202
+ });
203
+ $(`#play-pause-${i}-${j}`).click(() => {
204
+ wavesurfer.playPause();
205
+ });
206
+ });
207
+ });
208
+ }
209
+ });
index.html CHANGED
@@ -1,19 +1,251 @@
1
- <!doctype html>
2
  <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  </html>
 
 
1
+ <!DOCTYPE html>
2
  <html>
3
+ <head>
4
+ <title>Hibiki</title>
5
+ <link
6
+ href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css"
7
+ rel="stylesheet"
8
+ />
9
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.min.css">
10
+ <meta charset="utf-8" />
11
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
12
+ <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
13
+ <script src="https://unpkg.com/wavesurfer.js@7"></script>
14
+ <script src="helper.js" defer></script>
15
+ <script>
16
+ function _setup_callback(elem, elems) {
17
+ elem.addEventListener("play", function () {
18
+ for (other of elems) {
19
+ if (other !== elem) {
20
+ other.pause();
21
+ }
22
+ }
23
+ });
24
+ }
25
+
26
+ document.addEventListener('DOMContentLoaded', function () {
27
+ var elems = document.body.getElementsByTagName("audio");
28
+ for (elem of elems) {
29
+ _setup_callback(elem, elems);
30
+ }
31
+ });
32
+ </script>
33
+ <style>
34
+ td {
35
+ vertical-align: middle;
36
+ text-align: center;
37
+ }
38
+ audio {
39
+ width: 20vw;
40
+ min-width: 100px;
41
+ max-width: 100%;
42
+ }
43
+ h1, h2, h3, h4, h5, h6, body, b, strong, th {
44
+ color: #595959;
45
+ }
46
+ .ratio-8x5 {
47
+ --bs-aspect-ratio: 62.5%;
48
+ }
49
+ .btn-secondary {
50
+ padding: 0.1rem 0.8rem;
51
+ font-size: small
52
+ }
53
+ .container {
54
+ max-width: 1620px;
55
+ }
56
+ </style>
57
+ </head>
58
+ <body>
59
+ <div class="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded">
60
+ <div class="text-center">
61
+ <h1>High-Fidelity Simultaneous Speech-To-Speech Translation</h1>
62
+ <p class="lead">
63
+ <a href="https://kyutai.org">Kyutai</a>
64
+ - code on <a href="https://github.com/kyutai-labs/hibiki">github</a>
65
+ </p>
66
+ </div>
67
+ <p>
68
+ <b>Abstract.</b>
69
+ We introduce <i>Hibiki</i> ('echo' in Japanese)
70
+ Hibiki leverages a multistream language model to synchronously process
71
+ source and target speech, and jointly produces text and audio tokens to
72
+ perform speech-to-text and speech-to-speech translation.
73
+ We furthermore address the fundamental challenge of <i>simultaneous</i> interpretation,
74
+ which unlike its <i>consecutive</i> counterpart---where one waits for
75
+ the end of the source utterance to start translating--- adapts its flow
76
+ to accumulate just enough context to produce a correct translation in
77
+ real-time, chunk by chunk. <br />
78
+ To do so, we introduce a weakly-supervised method that leverages the
79
+ perplexity of an off-the-shelf text translation system to identify
80
+ optimal delays on a per-word basis and create aligned synthetic data.
81
+ After supervised training, Hibiki performs adaptive, simultaneous
82
+ speech translation with vanilla temperature sampling. On a
83
+ French-English simultaneous speech translation task, Hibiki demonstrates
84
+ state-of-the-art performance in translation quality, speaker fidelity
85
+ and naturalness. Moreover, the simplicity of its inference process
86
+ makes it compatible with batched translation and even real-time
87
+ on-device deployment.
88
+ </p>
89
+ </div>
90
+
91
+ <div class="container shadow p-5 mb-5 bg-white rounded">
92
+ <h3>In the Wild Examples<a id="vis"/></h3>
93
+ <p class="mb-0">
94
+ </p>
95
+ <div class="container pt-3 table-responsive">
96
+ <table class="table table-hover" width="100%">
97
+ <tr>
98
+ <td witdth="50%">
99
+ <video class="embed-responsive-item" style="max-width: 80%; min-width: 400px;" controls>
100
+ <source src="videos/RPckvIkNWhE_ss301_to390_babel_numerique_arte.mp4" type="video/mp4">
101
+ Your browser does not support HTML video.
102
+ </video>
103
+ </td>
104
+ <td width="50%">
105
+ <video class="embed-responsive-item" style="max-width: 80%; min-width: 400px;" controls>
106
+ <source src="videos/uNAmODXvAiQ_ss9_message_a_caractere_informatif.mp4" type="video/mp4">
107
+ Your browser does not support HTML video.
108
+ </video>
109
+ </td>
110
+ <tr>
111
+ <td>
112
+ This example comes from a video explaining automated translation.
113
+ (<a href="https://www.youtube.com/watch?v=RPckvIkNWhE" target="_blank">source</a>, original video (c) Arte)
114
+ </td>
115
+ <td>
116
+ This example comes from a humoristic video. The source voice is high pitch on purpose,
117
+ it is a good showcase of how well Hibiki replicates pitch and prosody and how robust it is to
118
+ background noise <b>as no denoising is applied to the audio which is fed raw to Hibiki</b>.
119
+ (<a href="https://www.youtube.com/watch?v=uNAmODXvAiQ" target="_blank">source</a>, original video (c) Canal+)
120
+ </td>
121
+ </tr>
122
+ </table>
123
+ </div>
124
+ </div>
125
+
126
+ <div class="container shadow p-5 mb-5 bg-white rounded">
127
+ <h3>Examples with Ground Truth Interpretation<a id="vis"/></h3>
128
+ <p class="mb-0">
129
+ These samples come from the VoxPopuli dataset where the ground truth is real human
130
+ interpretation.
131
+ The volume for the sources has been reduced so that it's easier to hear the translations.
132
+ </p>
133
+ <div class="container pt-3 table-responsive">
134
+ <table class="table table-hover" id="vis-table2"></table>
135
+ </div>
136
+ </div>
137
+
138
+ <div class="container shadow p-5 mb-5 bg-white rounded">
139
+ <h3>Multistream Visualization<a id="vis"/></h3>
140
+ <p class="mb-0">
141
+ The audio for the source and translated versions are on different channels. Use headphones
142
+ to hear both at the same time. These samples are the same as in the voxpopuli section with CFG
143
+ set to 3.
144
+ </p>
145
+ <div class="container pt-3 table-responsive">
146
+ <table class="table table-hover" id="vis-table"></table>
147
+ </div>
148
+ </div>
149
+
150
+
151
+ <div class="container shadow p-5 mb-5 bg-white rounded">
152
+ <h3>Impact of Classifier-Free Guidance<a id="voxpopuli"/></h3>
153
+ <p class="mb-0">
154
+ Samples taken from the VoxPopuli dataset. The Hibiki samples are presented with different levels
155
+ of classifier-free guidance (CFG). The higher the CFG value, the closer the generated voice will
156
+ be to the original voice. This results in very strong accents for the generations with the higher
157
+ values.
158
+ </p>
159
+
160
+ <div class="container pt-3 table-responsive">
161
+ <table
162
+ class="table table-hover"
163
+ id="voxpopuli-table"
164
+ >
165
+ <thead>
166
+ <tr>
167
+ <th style="text-align: center">Source</th>
168
+ <th style="text-align: center">Hibiki CFG-1</th>
169
+ <th style="text-align: center">Hibiki CFG-3</th>
170
+ <th style="text-align: center">Hibiki CFG-10</th>
171
+ <th style="text-align: center">Seamless</th>
172
+ </tr>
173
+ </thead>
174
+ <tbody>
175
+ <tr> <td></td> <td></td> <td></td> <td></td> <td></td></tr>
176
+ <tr> <td></td> <td></td> <td></td> <td></td> <td></td></tr>
177
+ <tr> <td></td> <td></td> <td></td> <td></td> <td></td></tr>
178
+ <tr> <td></td> <td></td> <td></td> <td></td> <td></td></tr>
179
+ <tr> <td></td> <td></td> <td></td> <td></td> <td></td></tr>
180
+ </tbody>
181
+ </table>
182
+ </div>
183
+ </div>
184
+ <div class="container shadow p-5 mb-5 bg-white rounded">
185
+ <h3>Long-form Simultaneous Translations<a id="ntrex"/></h3>
186
+ <p class="mb-0">
187
+ Samples taken from the audio NTREX dataset.
188
+ </p>
189
+
190
+ <div class="container pt-3 table-responsive">
191
+ <table
192
+ class="table table-hover"
193
+ id="ntrex-table"
194
+ >
195
+ <thead>
196
+ <tr>
197
+ <th style="text-align: center;min-width: 200px;">Source</th>
198
+ <th style="text-align: center;">Hibiki</th>
199
+ <th style="text-align: center">Seamless</th>
200
+ </tr>
201
+ </thead>
202
+ <tbody>
203
+ <tr> <td></td> <td></td> <td></td></tr>
204
+ <tr> <td></td> <td></td> <td></td></tr>
205
+ <tr> <td></td> <td></td> <td></td></tr>
206
+ <tr> <td></td> <td></td> <td></td></tr>
207
+ <tr> <td></td> <td></td> <td></td></tr>
208
+ </tbody>
209
+ </table>
210
+ </div>
211
+ </div>
212
+
213
+ <div class="container shadow p-5 mb-5 bg-white rounded">
214
+ <h3>Short-form Simultaneous Translations<a id="cvss-c"/></h3>
215
+ <p class="mb-0">
216
+ Samples taken from the CVSS-C dataset.
217
+ </p>
218
+
219
+ <div class="container pt-3 table-responsive">
220
+ <table
221
+ class="table table-hover"
222
+ id="cvss-table"
223
+ >
224
+ <thead>
225
+ <tr>
226
+ <th style="text-align: center;min-width: 200px;">Source</th>
227
+ <th style="text-align: center;">Hibiki</th>
228
+ <th style="text-align: center">Seamless</th>
229
+ </tr>
230
+ </thead>
231
+ <tbody>
232
+ <tr> <td></td> <td></td> <td></td></tr>
233
+ <tr> <td></td> <td></td> <td></td></tr>
234
+ <tr> <td></td> <td></td> <td></td></tr>
235
+ <tr> <td></td> <td></td> <td></td></tr>
236
+ <tr> <td></td> <td></td> <td></td></tr>
237
+ </tbody>
238
+ </table>
239
+ </div>
240
+ </div>
241
+
242
+ <div class="container p-5 mb-5 bg-white rounded">
243
+ <p class="mb-0">
244
+ This page was adapted from the <a href="https://google-research.github.io/seanet/soundstorm/examples">SoundStorm project page</a>.
245
+ </p>
246
+ </div>
247
+
248
+
249
+ </body>
250
  </html>
251
+