Spaces:
Running
Running
laurent
commited on
Commit
·
fe98679
1
Parent(s):
874d788
Add some content.
Browse files- README.md +3 -4
- helper.js +209 -0
- index.html +249 -17
README.md
CHANGED
@@ -1,10 +1,9 @@
|
|
1 |
---
|
2 |
title: Hibiki Samples
|
3 |
-
emoji:
|
4 |
colorFrom: green
|
5 |
-
colorTo:
|
6 |
sdk: static
|
|
|
7 |
pinned: false
|
8 |
---
|
9 |
-
|
10 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Hibiki Samples
|
3 |
+
emoji: 🤗
|
4 |
colorFrom: green
|
5 |
+
colorTo: green
|
6 |
sdk: static
|
7 |
+
app_file: index.html
|
8 |
pinned: false
|
9 |
---
|
|
|
|
helper.js
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
function createAudioHTML(path) {
|
2 |
+
return '<audio controls controlslist="nodownload" class="px-1"> <source src=' +
|
3 |
+
path +
|
4 |
+
' type="audio/wav">Your browser does not support the audio element.</audio>';
|
5 |
+
}
|
6 |
+
|
7 |
+
function generateExampleRow(table_row, base_dir, dirs, filename, col_offset) {
|
8 |
+
for (var i = 0; i < dirs.length; i++) {
|
9 |
+
let cell = table_row.cells[col_offset + i];
|
10 |
+
let p = base_dir + '/' + dirs[i] + '/' + filename;
|
11 |
+
if (p.endsWith('txt')) {
|
12 |
+
var req = new XMLHttpRequest();
|
13 |
+
req.onreadystatechange = function() {
|
14 |
+
if (this.readyState === this.DONE) {
|
15 |
+
cell.innerHTML = '<font size="-1">' + req.responseText + '</font>';
|
16 |
+
}
|
17 |
+
};
|
18 |
+
req.open('GET', p);
|
19 |
+
req.send(null);
|
20 |
+
} else {
|
21 |
+
cell.innerHTML = cell.innerHTML + createAudioHTML(p);
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
|
26 |
+
|
27 |
+
function generateCVSS(tableId) {
|
28 |
+
let table = document.getElementById(tableId);
|
29 |
+
let base_dir = 'data/cvss_c_test'
|
30 |
+
let dirs = ['source', 'hibiki', 'seamless'];
|
31 |
+
let filenames = [
|
32 |
+
"cvss-fr2en-test-idx14345-20007437.wav",
|
33 |
+
"cvss-fr2en-test-idx14410-20011543.wav",
|
34 |
+
"cvss-fr2en-test-idx14603-20030929.wav",
|
35 |
+
"cvss-fr2en-test-idx14695-20041791.wav",
|
36 |
+
"cvss-fr2en-test-idx4562-19004869.wav",
|
37 |
+
];
|
38 |
+
|
39 |
+
for (var i = 0; i < filenames.length; i++) {
|
40 |
+
generateExampleRow(table.rows[1 + i], base_dir, dirs, filenames[i], 0);
|
41 |
+
}
|
42 |
+
}
|
43 |
+
|
44 |
+
function generateNTREX(tableId) {
|
45 |
+
let table = document.getElementById(tableId);
|
46 |
+
let base_dir = 'data/audio_ntrex_long'
|
47 |
+
let dirs = ['source', 'hibiki', 'seamless'];
|
48 |
+
let filenames = [
|
49 |
+
"10887_ea80c8e6-883d-4afe-841b-598ce7db3779.wav",
|
50 |
+
"3120_a63eabfc-d5aa-4353-84d0-9c5c068a1b38.wav",
|
51 |
+
"5196_ea80c8e6-883d-4afe-841b-598ce7db3779.wav",
|
52 |
+
"6855_f3c3ea82-42ef-4c09-b4aa-544a4c95518b.wav",
|
53 |
+
"9605_83f1360e-7775-4d36-89f6-60649041c935.wav"
|
54 |
+
];
|
55 |
+
|
56 |
+
for (var i = 0; i < filenames.length; i++) {
|
57 |
+
generateExampleRow(table.rows[1 + i], base_dir, dirs, filenames[i], 0);
|
58 |
+
}
|
59 |
+
}
|
60 |
+
|
61 |
+
function generateVoxPopuli(tableId) {
|
62 |
+
let table = document.getElementById(tableId);
|
63 |
+
let base_dir = 'data/voxpopuli'
|
64 |
+
let dirs = ['source', 'hibiki_cfg=1', 'hibiki_cfg=3', 'hibiki_cfg=10', 'seamless'];
|
65 |
+
let filenames = [
|
66 |
+
"20090422-0900-PLENARY-3_20090422-09:53:50_7.wav",
|
67 |
+
"20090506-0900-PLENARY-12_20090506-17:43:49_4.wav",
|
68 |
+
"20090914-0900-PLENARY-15_20090914-20:43:54_7.wav",
|
69 |
+
"20090916-0900-PLENARY-4_20090916-10:55:02_12.wav",
|
70 |
+
];
|
71 |
+
|
72 |
+
for (var i = 0; i < filenames.length; i++) {
|
73 |
+
generateExampleRow(table.rows[1 + i], base_dir, dirs, filenames[i], 0);
|
74 |
+
}
|
75 |
+
}
|
76 |
+
|
77 |
+
|
78 |
+
generateNTREX('ntrex-table');
|
79 |
+
generateCVSS('cvss-table');
|
80 |
+
generateVoxPopuli('voxpopuli-table');
|
81 |
+
|
82 |
+
// Borrowed from https://nu-dialogue.github.io/j-moshi/
|
83 |
+
$(document).ready(function() {
|
84 |
+
{
|
85 |
+
const columns = ['Hibiki', 'Seamless'];
|
86 |
+
const rows = [
|
87 |
+
['data-stereo/hibiki1.wav', 'data-stereo/seamless1.wav'],
|
88 |
+
['data-stereo/hibiki2.wav', 'data-stereo/seamless2.wav'],
|
89 |
+
['data-stereo/hibiki3.wav', 'data-stereo/seamless3.wav'],
|
90 |
+
];
|
91 |
+
const table = $('#vis-table');
|
92 |
+
|
93 |
+
// Add header
|
94 |
+
const thead = $('<thead>');
|
95 |
+
const headerRow = $('<tr>');
|
96 |
+
columns.forEach(header => {
|
97 |
+
headerRow.append($('<th style="text-align: center">').text(header));
|
98 |
+
});
|
99 |
+
thead.append(headerRow);
|
100 |
+
table.append(thead);
|
101 |
+
|
102 |
+
// Add rows
|
103 |
+
const tbody = $('<tbody>');
|
104 |
+
rows.forEach((files, i) => {
|
105 |
+
const row = $('<tr>');
|
106 |
+
files.forEach((files, j) => {
|
107 |
+
// Add waveform cell
|
108 |
+
const waveCell = $('<td style="text-align: center">');//.css('min-width', '200px');
|
109 |
+
const waveform = $('<div>').attr('id', `waveform-${i}-${j}`);
|
110 |
+
waveCell.append(waveform);
|
111 |
+
const playPauseButton = `
|
112 |
+
<button class="btn btn-secondary" data-action="play" id="play-pause-${i}-${j}">
|
113 |
+
<i class="bi bi-play-fill"></i> Play / <i class="bi bi-pause-fill"></i> Pause
|
114 |
+
</button>
|
115 |
+
`;
|
116 |
+
waveCell.append(playPauseButton);
|
117 |
+
row.append(waveCell);
|
118 |
+
});
|
119 |
+
tbody.append(row);
|
120 |
+
});
|
121 |
+
table.append(tbody);
|
122 |
+
|
123 |
+
// Create wavesurfer instances
|
124 |
+
rows.forEach((files, i) => {
|
125 |
+
files.forEach((file, j) => {
|
126 |
+
const wavesurfer = WaveSurfer.create({
|
127 |
+
container: `#waveform-${i}-${j}`,
|
128 |
+
url: file,
|
129 |
+
splitChannels: [
|
130 |
+
{
|
131 |
+
waveColor: '#2E7D9E',
|
132 |
+
progressColor: '#173E4E',
|
133 |
+
},
|
134 |
+
{
|
135 |
+
waveColor: '#E57872',
|
136 |
+
progressColor: '#2A0908',
|
137 |
+
}
|
138 |
+
],
|
139 |
+
barWidth: 2,
|
140 |
+
height: 55,
|
141 |
+
width: 700,
|
142 |
+
});
|
143 |
+
$(`#play-pause-${i}-${j}`).click(() => {
|
144 |
+
wavesurfer.playPause();
|
145 |
+
});
|
146 |
+
});
|
147 |
+
});
|
148 |
+
}
|
149 |
+
{
|
150 |
+
const columns = ['Real Human Interpretation', 'Hibiki', 'Seamless'];
|
151 |
+
const dirs = [
|
152 |
+
"data/voxpopuli/gt_with_fr_background",
|
153 |
+
"data/voxpopuli/hibiki_cfg=3_with_fr_background",
|
154 |
+
"data/voxpopuli/seamless_with_fr_background",
|
155 |
+
];
|
156 |
+
const rows = [
|
157 |
+
"20090422-0900-PLENARY-3_20090422-09:53:50_7.wav",
|
158 |
+
"20090506-0900-PLENARY-12_20090506-17:43:49_4.wav",
|
159 |
+
"20090914-0900-PLENARY-15_20090914-20:43:54_7.wav",
|
160 |
+
"20090916-0900-PLENARY-4_20090916-10:55:02_12.wav",
|
161 |
+
];
|
162 |
+
const table = $('#vis-table2');
|
163 |
+
|
164 |
+
// Add header
|
165 |
+
const thead = $('<thead>');
|
166 |
+
const headerRow = $('<tr>');
|
167 |
+
columns.forEach(header => {
|
168 |
+
headerRow.append($('<th style="text-align: center">').text(header));
|
169 |
+
});
|
170 |
+
thead.append(headerRow);
|
171 |
+
table.append(thead);
|
172 |
+
|
173 |
+
// Add rows
|
174 |
+
const tbody = $('<tbody>');
|
175 |
+
rows.forEach((file, i) => {
|
176 |
+
const row = $('<tr>');
|
177 |
+
dirs.forEach((d, j) => {
|
178 |
+
// Add waveform cell
|
179 |
+
const waveCell = $('<td style="text-align: center">');//.css('min-width', '200px');
|
180 |
+
const waveform = $('<div>').attr('id', `waveform2-${i}-${j}`);
|
181 |
+
waveCell.append(waveform);
|
182 |
+
const playPauseButton = `
|
183 |
+
<button class="btn btn-secondary" data-action="play" id="play-pause-${i}-${j}">
|
184 |
+
<i class="bi bi-play-fill"></i> Play / <i class="bi bi-pause-fill"></i> Pause
|
185 |
+
</button>
|
186 |
+
`;
|
187 |
+
waveCell.append(playPauseButton);
|
188 |
+
row.append(waveCell);
|
189 |
+
});
|
190 |
+
tbody.append(row);
|
191 |
+
});
|
192 |
+
table.append(tbody);
|
193 |
+
|
194 |
+
// Create wavesurfer instances
|
195 |
+
rows.forEach((file, i) => {
|
196 |
+
dirs.forEach((dir, j) => {
|
197 |
+
const wavesurfer = WaveSurfer.create({
|
198 |
+
container: `#waveform2-${i}-${j}`,
|
199 |
+
url: dir + '/' + file,
|
200 |
+
barWidth: 2,
|
201 |
+
height: 55,
|
202 |
+
});
|
203 |
+
$(`#play-pause-${i}-${j}`).click(() => {
|
204 |
+
wavesurfer.playPause();
|
205 |
+
});
|
206 |
+
});
|
207 |
+
});
|
208 |
+
}
|
209 |
+
});
|
index.html
CHANGED
@@ -1,19 +1,251 @@
|
|
1 |
-
<!
|
2 |
<html>
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
</html>
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
<html>
|
3 |
+
<head>
|
4 |
+
<title>Hibiki</title>
|
5 |
+
<link
|
6 |
+
href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css"
|
7 |
+
rel="stylesheet"
|
8 |
+
/>
|
9 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.min.css">
|
10 |
+
<meta charset="utf-8" />
|
11 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
12 |
+
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
|
13 |
+
<script src="https://unpkg.com/wavesurfer.js@7"></script>
|
14 |
+
<script src="helper.js" defer></script>
|
15 |
+
<script>
|
16 |
+
function _setup_callback(elem, elems) {
|
17 |
+
elem.addEventListener("play", function () {
|
18 |
+
for (other of elems) {
|
19 |
+
if (other !== elem) {
|
20 |
+
other.pause();
|
21 |
+
}
|
22 |
+
}
|
23 |
+
});
|
24 |
+
}
|
25 |
+
|
26 |
+
document.addEventListener('DOMContentLoaded', function () {
|
27 |
+
var elems = document.body.getElementsByTagName("audio");
|
28 |
+
for (elem of elems) {
|
29 |
+
_setup_callback(elem, elems);
|
30 |
+
}
|
31 |
+
});
|
32 |
+
</script>
|
33 |
+
<style>
|
34 |
+
td {
|
35 |
+
vertical-align: middle;
|
36 |
+
text-align: center;
|
37 |
+
}
|
38 |
+
audio {
|
39 |
+
width: 20vw;
|
40 |
+
min-width: 100px;
|
41 |
+
max-width: 100%;
|
42 |
+
}
|
43 |
+
h1, h2, h3, h4, h5, h6, body, b, strong, th {
|
44 |
+
color: #595959;
|
45 |
+
}
|
46 |
+
.ratio-8x5 {
|
47 |
+
--bs-aspect-ratio: 62.5%;
|
48 |
+
}
|
49 |
+
.btn-secondary {
|
50 |
+
padding: 0.1rem 0.8rem;
|
51 |
+
font-size: small
|
52 |
+
}
|
53 |
+
.container {
|
54 |
+
max-width: 1620px;
|
55 |
+
}
|
56 |
+
</style>
|
57 |
+
</head>
|
58 |
+
<body>
|
59 |
+
<div class="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded">
|
60 |
+
<div class="text-center">
|
61 |
+
<h1>High-Fidelity Simultaneous Speech-To-Speech Translation</h1>
|
62 |
+
<p class="lead">
|
63 |
+
<a href="https://kyutai.org">Kyutai</a>
|
64 |
+
- code on <a href="https://github.com/kyutai-labs/hibiki">github</a>
|
65 |
+
</p>
|
66 |
+
</div>
|
67 |
+
<p>
|
68 |
+
<b>Abstract.</b>
|
69 |
+
We introduce <i>Hibiki</i> ('echo' in Japanese)
|
70 |
+
Hibiki leverages a multistream language model to synchronously process
|
71 |
+
source and target speech, and jointly produces text and audio tokens to
|
72 |
+
perform speech-to-text and speech-to-speech translation.
|
73 |
+
We furthermore address the fundamental challenge of <i>simultaneous</i> interpretation,
|
74 |
+
which unlike its <i>consecutive</i> counterpart---where one waits for
|
75 |
+
the end of the source utterance to start translating--- adapts its flow
|
76 |
+
to accumulate just enough context to produce a correct translation in
|
77 |
+
real-time, chunk by chunk. <br />
|
78 |
+
To do so, we introduce a weakly-supervised method that leverages the
|
79 |
+
perplexity of an off-the-shelf text translation system to identify
|
80 |
+
optimal delays on a per-word basis and create aligned synthetic data.
|
81 |
+
After supervised training, Hibiki performs adaptive, simultaneous
|
82 |
+
speech translation with vanilla temperature sampling. On a
|
83 |
+
French-English simultaneous speech translation task, Hibiki demonstrates
|
84 |
+
state-of-the-art performance in translation quality, speaker fidelity
|
85 |
+
and naturalness. Moreover, the simplicity of its inference process
|
86 |
+
makes it compatible with batched translation and even real-time
|
87 |
+
on-device deployment.
|
88 |
+
</p>
|
89 |
+
</div>
|
90 |
+
|
91 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
92 |
+
<h3>In the Wild Examples<a id="vis"/></h3>
|
93 |
+
<p class="mb-0">
|
94 |
+
</p>
|
95 |
+
<div class="container pt-3 table-responsive">
|
96 |
+
<table class="table table-hover" width="100%">
|
97 |
+
<tr>
|
98 |
+
<td witdth="50%">
|
99 |
+
<video class="embed-responsive-item" style="max-width: 80%; min-width: 400px;" controls>
|
100 |
+
<source src="videos/RPckvIkNWhE_ss301_to390_babel_numerique_arte.mp4" type="video/mp4">
|
101 |
+
Your browser does not support HTML video.
|
102 |
+
</video>
|
103 |
+
</td>
|
104 |
+
<td width="50%">
|
105 |
+
<video class="embed-responsive-item" style="max-width: 80%; min-width: 400px;" controls>
|
106 |
+
<source src="videos/uNAmODXvAiQ_ss9_message_a_caractere_informatif.mp4" type="video/mp4">
|
107 |
+
Your browser does not support HTML video.
|
108 |
+
</video>
|
109 |
+
</td>
|
110 |
+
<tr>
|
111 |
+
<td>
|
112 |
+
This example comes from a video explaining automated translation.
|
113 |
+
(<a href="https://www.youtube.com/watch?v=RPckvIkNWhE" target="_blank">source</a>, original video (c) Arte)
|
114 |
+
</td>
|
115 |
+
<td>
|
116 |
+
This example comes from a humoristic video. The source voice is high pitch on purpose,
|
117 |
+
it is a good showcase of how well Hibiki replicates pitch and prosody and how robust it is to
|
118 |
+
background noise <b>as no denoising is applied to the audio which is fed raw to Hibiki</b>.
|
119 |
+
(<a href="https://www.youtube.com/watch?v=uNAmODXvAiQ" target="_blank">source</a>, original video (c) Canal+)
|
120 |
+
</td>
|
121 |
+
</tr>
|
122 |
+
</table>
|
123 |
+
</div>
|
124 |
+
</div>
|
125 |
+
|
126 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
127 |
+
<h3>Examples with Ground Truth Interpretation<a id="vis"/></h3>
|
128 |
+
<p class="mb-0">
|
129 |
+
These samples come from the VoxPopuli dataset where the ground truth is real human
|
130 |
+
interpretation.
|
131 |
+
The volume for the sources has been reduced so that it's easier to hear the translations.
|
132 |
+
</p>
|
133 |
+
<div class="container pt-3 table-responsive">
|
134 |
+
<table class="table table-hover" id="vis-table2"></table>
|
135 |
+
</div>
|
136 |
+
</div>
|
137 |
+
|
138 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
139 |
+
<h3>Multistream Visualization<a id="vis"/></h3>
|
140 |
+
<p class="mb-0">
|
141 |
+
The audio for the source and translated versions are on different channels. Use headphones
|
142 |
+
to hear both at the same time. These samples are the same as in the voxpopuli section with CFG
|
143 |
+
set to 3.
|
144 |
+
</p>
|
145 |
+
<div class="container pt-3 table-responsive">
|
146 |
+
<table class="table table-hover" id="vis-table"></table>
|
147 |
+
</div>
|
148 |
+
</div>
|
149 |
+
|
150 |
+
|
151 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
152 |
+
<h3>Impact of Classifier-Free Guidance<a id="voxpopuli"/></h3>
|
153 |
+
<p class="mb-0">
|
154 |
+
Samples taken from the VoxPopuli dataset. The Hibiki samples are presented with different levels
|
155 |
+
of classifier-free guidance (CFG). The higher the CFG value, the closer the generated voice will
|
156 |
+
be to the original voice. This results in very strong accents for the generations with the higher
|
157 |
+
values.
|
158 |
+
</p>
|
159 |
+
|
160 |
+
<div class="container pt-3 table-responsive">
|
161 |
+
<table
|
162 |
+
class="table table-hover"
|
163 |
+
id="voxpopuli-table"
|
164 |
+
>
|
165 |
+
<thead>
|
166 |
+
<tr>
|
167 |
+
<th style="text-align: center">Source</th>
|
168 |
+
<th style="text-align: center">Hibiki CFG-1</th>
|
169 |
+
<th style="text-align: center">Hibiki CFG-3</th>
|
170 |
+
<th style="text-align: center">Hibiki CFG-10</th>
|
171 |
+
<th style="text-align: center">Seamless</th>
|
172 |
+
</tr>
|
173 |
+
</thead>
|
174 |
+
<tbody>
|
175 |
+
<tr> <td></td> <td></td> <td></td> <td></td> <td></td></tr>
|
176 |
+
<tr> <td></td> <td></td> <td></td> <td></td> <td></td></tr>
|
177 |
+
<tr> <td></td> <td></td> <td></td> <td></td> <td></td></tr>
|
178 |
+
<tr> <td></td> <td></td> <td></td> <td></td> <td></td></tr>
|
179 |
+
<tr> <td></td> <td></td> <td></td> <td></td> <td></td></tr>
|
180 |
+
</tbody>
|
181 |
+
</table>
|
182 |
+
</div>
|
183 |
+
</div>
|
184 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
185 |
+
<h3>Long-form Simultaneous Translations<a id="ntrex"/></h3>
|
186 |
+
<p class="mb-0">
|
187 |
+
Samples taken from the audio NTREX dataset.
|
188 |
+
</p>
|
189 |
+
|
190 |
+
<div class="container pt-3 table-responsive">
|
191 |
+
<table
|
192 |
+
class="table table-hover"
|
193 |
+
id="ntrex-table"
|
194 |
+
>
|
195 |
+
<thead>
|
196 |
+
<tr>
|
197 |
+
<th style="text-align: center;min-width: 200px;">Source</th>
|
198 |
+
<th style="text-align: center;">Hibiki</th>
|
199 |
+
<th style="text-align: center">Seamless</th>
|
200 |
+
</tr>
|
201 |
+
</thead>
|
202 |
+
<tbody>
|
203 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
204 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
205 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
206 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
207 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
208 |
+
</tbody>
|
209 |
+
</table>
|
210 |
+
</div>
|
211 |
+
</div>
|
212 |
+
|
213 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
214 |
+
<h3>Short-form Simultaneous Translations<a id="cvss-c"/></h3>
|
215 |
+
<p class="mb-0">
|
216 |
+
Samples taken from the CVSS-C dataset.
|
217 |
+
</p>
|
218 |
+
|
219 |
+
<div class="container pt-3 table-responsive">
|
220 |
+
<table
|
221 |
+
class="table table-hover"
|
222 |
+
id="cvss-table"
|
223 |
+
>
|
224 |
+
<thead>
|
225 |
+
<tr>
|
226 |
+
<th style="text-align: center;min-width: 200px;">Source</th>
|
227 |
+
<th style="text-align: center;">Hibiki</th>
|
228 |
+
<th style="text-align: center">Seamless</th>
|
229 |
+
</tr>
|
230 |
+
</thead>
|
231 |
+
<tbody>
|
232 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
233 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
234 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
235 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
236 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
237 |
+
</tbody>
|
238 |
+
</table>
|
239 |
+
</div>
|
240 |
+
</div>
|
241 |
+
|
242 |
+
<div class="container p-5 mb-5 bg-white rounded">
|
243 |
+
<p class="mb-0">
|
244 |
+
This page was adapted from the <a href="https://google-research.github.io/seanet/soundstorm/examples">SoundStorm project page</a>.
|
245 |
+
</p>
|
246 |
+
</div>
|
247 |
+
|
248 |
+
|
249 |
+
</body>
|
250 |
</html>
|
251 |
+
|