|
<!DOCTYPE html>
|
|
<html lang="en">
|
|
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>Speech-to-Speech Model Comparison</title>
|
|
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet">
|
|
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
|
<style>
|
|
body {
|
|
background-color: #f4f6f9;
|
|
font-family: 'Arial', sans-serif;
|
|
}
|
|
|
|
.container {
|
|
background-color: white;
|
|
border-radius: 10px;
|
|
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
|
|
padding: 30px;
|
|
}
|
|
|
|
h3 {
|
|
font-size: 1.2rem;
|
|
|
|
font-weight: bold;
|
|
color: #333;
|
|
}
|
|
|
|
.form-control {
|
|
border-radius: 25px;
|
|
padding: 15px;
|
|
}
|
|
|
|
.btn {
|
|
border-radius: 25px;
|
|
font-size: 0.9rem;
|
|
padding: 8px 16px;
|
|
transition: background-color 0.3s ease;
|
|
}
|
|
|
|
.btn-primary {
|
|
background-color: #007bff;
|
|
border: none;
|
|
}
|
|
|
|
.btn-primary:hover {
|
|
background-color: #0056b3;
|
|
}
|
|
|
|
.btn-success {
|
|
background-color: #28a745;
|
|
border: none;
|
|
}
|
|
|
|
.btn-success:hover {
|
|
background-color: #218838;
|
|
}
|
|
|
|
.btn-selected {
|
|
background-color: #155724 !important;
|
|
color: white !important;
|
|
}
|
|
|
|
.btn-option {
|
|
font-size: 0.9rem;
|
|
padding: 8px 20px;
|
|
margin: 0 10px;
|
|
}
|
|
|
|
#test-content {
|
|
display: none;
|
|
}
|
|
|
|
#category-select,
|
|
#task-select-dropdown {
|
|
width: 120% !important;
|
|
|
|
margin: 0 auto;
|
|
|
|
}
|
|
|
|
#confirm-choice,
|
|
#next-test {
|
|
display: none;
|
|
transition: opacity 0.3s ease;
|
|
}
|
|
|
|
#model-comparison {
|
|
display: none;
|
|
opacity: 0;
|
|
transition: opacity 0.3s ease;
|
|
}
|
|
|
|
#model-comparison.show {
|
|
opacity: 1;
|
|
}
|
|
|
|
#switch-task {
|
|
font-size: 0.8rem;
|
|
padding: 5px 10px;
|
|
position: absolute;
|
|
top: 10px;
|
|
right: 20px;
|
|
display: none;
|
|
}
|
|
#task-description {
|
|
display: none;
|
|
}
|
|
</style>
|
|
</head>
|
|
|
|
<body>
|
|
<div class="container py-5">
|
|
<h3 class="text-center mb-4">Speech-to-Speech Model Comparison</h3>
|
|
|
|
<div id="evaluation-info" class="mb-5">
|
|
<p class="text-start">
|
|
<strong>Welcome to the Speech-to-Speech (S2S) Model Evaluation!</strong>
|
|
<br><br>
|
|
In this evaluation, you will assess the performance of 4 S2S models:
|
|
<strong>ChatGPT-4o</strong>, <strong>FunAudioLLM</strong>, <strong>SpeechGPT</strong>, and
|
|
<strong>Mini-Omni</strong>.
|
|
The goal is to evaluate how well these models handle various speech tasks across different domains.
|
|
<br><br>
|
|
Once you select a specific domain and task (e.g., <em>Educational Tutoring</em> and <em>Rhythm Control</em>),
|
|
you will proceed to the evaluation stage. In each round, you will be presented with an audio input.
|
|
For example:
|
|
<br><br>
|
|
|
|
|
|
<span style="vertical-align: middle; line-height: 1.2; display: inline-block;"><strong>Audio Sample:</strong></span>
|
|
<audio controls style="vertical-align: middle;">
|
|
<source src="/static/audio/sample/input_audio.wav" type="audio/wav">
|
|
</audio>
|
|
|
|
<br><br>
|
|
The corresponding text is:
|
|
<em>"Say the following sentence at my speed first, then say it again very slowly:
|
|
'Artificial intelligence is changing the world in many ways.'" </em>
|
|
<small>(Note: the audio plays at 1.5x the normal speed.)</small>
|
|
<br><br>
|
|
The responses of different S2S models will be provided, and your task is to choose which response best follows
|
|
the instructions. For example<small>(Note: During the evaluation process, you will be provided with responses from only the two models that have the most comparative significance.)</small>:
|
|
<br><br>
|
|
|
|
|
|
<span><strong>ChatGPT-4o:</strong></span>
|
|
<audio controls style="vertical-align: middle;">
|
|
<source src="/static/audio/sample/4o_audio.wav" type="audio/wav">
|
|
</audio>
|
|
<p class="text-start" style="margin-left: 20px;">
|
|
<strong>Performance:</strong> Speech: Partially followed the instruction on speed. Semantics: Accurately followed the instruction, with no semantic deviation or missing information.
|
|
</p>
|
|
|
|
|
|
<span><strong>FunAudioLLM:</strong></span>
|
|
<audio controls style="vertical-align: middle;">
|
|
<source src="/static/audio/sample/FunAudio_audio.wav" type="audio/wav">
|
|
</audio>
|
|
<p class="text-start" style="margin-left: 20px;">
|
|
<strong>Performance:</strong> Speech: Partially followed the instruction on speed. Semantics: Accurately followed the instruction, with no semantic deviation or missing information.
|
|
</p>
|
|
|
|
|
|
<span><strong>SpeechGPT:</strong></span>
|
|
<audio controls style="vertical-align: middle;">
|
|
<source src="/static/audio/sample/SpeechGPT.wav" type="audio/wav">
|
|
</audio>
|
|
<p class="text-start" style="margin-left: 20px;">
|
|
<strong>Performance:</strong> Speech: Did not follow the instruction on speed. Semantics: Partially followed the instruction, with minor semantic deviation and missing information.
|
|
</p>
|
|
|
|
|
|
<span><strong>Mini-Omni:</strong></span>
|
|
<audio controls style="vertical-align: middle;">
|
|
<source src="/static/audio/sample/mini-omni.wav" type="audio/wav">
|
|
</audio>
|
|
<p class="text-start" style="margin-left: 20px;">
|
|
<strong>Performance:</strong> Speech: Did not follow the instruction on speed. Semantics: Did not follow the instruction, with significant semantic deviation and missing information.
|
|
</p>
|
|
|
|
<p class="text-start">
|
|
After making your choice, you'll proceed to the next round.
|
|
</p>
|
|
<strong>Please enter your username and start the evaluation!</strong>
|
|
</p>
|
|
</div>
|
|
|
|
<div id="user-input" class="text-center">
|
|
<div class="mb-3">
|
|
<input type="text" id="username" class="form-control w-50 mx-auto" placeholder="Your username" />
|
|
</div>
|
|
<button class="btn btn-primary" onclick="startTest()">Start Test</button>
|
|
</div>
|
|
|
|
|
|
<div id="task-select" class="text-center" style="display: none;">
|
|
<h3 class="my-4">Select Test Category:</h3>
|
|
<div class="d-grid gap-2 col-6 mx-auto">
|
|
|
|
<select id="category-select" class="form-select mx-auto" onchange="populateTasks()">
|
|
<option value="" disabled selected>Select Category</option>
|
|
<option value="educational">Educational Tutoring</option>
|
|
<option value="social">Social Companionship</option>
|
|
<option value="entertainment">Entertainment Dubbing</option>
|
|
<option value="medical">Medical Consultation</option>
|
|
</select>
|
|
</div>
|
|
|
|
<h3 class="my-4" id="specific-task-title" style="display: none;">Select Specific Task:</h3>
|
|
<div class="d-grid gap-2 col-6 mx-auto">
|
|
|
|
<select id="task-select-dropdown" class="form-select mx-auto" style="display: none;">
|
|
<option value="" disabled selected>Select Specific Task</option>
|
|
|
|
</select>
|
|
</div>
|
|
|
|
<button class="btn btn-primary mt-4" id="start-task-btn" onclick="selectTaskFromDropdown()"
|
|
style="display: none;">Start Task</button>
|
|
</div>
|
|
|
|
<button id="switch-task" class="btn btn-warning" onclick="switchTask()">Switch Category and Tasks</button>
|
|
|
|
<div id="test-content">
|
|
<div class="text-center">
|
|
|
|
<div class="row justify-content-center">
|
|
<div class="col-md-6 text-start double-text" style="margin-bottom: 10px;">
|
|
<strong>Task description:</strong> <span id="task-description"></span>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
<div class="row justify-content-center">
|
|
<div class="col-md-6 d-flex justify-content-center align-items-center mb-4">
|
|
<strong class="me-2">Audio:</strong>
|
|
<audio id="input-audio" controls></audio>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="row justify-content-center">
|
|
<div class="col-md-6 text-start double-text" style="margin-bottom: 10px;">
|
|
<strong>Audio text:</strong> <span id="test-text"></span>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
<div class="row justify-content-center">
|
|
<div class="col-md-6 text-start">
|
|
<p><strong>Question:</strong> Which of the following two models answers the result better?</p>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
<div class="mb-4 text-center">
|
|
<div class="model-section d-flex align-items-center justify-content-center mb-3">
|
|
<h6 class="me-2" style="margin-bottom: 0; margin-top: 5px; font-weight: bold;">Model A:</h6>
|
|
<audio id="audio-a" controls></audio>
|
|
</div>
|
|
<div class="model-section d-flex align-items-center justify-content-center">
|
|
<h6 class="me-2" style="margin-bottom: 0; margin-top: 5px; font-weight: bold;">Model B:</h6>
|
|
<audio id="audio-b" controls></audio>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
<div class="d-flex justify-content-center mt-4">
|
|
<button class="btn btn-success btn-option mx-2" onclick="selectModel('A')">Model A</button>
|
|
<button class="btn btn-success btn-option mx-2" onclick="selectModel('B')">Model B</button>
|
|
</div>
|
|
|
|
<div id="model-comparison" class="text-center mt-4">
|
|
<p>Model A: <span id="model-a"></span></p>
|
|
<p>Model B: <span id="model-b"></span></p>
|
|
<p>Your choice: <span id="chosen-model"></span></p>
|
|
</div>
|
|
|
|
<button id="confirm-choice" class="btn btn-primary mt-4" onclick="confirmChoice()">Confirm
|
|
Selection</button>
|
|
<button id="next-test" class="btn btn-primary mt-4" onclick="loadNextTest()">Next Test</button>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="test-completed" class="text-center" style="display: none;">
|
|
<h3>Thank you for completing the <span id="completed-task"></span> test!</h3>
|
|
<p>Would you like to test another category or task?</p>
|
|
<button class="btn btn-primary" onclick="switchTask()">Yes</button>
|
|
<button class="btn btn-secondary" onclick="endTest()">No</button>
|
|
</div>
|
|
|
|
|
|
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/js/bootstrap.bundle.min.js"></script>
|
|
<script>
|
|
let username;
|
|
let task;
|
|
let chosenModel;
|
|
let modelA, modelB;
|
|
|
|
|
|
const modelNames = {
|
|
"output_path_speechgpt": "SpeechGPT",
|
|
"output_path_miniomni": "Mini-Omni",
|
|
"output_path_4o": "ChatGPT-4o",
|
|
"output_path_funaudio": "FunAudioLLM",
|
|
"output_path_4o_cascade": "Cascade",
|
|
"output_path_4o_llama_omni": "LLaMA-Omni"
|
|
};
|
|
|
|
function startTest() {
|
|
username = $("#username").val();
|
|
if (!username) {
|
|
alert("Please enter a username");
|
|
return;
|
|
}
|
|
$("#evaluation-info").hide();
|
|
$("#user-input").hide();
|
|
$("#task-select").show();
|
|
}
|
|
|
|
function switchTask() {
|
|
|
|
$("#task-description").text('');
|
|
$("#test-content").hide();
|
|
$("#test-text").text('');
|
|
$("#input-audio").attr("src", '');
|
|
$("#audio-a").attr("src", '');
|
|
$("#audio-b").attr("src", '');
|
|
$("#chosen-model").text('');
|
|
$("#model-a").text('');
|
|
$("#model-b").text('');
|
|
$("#confirm-choice").hide();
|
|
$("#next-test").hide();
|
|
$("#model-comparison").removeClass('show').hide();
|
|
|
|
|
|
$("#test-completed").hide();
|
|
$("#task-select").show();
|
|
$("#switch-task").hide();
|
|
}
|
|
|
|
function selectTask(selectedTask) {
|
|
task = selectedTask;
|
|
|
|
|
|
$("#task-description").text('');
|
|
$("#test-text").text('');
|
|
$("#input-audio").attr("src", '');
|
|
$("#audio-a").attr("src", '');
|
|
$("#audio-b").attr("src", '');
|
|
$("#chosen-model").text('');
|
|
$("#model-a").text('');
|
|
$("#model-b").text('');
|
|
$("#confirm-choice").hide();
|
|
$("#next-test").hide();
|
|
$("#model-comparison").removeClass('show').hide();
|
|
|
|
|
|
$("#task-select").hide();
|
|
$("#switch-task").show();
|
|
|
|
|
|
$.ajax({
|
|
url: '/start_test',
|
|
type: 'POST',
|
|
contentType: 'application/json',
|
|
data: JSON.stringify({ username: username, task: task }),
|
|
success: function (data) {
|
|
$("#test-content").show();
|
|
loadNextTest();
|
|
},
|
|
error: function (xhr, status, error) {
|
|
console.error("Error occurred: ", status, error);
|
|
}
|
|
});
|
|
}
|
|
|
|
function populateTasks() {
|
|
const category = $("#category-select").val();
|
|
const taskDropdown = $("#task-select-dropdown");
|
|
|
|
|
|
taskDropdown.empty();
|
|
|
|
taskDropdown.append('<option value="" disabled selected>Select Specific Task</option>');
|
|
|
|
|
|
if (category === 'educational') {
|
|
taskDropdown.append('<option value="pronunciation">Correcting pronunciation ability</option>');
|
|
taskDropdown.append('<option value="rhythm">Rhythm control capabilities</option>');
|
|
taskDropdown.append('<option value="translation">Cross-language translation with emotion</option>');
|
|
taskDropdown.append('<option value="language">Language consistency</option>');
|
|
taskDropdown.append('<option value="pause">Pause and segmentation</option>');
|
|
taskDropdown.append('<option value="polyphone">Polyphonic word comprehension</option>');
|
|
taskDropdown.append('<option value="stress">Emphasis control</option>');
|
|
} else if (category === 'social') {
|
|
taskDropdown.append('<option value="emotion">Emotion recognition and expression</option>');
|
|
taskDropdown.append('<option value="identity">Identity coping ability</option>');
|
|
taskDropdown.append('<option value="humor">Implications ability</option>');
|
|
taskDropdown.append('<option value="irony">Sarcasm detection</option>');
|
|
} else if (category === 'entertainment') {
|
|
taskDropdown.append('<option value="natural">Ability to simulate natural sound</option>');
|
|
taskDropdown.append('<option value="singing">Singing ability</option>');
|
|
taskDropdown.append('<option value="tongue">Tongue twisters capabilities</option>');
|
|
taskDropdown.append('<option value="crosstalk">Crosstalk ability</option>');
|
|
taskDropdown.append('<option value="poetry">Poetry recitation</option>');
|
|
taskDropdown.append('<option value="role">Role-playing</option>');
|
|
taskDropdown.append('<option value="story">Storytelling</option>');
|
|
} else if (category === 'medical') {
|
|
taskDropdown.append('<option value="healthcare">Health consultation</option>');
|
|
taskDropdown.append('<option value="illness">Querying symptoms</option>');
|
|
taskDropdown.append('<option value="psychological">Psychological comfort</option>');
|
|
}
|
|
|
|
|
|
if (category) {
|
|
$("#specific-task-title").show();
|
|
$("#task-select-dropdown").show();
|
|
$("#start-task-btn").show();
|
|
} else {
|
|
$("#specific-task-title").hide();
|
|
$("#task-select-dropdown").hide();
|
|
$("#start-task-btn").hide();
|
|
}
|
|
}
|
|
|
|
|
|
function selectTaskFromDropdown() {
|
|
const selectedTask = $("#task-select-dropdown").val();
|
|
if (selectedTask) {
|
|
task = selectedTask;
|
|
$.ajax({
|
|
url: '/start_test',
|
|
type: 'POST',
|
|
contentType: 'application/json',
|
|
data: JSON.stringify({ username: username, task: task }),
|
|
success: function (data) {
|
|
|
|
$("#task-description").text(data.task_description);
|
|
$("#task-description").show();
|
|
$("#task-select").hide();
|
|
$("#test-content").show();
|
|
$("#switch-task").show();
|
|
loadNextTest();
|
|
},
|
|
error: function (xhr, status, error) {
|
|
console.error("Error occurred: ", status, error);
|
|
}
|
|
});
|
|
} else {
|
|
alert("Please select a specific task.");
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
function loadNextTest() {
|
|
$.get('/next_test', function (data) {
|
|
if (data.message === 'Test completed') {
|
|
$("#test-content").hide();
|
|
$("#test-completed").show();
|
|
|
|
|
|
$("#completed-task").text(task);
|
|
|
|
|
|
sessionStorage.removeItem('current_index');
|
|
} else {
|
|
|
|
console.log(data);
|
|
$("#task-description").text(data.task_description);
|
|
$("#test-text").text(data.text);
|
|
$("#input-audio").attr("src", data.input_path);
|
|
$("#audio-a").attr("src", data.audio_a);
|
|
$("#audio-b").attr("src", data.audio_b);
|
|
|
|
|
|
modelA = modelNames[data.model_a];
|
|
modelB = modelNames[data.model_b];
|
|
$("#model-a").text(modelA);
|
|
$("#model-b").text(modelB);
|
|
|
|
$("#next-test").hide();
|
|
$("#model-comparison").hide();
|
|
$("#confirm-choice").show();
|
|
chosenModel = null;
|
|
$(".btn-option").prop('disabled', false);
|
|
$(".btn-option").removeClass("btn-selected").addClass("btn-success");
|
|
}
|
|
}, 'json').fail(function (xhr, status, error) {
|
|
console.error("Failed to load test data:", status, error);
|
|
});
|
|
}
|
|
|
|
function endTest() {
|
|
|
|
alert("Thank you for participating in the test!");
|
|
|
|
window.location.href = "/thank_you";
|
|
}
|
|
|
|
function selectModel(model) {
|
|
|
|
chosenModel = model;
|
|
|
|
|
|
$(".btn-option").prop('disabled', false);
|
|
|
|
|
|
$(".btn-option").removeClass("btn-selected").addClass("btn-success");
|
|
|
|
|
|
if (model === 'A') {
|
|
$("button:contains('Model A')").removeClass("btn-success").addClass("btn-selected");
|
|
} else if (model === 'B') {
|
|
$("button:contains('Model B')").removeClass("btn-success").addClass("btn-selected");
|
|
}
|
|
}
|
|
|
|
function confirmChoice() {
|
|
|
|
if (!chosenModel) {
|
|
alert("Please select a model before confirming.");
|
|
return;
|
|
}
|
|
|
|
|
|
$(".btn-option").prop('disabled', true);
|
|
|
|
|
|
if (chosenModel === 'A') {
|
|
$("#chosen-model").text(modelA);
|
|
} else {
|
|
$("#chosen-model").text(modelB);
|
|
}
|
|
|
|
|
|
$("#model-a").text(modelA);
|
|
$("#model-b").text(modelB);
|
|
|
|
|
|
$("#model-comparison").addClass('show');
|
|
$("#model-comparison").show();
|
|
|
|
|
|
$("#confirm-choice").hide();
|
|
$("#next-test").show();
|
|
|
|
|
|
$.ajax({
|
|
url: '/submit_result',
|
|
type: 'POST',
|
|
contentType: 'application/json',
|
|
data: JSON.stringify({ chosen_model: chosenModel }),
|
|
success: function (data) {
|
|
|
|
},
|
|
error: function (xhr, status, error) {
|
|
console.error("Error occurred: ", status, error);
|
|
}
|
|
});
|
|
}
|
|
</script>
|
|
</body>
|
|
|
|
</html> |