benjamin-paine
commited on
Commit
·
caca220
1
Parent(s):
55d3ab6
Switch to kokoro
Browse files- Dockerfile +4 -3
- www/index.css +11 -11
- www/index.html +7 -7
- www/index.js +15 -64
Dockerfile
CHANGED
@@ -3,7 +3,7 @@ FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
|
|
3 |
# Model choices
|
4 |
ARG TEXT_MODEL=llama-v3-2-3b-instruct-q6-k
|
5 |
ARG TRANSCRIBE_MODEL=distilled-whisper-large-v3
|
6 |
-
ARG SPEECH_MODEL=
|
7 |
|
8 |
# Create user
|
9 |
RUN useradd -m -u 1000 anachrovox
|
@@ -38,6 +38,7 @@ RUN apt-get update && \
|
|
38 |
curl wget procps \
|
39 |
htop vim \
|
40 |
python3-pip python3-dev \
|
|
|
41 |
nginx && \
|
42 |
rm -rf /var/lib/apt/lists/*
|
43 |
|
@@ -55,8 +56,8 @@ RUN taproot install \
|
|
55 |
audio-transcription:${TRANSCRIBE_MODEL} \
|
56 |
text-generation:${TEXT_MODEL} \
|
57 |
speech-synthesis:${SPEECH_MODEL} \
|
58 |
-
--
|
59 |
-
--
|
60 |
|
61 |
# Copy run script
|
62 |
COPY --chown=anachrovox --chmod=755 run.sh /app/run.sh
|
|
|
3 |
# Model choices
|
4 |
ARG TEXT_MODEL=llama-v3-2-3b-instruct-q6-k
|
5 |
ARG TRANSCRIBE_MODEL=distilled-whisper-large-v3
|
6 |
+
ARG SPEECH_MODEL=kokoro
|
7 |
|
8 |
# Create user
|
9 |
RUN useradd -m -u 1000 anachrovox
|
|
|
38 |
curl wget procps \
|
39 |
htop vim \
|
40 |
python3-pip python3-dev \
|
41 |
+
espeak-ng \
|
42 |
nginx && \
|
43 |
rm -rf /var/lib/apt/lists/*
|
44 |
|
|
|
56 |
audio-transcription:${TRANSCRIBE_MODEL} \
|
57 |
text-generation:${TEXT_MODEL} \
|
58 |
speech-synthesis:${SPEECH_MODEL} \
|
59 |
+
--optional \
|
60 |
+
--debug
|
61 |
|
62 |
# Copy run script
|
63 |
COPY --chown=anachrovox --chmod=755 run.sh /app/run.sh
|
www/index.css
CHANGED
@@ -1159,10 +1159,10 @@ div.slider {
|
|
1159 |
|
1160 |
div.slider-element {
|
1161 |
position: absolute;
|
1162 |
-
width:
|
1163 |
-
left: calc(50% -
|
1164 |
-
height:
|
1165 |
-
margin-top: -
|
1166 |
perspective: 32px;
|
1167 |
cursor: grab;
|
1168 |
z-index: 2;
|
@@ -1173,17 +1173,17 @@ div.slider-element::after {
|
|
1173 |
content: "\A";
|
1174 |
display: block;
|
1175 |
position: absolute;
|
1176 |
-
width:
|
1177 |
-
height:
|
1178 |
-
transform-origin: center;
|
1179 |
background-blend-mode: overlay, normal;
|
1180 |
box-shadow: 4px 4px 6px rgba(0,0,0,0.4),
|
1181 |
10px 10px 14px rgba(0,0,0,0.2);
|
1182 |
}
|
1183 |
|
1184 |
div.slider-element::before {
|
1185 |
-
left:
|
1186 |
-
transform: rotateY(-15deg);
|
1187 |
background-image:
|
1188 |
linear-gradient(to right, #CCC, #888),
|
1189 |
url("static/black-plastic.jpg");
|
@@ -1192,8 +1192,8 @@ div.slider-element::before {
|
|
1192 |
}
|
1193 |
|
1194 |
div.slider-element::after {
|
1195 |
-
right:
|
1196 |
-
transform: rotateY(15deg);
|
1197 |
background-image:
|
1198 |
linear-gradient(to right, #444, #333),
|
1199 |
url("static/black-plastic.jpg");
|
|
|
1159 |
|
1160 |
div.slider-element {
|
1161 |
position: absolute;
|
1162 |
+
width: 140px;
|
1163 |
+
left: calc(50% - 70px);
|
1164 |
+
height: 40px;
|
1165 |
+
margin-top: -20px;
|
1166 |
perspective: 32px;
|
1167 |
cursor: grab;
|
1168 |
z-index: 2;
|
|
|
1173 |
content: "\A";
|
1174 |
display: block;
|
1175 |
position: absolute;
|
1176 |
+
width: 50px;
|
1177 |
+
height: 40px;
|
1178 |
+
transform-origin: center center;
|
1179 |
background-blend-mode: overlay, normal;
|
1180 |
box-shadow: 4px 4px 6px rgba(0,0,0,0.4),
|
1181 |
10px 10px 14px rgba(0,0,0,0.2);
|
1182 |
}
|
1183 |
|
1184 |
div.slider-element::before {
|
1185 |
+
left: 21px;
|
1186 |
+
transform: rotateY(-15deg) scaleY(0.6);
|
1187 |
background-image:
|
1188 |
linear-gradient(to right, #CCC, #888),
|
1189 |
url("static/black-plastic.jpg");
|
|
|
1192 |
}
|
1193 |
|
1194 |
div.slider-element::after {
|
1195 |
+
right: 21px;
|
1196 |
+
transform: rotateY(15deg) scaleY(0.6);
|
1197 |
background-image:
|
1198 |
linear-gradient(to right, #444, #333),
|
1199 |
url("static/black-plastic.jpg");
|
www/index.html
CHANGED
@@ -152,7 +152,7 @@
|
|
152 |
<button id="info-button">i</button>
|
153 |
<div id="info-content">
|
154 |
<h1 id="info-logo">Anachrovox</h1>
|
155 |
-
<h2 id="info-version">Alpha Release v0.1.
|
156 |
<h3>Instructions</h3>
|
157 |
<p>To issue a voice command in a hands-free fashion, start your command with one of the supported wake phrases, then issue your command naturally (i.e. you do not need to pause.) There are many supported wake phrases but they all include 'Vox' - for example, 'Hey Vox', 'Hi Vox', or just 'Vox'.</p>
|
158 |
<p>There are numerous ways to shortcut the speech-to-speech workflow:</p>
|
@@ -176,7 +176,7 @@
|
|
176 |
<li>Wake-word detection using <a href="https://github.com/painebenjamin/hey-buddy" target="_blank">Hey Buddy</a></li>
|
177 |
<li>Speech-to-text using <a href="https://huggingface.co/distil-whisper/distil-large-v3" target="_blank">Distil-Whisper Large</a></li>
|
178 |
<li>Text generation using <a href="https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct" target="_blank">Llama 3.2 3B Instruct</a></li>
|
179 |
-
<li>Text-to-speech using <a href="https://
|
180 |
<li>Audio enhancement using <a href="https://github.com/Rikorose/DeepFilterNet" target="_blank">DeepFilterNet</a></li>
|
181 |
</ol>
|
182 |
<p>All backend models are ran through Taproot, which is made to be as low-overhead as possible, allowing for real-time operation on consumer hardware. These are just a small selection of the supported model set, but were chosen for their balance of speed, size and capability. Visit <a href="https://github.com/painebenjamin/anachrovox" target="_blank">the Anachrovox GitHub</a> to see how to build with different supported components and/or visit <a href="https://github.com/painebenjamin/taproot" target="_blank">the Taproot GitHub</a> to request a new supported component or learn how to build your own (and hopefully contribute it back to the community!)</p>
|
@@ -206,11 +206,11 @@
|
|
206 |
primaryClass = {cs.AI},
|
207 |
url = {https://arxiv.org/abs/2407.21783}
|
208 |
}</cite>
|
209 |
-
<cite>@misc{
|
210 |
-
title = {
|
211 |
-
author = {
|
212 |
-
year = {
|
213 |
-
url = {https://
|
214 |
}</cite>
|
215 |
<cite>@inproceedings{schroeter2023deepfilternet3,
|
216 |
title = {{DeepFilterNet}: Perceptually Motivated Real-Time Speech Enhancement},
|
|
|
152 |
<button id="info-button">i</button>
|
153 |
<div id="info-content">
|
154 |
<h1 id="info-logo">Anachrovox</h1>
|
155 |
+
<h2 id="info-version">Alpha Release v0.1.3</h2>
|
156 |
<h3>Instructions</h3>
|
157 |
<p>To issue a voice command in a hands-free fashion, start your command with one of the supported wake phrases, then issue your command naturally (i.e. you do not need to pause.) There are many supported wake phrases but they all include 'Vox' - for example, 'Hey Vox', 'Hi Vox', or just 'Vox'.</p>
|
158 |
<p>There are numerous ways to shortcut the speech-to-speech workflow:</p>
|
|
|
176 |
<li>Wake-word detection using <a href="https://github.com/painebenjamin/hey-buddy" target="_blank">Hey Buddy</a></li>
|
177 |
<li>Speech-to-text using <a href="https://huggingface.co/distil-whisper/distil-large-v3" target="_blank">Distil-Whisper Large</a></li>
|
178 |
<li>Text generation using <a href="https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct" target="_blank">Llama 3.2 3B Instruct</a></li>
|
179 |
+
<li>Text-to-speech using <a href="https://huggingface.co/hexgrad/Kokoro-82M" target="_blank">Kokoro</a></li>
|
180 |
<li>Audio enhancement using <a href="https://github.com/Rikorose/DeepFilterNet" target="_blank">DeepFilterNet</a></li>
|
181 |
</ol>
|
182 |
<p>All backend models are ran through Taproot, which is made to be as low-overhead as possible, allowing for real-time operation on consumer hardware. These are just a small selection of the supported model set, but were chosen for their balance of speed, size and capability. Visit <a href="https://github.com/painebenjamin/anachrovox" target="_blank">the Anachrovox GitHub</a> to see how to build with different supported components and/or visit <a href="https://github.com/painebenjamin/taproot" target="_blank">the Taproot GitHub</a> to request a new supported component or learn how to build your own (and hopefully contribute it back to the community!)</p>
|
|
|
206 |
primaryClass = {cs.AI},
|
207 |
url = {https://arxiv.org/abs/2407.21783}
|
208 |
}</cite>
|
209 |
+
<cite>@misc{kokoro82m,
|
210 |
+
title = {Kokoro-82M}
|
211 |
+
author = {@rzvzn}
|
212 |
+
year = {2024}
|
213 |
+
url = {https://huggingface.co/hexgrad/Kokoro-82M}
|
214 |
}</cite>
|
215 |
<cite>@inproceedings{schroeter2023deepfilternet3,
|
216 |
title = {{DeepFilterNet}: Perceptually Motivated Real-Time Speech Enhancement},
|
www/index.js
CHANGED
@@ -16,16 +16,14 @@ const [dpR, dpG, dpB] = [
|
|
16 |
const pollingInterval = 150;
|
17 |
const transcriptionParameters = {};
|
18 |
const languageParameters = {
|
19 |
-
max_tokens: 512,
|
20 |
role: "anachrovox",
|
21 |
stream: true,
|
22 |
use_tools: true,
|
|
|
23 |
return_tool_metadata: true,
|
24 |
};
|
25 |
const speechParameters = {
|
26 |
enhance: true,
|
27 |
-
stream: false,
|
28 |
-
stream_chunk_size: 25,
|
29 |
output_format: "float"
|
30 |
};
|
31 |
const waveformParameters = {
|
@@ -49,6 +47,7 @@ const speakerHoleRings = [ // radius, number of holes
|
|
49 |
const maxTypingSpeed = 200; // characters per second
|
50 |
const minTypingSpeed = 50;
|
51 |
const maxDelay = 0.5; // max length to delay from completion to wait for speech to start generating
|
|
|
52 |
let overseerAddress;
|
53 |
|
54 |
if (window.location.port === "3000") {
|
@@ -140,66 +139,18 @@ const pushText = (text, className) => {
|
|
140 |
};
|
141 |
|
142 |
// Bind voice ID wheel to change voice ID
|
143 |
-
// This is
|
144 |
const voiceMap = {
|
145 |
-
"
|
146 |
-
"
|
147 |
-
"
|
148 |
-
"
|
149 |
-
"
|
150 |
-
"
|
151 |
-
"
|
152 |
-
"
|
153 |
-
"
|
154 |
-
"
|
155 |
-
"Barbara": "Barbora MacLean",
|
156 |
-
"Blake": "Baldur Sanjin",
|
157 |
-
"Brenda": "Brenda Stern",
|
158 |
-
"Brian": "Badr Odhiambo",
|
159 |
-
"Carla": "Camilla Holmström",
|
160 |
-
"Cindy": "Chandra MacFarland",
|
161 |
-
"Clara": "Claribel Dervla",
|
162 |
-
"Clark": "Kumar Dahl",
|
163 |
-
"Craig": "Craig Gutsy",
|
164 |
-
"Daisy": "Daisy Studious",
|
165 |
-
"Damien": "Damien Black",
|
166 |
-
"David": "Dionisio Schuyler",
|
167 |
-
"Dennis": "Damjan Chapman",
|
168 |
-
"Ella": "Uta Obando",
|
169 |
-
"Eugene": "Eugenio Mataracı",
|
170 |
-
"Frank": "Ferran Simen",
|
171 |
-
"Gilbert": "Gilberto Mathias",
|
172 |
-
"Gina": "Gitta Nikolina",
|
173 |
-
"Grace": "Gracie Wise",
|
174 |
-
"Heidi": "Henriette Usha",
|
175 |
-
"Ian": "Ige Behringer",
|
176 |
-
"Ivan": "Ilkin Urbano",
|
177 |
-
"Kevin": "Kazuhiko Atallah",
|
178 |
-
"Lily": "Lilya Stainthorpe",
|
179 |
-
"Louis": "Luis Moray",
|
180 |
-
"Lucas": "Ludvig Milivoj",
|
181 |
-
"Lydia": "Lidiya Szekeres",
|
182 |
-
"Marcus": "Marcos Rudaski",
|
183 |
-
"Maya": "Maja Ruoho",
|
184 |
-
"Nadia": "Narelle Moon",
|
185 |
-
"Nora": "Nova Hogarth",
|
186 |
-
"Philip": "Filip Traverse",
|
187 |
-
"Raymond": "Royston Min",
|
188 |
-
"Rose": "Rosemary Okafor",
|
189 |
-
"Saul": "Suad Qasim",
|
190 |
-
"Sofia": "Sofia Hellen",
|
191 |
-
"Sophie": "Szofi Granger",
|
192 |
-
"Tammy": "Tammie Ema",
|
193 |
-
"Tanya": "Tanja Adelina",
|
194 |
-
"Tara": "Tammy Grit",
|
195 |
-
"Thomas": "Torcull Diarmuid",
|
196 |
-
"Victor": "Viktor Eka",
|
197 |
-
"Victor": "Viktor Menelaos",
|
198 |
-
"Violet": "Vjollca Johnnie",
|
199 |
-
"Warren": "Wulf Carlevaro",
|
200 |
-
"Xavier": "Xavier Hayasaka",
|
201 |
-
"Zachary": "Zacharie Aimilios",
|
202 |
-
"Zoe": "Zofija Kendrick",
|
203 |
};
|
204 |
const voiceNames = Object.keys(voiceMap);
|
205 |
const voiceIds = Object.values(voiceMap);
|
@@ -211,7 +162,7 @@ const setVoiceIndex = (newIndex) => {
|
|
211 |
voiceIndex = newIndex;
|
212 |
}
|
213 |
};
|
214 |
-
setVoiceIndex(
|
215 |
voiceIdWheel.addEventListener("click", () => {
|
216 |
let newVoiceIndex = voiceIndex + parseInt(voiceIdWheel.value);
|
217 |
if (newVoiceIndex < 0) newVoiceIndex = voiceIds.length - 1;
|
@@ -245,7 +196,7 @@ const getSpeechParameters = (overrides = {}) => {
|
|
245 |
return {
|
246 |
...speechParameters,
|
247 |
speed: parseFloat(speed.value),
|
248 |
-
|
249 |
...overrides,
|
250 |
};
|
251 |
};
|
|
|
16 |
const pollingInterval = 150;
|
17 |
const transcriptionParameters = {};
|
18 |
const languageParameters = {
|
|
|
19 |
role: "anachrovox",
|
20 |
stream: true,
|
21 |
use_tools: true,
|
22 |
+
max_tokens: 1024,
|
23 |
return_tool_metadata: true,
|
24 |
};
|
25 |
const speechParameters = {
|
26 |
enhance: true,
|
|
|
|
|
27 |
output_format: "float"
|
28 |
};
|
29 |
const waveformParameters = {
|
|
|
47 |
const maxTypingSpeed = 200; // characters per second
|
48 |
const minTypingSpeed = 50;
|
49 |
const maxDelay = 0.5; // max length to delay from completion to wait for speech to start generating
|
50 |
+
|
51 |
let overseerAddress;
|
52 |
|
53 |
if (window.location.port === "3000") {
|
|
|
139 |
};
|
140 |
|
141 |
// Bind voice ID wheel to change voice ID
|
142 |
+
// This is the list of voices from Kokoro
|
143 |
const voiceMap = {
|
144 |
+
"Adam": "male.en.us.adam",
|
145 |
+
"Bella": "female.en.us.bella",
|
146 |
+
"Emma": "female.en.gb.emma",
|
147 |
+
"George": "male.en.gb.george",
|
148 |
+
"Isabel": "female.en.gb.isabella",
|
149 |
+
"Lewis": "male.en.gb.lewis",
|
150 |
+
"Michael": "male.en.us.michael",
|
151 |
+
"Nicole": "female.en.us.nicole",
|
152 |
+
"Sarah": "female.en.us.sarah",
|
153 |
+
"Skye": "female.en.us.sky",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
};
|
155 |
const voiceNames = Object.keys(voiceMap);
|
156 |
const voiceIds = Object.values(voiceMap);
|
|
|
162 |
voiceIndex = newIndex;
|
163 |
}
|
164 |
};
|
165 |
+
setVoiceIndex(Math.round(Math.random() * voiceIds.length));
|
166 |
voiceIdWheel.addEventListener("click", () => {
|
167 |
let newVoiceIndex = voiceIndex + parseInt(voiceIdWheel.value);
|
168 |
if (newVoiceIndex < 0) newVoiceIndex = voiceIds.length - 1;
|
|
|
196 |
return {
|
197 |
...speechParameters,
|
198 |
speed: parseFloat(speed.value),
|
199 |
+
voice: voiceMap[voiceId.value],
|
200 |
...overrides,
|
201 |
};
|
202 |
};
|