benjamin-paine commited on
Commit
caca220
·
1 Parent(s): 55d3ab6

Switch to kokoro

Browse files
Files changed (4) hide show
  1. Dockerfile +4 -3
  2. www/index.css +11 -11
  3. www/index.html +7 -7
  4. www/index.js +15 -64
Dockerfile CHANGED
@@ -3,7 +3,7 @@ FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
3
  # Model choices
4
  ARG TEXT_MODEL=llama-v3-2-3b-instruct-q6-k
5
  ARG TRANSCRIBE_MODEL=distilled-whisper-large-v3
6
- ARG SPEECH_MODEL=xtts-v2
7
 
8
  # Create user
9
  RUN useradd -m -u 1000 anachrovox
@@ -38,6 +38,7 @@ RUN apt-get update && \
38
  curl wget procps \
39
  htop vim \
40
  python3-pip python3-dev \
 
41
  nginx && \
42
  rm -rf /var/lib/apt/lists/*
43
 
@@ -55,8 +56,8 @@ RUN taproot install \
55
  audio-transcription:${TRANSCRIBE_MODEL} \
56
  text-generation:${TEXT_MODEL} \
57
  speech-synthesis:${SPEECH_MODEL} \
58
- --debug \
59
- --optional
60
 
61
  # Copy run script
62
  COPY --chown=anachrovox --chmod=755 run.sh /app/run.sh
 
3
  # Model choices
4
  ARG TEXT_MODEL=llama-v3-2-3b-instruct-q6-k
5
  ARG TRANSCRIBE_MODEL=distilled-whisper-large-v3
6
+ ARG SPEECH_MODEL=kokoro
7
 
8
  # Create user
9
  RUN useradd -m -u 1000 anachrovox
 
38
  curl wget procps \
39
  htop vim \
40
  python3-pip python3-dev \
41
+ espeak-ng \
42
  nginx && \
43
  rm -rf /var/lib/apt/lists/*
44
 
 
56
  audio-transcription:${TRANSCRIBE_MODEL} \
57
  text-generation:${TEXT_MODEL} \
58
  speech-synthesis:${SPEECH_MODEL} \
59
+ --optional \
60
+ --debug
61
 
62
  # Copy run script
63
  COPY --chown=anachrovox --chmod=755 run.sh /app/run.sh
www/index.css CHANGED
@@ -1159,10 +1159,10 @@ div.slider {
1159
 
1160
  div.slider-element {
1161
  position: absolute;
1162
- width: 100px;
1163
- left: calc(50% - 50px);
1164
- height: 20px;
1165
- margin-top: -10px;
1166
  perspective: 32px;
1167
  cursor: grab;
1168
  z-index: 2;
@@ -1173,17 +1173,17 @@ div.slider-element::after {
1173
  content: "\A";
1174
  display: block;
1175
  position: absolute;
1176
- width: 50%;
1177
- height: 100%;
1178
- transform-origin: center;
1179
  background-blend-mode: overlay, normal;
1180
  box-shadow: 4px 4px 6px rgba(0,0,0,0.4),
1181
  10px 10px 14px rgba(0,0,0,0.2);
1182
  }
1183
 
1184
  div.slider-element::before {
1185
- left: 1px;
1186
- transform: rotateY(-15deg);
1187
  background-image:
1188
  linear-gradient(to right, #CCC, #888),
1189
  url("static/black-plastic.jpg");
@@ -1192,8 +1192,8 @@ div.slider-element::before {
1192
  }
1193
 
1194
  div.slider-element::after {
1195
- right: 1px;
1196
- transform: rotateY(15deg);
1197
  background-image:
1198
  linear-gradient(to right, #444, #333),
1199
  url("static/black-plastic.jpg");
 
1159
 
1160
  div.slider-element {
1161
  position: absolute;
1162
+ width: 140px;
1163
+ left: calc(50% - 70px);
1164
+ height: 40px;
1165
+ margin-top: -20px;
1166
  perspective: 32px;
1167
  cursor: grab;
1168
  z-index: 2;
 
1173
  content: "\A";
1174
  display: block;
1175
  position: absolute;
1176
+ width: 50px;
1177
+ height: 40px;
1178
+ transform-origin: center center;
1179
  background-blend-mode: overlay, normal;
1180
  box-shadow: 4px 4px 6px rgba(0,0,0,0.4),
1181
  10px 10px 14px rgba(0,0,0,0.2);
1182
  }
1183
 
1184
  div.slider-element::before {
1185
+ left: 21px;
1186
+ transform: rotateY(-15deg) scaleY(0.6);
1187
  background-image:
1188
  linear-gradient(to right, #CCC, #888),
1189
  url("static/black-plastic.jpg");
 
1192
  }
1193
 
1194
  div.slider-element::after {
1195
+ right: 21px;
1196
+ transform: rotateY(15deg) scaleY(0.6);
1197
  background-image:
1198
  linear-gradient(to right, #444, #333),
1199
  url("static/black-plastic.jpg");
www/index.html CHANGED
@@ -152,7 +152,7 @@
152
  <button id="info-button">i</button>
153
  <div id="info-content">
154
  <h1 id="info-logo">Anachrovox</h1>
155
- <h2 id="info-version">Alpha Release v0.1.1</h2>
156
  <h3>Instructions</h3>
157
  <p>To issue a voice command in a hands-free fashion, start your command with one of the supported wake phrases, then issue your command naturally (i.e. you do not need to pause.) There are many supported wake phrases but they all include 'Vox' - for example, 'Hey Vox', 'Hi Vox', or just 'Vox'.</p>
158
  <p>There are numerous ways to shortcut the speech-to-speech workflow:</p>
@@ -176,7 +176,7 @@
176
  <li>Wake-word detection using <a href="https://github.com/painebenjamin/hey-buddy" target="_blank">Hey Buddy</a></li>
177
  <li>Speech-to-text using <a href="https://huggingface.co/distil-whisper/distil-large-v3" target="_blank">Distil-Whisper Large</a></li>
178
  <li>Text generation using <a href="https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct" target="_blank">Llama 3.2 3B Instruct</a></li>
179
- <li>Text-to-speech using <a href="https://coqui.ai/blog/tts/open_xtts" target="_blank">XTTS2</a></li>
180
  <li>Audio enhancement using <a href="https://github.com/Rikorose/DeepFilterNet" target="_blank">DeepFilterNet</a></li>
181
  </ol>
182
  <p>All backend models are ran through Taproot, which is made to be as low-overhead as possible, allowing for real-time operation on consumer hardware. These are just a small selection of the supported model set, but were chosen for their balance of speed, size and capability. Visit <a href="https://github.com/painebenjamin/anachrovox" target="_blank">the Anachrovox GitHub</a> to see how to build with different supported components and/or visit <a href="https://github.com/painebenjamin/taproot" target="_blank">the Taproot GitHub</a> to request a new supported component or learn how to build your own (and hopefully contribute it back to the community!)</p>
@@ -206,11 +206,11 @@
206
  primaryClass = {cs.AI},
207
  url = {https://arxiv.org/abs/2407.21783}
208
  }</cite>
209
- <cite>@misc{coqui2023xtts,
210
- title = {XTTS: Open Model Release Announcement}
211
- author = {Coqui AI}
212
- year = {2023}
213
- url = {https://coqui.ai/blog/tts/open_xtts}
214
  }</cite>
215
  <cite>@inproceedings{schroeter2023deepfilternet3,
216
  title = {{DeepFilterNet}: Perceptually Motivated Real-Time Speech Enhancement},
 
152
  <button id="info-button">i</button>
153
  <div id="info-content">
154
  <h1 id="info-logo">Anachrovox</h1>
155
+ <h2 id="info-version">Alpha Release v0.1.3</h2>
156
  <h3>Instructions</h3>
157
  <p>To issue a voice command in a hands-free fashion, start your command with one of the supported wake phrases, then issue your command naturally (i.e. you do not need to pause.) There are many supported wake phrases but they all include 'Vox' - for example, 'Hey Vox', 'Hi Vox', or just 'Vox'.</p>
158
  <p>There are numerous ways to shortcut the speech-to-speech workflow:</p>
 
176
  <li>Wake-word detection using <a href="https://github.com/painebenjamin/hey-buddy" target="_blank">Hey Buddy</a></li>
177
  <li>Speech-to-text using <a href="https://huggingface.co/distil-whisper/distil-large-v3" target="_blank">Distil-Whisper Large</a></li>
178
  <li>Text generation using <a href="https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct" target="_blank">Llama 3.2 3B Instruct</a></li>
179
+ <li>Text-to-speech using <a href="https://huggingface.co/hexgrad/Kokoro-82M" target="_blank">Kokoro</a></li>
180
  <li>Audio enhancement using <a href="https://github.com/Rikorose/DeepFilterNet" target="_blank">DeepFilterNet</a></li>
181
  </ol>
182
  <p>All backend models are ran through Taproot, which is made to be as low-overhead as possible, allowing for real-time operation on consumer hardware. These are just a small selection of the supported model set, but were chosen for their balance of speed, size and capability. Visit <a href="https://github.com/painebenjamin/anachrovox" target="_blank">the Anachrovox GitHub</a> to see how to build with different supported components and/or visit <a href="https://github.com/painebenjamin/taproot" target="_blank">the Taproot GitHub</a> to request a new supported component or learn how to build your own (and hopefully contribute it back to the community!)</p>
 
206
  primaryClass = {cs.AI},
207
  url = {https://arxiv.org/abs/2407.21783}
208
  }</cite>
209
+ <cite>@misc{kokoro82m,
210
+ title = {Kokoro-82M}
211
+ author = {@rzvzn}
212
+ year = {2024}
213
+ url = {https://huggingface.co/hexgrad/Kokoro-82M}
214
  }</cite>
215
  <cite>@inproceedings{schroeter2023deepfilternet3,
216
  title = {{DeepFilterNet}: Perceptually Motivated Real-Time Speech Enhancement},
www/index.js CHANGED
@@ -16,16 +16,14 @@ const [dpR, dpG, dpB] = [
16
  const pollingInterval = 150;
17
  const transcriptionParameters = {};
18
  const languageParameters = {
19
- max_tokens: 512,
20
  role: "anachrovox",
21
  stream: true,
22
  use_tools: true,
 
23
  return_tool_metadata: true,
24
  };
25
  const speechParameters = {
26
  enhance: true,
27
- stream: false,
28
- stream_chunk_size: 25,
29
  output_format: "float"
30
  };
31
  const waveformParameters = {
@@ -49,6 +47,7 @@ const speakerHoleRings = [ // radius, number of holes
49
  const maxTypingSpeed = 200; // characters per second
50
  const minTypingSpeed = 50;
51
  const maxDelay = 0.5; // max length to delay from completion to wait for speech to start generating
 
52
  let overseerAddress;
53
 
54
  if (window.location.port === "3000") {
@@ -140,66 +139,18 @@ const pushText = (text, className) => {
140
  };
141
 
142
  // Bind voice ID wheel to change voice ID
143
- // This is a localized list of voices from xtts2.
144
  const voiceMap = {
145
- "Aaron": "Aaron Dreschner",
146
- "Abraham": "Abrahan Mack",
147
- "Adam": "Adde Michal",
148
- "Alexis": "Alexandra Hisakawa",
149
- "Alexis": "Alma María",
150
- "Alison": "Alison Dietlinde",
151
- "Amy": "Asya Anara",
152
- "Andrew": "Andrew Chipper",
153
- "Anna": "Ana Florence",
154
- "Annie": "Annmarie Nele",
155
- "Barbara": "Barbora MacLean",
156
- "Blake": "Baldur Sanjin",
157
- "Brenda": "Brenda Stern",
158
- "Brian": "Badr Odhiambo",
159
- "Carla": "Camilla Holmström",
160
- "Cindy": "Chandra MacFarland",
161
- "Clara": "Claribel Dervla",
162
- "Clark": "Kumar Dahl",
163
- "Craig": "Craig Gutsy",
164
- "Daisy": "Daisy Studious",
165
- "Damien": "Damien Black",
166
- "David": "Dionisio Schuyler",
167
- "Dennis": "Damjan Chapman",
168
- "Ella": "Uta Obando",
169
- "Eugene": "Eugenio Mataracı",
170
- "Frank": "Ferran Simen",
171
- "Gilbert": "Gilberto Mathias",
172
- "Gina": "Gitta Nikolina",
173
- "Grace": "Gracie Wise",
174
- "Heidi": "Henriette Usha",
175
- "Ian": "Ige Behringer",
176
- "Ivan": "Ilkin Urbano",
177
- "Kevin": "Kazuhiko Atallah",
178
- "Lily": "Lilya Stainthorpe",
179
- "Louis": "Luis Moray",
180
- "Lucas": "Ludvig Milivoj",
181
- "Lydia": "Lidiya Szekeres",
182
- "Marcus": "Marcos Rudaski",
183
- "Maya": "Maja Ruoho",
184
- "Nadia": "Narelle Moon",
185
- "Nora": "Nova Hogarth",
186
- "Philip": "Filip Traverse",
187
- "Raymond": "Royston Min",
188
- "Rose": "Rosemary Okafor",
189
- "Saul": "Suad Qasim",
190
- "Sofia": "Sofia Hellen",
191
- "Sophie": "Szofi Granger",
192
- "Tammy": "Tammie Ema",
193
- "Tanya": "Tanja Adelina",
194
- "Tara": "Tammy Grit",
195
- "Thomas": "Torcull Diarmuid",
196
- "Victor": "Viktor Eka",
197
- "Victor": "Viktor Menelaos",
198
- "Violet": "Vjollca Johnnie",
199
- "Warren": "Wulf Carlevaro",
200
- "Xavier": "Xavier Hayasaka",
201
- "Zachary": "Zacharie Aimilios",
202
- "Zoe": "Zofija Kendrick",
203
  };
204
  const voiceNames = Object.keys(voiceMap);
205
  const voiceIds = Object.values(voiceMap);
@@ -211,7 +162,7 @@ const setVoiceIndex = (newIndex) => {
211
  voiceIndex = newIndex;
212
  }
213
  };
214
- setVoiceIndex(0);
215
  voiceIdWheel.addEventListener("click", () => {
216
  let newVoiceIndex = voiceIndex + parseInt(voiceIdWheel.value);
217
  if (newVoiceIndex < 0) newVoiceIndex = voiceIds.length - 1;
@@ -245,7 +196,7 @@ const getSpeechParameters = (overrides = {}) => {
245
  return {
246
  ...speechParameters,
247
  speed: parseFloat(speed.value),
248
- speaker_id: voiceMap[voiceId.value],
249
  ...overrides,
250
  };
251
  };
 
16
  const pollingInterval = 150;
17
  const transcriptionParameters = {};
18
  const languageParameters = {
 
19
  role: "anachrovox",
20
  stream: true,
21
  use_tools: true,
22
+ max_tokens: 1024,
23
  return_tool_metadata: true,
24
  };
25
  const speechParameters = {
26
  enhance: true,
 
 
27
  output_format: "float"
28
  };
29
  const waveformParameters = {
 
47
  const maxTypingSpeed = 200; // characters per second
48
  const minTypingSpeed = 50;
49
  const maxDelay = 0.5; // max length to delay from completion to wait for speech to start generating
50
+
51
  let overseerAddress;
52
 
53
  if (window.location.port === "3000") {
 
139
  };
140
 
141
  // Bind voice ID wheel to change voice ID
142
+ // This is the list of voices from Kokoro
143
  const voiceMap = {
144
+ "Adam": "male.en.us.adam",
145
+ "Bella": "female.en.us.bella",
146
+ "Emma": "female.en.gb.emma",
147
+ "George": "male.en.gb.george",
148
+ "Isabel": "female.en.gb.isabella",
149
+ "Lewis": "male.en.gb.lewis",
150
+ "Michael": "male.en.us.michael",
151
+ "Nicole": "female.en.us.nicole",
152
+ "Sarah": "female.en.us.sarah",
153
+ "Skye": "female.en.us.sky",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  };
155
  const voiceNames = Object.keys(voiceMap);
156
  const voiceIds = Object.values(voiceMap);
 
162
  voiceIndex = newIndex;
163
  }
164
  };
165
+ setVoiceIndex(Math.round(Math.random() * voiceIds.length));
166
  voiceIdWheel.addEventListener("click", () => {
167
  let newVoiceIndex = voiceIndex + parseInt(voiceIdWheel.value);
168
  if (newVoiceIndex < 0) newVoiceIndex = voiceIds.length - 1;
 
196
  return {
197
  ...speechParameters,
198
  speed: parseFloat(speed.value),
199
+ voice: voiceMap[voiceId.value],
200
  ...overrides,
201
  };
202
  };