Spaces:

benjamin-paine
/

anachrovox-v0.1-amber

Paused

App Files Files Community

benjamin-paine commited on 20 days ago

Commit

caca220

1 Parent(s): 55d3ab6

Switch to kokoro

Browse files

Files changed (4) hide show

Dockerfile +4 -3
www/index.css +11 -11
www/index.html +7 -7
www/index.js +15 -64

Dockerfile CHANGED Viewed

@@ -3,7 +3,7 @@ FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
 # Model choices
 ARG TEXT_MODEL=llama-v3-2-3b-instruct-q6-k
 ARG TRANSCRIBE_MODEL=distilled-whisper-large-v3
-ARG SPEECH_MODEL=xtts-v2
 # Create user
 RUN useradd -m -u 1000 anachrovox
@@ -38,6 +38,7 @@ RUN apt-get update && \
     curl wget procps \
     htop vim \
     python3-pip python3-dev \
     nginx && \
     rm -rf /var/lib/apt/lists/*
@@ -55,8 +56,8 @@ RUN taproot install \
     audio-transcription:${TRANSCRIBE_MODEL} \
     text-generation:${TEXT_MODEL} \
     speech-synthesis:${SPEECH_MODEL} \
-    --debug \
-    --optional
 # Copy run script
 COPY --chown=anachrovox --chmod=755 run.sh /app/run.sh

 # Model choices
 ARG TEXT_MODEL=llama-v3-2-3b-instruct-q6-k
 ARG TRANSCRIBE_MODEL=distilled-whisper-large-v3
+ARG SPEECH_MODEL=kokoro
 # Create user
 RUN useradd -m -u 1000 anachrovox
     curl wget procps \
     htop vim \
     python3-pip python3-dev \
+    espeak-ng \
     nginx && \
     rm -rf /var/lib/apt/lists/*
     audio-transcription:${TRANSCRIBE_MODEL} \
     text-generation:${TEXT_MODEL} \
     speech-synthesis:${SPEECH_MODEL} \
+    --optional \
+    --debug
 # Copy run script
 COPY --chown=anachrovox --chmod=755 run.sh /app/run.sh

www/index.css CHANGED Viewed

@@ -1159,10 +1159,10 @@ div.slider {
 div.slider-element {
     position: absolute;
-    width: 100px;
-    left: calc(50% - 50px);
-    height: 20px;
-    margin-top: -10px;
     perspective: 32px;
     cursor: grab;
     z-index: 2;
@@ -1173,17 +1173,17 @@ div.slider-element::after {
     content: "\A";
     display: block;
     position: absolute;
-    width: 50%;
-    height: 100%;
-    transform-origin: center;
     background-blend-mode: overlay, normal;
     box-shadow: 4px 4px 6px rgba(0,0,0,0.4),
                 10px 10px 14px rgba(0,0,0,0.2);
 }
 div.slider-element::before {
-    left: 1px;
-    transform: rotateY(-15deg);
     background-image:
         linear-gradient(to right, #CCC, #888),
         url("static/black-plastic.jpg");
@@ -1192,8 +1192,8 @@ div.slider-element::before {
 }
 div.slider-element::after {
-    right: 1px;
-    transform: rotateY(15deg);
     background-image:
         linear-gradient(to right, #444, #333),
         url("static/black-plastic.jpg");

 div.slider-element {
     position: absolute;
+    width: 140px;
+    left: calc(50% - 70px);
+    height: 40px;
+    margin-top: -20px;
     perspective: 32px;
     cursor: grab;
     z-index: 2;
     content: "\A";
     display: block;
     position: absolute;
+    width: 50px;
+    height: 40px;
+    transform-origin: center center;
     background-blend-mode: overlay, normal;
     box-shadow: 4px 4px 6px rgba(0,0,0,0.4),
                 10px 10px 14px rgba(0,0,0,0.2);
 }
 div.slider-element::before {
+    left: 21px;
+    transform: rotateY(-15deg) scaleY(0.6);
     background-image:
         linear-gradient(to right, #CCC, #888),
         url("static/black-plastic.jpg");
 }
 div.slider-element::after {
+    right: 21px;
+    transform: rotateY(15deg) scaleY(0.6);
     background-image:
         linear-gradient(to right, #444, #333),
         url("static/black-plastic.jpg");

www/index.html CHANGED Viewed

@@ -152,7 +152,7 @@
             <button id="info-button">i</button>
             <div id="info-content">
                 <h1 id="info-logo">Anachrovox</h1>
-                <h2 id="info-version">Alpha Release v0.1.1</h2>
                 <h3>Instructions</h3>
                 <p>To issue a voice command in a hands-free fashion, start your command with one of the supported wake phrases, then issue your command naturally (i.e. you do not need to pause.) There are many supported wake phrases but they all include 'Vox' - for example, 'Hey Vox', 'Hi Vox', or just 'Vox'.</p>
                 <p>There are numerous ways to shortcut the speech-to-speech workflow:</p>
@@ -176,7 +176,7 @@
                     <li>Wake-word detection using <a href="https://github.com/painebenjamin/hey-buddy" target="_blank">Hey Buddy</a></li>
                     <li>Speech-to-text using <a href="https://huggingface.co/distil-whisper/distil-large-v3" target="_blank">Distil-Whisper Large</a></li>
                     <li>Text generation using <a href="https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct" target="_blank">Llama 3.2 3B Instruct</a></li>
-                    <li>Text-to-speech using <a href="https://coqui.ai/blog/tts/open_xtts" target="_blank">XTTS2</a></li>
                     <li>Audio enhancement using <a href="https://github.com/Rikorose/DeepFilterNet" target="_blank">DeepFilterNet</a></li>
                 </ol>
                 <p>All backend models are ran through Taproot, which is made to be as low-overhead as possible, allowing for real-time operation on consumer hardware. These are just a small selection of the supported model set, but were chosen for their balance of speed, size and capability. Visit <a href="https://github.com/painebenjamin/anachrovox" target="_blank">the Anachrovox GitHub</a> to see how to build with different supported components and/or visit <a href="https://github.com/painebenjamin/taproot" target="_blank">the Taproot GitHub</a> to request a new supported component or learn how to build your own (and hopefully contribute it back to the community!)</p>
@@ -206,11 +206,11 @@
     primaryClass  = {cs.AI},
     url           = {https://arxiv.org/abs/2407.21783}
 }</cite>
-                <cite>@misc{coqui2023xtts,
-    title         = {XTTS: Open Model Release Announcement}
-    author        = {Coqui AI}
-    year          = {2023}
-    url           = {https://coqui.ai/blog/tts/open_xtts}
 }</cite>
                 <cite>@inproceedings{schroeter2023deepfilternet3,
     title         = {{DeepFilterNet}: Perceptually Motivated Real-Time Speech Enhancement},

             <button id="info-button">i</button>
             <div id="info-content">
                 <h1 id="info-logo">Anachrovox</h1>
+                <h2 id="info-version">Alpha Release v0.1.3</h2>
                 <h3>Instructions</h3>
                 <p>To issue a voice command in a hands-free fashion, start your command with one of the supported wake phrases, then issue your command naturally (i.e. you do not need to pause.) There are many supported wake phrases but they all include 'Vox' - for example, 'Hey Vox', 'Hi Vox', or just 'Vox'.</p>
                 <p>There are numerous ways to shortcut the speech-to-speech workflow:</p>
                     <li>Wake-word detection using <a href="https://github.com/painebenjamin/hey-buddy" target="_blank">Hey Buddy</a></li>
                     <li>Speech-to-text using <a href="https://huggingface.co/distil-whisper/distil-large-v3" target="_blank">Distil-Whisper Large</a></li>
                     <li>Text generation using <a href="https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct" target="_blank">Llama 3.2 3B Instruct</a></li>
+                    <li>Text-to-speech using <a href="https://huggingface.co/hexgrad/Kokoro-82M" target="_blank">Kokoro</a></li>
                     <li>Audio enhancement using <a href="https://github.com/Rikorose/DeepFilterNet" target="_blank">DeepFilterNet</a></li>
                 </ol>
                 <p>All backend models are ran through Taproot, which is made to be as low-overhead as possible, allowing for real-time operation on consumer hardware. These are just a small selection of the supported model set, but were chosen for their balance of speed, size and capability. Visit <a href="https://github.com/painebenjamin/anachrovox" target="_blank">the Anachrovox GitHub</a> to see how to build with different supported components and/or visit <a href="https://github.com/painebenjamin/taproot" target="_blank">the Taproot GitHub</a> to request a new supported component or learn how to build your own (and hopefully contribute it back to the community!)</p>
     primaryClass  = {cs.AI},
     url           = {https://arxiv.org/abs/2407.21783}
 }</cite>
+                <cite>@misc{kokoro82m,
+    title         = {Kokoro-82M}
+    author        = {@rzvzn}
+    year          = {2024}
+    url           = {https://huggingface.co/hexgrad/Kokoro-82M}
 }</cite>
                 <cite>@inproceedings{schroeter2023deepfilternet3,
     title         = {{DeepFilterNet}: Perceptually Motivated Real-Time Speech Enhancement},

www/index.js CHANGED Viewed

@@ -16,16 +16,14 @@ const [dpR, dpG, dpB] = [
 const pollingInterval = 150;
 const transcriptionParameters = {};
 const languageParameters = {
-    max_tokens: 512,
     role: "anachrovox",
     stream: true,
     use_tools: true,
     return_tool_metadata: true,
 };
 const speechParameters = {
     enhance: true,
-    stream: false,
-    stream_chunk_size: 25,
     output_format: "float"
 };
 const waveformParameters = {
@@ -49,6 +47,7 @@ const speakerHoleRings = [ // radius, number of holes
 const maxTypingSpeed = 200; // characters per second
 const minTypingSpeed = 50;
 const maxDelay = 0.5; // max length to delay from completion to wait for speech to start generating
 let overseerAddress;
 if (window.location.port === "3000") {
@@ -140,66 +139,18 @@ const pushText = (text, className) => {
 };
 // Bind voice ID wheel to change voice ID
-// This is a localized list of voices from xtts2.
 const voiceMap = {
-    "Aaron": "Aaron Dreschner",
-    "Abraham": "Abrahan Mack",
-    "Adam": "Adde Michal",
-    "Alexis": "Alexandra Hisakawa",
-    "Alexis": "Alma María",
-    "Alison": "Alison Dietlinde",
-    "Amy": "Asya Anara",
-    "Andrew": "Andrew Chipper",
-    "Anna": "Ana Florence",
-    "Annie": "Annmarie Nele",
-    "Barbara": "Barbora MacLean",
-    "Blake": "Baldur Sanjin",
-    "Brenda": "Brenda Stern",
-    "Brian": "Badr Odhiambo",
-    "Carla": "Camilla Holmström",
-    "Cindy": "Chandra MacFarland",
-    "Clara": "Claribel Dervla",
-    "Clark": "Kumar Dahl",
-    "Craig": "Craig Gutsy",
-    "Daisy": "Daisy Studious",
-    "Damien": "Damien Black",
-    "David": "Dionisio Schuyler",
-    "Dennis": "Damjan Chapman",
-    "Ella": "Uta Obando",
-    "Eugene": "Eugenio Mataracı",
-    "Frank": "Ferran Simen",
-    "Gilbert": "Gilberto Mathias",
-    "Gina": "Gitta Nikolina",
-    "Grace": "Gracie Wise",
-    "Heidi": "Henriette Usha",
-    "Ian": "Ige Behringer",
-    "Ivan": "Ilkin Urbano",
-    "Kevin": "Kazuhiko Atallah",
-    "Lily": "Lilya Stainthorpe",
-    "Louis": "Luis Moray",
-    "Lucas": "Ludvig Milivoj",
-    "Lydia": "Lidiya Szekeres",
-    "Marcus": "Marcos Rudaski",
-    "Maya": "Maja Ruoho",
-    "Nadia": "Narelle Moon",
-    "Nora": "Nova Hogarth",
-    "Philip": "Filip Traverse",
-    "Raymond": "Royston Min",
-    "Rose": "Rosemary Okafor",
-    "Saul": "Suad Qasim",
-    "Sofia": "Sofia Hellen",
-    "Sophie": "Szofi Granger",
-    "Tammy": "Tammie Ema",
-    "Tanya": "Tanja Adelina",
-    "Tara": "Tammy Grit",
-    "Thomas": "Torcull Diarmuid",
-    "Victor": "Viktor Eka",
-    "Victor": "Viktor Menelaos",
-    "Violet": "Vjollca Johnnie",
-    "Warren": "Wulf Carlevaro",
-    "Xavier": "Xavier Hayasaka",
-    "Zachary": "Zacharie Aimilios",
-    "Zoe": "Zofija Kendrick",
 };
 const voiceNames = Object.keys(voiceMap);
 const voiceIds = Object.values(voiceMap);
@@ -211,7 +162,7 @@ const setVoiceIndex = (newIndex) => {
         voiceIndex = newIndex;
     }
 };
-setVoiceIndex(0);
 voiceIdWheel.addEventListener("click", () => {
     let newVoiceIndex = voiceIndex + parseInt(voiceIdWheel.value);
     if (newVoiceIndex < 0) newVoiceIndex = voiceIds.length - 1;
@@ -245,7 +196,7 @@ const getSpeechParameters = (overrides = {}) => {
     return {
         ...speechParameters,
         speed: parseFloat(speed.value),
-        speaker_id: voiceMap[voiceId.value],
         ...overrides,
     };
 };

 const pollingInterval = 150;
 const transcriptionParameters = {};
 const languageParameters = {
     role: "anachrovox",
     stream: true,
     use_tools: true,
+    max_tokens: 1024,
     return_tool_metadata: true,
 };
 const speechParameters = {
     enhance: true,
     output_format: "float"
 };
 const waveformParameters = {
 const maxTypingSpeed = 200; // characters per second
 const minTypingSpeed = 50;
 const maxDelay = 0.5; // max length to delay from completion to wait for speech to start generating
 let overseerAddress;
 if (window.location.port === "3000") {
 };
 // Bind voice ID wheel to change voice ID
+// This is the list of voices from Kokoro
 const voiceMap = {
+    "Adam": "male.en.us.adam",
+    "Bella": "female.en.us.bella",
+    "Emma": "female.en.gb.emma",
+    "George": "male.en.gb.george",
+    "Isabel": "female.en.gb.isabella",
+    "Lewis": "male.en.gb.lewis",
+    "Michael": "male.en.us.michael",
+    "Nicole": "female.en.us.nicole",
+    "Sarah": "female.en.us.sarah",
+    "Skye": "female.en.us.sky",
 };
 const voiceNames = Object.keys(voiceMap);
 const voiceIds = Object.values(voiceMap);
         voiceIndex = newIndex;
     }
 };
+setVoiceIndex(Math.round(Math.random() * voiceIds.length));
 voiceIdWheel.addEventListener("click", () => {
     let newVoiceIndex = voiceIndex + parseInt(voiceIdWheel.value);
     if (newVoiceIndex < 0) newVoiceIndex = voiceIds.length - 1;
     return {
         ...speechParameters,
         speed: parseFloat(speed.value),
+        voice: voiceMap[voiceId.value],
         ...overrides,
     };
 };