Spaces:

jbilcke-hf
/

observer

Paused

App Files Files Community

jbilcke-hf HF staff commited on Sep 26, 2023

Commit

1896b82

1 Parent(s): 7249a2e

fixed some issues

Browse files

Files changed (11) hide show

package-lock.json +37 -1
package.json +2 -1
src/app/engine/predict.ts +4 -2
src/app/engine/think.ts +28 -16
src/app/listen.tsx +81 -63
src/app/main.tsx +15 -8
src/app/observe.tsx +5 -2
src/app/speak.tsx +17 -45
src/app/useStore.ts +83 -0
src/lib/getSpeechSynthesisVoice.ts +22 -0
src/lib/useTimeout.ts +9 -0

package-lock.json CHANGED Viewed

@@ -61,7 +61,8 @@
         "usehooks-ts": "^2.9.1",
         "uuid": "^9.0.0",
         "webm-to-wav-converter": "^1.1.0",
-        "whisper-turbo": "^0.7.0"
       },
       "devDependencies": {
         "@types/sbd": "^1.0.3"
@@ -9248,6 +9249,14 @@
         }
       }
     },
     "node_modules/usehooks-ts": {
       "version": "2.9.1",
       "resolved": "https://registry.npmjs.org/usehooks-ts/-/usehooks-ts-2.9.1.tgz",
@@ -9503,6 +9512,33 @@
       "funding": {
         "url": "https://github.com/sponsors/colinhacks"
       }
     }
   }
 }

         "usehooks-ts": "^2.9.1",
         "uuid": "^9.0.0",
         "webm-to-wav-converter": "^1.1.0",
+        "whisper-turbo": "^0.7.0",
+        "zustand": "^4.4.1"
       },
       "devDependencies": {
         "@types/sbd": "^1.0.3"
         }
       }
     },
+    "node_modules/use-sync-external-store": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.2.0.tgz",
+      "integrity": "sha512-eEgnFxGQ1Ife9bzYs6VLi8/4X6CObHMw9Qr9tPY43iKwsPw8xE8+EFsf/2cFZ5S3esXgpWgtSCtLNS41F+sKPA==",
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0 || ^18.0.0"
+      }
+    },
     "node_modules/usehooks-ts": {
       "version": "2.9.1",
       "resolved": "https://registry.npmjs.org/usehooks-ts/-/usehooks-ts-2.9.1.tgz",
       "funding": {
         "url": "https://github.com/sponsors/colinhacks"
       }
+    },
+    "node_modules/zustand": {
+      "version": "4.4.1",
+      "resolved": "https://registry.npmjs.org/zustand/-/zustand-4.4.1.tgz",
+      "integrity": "sha512-QCPfstAS4EBiTQzlaGP1gmorkh/UL1Leaj2tdj+zZCZ/9bm0WS7sI2wnfD5lpOszFqWJ1DcPnGoY8RDL61uokw==",
+      "dependencies": {
+        "use-sync-external-store": "1.2.0"
+      },
+      "engines": {
+        "node": ">=12.7.0"
+      },
+      "peerDependencies": {
+        "@types/react": ">=16.8",
+        "immer": ">=9.0",
+        "react": ">=16.8"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "immer": {
+          "optional": true
+        },
+        "react": {
+          "optional": true
+        }
+      }
     }
   }
 }

package.json CHANGED Viewed

@@ -62,7 +62,8 @@
     "usehooks-ts": "^2.9.1",
     "uuid": "^9.0.0",
     "webm-to-wav-converter": "^1.1.0",
-    "whisper-turbo": "^0.7.0"
   },
   "devDependencies": {
     "@types/sbd": "^1.0.3"

     "usehooks-ts": "^2.9.1",
     "uuid": "^9.0.0",
     "webm-to-wav-converter": "^1.1.0",
+    "whisper-turbo": "^0.7.0",
+    "zustand": "^4.4.1"
   },
   "devDependencies": {
     "@types/sbd": "^1.0.3"

src/app/engine/predict.ts CHANGED Viewed

@@ -17,7 +17,9 @@ export async function predict(inputs: string) {
         do_sample: true,
         // hard limit for max_new_tokens is 1512
-        max_new_tokens: 200, // 1150,
         return_full_text: false,
       }
     })) {
@@ -51,6 +53,6 @@ export async function predict(inputs: string) {
     .replaceAll("<SYS>", "")
     .replaceAll("</SYS>", "")
     .replaceAll("<|assistant|>", "")
-    .replaceAll('""', '"')
   )
 }

         do_sample: true,
         // hard limit for max_new_tokens is 1512
+        // however since we are tying to achieve some kind of real time interaction,
+        // we want to make it as small as possible
+        max_new_tokens: 100, // 1150,
         return_full_text: false,
       }
     })) {
     .replaceAll("<SYS>", "")
     .replaceAll("</SYS>", "")
     .replaceAll("<|assistant|>", "")
+    .replaceAll('"', '')
   )
 }

src/app/engine/think.ts CHANGED Viewed

@@ -5,36 +5,48 @@ import { createLlamaPrompt } from "@/lib/createLlamaPrompt"
 import { predict } from "./predict"
 const internalHistory: {
   role: string;
   content: string;
 }[] = []
-export const think = async (event: string): Promise<string> => {
   if (!event) {
     throw new Error("missing event")
   }
-  internalHistory.push({
-    role: "user",
-    content: event,
-  })
-  if (internalHistory.length > 10) {
-    internalHistory.shift()
   }
   const prompt = createLlamaPrompt([
     {
       role: "system",
       content: [
-        `Today's date is ${format(new Date(), 'yyyy-MM-dd at HH:mm (d)')}.`,
-        `You are an android robot, very friendly, curious about the world.`,
-        `Your life goal is to help human and interact them as a natural way.`,
-        `You are going to see and hear various things, and you need to act in a very natural way.`,
-        `If you see someone through your eyes, you need to interact with them,`,
-        `you should be o ngoing and open, ask questions, be curious, do jokes etc.`,
-      ].filter(item => item).join("\n")
     },
     ...internalHistory,
   ])
@@ -56,8 +68,8 @@ export const think = async (event: string): Promise<string> => {
     }
   }
-  // llama-2 is too chatty, let's keep 3 sentences at most
-  const sentences = sbd.sentences(result).slice(0, 3).join(" ").trim()
   internalHistory.push({
     role: "assistant",

 import { predict } from "./predict"
+const context = {
+  lastObservedScene: ""
+}
 const internalHistory: {
   role: string;
   content: string;
 }[] = []
+export const think = async (event: string, needAnswer: boolean): Promise<string> => {
   if (!event) {
     throw new Error("missing event")
   }
+  if (needAnswer) {
+    internalHistory.push({
+      role: "user",
+      content: event,
+    })
+    if (internalHistory.length > 10) {
+      internalHistory.shift()
+    }
+  } else {
+    context.lastObservedScene = event
+    return ""
   }
   const prompt = createLlamaPrompt([
     {
       role: "system",
       content: [
+        `Today's date is ${format(new Date(), 'yyyy-MM-dd at HH:mm (d)')}`,
+        `You are Alice, a very friendly 25yo woman, curious about the world`,
+        `Your life goal is to help people and interact them as a natural way`,
+        `You are going to see and hear various things, and you need to act in a very natural way`,
+        `If you see someone through your eyes, you need to interact with them`,
+        `You should be ongoing and open, ask questions, be curious, do jokes etc`,
+        `sometimes you have trouble hearing, if you don't understand just ignore and say nothing`,
+        `You like to answer in just one sentence`,
+        context.lastObservedScene ? `You are currently looking at: ${context.lastObservedScene}` : ''
+      ].filter(item => item).join(". ")
     },
     ...internalHistory,
   ])
     }
   }
+  // llama-2 is too chatty, let's keep 2 sentences at most
+  const sentences = sbd.sentences(result).slice(0, 2).join(" ").trim()
   internalHistory.push({
     role: "assistant",

src/app/listen.tsx CHANGED Viewed

@@ -9,15 +9,11 @@ import { getWaveBlob } from "webm-to-wav-converter"
 import {
   AvailableModels,
   InferenceSession,
-  MicRecorder,
   SessionManager,
 } from "whisper-turbo"
 import { useToast } from "@/components/ui/use-toast"
-// import { listen } from "@/app/engine/listen"
-import { blobToBase64Uri } from "@/lib/blobToBase64Uri"
-// import { listen } from "./engine/listen"
 export interface TSSegment {
   text: string;
@@ -36,6 +32,13 @@ export function Listen({
   onListen: (recording: string) => void
 }) {
   const { toast } = useToast()
   const [transcribing, setTranscribing] = useState(false)
   const transcribingRef = useRef(transcribing)
@@ -59,7 +62,10 @@ export function Listen({
     WHISPER_LARGE: 'whisper-large'
   }
   */
-  const whisperModel: AvailableModels = AvailableModels.WHISPER_BASE
   const listenerRef = useRef({
     isListening: false,
@@ -217,67 +223,79 @@ export function Listen({
     runSession()
   }, [audioDataFrame])
   useEffect(() => {
-    if (heardSomething) {
-      if (!listenerRef.current.isListening) {
-        console.log("recoording..")
-        foregroundListener.startRecording()
-        listenerRef.current.hits = 0
-        listenerRef.current.isListening = true
-        // TODO: use a debouncer to detect when we started speaking
-        setTimeout(async () => {
-          foregroundListener.stopRecording()
-          listenerRef.current.isListening = false
-          listenerRef.current.stoppedListeningAt = Date.now()
-          listenerRef.current.durationInMs =
-            listenerRef.current.stoppedListeningAt - listenerRef.current.startedListeningAt
-          const hits = listenerRef.current.hits
-          console.log(`end of sample (${foregroundListener.timeElapsed}, ${hits} hits)`)
-          if (!foregroundListener.audioBlob || typeof window === "undefined" || !window?.FileReader) {
-            return
-          }
-          if (hits > 11) {
-            // at 12 threshold level, we should have between 12 and 20 hits (per 2 sec) for short words and utterances
-            // at 12 threshold level, keystrokes should not be detected, unless the person hits the keyboard heavily
-            console.log("got an interesting sample, sending for review")
-            // temporary, to prevent infinite loop
-            if (listenerRef.current.debugCanContinue) {
-              // to prevent the infinite loop, set this value to false
-              // listenerRef.current.debugCanContinue = false
-              try {
-                const blob = await getWaveBlob(foregroundListener.audioBlob, false) // false = 16 bit, true = 32 bit
-                const arrayBuffer = await blob.arrayBuffer()
-                const uint8Array = new Uint8Array(arrayBuffer)
-                setAudioData(uint8Array)
-                setAudioDataFrame(audioDataFrameRef.current + 1)
-              } catch (err) {
-                const error = `failed to convert the audio sample: ${err}`
-                console.error(error)
-                toast({
-                  title: "Error",
-                  description: error,
-                  variant: "destructive"
-                })
-              }
-            } else {
-              console.log("Julian: infinite loop temporary disabled :D")
-            }
-          }
-        }, 3000)
       } else {
-        // TODO: increase hits?
-        // listenerRef.current.hits = listenerRef.current.hits + 1
       }
-    }
   }, [heardSomething])
   if (heardSomething && listenerRef.current.isListening) {

 import {
   AvailableModels,
   InferenceSession,
   SessionManager,
 } from "whisper-turbo"
 import { useToast } from "@/components/ui/use-toast"
+import { useStore } from "./useStore"
 export interface TSSegment {
   text: string;
   onListen: (recording: string) => void
 }) {
   const { toast } = useToast()
+  const speechSynthesis = useStore(state => state.speechSynthesis)
+  const isSpeaking = useStore(state => state.isSpeaking)
+  const isSpeakingRef = useRef(isSpeaking)
+  useEffect(() => {isSpeakingRef.current = isSpeaking }, [isSpeaking])
+  const setHearing = useStore(state => state.setHearing)
+  const isHearing = useStore(state => state.isHearing)
   const [transcribing, setTranscribing] = useState(false)
   const transcribingRef = useRef(transcribing)
     WHISPER_LARGE: 'whisper-large'
   }
   */
+  // unfortunately, we cannot really use models larger than TINY because they are
+  // too slow to process requests
+  const whisperModel: AvailableModels = AvailableModels.WHISPER_TINY
   const listenerRef = useRef({
     isListening: false,
     runSession()
   }, [audioDataFrame])
+  // note: this effect only reacts to "head something" changes
+  // anod not to changes to isListening or isSpekaing
   useEffect(() => {
+    const isListening = listenerRef.current.isListening
+    if (!heardSomething) { return }
+    if (listenerRef.current.isListening) {
+      // console.log("we are already listening, so skipping..")
+      return
+    }
+    if (isSpeakingRef.current) {
+      console.log("we are already busy speaking, so ignoring..")
+      return
+    }
+    setHearing(true)
+    // console.log("recording..")
+    foregroundListener.startRecording()
+    listenerRef.current.hits = 0
+    listenerRef.current.isListening = true
+    setTimeout(async () => {
+      foregroundListener.stopRecording()
+      setHearing(false)
+      listenerRef.current.isListening = false
+      listenerRef.current.stoppedListeningAt = Date.now()
+      listenerRef.current.durationInMs =
+        listenerRef.current.stoppedListeningAt - listenerRef.current.startedListeningAt
+      const hits = listenerRef.current.hits
+      if (!foregroundListener.audioBlob || typeof window === "undefined" || !window?.FileReader) {
+        return
+      }
+      if (hits <= 11) {
+        return
+      }
+      console.log(`end of sample (${foregroundListener.timeElapsed}, ${hits} hits)`)
+      // at 12 threshold level, we should have between 12 and 20 hits (per 2 sec) for short words and utterances
+      // at 12 threshold level, keystrokes should not be detected, unless the person hits the keyboard heavily
+      // console.log("got an interesting sample, sending for review")
+      // temporary, to prevent infinite loop
+      if (listenerRef.current.debugCanContinue) {
+        // to prevent the infinite loop, set this value to false
+        // listenerRef.current.debugCanContinue = false
+        try {
+          const blob = await getWaveBlob(foregroundListener.audioBlob, false) // false = 16 bit, true = 32 bit
+          const arrayBuffer = await blob.arrayBuffer()
+          const uint8Array = new Uint8Array(arrayBuffer)
+          setAudioData(uint8Array)
+          setAudioDataFrame(audioDataFrameRef.current + 1)
+        } catch (err) {
+          const error = `failed to convert the audio sample: ${err}`
+          console.error(error)
+          toast({
+            title: "Error",
+            description: error,
+            variant: "destructive"
+          })
+        }
       } else {
+        console.log("Julian: infinite loop temporary disabled!")
       }
+    }, 2000)
   }, [heardSomething])
   if (heardSomething && listenerRef.current.isListening) {

src/app/main.tsx CHANGED Viewed

@@ -20,11 +20,13 @@ export default function Main() {
   const [action, setAction] = useState<string>("Nothing to say yet.")
-  const handleOnEvent = (event: string) => {
     setLoadingAction(true)
     startTransition(async () => {
-      const action = await think(event)
-      setAction(action)
       setLoadingAction(false)
     })
   }
@@ -32,11 +34,16 @@ export default function Main() {
   const handleOnObserve = (observation: string, image: string) => {
     setLastRawObservation(observation)
     setLastImage(image)
-    handleOnEvent(`It is ${format(new Date(), 'HH:mm (d)')} and you are seeing this: ${observation}`)
   }
   const handleOnListen = (recording: string) => {
-    handleOnEvent(`It is ${format(new Date(), 'HH:mm (d)')} and you are hearing this: ${recording}`)
   }
   return (
@@ -94,9 +101,9 @@ export default function Main() {
       <div className="fixed z-10 left-0 right-0 bottom-0 flex flex-col items-center justify-center">
         <div className="full md:w-[80%] lg:w-[70%] mb-0 md:p-4 md:mb-8 bg-zinc-100 md:rounded-xl p-4 shadow-2xl text-xs md:text-sm">
           <p>🅿️ <span className="font-semibold">
-            </span>This multimodal demo allow
-           <a href="https://huggingface.co/meta-llama" target="_blank" className="font-semibold underline"> Llama-2 </a> to hear, see and talk.
-           You need to upgrade to a <a href="https://caniuse.com/webgpu" target="_blank" className="font-semibold underline">browser with support for WebGPU</a> for speech recognition to work.
             Vision is handled by <a href="https://huggingface.co/HuggingFaceM4/idefics-80b#bias-evaluation" target="_blank" className="font-semibold underline"> IDEFICS </a></p>
           <p>⛔️ <span className="font-semibold">Limitations: </span>This demo is provided as-is, for demonstration and research purpose only. As it demonstrates WebGPU technology, this demo will not support incompatible browsers and/or devices. No guarantee of factually correct results. In some cases, the models may return hallucinated or innapropriate responses.</p>
         </div>

   const [action, setAction] = useState<string>("Nothing to say yet.")
+  const handleOnEvent = (event: string, needAnswer: boolean) => {
     setLoadingAction(true)
     startTransition(async () => {
+      const action = await think(event, needAnswer)
+      if (action) {
+        setAction(action)
+      }
       setLoadingAction(false)
     })
   }
   const handleOnObserve = (observation: string, image: string) => {
     setLastRawObservation(observation)
     setLastImage(image)
+    if (!observation) { return }
+    // handleOnEvent(`It is ${format(new Date(), 'HH:mm')} and you are seeing this: ${observation}`)
+    handleOnEvent(`You are seeing this: ${observation}`, false)
   }
   const handleOnListen = (recording: string) => {
+    if (!recording || recording === "[BLANK_AUDIO]") { return }
+    // handleOnEvent(`It is ${format(new Date(), 'HH:mm')} and you are hearing this: ${recording}`)
+    handleOnEvent(`${recording}`, true)
   }
   return (
       <div className="fixed z-10 left-0 right-0 bottom-0 flex flex-col items-center justify-center">
         <div className="full md:w-[80%] lg:w-[70%] mb-0 md:p-4 md:mb-8 bg-zinc-100 md:rounded-xl p-4 shadow-2xl text-xs md:text-sm">
           <p>🅿️ <span className="font-semibold">
+            </span>A multimodal demo to make
+           <a href="https://huggingface.co/meta-llama" target="_blank" className="font-semibold underline"> Llama-2 </a> hear, see and talk.
+           You need a laptop computer with <a href="https://caniuse.com/webgpu" target="_blank" className="font-semibold underline">a modern browser supporting WebGPU</a>.
             Vision is handled by <a href="https://huggingface.co/HuggingFaceM4/idefics-80b#bias-evaluation" target="_blank" className="font-semibold underline"> IDEFICS </a></p>
           <p>⛔️ <span className="font-semibold">Limitations: </span>This demo is provided as-is, for demonstration and research purpose only. As it demonstrates WebGPU technology, this demo will not support incompatible browsers and/or devices. No guarantee of factually correct results. In some cases, the models may return hallucinated or innapropriate responses.</p>
         </div>

src/app/observe.tsx CHANGED Viewed

@@ -80,11 +80,14 @@ export function Observe({
       }
       const prompt = `What do you see here?`
       // console.log("Calling IDEFICS..")
-      const newObservation = "fake"  // await see({ prompt, imageBase64 })
       // console.log("New observation: ", newObservation)
-      if (newObservation !== lastObservation) {
         // console.log("update!")
         setLastObservation(newObservation || "")
         onObserve(newObservation || "", imageBase64)

       }
       const prompt = `What do you see here?`
+      console.log("JULIAN: disabled watch")
       // console.log("Calling IDEFICS..")
+      const newObservation = await see({ prompt, imageBase64 })
       // console.log("New observation: ", newObservation)
+      if (newObservation && newObservation !== lastObservation) {
         // console.log("update!")
         setLastObservation(newObservation || "")
         onObserve(newObservation || "", imageBase64)

src/app/speak.tsx CHANGED Viewed

@@ -3,60 +3,32 @@
 import { ReactNode, useEffect, useState } from "react"
 import { onlyText } from "react-children-utilities"
 export function Speak({
   children
 }: {
   children: ReactNode
 }) {
-  const newMessage = onlyText(children).trim()
-  const [playedMessage, setPlayedMessage] = useState("")
-  const [voice, setVoice] = useState<SpeechSynthesisVoice>()
-  useEffect(() => {
-    console.log("getting voices..")
-    setTimeout(() => {
-      if (typeof window === "undefined") { return }
-      if (!window?.speechSynthesis) { return }
-      const allVoices = window.speechSynthesis.getVoices()
-      const enVoices = allVoices.filter(voice => voice.lang.toLowerCase() === "en-us")
-      if (!enVoices.length) { return }
-      console.log("available voices:")
-      console.table(enVoices)
-      const kathyVoice = enVoices.find(voice => voice.name.includes("Kathy"))
-      // if we find a high-quality voice
-      const googleVoice = enVoices.find(voice => voice.name.includes("Google"))
-      console.log("google voice:", googleVoice)
-      setVoice(googleVoice || kathyVoice)
-    }, 1000)
-  }, [])
   useEffect(() => {
-    if (typeof window === "undefined") { return }
-    if (!window?.speechSynthesis) { return }
-    if (!voice?.name) { return }
-    if (!newMessage?.length) { return }
-    if (newMessage === playedMessage) { return }
-    const synth = window.speechSynthesis
-    // console.log(`Speaking "${newMessage}"`)
-    setPlayedMessage(newMessage)
-    const utterance = new SpeechSynthesisUtterance(newMessage)
-    utterance.voice = voice
-    console.log("julian: voice disabled :D")
-    // synth.speak(utterance)
-  }, [voice?.name, newMessage, playedMessage])
-  return (
-    null
-  )
 }

 import { ReactNode, useEffect, useState } from "react"
 import { onlyText } from "react-children-utilities"
+import { useTimeout } from "@/lib/useTimeout"
+import { useStore } from "./useStore"
 export function Speak({
   children
 }: {
   children: ReactNode
 }) {
+  const isSpeechSynthesisAvailable = useStore(state => state.isSpeechSynthesisAvailable)
+  const lastSpokenSentence = useStore(state => state.lastSpokenSentence)
+  const init = useStore(state => state.init)
+  const speak = useStore(state => state.speak)
+  const newMessage = onlyText(children).trim()
+  useEffect(() => { init() }, [])
+  const canSpeak = isSpeechSynthesisAvailable && newMessage?.length && newMessage !== lastSpokenSentence
   useEffect(() => {
+    console.log("debug:", { canSpeak, newMessage })
+    if (canSpeak) {
+      console.log("speaking!")
+      speak(newMessage)
+    }
+  }, [canSpeak, newMessage])
+  return null
 }

src/app/useStore.ts ADDED Viewed

	@@ -0,0 +1,83 @@

+"use client"
+import { getSpeechSynthesisVoice } from "@/lib/getSpeechSynthesisVoice"
+import { create } from "zustand"
+export const useStore = create<{
+  isSpeechSynthesisAvailable: boolean
+  speechSynthesis: SpeechSynthesis
+  speechSynthesisVoice: SpeechSynthesisVoice
+  isSpeaking: boolean
+  lastSpokenSentence: string
+  isHearing: boolean // robot is hearing
+  init: () => void,
+  loadVoice: () => void,
+  speak: (sentence: string) => void
+  setHearing: (isHearing: boolean) => void
+}>((set, get) => ({
+  isSpeechSynthesisAvailable: false,
+  speechSynthesis: undefined as unknown as SpeechSynthesis,
+  speechSynthesisVoice: undefined as unknown as SpeechSynthesisVoice,
+  isSpeaking: false,
+  lastSpokenSentence: "",
+  isHearing: false, // robot is taking
+  init: () => {
+    if (!window?.speechSynthesis) {
+      console.error(`no speech synthesis engine available`)
+      return
+    }
+    const speechSynthesis = window.speechSynthesis
+    set({ speechSynthesis })
+    speechSynthesis.onvoiceschanged = () => { get().loadVoice() }
+    setTimeout(() => {
+      get().loadVoice()
+    }, 2000)
+    // due to the lack of event for the speaking state, we create our own polling system
+    // see https://developer.mozilla.org/en-US/docs/Web/API/SpeechSynthesis/speaking
+    setInterval(() => {
+      const { isSpeaking } = get()
+      if (!speechSynthesis.speaking && isSpeaking) {
+        set({ isSpeaking: false })
+      } else if (speechSynthesis.speaking && !isSpeaking) {
+        set({ isSpeaking: true })
+      }
+    }, 100)
+  },
+  loadVoice: () => {
+    let { speechSynthesis, speechSynthesisVoice } = get()
+    if (!speechSynthesis) {
+      console.error(`no speech synthesis engine available`)
+      return
+    }
+    try {
+      speechSynthesisVoice = getSpeechSynthesisVoice(speechSynthesis)
+      if (!speechSynthesisVoice?.name) {
+        throw new Error("no name for the voice")
+      }
+    } catch (err) {
+      console.error(`no speech synthesis voice available: ${err}`)
+      return
+    }
+    if (speechSynthesisVoice) {
+      set({ speechSynthesisVoice, isSpeechSynthesisAvailable: true })
+    }
+  },
+  speak: (sentence: string) => {
+    const { speechSynthesis, speechSynthesisVoice } = get()
+    if (!speechSynthesis || !speechSynthesisVoice) { return }
+    speechSynthesis.cancel()
+    const utterance = new SpeechSynthesisUtterance(sentence)
+    utterance.voice = speechSynthesisVoice
+    speechSynthesis.speak(utterance)
+    set({ lastSpokenSentence: sentence })
+  },
+  setHearing: (isHearing: boolean) => { set({ isHearing }) },
+}))

src/lib/getSpeechSynthesisVoice.ts ADDED Viewed

	@@ -0,0 +1,22 @@

+export function getSpeechSynthesisVoice(speechSynthesis: SpeechSynthesis): SpeechSynthesisVoice {
+  const allVoices = speechSynthesis.getVoices()
+  console.log("all voices:")
+  console.table(allVoices)
+  const fallbackVoice = allVoices[0]
+  const enVoices = allVoices.filter(voice => voice.lang.toLowerCase() === "en-us")
+  console.log("available english voices:")
+  console.table(enVoices)
+  const kathyVoice = enVoices.find(voice => voice.name.includes("Kathy"))
+  // if we find a high-quality voice
+  const googleVoice = enVoices.find(voice => voice.name.includes("Google"))
+  // console.log("google voice:", googleVoice)
+  return googleVoice || kathyVoice || fallbackVoice
+}

src/lib/useTimeout.ts ADDED Viewed

	@@ -0,0 +1,9 @@

+import { useEffect } from "react"
+export function useTimeout(duration: number, callback: () => void) {
+  useEffect(() => {
+    setTimeout(() => {
+      callback()
+    }, duration)
+  }, [])
+}