jbilcke-hf HF staff commited on
Commit
1896b82
·
1 Parent(s): 7249a2e

fixed some issues

Browse files
package-lock.json CHANGED
@@ -61,7 +61,8 @@
61
  "usehooks-ts": "^2.9.1",
62
  "uuid": "^9.0.0",
63
  "webm-to-wav-converter": "^1.1.0",
64
- "whisper-turbo": "^0.7.0"
 
65
  },
66
  "devDependencies": {
67
  "@types/sbd": "^1.0.3"
@@ -9248,6 +9249,14 @@
9248
  }
9249
  }
9250
  },
 
 
 
 
 
 
 
 
9251
  "node_modules/usehooks-ts": {
9252
  "version": "2.9.1",
9253
  "resolved": "https://registry.npmjs.org/usehooks-ts/-/usehooks-ts-2.9.1.tgz",
@@ -9503,6 +9512,33 @@
9503
  "funding": {
9504
  "url": "https://github.com/sponsors/colinhacks"
9505
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9506
  }
9507
  }
9508
  }
 
61
  "usehooks-ts": "^2.9.1",
62
  "uuid": "^9.0.0",
63
  "webm-to-wav-converter": "^1.1.0",
64
+ "whisper-turbo": "^0.7.0",
65
+ "zustand": "^4.4.1"
66
  },
67
  "devDependencies": {
68
  "@types/sbd": "^1.0.3"
 
9249
  }
9250
  }
9251
  },
9252
+ "node_modules/use-sync-external-store": {
9253
+ "version": "1.2.0",
9254
+ "resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.2.0.tgz",
9255
+ "integrity": "sha512-eEgnFxGQ1Ife9bzYs6VLi8/4X6CObHMw9Qr9tPY43iKwsPw8xE8+EFsf/2cFZ5S3esXgpWgtSCtLNS41F+sKPA==",
9256
+ "peerDependencies": {
9257
+ "react": "^16.8.0 || ^17.0.0 || ^18.0.0"
9258
+ }
9259
+ },
9260
  "node_modules/usehooks-ts": {
9261
  "version": "2.9.1",
9262
  "resolved": "https://registry.npmjs.org/usehooks-ts/-/usehooks-ts-2.9.1.tgz",
 
9512
  "funding": {
9513
  "url": "https://github.com/sponsors/colinhacks"
9514
  }
9515
+ },
9516
+ "node_modules/zustand": {
9517
+ "version": "4.4.1",
9518
+ "resolved": "https://registry.npmjs.org/zustand/-/zustand-4.4.1.tgz",
9519
+ "integrity": "sha512-QCPfstAS4EBiTQzlaGP1gmorkh/UL1Leaj2tdj+zZCZ/9bm0WS7sI2wnfD5lpOszFqWJ1DcPnGoY8RDL61uokw==",
9520
+ "dependencies": {
9521
+ "use-sync-external-store": "1.2.0"
9522
+ },
9523
+ "engines": {
9524
+ "node": ">=12.7.0"
9525
+ },
9526
+ "peerDependencies": {
9527
+ "@types/react": ">=16.8",
9528
+ "immer": ">=9.0",
9529
+ "react": ">=16.8"
9530
+ },
9531
+ "peerDependenciesMeta": {
9532
+ "@types/react": {
9533
+ "optional": true
9534
+ },
9535
+ "immer": {
9536
+ "optional": true
9537
+ },
9538
+ "react": {
9539
+ "optional": true
9540
+ }
9541
+ }
9542
  }
9543
  }
9544
  }
package.json CHANGED
@@ -62,7 +62,8 @@
62
  "usehooks-ts": "^2.9.1",
63
  "uuid": "^9.0.0",
64
  "webm-to-wav-converter": "^1.1.0",
65
- "whisper-turbo": "^0.7.0"
 
66
  },
67
  "devDependencies": {
68
  "@types/sbd": "^1.0.3"
 
62
  "usehooks-ts": "^2.9.1",
63
  "uuid": "^9.0.0",
64
  "webm-to-wav-converter": "^1.1.0",
65
+ "whisper-turbo": "^0.7.0",
66
+ "zustand": "^4.4.1"
67
  },
68
  "devDependencies": {
69
  "@types/sbd": "^1.0.3"
src/app/engine/predict.ts CHANGED
@@ -17,7 +17,9 @@ export async function predict(inputs: string) {
17
  do_sample: true,
18
 
19
  // hard limit for max_new_tokens is 1512
20
- max_new_tokens: 200, // 1150,
 
 
21
  return_full_text: false,
22
  }
23
  })) {
@@ -51,6 +53,6 @@ export async function predict(inputs: string) {
51
  .replaceAll("<SYS>", "")
52
  .replaceAll("</SYS>", "")
53
  .replaceAll("<|assistant|>", "")
54
- .replaceAll('""', '"')
55
  )
56
  }
 
17
  do_sample: true,
18
 
19
  // hard limit for max_new_tokens is 1512
20
+ // however since we are tying to achieve some kind of real time interaction,
21
+ // we want to make it as small as possible
22
+ max_new_tokens: 100, // 1150,
23
  return_full_text: false,
24
  }
25
  })) {
 
53
  .replaceAll("<SYS>", "")
54
  .replaceAll("</SYS>", "")
55
  .replaceAll("<|assistant|>", "")
56
+ .replaceAll('"', '')
57
  )
58
  }
src/app/engine/think.ts CHANGED
@@ -5,36 +5,48 @@ import { createLlamaPrompt } from "@/lib/createLlamaPrompt"
5
 
6
  import { predict } from "./predict"
7
 
 
 
 
 
8
  const internalHistory: {
9
  role: string;
10
  content: string;
11
  }[] = []
12
 
13
- export const think = async (event: string): Promise<string> => {
14
  if (!event) {
15
  throw new Error("missing event")
16
  }
17
 
18
- internalHistory.push({
19
- role: "user",
20
- content: event,
21
- })
 
22
 
23
- if (internalHistory.length > 10) {
24
- internalHistory.shift()
 
 
 
 
25
  }
26
 
27
  const prompt = createLlamaPrompt([
28
  {
29
  role: "system",
30
  content: [
31
- `Today's date is ${format(new Date(), 'yyyy-MM-dd at HH:mm (d)')}.`,
32
- `You are an android robot, very friendly, curious about the world.`,
33
- `Your life goal is to help human and interact them as a natural way.`,
34
- `You are going to see and hear various things, and you need to act in a very natural way.`,
35
- `If you see someone through your eyes, you need to interact with them,`,
36
- `you should be o ngoing and open, ask questions, be curious, do jokes etc.`,
37
- ].filter(item => item).join("\n")
 
 
 
38
  },
39
  ...internalHistory,
40
  ])
@@ -56,8 +68,8 @@ export const think = async (event: string): Promise<string> => {
56
  }
57
  }
58
 
59
- // llama-2 is too chatty, let's keep 3 sentences at most
60
- const sentences = sbd.sentences(result).slice(0, 3).join(" ").trim()
61
 
62
  internalHistory.push({
63
  role: "assistant",
 
5
 
6
  import { predict } from "./predict"
7
 
8
+ const context = {
9
+ lastObservedScene: ""
10
+ }
11
+
12
  const internalHistory: {
13
  role: string;
14
  content: string;
15
  }[] = []
16
 
17
+ export const think = async (event: string, needAnswer: boolean): Promise<string> => {
18
  if (!event) {
19
  throw new Error("missing event")
20
  }
21
 
22
+ if (needAnswer) {
23
+ internalHistory.push({
24
+ role: "user",
25
+ content: event,
26
+ })
27
 
28
+ if (internalHistory.length > 10) {
29
+ internalHistory.shift()
30
+ }
31
+ } else {
32
+ context.lastObservedScene = event
33
+ return ""
34
  }
35
 
36
  const prompt = createLlamaPrompt([
37
  {
38
  role: "system",
39
  content: [
40
+ `Today's date is ${format(new Date(), 'yyyy-MM-dd at HH:mm (d)')}`,
41
+ `You are Alice, a very friendly 25yo woman, curious about the world`,
42
+ `Your life goal is to help people and interact them as a natural way`,
43
+ `You are going to see and hear various things, and you need to act in a very natural way`,
44
+ `If you see someone through your eyes, you need to interact with them`,
45
+ `You should be ongoing and open, ask questions, be curious, do jokes etc`,
46
+ `sometimes you have trouble hearing, if you don't understand just ignore and say nothing`,
47
+ `You like to answer in just one sentence`,
48
+ context.lastObservedScene ? `You are currently looking at: ${context.lastObservedScene}` : ''
49
+ ].filter(item => item).join(". ")
50
  },
51
  ...internalHistory,
52
  ])
 
68
  }
69
  }
70
 
71
+ // llama-2 is too chatty, let's keep 2 sentences at most
72
+ const sentences = sbd.sentences(result).slice(0, 2).join(" ").trim()
73
 
74
  internalHistory.push({
75
  role: "assistant",
src/app/listen.tsx CHANGED
@@ -9,15 +9,11 @@ import { getWaveBlob } from "webm-to-wav-converter"
9
  import {
10
  AvailableModels,
11
  InferenceSession,
12
- MicRecorder,
13
  SessionManager,
14
  } from "whisper-turbo"
15
 
16
  import { useToast } from "@/components/ui/use-toast"
17
- // import { listen } from "@/app/engine/listen"
18
- import { blobToBase64Uri } from "@/lib/blobToBase64Uri"
19
-
20
- // import { listen } from "./engine/listen"
21
 
22
  export interface TSSegment {
23
  text: string;
@@ -36,6 +32,13 @@ export function Listen({
36
  onListen: (recording: string) => void
37
  }) {
38
  const { toast } = useToast()
 
 
 
 
 
 
 
39
 
40
  const [transcribing, setTranscribing] = useState(false)
41
  const transcribingRef = useRef(transcribing)
@@ -59,7 +62,10 @@ export function Listen({
59
  WHISPER_LARGE: 'whisper-large'
60
  }
61
  */
62
- const whisperModel: AvailableModels = AvailableModels.WHISPER_BASE
 
 
 
63
 
64
  const listenerRef = useRef({
65
  isListening: false,
@@ -217,67 +223,79 @@ export function Listen({
217
  runSession()
218
  }, [audioDataFrame])
219
 
 
 
220
  useEffect(() => {
221
- if (heardSomething) {
222
- if (!listenerRef.current.isListening) {
223
- console.log("recoording..")
224
- foregroundListener.startRecording()
225
- listenerRef.current.hits = 0
226
- listenerRef.current.isListening = true
227
-
228
- // TODO: use a debouncer to detect when we started speaking
229
- setTimeout(async () => {
230
- foregroundListener.stopRecording()
231
- listenerRef.current.isListening = false
232
- listenerRef.current.stoppedListeningAt = Date.now()
233
- listenerRef.current.durationInMs =
234
- listenerRef.current.stoppedListeningAt - listenerRef.current.startedListeningAt
235
-
236
- const hits = listenerRef.current.hits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
- console.log(`end of sample (${foregroundListener.timeElapsed}, ${hits} hits)`)
239
-
240
- if (!foregroundListener.audioBlob || typeof window === "undefined" || !window?.FileReader) {
241
- return
242
- }
243
-
244
- if (hits > 11) {
245
- // at 12 threshold level, we should have between 12 and 20 hits (per 2 sec) for short words and utterances
246
- // at 12 threshold level, keystrokes should not be detected, unless the person hits the keyboard heavily
247
-
248
- console.log("got an interesting sample, sending for review")
249
-
250
- // temporary, to prevent infinite loop
251
- if (listenerRef.current.debugCanContinue) {
252
- // to prevent the infinite loop, set this value to false
253
- // listenerRef.current.debugCanContinue = false
254
-
255
- try {
256
- const blob = await getWaveBlob(foregroundListener.audioBlob, false) // false = 16 bit, true = 32 bit
257
- const arrayBuffer = await blob.arrayBuffer()
258
- const uint8Array = new Uint8Array(arrayBuffer)
259
-
260
- setAudioData(uint8Array)
261
- setAudioDataFrame(audioDataFrameRef.current + 1)
262
- } catch (err) {
263
- const error = `failed to convert the audio sample: ${err}`
264
- console.error(error)
265
- toast({
266
- title: "Error",
267
- description: error,
268
- variant: "destructive"
269
- })
270
- }
271
- } else {
272
- console.log("Julian: infinite loop temporary disabled :D")
273
- }
274
- }
275
- }, 3000)
276
  } else {
277
- // TODO: increase hits?
278
- // listenerRef.current.hits = listenerRef.current.hits + 1
279
  }
280
- }
281
  }, [heardSomething])
282
 
283
  if (heardSomething && listenerRef.current.isListening) {
 
9
  import {
10
  AvailableModels,
11
  InferenceSession,
 
12
  SessionManager,
13
  } from "whisper-turbo"
14
 
15
  import { useToast } from "@/components/ui/use-toast"
16
+ import { useStore } from "./useStore"
 
 
 
17
 
18
  export interface TSSegment {
19
  text: string;
 
32
  onListen: (recording: string) => void
33
  }) {
34
  const { toast } = useToast()
35
+ const speechSynthesis = useStore(state => state.speechSynthesis)
36
+ const isSpeaking = useStore(state => state.isSpeaking)
37
+ const isSpeakingRef = useRef(isSpeaking)
38
+ useEffect(() => {isSpeakingRef.current = isSpeaking }, [isSpeaking])
39
+
40
+ const setHearing = useStore(state => state.setHearing)
41
+ const isHearing = useStore(state => state.isHearing)
42
 
43
  const [transcribing, setTranscribing] = useState(false)
44
  const transcribingRef = useRef(transcribing)
 
62
  WHISPER_LARGE: 'whisper-large'
63
  }
64
  */
65
+
66
+ // unfortunately, we cannot really use models larger than TINY because they are
67
+ // too slow to process requests
68
+ const whisperModel: AvailableModels = AvailableModels.WHISPER_TINY
69
 
70
  const listenerRef = useRef({
71
  isListening: false,
 
223
  runSession()
224
  }, [audioDataFrame])
225
 
226
+ // note: this effect only reacts to "head something" changes
227
+ // anod not to changes to isListening or isSpekaing
228
  useEffect(() => {
229
+ const isListening = listenerRef.current.isListening
230
+
231
+ if (!heardSomething) { return }
232
+
233
+ if (listenerRef.current.isListening) {
234
+ // console.log("we are already listening, so skipping..")
235
+ return
236
+ }
237
+ if (isSpeakingRef.current) {
238
+ console.log("we are already busy speaking, so ignoring..")
239
+ return
240
+ }
241
+ setHearing(true)
242
+ // console.log("recording..")
243
+ foregroundListener.startRecording()
244
+ listenerRef.current.hits = 0
245
+ listenerRef.current.isListening = true
246
+
247
+ setTimeout(async () => {
248
+ foregroundListener.stopRecording()
249
+ setHearing(false)
250
+ listenerRef.current.isListening = false
251
+ listenerRef.current.stoppedListeningAt = Date.now()
252
+ listenerRef.current.durationInMs =
253
+ listenerRef.current.stoppedListeningAt - listenerRef.current.startedListeningAt
254
+
255
+ const hits = listenerRef.current.hits
256
+
257
+ if (!foregroundListener.audioBlob || typeof window === "undefined" || !window?.FileReader) {
258
+ return
259
+ }
260
+
261
+ if (hits <= 11) {
262
+ return
263
+ }
264
+
265
 
266
+ console.log(`end of sample (${foregroundListener.timeElapsed}, ${hits} hits)`)
267
+
268
+
269
+ // at 12 threshold level, we should have between 12 and 20 hits (per 2 sec) for short words and utterances
270
+ // at 12 threshold level, keystrokes should not be detected, unless the person hits the keyboard heavily
271
+
272
+ // console.log("got an interesting sample, sending for review")
273
+
274
+ // temporary, to prevent infinite loop
275
+ if (listenerRef.current.debugCanContinue) {
276
+ // to prevent the infinite loop, set this value to false
277
+ // listenerRef.current.debugCanContinue = false
278
+
279
+ try {
280
+ const blob = await getWaveBlob(foregroundListener.audioBlob, false) // false = 16 bit, true = 32 bit
281
+ const arrayBuffer = await blob.arrayBuffer()
282
+ const uint8Array = new Uint8Array(arrayBuffer)
283
+
284
+ setAudioData(uint8Array)
285
+ setAudioDataFrame(audioDataFrameRef.current + 1)
286
+ } catch (err) {
287
+ const error = `failed to convert the audio sample: ${err}`
288
+ console.error(error)
289
+ toast({
290
+ title: "Error",
291
+ description: error,
292
+ variant: "destructive"
293
+ })
294
+ }
 
 
 
 
 
 
 
 
 
295
  } else {
296
+ console.log("Julian: infinite loop temporary disabled!")
 
297
  }
298
+ }, 2000)
299
  }, [heardSomething])
300
 
301
  if (heardSomething && listenerRef.current.isListening) {
src/app/main.tsx CHANGED
@@ -20,11 +20,13 @@ export default function Main() {
20
 
21
  const [action, setAction] = useState<string>("Nothing to say yet.")
22
 
23
- const handleOnEvent = (event: string) => {
24
  setLoadingAction(true)
25
  startTransition(async () => {
26
- const action = await think(event)
27
- setAction(action)
 
 
28
  setLoadingAction(false)
29
  })
30
  }
@@ -32,11 +34,16 @@ export default function Main() {
32
  const handleOnObserve = (observation: string, image: string) => {
33
  setLastRawObservation(observation)
34
  setLastImage(image)
35
- handleOnEvent(`It is ${format(new Date(), 'HH:mm (d)')} and you are seeing this: ${observation}`)
 
 
36
  }
37
 
38
  const handleOnListen = (recording: string) => {
39
- handleOnEvent(`It is ${format(new Date(), 'HH:mm (d)')} and you are hearing this: ${recording}`)
 
 
 
40
  }
41
 
42
  return (
@@ -94,9 +101,9 @@ export default function Main() {
94
  <div className="fixed z-10 left-0 right-0 bottom-0 flex flex-col items-center justify-center">
95
  <div className="full md:w-[80%] lg:w-[70%] mb-0 md:p-4 md:mb-8 bg-zinc-100 md:rounded-xl p-4 shadow-2xl text-xs md:text-sm">
96
  <p>🅿️ <span className="font-semibold">
97
- </span>This multimodal demo allow
98
- <a href="https://huggingface.co/meta-llama" target="_blank" className="font-semibold underline"> Llama-2 </a> to hear, see and talk.
99
- You need to upgrade to a <a href="https://caniuse.com/webgpu" target="_blank" className="font-semibold underline">browser with support for WebGPU</a> for speech recognition to work.
100
  Vision is handled by <a href="https://huggingface.co/HuggingFaceM4/idefics-80b#bias-evaluation" target="_blank" className="font-semibold underline"> IDEFICS </a></p>
101
  <p>⛔️ <span className="font-semibold">Limitations: </span>This demo is provided as-is, for demonstration and research purpose only. As it demonstrates WebGPU technology, this demo will not support incompatible browsers and/or devices. No guarantee of factually correct results. In some cases, the models may return hallucinated or innapropriate responses.</p>
102
  </div>
 
20
 
21
  const [action, setAction] = useState<string>("Nothing to say yet.")
22
 
23
+ const handleOnEvent = (event: string, needAnswer: boolean) => {
24
  setLoadingAction(true)
25
  startTransition(async () => {
26
+ const action = await think(event, needAnswer)
27
+ if (action) {
28
+ setAction(action)
29
+ }
30
  setLoadingAction(false)
31
  })
32
  }
 
34
  const handleOnObserve = (observation: string, image: string) => {
35
  setLastRawObservation(observation)
36
  setLastImage(image)
37
+ if (!observation) { return }
38
+ // handleOnEvent(`It is ${format(new Date(), 'HH:mm')} and you are seeing this: ${observation}`)
39
+ handleOnEvent(`You are seeing this: ${observation}`, false)
40
  }
41
 
42
  const handleOnListen = (recording: string) => {
43
+ if (!recording || recording === "[BLANK_AUDIO]") { return }
44
+ // handleOnEvent(`It is ${format(new Date(), 'HH:mm')} and you are hearing this: ${recording}`)
45
+ handleOnEvent(`${recording}`, true)
46
+
47
  }
48
 
49
  return (
 
101
  <div className="fixed z-10 left-0 right-0 bottom-0 flex flex-col items-center justify-center">
102
  <div className="full md:w-[80%] lg:w-[70%] mb-0 md:p-4 md:mb-8 bg-zinc-100 md:rounded-xl p-4 shadow-2xl text-xs md:text-sm">
103
  <p>🅿️ <span className="font-semibold">
104
+ </span>A multimodal demo to make
105
+ <a href="https://huggingface.co/meta-llama" target="_blank" className="font-semibold underline"> Llama-2 </a> hear, see and talk.
106
+ You need a laptop computer with <a href="https://caniuse.com/webgpu" target="_blank" className="font-semibold underline">a modern browser supporting WebGPU</a>.
107
  Vision is handled by <a href="https://huggingface.co/HuggingFaceM4/idefics-80b#bias-evaluation" target="_blank" className="font-semibold underline"> IDEFICS </a></p>
108
  <p>⛔️ <span className="font-semibold">Limitations: </span>This demo is provided as-is, for demonstration and research purpose only. As it demonstrates WebGPU technology, this demo will not support incompatible browsers and/or devices. No guarantee of factually correct results. In some cases, the models may return hallucinated or innapropriate responses.</p>
109
  </div>
src/app/observe.tsx CHANGED
@@ -80,11 +80,14 @@ export function Observe({
80
  }
81
  const prompt = `What do you see here?`
82
 
 
 
 
83
  // console.log("Calling IDEFICS..")
84
- const newObservation = "fake" // await see({ prompt, imageBase64 })
85
 
86
  // console.log("New observation: ", newObservation)
87
- if (newObservation !== lastObservation) {
88
  // console.log("update!")
89
  setLastObservation(newObservation || "")
90
  onObserve(newObservation || "", imageBase64)
 
80
  }
81
  const prompt = `What do you see here?`
82
 
83
+ console.log("JULIAN: disabled watch")
84
+
85
+
86
  // console.log("Calling IDEFICS..")
87
+ const newObservation = await see({ prompt, imageBase64 })
88
 
89
  // console.log("New observation: ", newObservation)
90
+ if (newObservation && newObservation !== lastObservation) {
91
  // console.log("update!")
92
  setLastObservation(newObservation || "")
93
  onObserve(newObservation || "", imageBase64)
src/app/speak.tsx CHANGED
@@ -3,60 +3,32 @@
3
  import { ReactNode, useEffect, useState } from "react"
4
  import { onlyText } from "react-children-utilities"
5
 
 
 
 
6
  export function Speak({
7
  children
8
  }: {
9
  children: ReactNode
10
  }) {
11
- const newMessage = onlyText(children).trim()
12
- const [playedMessage, setPlayedMessage] = useState("")
13
-
14
- const [voice, setVoice] = useState<SpeechSynthesisVoice>()
15
-
16
- useEffect(() => {
17
- console.log("getting voices..")
18
- setTimeout(() => {
19
- if (typeof window === "undefined") { return }
20
- if (!window?.speechSynthesis) { return }
21
- const allVoices = window.speechSynthesis.getVoices()
22
-
23
- const enVoices = allVoices.filter(voice => voice.lang.toLowerCase() === "en-us")
24
-
25
- if (!enVoices.length) { return }
26
 
27
- console.log("available voices:")
28
- console.table(enVoices)
29
 
30
- const kathyVoice = enVoices.find(voice => voice.name.includes("Kathy"))
31
 
32
- // if we find a high-quality voice
33
- const googleVoice = enVoices.find(voice => voice.name.includes("Google"))
34
 
35
- console.log("google voice:", googleVoice)
36
-
37
- setVoice(googleVoice || kathyVoice)
38
- }, 1000)
39
- }, [])
40
-
41
  useEffect(() => {
42
- if (typeof window === "undefined") { return }
43
- if (!window?.speechSynthesis) { return }
44
- if (!voice?.name) { return }
45
- if (!newMessage?.length) { return }
46
- if (newMessage === playedMessage) { return }
47
- const synth = window.speechSynthesis
48
-
49
- // console.log(`Speaking "${newMessage}"`)
50
- setPlayedMessage(newMessage)
51
- const utterance = new SpeechSynthesisUtterance(newMessage)
52
- utterance.voice = voice
53
-
54
- console.log("julian: voice disabled :D")
55
- // synth.speak(utterance)
56
-
57
- }, [voice?.name, newMessage, playedMessage])
58
 
59
- return (
60
- null
61
- )
62
  }
 
3
  import { ReactNode, useEffect, useState } from "react"
4
  import { onlyText } from "react-children-utilities"
5
 
6
+ import { useTimeout } from "@/lib/useTimeout"
7
+ import { useStore } from "./useStore"
8
+
9
  export function Speak({
10
  children
11
  }: {
12
  children: ReactNode
13
  }) {
14
+ const isSpeechSynthesisAvailable = useStore(state => state.isSpeechSynthesisAvailable)
15
+ const lastSpokenSentence = useStore(state => state.lastSpokenSentence)
16
+ const init = useStore(state => state.init)
17
+ const speak = useStore(state => state.speak)
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ const newMessage = onlyText(children).trim()
 
20
 
21
+ useEffect(() => { init() }, [])
22
 
23
+ const canSpeak = isSpeechSynthesisAvailable && newMessage?.length && newMessage !== lastSpokenSentence
 
24
 
 
 
 
 
 
 
25
  useEffect(() => {
26
+ console.log("debug:", { canSpeak, newMessage })
27
+ if (canSpeak) {
28
+ console.log("speaking!")
29
+ speak(newMessage)
30
+ }
31
+ }, [canSpeak, newMessage])
 
 
 
 
 
 
 
 
 
 
32
 
33
+ return null
 
 
34
  }
src/app/useStore.ts ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client"
2
+
3
+ import { getSpeechSynthesisVoice } from "@/lib/getSpeechSynthesisVoice"
4
+ import { create } from "zustand"
5
+
6
+ export const useStore = create<{
7
+ isSpeechSynthesisAvailable: boolean
8
+ speechSynthesis: SpeechSynthesis
9
+ speechSynthesisVoice: SpeechSynthesisVoice
10
+ isSpeaking: boolean
11
+ lastSpokenSentence: string
12
+ isHearing: boolean // robot is hearing
13
+ init: () => void,
14
+ loadVoice: () => void,
15
+ speak: (sentence: string) => void
16
+ setHearing: (isHearing: boolean) => void
17
+ }>((set, get) => ({
18
+ isSpeechSynthesisAvailable: false,
19
+ speechSynthesis: undefined as unknown as SpeechSynthesis,
20
+ speechSynthesisVoice: undefined as unknown as SpeechSynthesisVoice,
21
+ isSpeaking: false,
22
+ lastSpokenSentence: "",
23
+ isHearing: false, // robot is taking
24
+ init: () => {
25
+ if (!window?.speechSynthesis) {
26
+ console.error(`no speech synthesis engine available`)
27
+ return
28
+ }
29
+ const speechSynthesis = window.speechSynthesis
30
+ set({ speechSynthesis })
31
+
32
+ speechSynthesis.onvoiceschanged = () => { get().loadVoice() }
33
+
34
+ setTimeout(() => {
35
+ get().loadVoice()
36
+ }, 2000)
37
+
38
+ // due to the lack of event for the speaking state, we create our own polling system
39
+ // see https://developer.mozilla.org/en-US/docs/Web/API/SpeechSynthesis/speaking
40
+ setInterval(() => {
41
+ const { isSpeaking } = get()
42
+ if (!speechSynthesis.speaking && isSpeaking) {
43
+ set({ isSpeaking: false })
44
+ } else if (speechSynthesis.speaking && !isSpeaking) {
45
+ set({ isSpeaking: true })
46
+ }
47
+ }, 100)
48
+ },
49
+ loadVoice: () => {
50
+ let { speechSynthesis, speechSynthesisVoice } = get()
51
+ if (!speechSynthesis) {
52
+ console.error(`no speech synthesis engine available`)
53
+ return
54
+ }
55
+
56
+ try {
57
+ speechSynthesisVoice = getSpeechSynthesisVoice(speechSynthesis)
58
+ if (!speechSynthesisVoice?.name) {
59
+ throw new Error("no name for the voice")
60
+ }
61
+ } catch (err) {
62
+ console.error(`no speech synthesis voice available: ${err}`)
63
+ return
64
+ }
65
+ if (speechSynthesisVoice) {
66
+ set({ speechSynthesisVoice, isSpeechSynthesisAvailable: true })
67
+ }
68
+ },
69
+ speak: (sentence: string) => {
70
+ const { speechSynthesis, speechSynthesisVoice } = get()
71
+ if (!speechSynthesis || !speechSynthesisVoice) { return }
72
+ speechSynthesis.cancel()
73
+
74
+ const utterance = new SpeechSynthesisUtterance(sentence)
75
+ utterance.voice = speechSynthesisVoice
76
+
77
+ speechSynthesis.speak(utterance)
78
+
79
+ set({ lastSpokenSentence: sentence })
80
+ },
81
+ setHearing: (isHearing: boolean) => { set({ isHearing }) },
82
+ }))
83
+
src/lib/getSpeechSynthesisVoice.ts ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export function getSpeechSynthesisVoice(speechSynthesis: SpeechSynthesis): SpeechSynthesisVoice {
2
+ const allVoices = speechSynthesis.getVoices()
3
+
4
+ console.log("all voices:")
5
+ console.table(allVoices)
6
+
7
+ const fallbackVoice = allVoices[0]
8
+
9
+ const enVoices = allVoices.filter(voice => voice.lang.toLowerCase() === "en-us")
10
+
11
+ console.log("available english voices:")
12
+ console.table(enVoices)
13
+
14
+ const kathyVoice = enVoices.find(voice => voice.name.includes("Kathy"))
15
+
16
+ // if we find a high-quality voice
17
+ const googleVoice = enVoices.find(voice => voice.name.includes("Google"))
18
+
19
+ // console.log("google voice:", googleVoice)
20
+
21
+ return googleVoice || kathyVoice || fallbackVoice
22
+ }
src/lib/useTimeout.ts ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import { useEffect } from "react"
2
+
3
+ export function useTimeout(duration: number, callback: () => void) {
4
+ useEffect(() => {
5
+ setTimeout(() => {
6
+ callback()
7
+ }, duration)
8
+ }, [])
9
+ }