Spaces:
Paused
Paused
Commit
·
1896b82
1
Parent(s):
7249a2e
fixed some issues
Browse files- package-lock.json +37 -1
- package.json +2 -1
- src/app/engine/predict.ts +4 -2
- src/app/engine/think.ts +28 -16
- src/app/listen.tsx +81 -63
- src/app/main.tsx +15 -8
- src/app/observe.tsx +5 -2
- src/app/speak.tsx +17 -45
- src/app/useStore.ts +83 -0
- src/lib/getSpeechSynthesisVoice.ts +22 -0
- src/lib/useTimeout.ts +9 -0
package-lock.json
CHANGED
@@ -61,7 +61,8 @@
|
|
61 |
"usehooks-ts": "^2.9.1",
|
62 |
"uuid": "^9.0.0",
|
63 |
"webm-to-wav-converter": "^1.1.0",
|
64 |
-
"whisper-turbo": "^0.7.0"
|
|
|
65 |
},
|
66 |
"devDependencies": {
|
67 |
"@types/sbd": "^1.0.3"
|
@@ -9248,6 +9249,14 @@
|
|
9248 |
}
|
9249 |
}
|
9250 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9251 |
"node_modules/usehooks-ts": {
|
9252 |
"version": "2.9.1",
|
9253 |
"resolved": "https://registry.npmjs.org/usehooks-ts/-/usehooks-ts-2.9.1.tgz",
|
@@ -9503,6 +9512,33 @@
|
|
9503 |
"funding": {
|
9504 |
"url": "https://github.com/sponsors/colinhacks"
|
9505 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9506 |
}
|
9507 |
}
|
9508 |
}
|
|
|
61 |
"usehooks-ts": "^2.9.1",
|
62 |
"uuid": "^9.0.0",
|
63 |
"webm-to-wav-converter": "^1.1.0",
|
64 |
+
"whisper-turbo": "^0.7.0",
|
65 |
+
"zustand": "^4.4.1"
|
66 |
},
|
67 |
"devDependencies": {
|
68 |
"@types/sbd": "^1.0.3"
|
|
|
9249 |
}
|
9250 |
}
|
9251 |
},
|
9252 |
+
"node_modules/use-sync-external-store": {
|
9253 |
+
"version": "1.2.0",
|
9254 |
+
"resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.2.0.tgz",
|
9255 |
+
"integrity": "sha512-eEgnFxGQ1Ife9bzYs6VLi8/4X6CObHMw9Qr9tPY43iKwsPw8xE8+EFsf/2cFZ5S3esXgpWgtSCtLNS41F+sKPA==",
|
9256 |
+
"peerDependencies": {
|
9257 |
+
"react": "^16.8.0 || ^17.0.0 || ^18.0.0"
|
9258 |
+
}
|
9259 |
+
},
|
9260 |
"node_modules/usehooks-ts": {
|
9261 |
"version": "2.9.1",
|
9262 |
"resolved": "https://registry.npmjs.org/usehooks-ts/-/usehooks-ts-2.9.1.tgz",
|
|
|
9512 |
"funding": {
|
9513 |
"url": "https://github.com/sponsors/colinhacks"
|
9514 |
}
|
9515 |
+
},
|
9516 |
+
"node_modules/zustand": {
|
9517 |
+
"version": "4.4.1",
|
9518 |
+
"resolved": "https://registry.npmjs.org/zustand/-/zustand-4.4.1.tgz",
|
9519 |
+
"integrity": "sha512-QCPfstAS4EBiTQzlaGP1gmorkh/UL1Leaj2tdj+zZCZ/9bm0WS7sI2wnfD5lpOszFqWJ1DcPnGoY8RDL61uokw==",
|
9520 |
+
"dependencies": {
|
9521 |
+
"use-sync-external-store": "1.2.0"
|
9522 |
+
},
|
9523 |
+
"engines": {
|
9524 |
+
"node": ">=12.7.0"
|
9525 |
+
},
|
9526 |
+
"peerDependencies": {
|
9527 |
+
"@types/react": ">=16.8",
|
9528 |
+
"immer": ">=9.0",
|
9529 |
+
"react": ">=16.8"
|
9530 |
+
},
|
9531 |
+
"peerDependenciesMeta": {
|
9532 |
+
"@types/react": {
|
9533 |
+
"optional": true
|
9534 |
+
},
|
9535 |
+
"immer": {
|
9536 |
+
"optional": true
|
9537 |
+
},
|
9538 |
+
"react": {
|
9539 |
+
"optional": true
|
9540 |
+
}
|
9541 |
+
}
|
9542 |
}
|
9543 |
}
|
9544 |
}
|
package.json
CHANGED
@@ -62,7 +62,8 @@
|
|
62 |
"usehooks-ts": "^2.9.1",
|
63 |
"uuid": "^9.0.0",
|
64 |
"webm-to-wav-converter": "^1.1.0",
|
65 |
-
"whisper-turbo": "^0.7.0"
|
|
|
66 |
},
|
67 |
"devDependencies": {
|
68 |
"@types/sbd": "^1.0.3"
|
|
|
62 |
"usehooks-ts": "^2.9.1",
|
63 |
"uuid": "^9.0.0",
|
64 |
"webm-to-wav-converter": "^1.1.0",
|
65 |
+
"whisper-turbo": "^0.7.0",
|
66 |
+
"zustand": "^4.4.1"
|
67 |
},
|
68 |
"devDependencies": {
|
69 |
"@types/sbd": "^1.0.3"
|
src/app/engine/predict.ts
CHANGED
@@ -17,7 +17,9 @@ export async function predict(inputs: string) {
|
|
17 |
do_sample: true,
|
18 |
|
19 |
// hard limit for max_new_tokens is 1512
|
20 |
-
|
|
|
|
|
21 |
return_full_text: false,
|
22 |
}
|
23 |
})) {
|
@@ -51,6 +53,6 @@ export async function predict(inputs: string) {
|
|
51 |
.replaceAll("<SYS>", "")
|
52 |
.replaceAll("</SYS>", "")
|
53 |
.replaceAll("<|assistant|>", "")
|
54 |
-
.replaceAll('"
|
55 |
)
|
56 |
}
|
|
|
17 |
do_sample: true,
|
18 |
|
19 |
// hard limit for max_new_tokens is 1512
|
20 |
+
// however since we are tying to achieve some kind of real time interaction,
|
21 |
+
// we want to make it as small as possible
|
22 |
+
max_new_tokens: 100, // 1150,
|
23 |
return_full_text: false,
|
24 |
}
|
25 |
})) {
|
|
|
53 |
.replaceAll("<SYS>", "")
|
54 |
.replaceAll("</SYS>", "")
|
55 |
.replaceAll("<|assistant|>", "")
|
56 |
+
.replaceAll('"', '')
|
57 |
)
|
58 |
}
|
src/app/engine/think.ts
CHANGED
@@ -5,36 +5,48 @@ import { createLlamaPrompt } from "@/lib/createLlamaPrompt"
|
|
5 |
|
6 |
import { predict } from "./predict"
|
7 |
|
|
|
|
|
|
|
|
|
8 |
const internalHistory: {
|
9 |
role: string;
|
10 |
content: string;
|
11 |
}[] = []
|
12 |
|
13 |
-
export const think = async (event: string): Promise<string> => {
|
14 |
if (!event) {
|
15 |
throw new Error("missing event")
|
16 |
}
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
22 |
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
25 |
}
|
26 |
|
27 |
const prompt = createLlamaPrompt([
|
28 |
{
|
29 |
role: "system",
|
30 |
content: [
|
31 |
-
`Today's date is ${format(new Date(), 'yyyy-MM-dd at HH:mm (d)')}
|
32 |
-
`You are
|
33 |
-
`Your life goal is to help
|
34 |
-
`You are going to see and hear various things, and you need to act in a very natural way
|
35 |
-
`If you see someone through your eyes, you need to interact with them
|
36 |
-
`
|
37 |
-
|
|
|
|
|
|
|
38 |
},
|
39 |
...internalHistory,
|
40 |
])
|
@@ -56,8 +68,8 @@ export const think = async (event: string): Promise<string> => {
|
|
56 |
}
|
57 |
}
|
58 |
|
59 |
-
// llama-2 is too chatty, let's keep
|
60 |
-
const sentences = sbd.sentences(result).slice(0,
|
61 |
|
62 |
internalHistory.push({
|
63 |
role: "assistant",
|
|
|
5 |
|
6 |
import { predict } from "./predict"
|
7 |
|
8 |
+
const context = {
|
9 |
+
lastObservedScene: ""
|
10 |
+
}
|
11 |
+
|
12 |
const internalHistory: {
|
13 |
role: string;
|
14 |
content: string;
|
15 |
}[] = []
|
16 |
|
17 |
+
export const think = async (event: string, needAnswer: boolean): Promise<string> => {
|
18 |
if (!event) {
|
19 |
throw new Error("missing event")
|
20 |
}
|
21 |
|
22 |
+
if (needAnswer) {
|
23 |
+
internalHistory.push({
|
24 |
+
role: "user",
|
25 |
+
content: event,
|
26 |
+
})
|
27 |
|
28 |
+
if (internalHistory.length > 10) {
|
29 |
+
internalHistory.shift()
|
30 |
+
}
|
31 |
+
} else {
|
32 |
+
context.lastObservedScene = event
|
33 |
+
return ""
|
34 |
}
|
35 |
|
36 |
const prompt = createLlamaPrompt([
|
37 |
{
|
38 |
role: "system",
|
39 |
content: [
|
40 |
+
`Today's date is ${format(new Date(), 'yyyy-MM-dd at HH:mm (d)')}`,
|
41 |
+
`You are Alice, a very friendly 25yo woman, curious about the world`,
|
42 |
+
`Your life goal is to help people and interact them as a natural way`,
|
43 |
+
`You are going to see and hear various things, and you need to act in a very natural way`,
|
44 |
+
`If you see someone through your eyes, you need to interact with them`,
|
45 |
+
`You should be ongoing and open, ask questions, be curious, do jokes etc`,
|
46 |
+
`sometimes you have trouble hearing, if you don't understand just ignore and say nothing`,
|
47 |
+
`You like to answer in just one sentence`,
|
48 |
+
context.lastObservedScene ? `You are currently looking at: ${context.lastObservedScene}` : ''
|
49 |
+
].filter(item => item).join(". ")
|
50 |
},
|
51 |
...internalHistory,
|
52 |
])
|
|
|
68 |
}
|
69 |
}
|
70 |
|
71 |
+
// llama-2 is too chatty, let's keep 2 sentences at most
|
72 |
+
const sentences = sbd.sentences(result).slice(0, 2).join(" ").trim()
|
73 |
|
74 |
internalHistory.push({
|
75 |
role: "assistant",
|
src/app/listen.tsx
CHANGED
@@ -9,15 +9,11 @@ import { getWaveBlob } from "webm-to-wav-converter"
|
|
9 |
import {
|
10 |
AvailableModels,
|
11 |
InferenceSession,
|
12 |
-
MicRecorder,
|
13 |
SessionManager,
|
14 |
} from "whisper-turbo"
|
15 |
|
16 |
import { useToast } from "@/components/ui/use-toast"
|
17 |
-
|
18 |
-
import { blobToBase64Uri } from "@/lib/blobToBase64Uri"
|
19 |
-
|
20 |
-
// import { listen } from "./engine/listen"
|
21 |
|
22 |
export interface TSSegment {
|
23 |
text: string;
|
@@ -36,6 +32,13 @@ export function Listen({
|
|
36 |
onListen: (recording: string) => void
|
37 |
}) {
|
38 |
const { toast } = useToast()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
const [transcribing, setTranscribing] = useState(false)
|
41 |
const transcribingRef = useRef(transcribing)
|
@@ -59,7 +62,10 @@ export function Listen({
|
|
59 |
WHISPER_LARGE: 'whisper-large'
|
60 |
}
|
61 |
*/
|
62 |
-
|
|
|
|
|
|
|
63 |
|
64 |
const listenerRef = useRef({
|
65 |
isListening: false,
|
@@ -217,67 +223,79 @@ export function Listen({
|
|
217 |
runSession()
|
218 |
}, [audioDataFrame])
|
219 |
|
|
|
|
|
220 |
useEffect(() => {
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
description: error,
|
268 |
-
variant: "destructive"
|
269 |
-
})
|
270 |
-
}
|
271 |
-
} else {
|
272 |
-
console.log("Julian: infinite loop temporary disabled :D")
|
273 |
-
}
|
274 |
-
}
|
275 |
-
}, 3000)
|
276 |
} else {
|
277 |
-
|
278 |
-
// listenerRef.current.hits = listenerRef.current.hits + 1
|
279 |
}
|
280 |
-
}
|
281 |
}, [heardSomething])
|
282 |
|
283 |
if (heardSomething && listenerRef.current.isListening) {
|
|
|
9 |
import {
|
10 |
AvailableModels,
|
11 |
InferenceSession,
|
|
|
12 |
SessionManager,
|
13 |
} from "whisper-turbo"
|
14 |
|
15 |
import { useToast } from "@/components/ui/use-toast"
|
16 |
+
import { useStore } from "./useStore"
|
|
|
|
|
|
|
17 |
|
18 |
export interface TSSegment {
|
19 |
text: string;
|
|
|
32 |
onListen: (recording: string) => void
|
33 |
}) {
|
34 |
const { toast } = useToast()
|
35 |
+
const speechSynthesis = useStore(state => state.speechSynthesis)
|
36 |
+
const isSpeaking = useStore(state => state.isSpeaking)
|
37 |
+
const isSpeakingRef = useRef(isSpeaking)
|
38 |
+
useEffect(() => {isSpeakingRef.current = isSpeaking }, [isSpeaking])
|
39 |
+
|
40 |
+
const setHearing = useStore(state => state.setHearing)
|
41 |
+
const isHearing = useStore(state => state.isHearing)
|
42 |
|
43 |
const [transcribing, setTranscribing] = useState(false)
|
44 |
const transcribingRef = useRef(transcribing)
|
|
|
62 |
WHISPER_LARGE: 'whisper-large'
|
63 |
}
|
64 |
*/
|
65 |
+
|
66 |
+
// unfortunately, we cannot really use models larger than TINY because they are
|
67 |
+
// too slow to process requests
|
68 |
+
const whisperModel: AvailableModels = AvailableModels.WHISPER_TINY
|
69 |
|
70 |
const listenerRef = useRef({
|
71 |
isListening: false,
|
|
|
223 |
runSession()
|
224 |
}, [audioDataFrame])
|
225 |
|
226 |
+
// note: this effect only reacts to "head something" changes
|
227 |
+
// anod not to changes to isListening or isSpekaing
|
228 |
useEffect(() => {
|
229 |
+
const isListening = listenerRef.current.isListening
|
230 |
+
|
231 |
+
if (!heardSomething) { return }
|
232 |
+
|
233 |
+
if (listenerRef.current.isListening) {
|
234 |
+
// console.log("we are already listening, so skipping..")
|
235 |
+
return
|
236 |
+
}
|
237 |
+
if (isSpeakingRef.current) {
|
238 |
+
console.log("we are already busy speaking, so ignoring..")
|
239 |
+
return
|
240 |
+
}
|
241 |
+
setHearing(true)
|
242 |
+
// console.log("recording..")
|
243 |
+
foregroundListener.startRecording()
|
244 |
+
listenerRef.current.hits = 0
|
245 |
+
listenerRef.current.isListening = true
|
246 |
+
|
247 |
+
setTimeout(async () => {
|
248 |
+
foregroundListener.stopRecording()
|
249 |
+
setHearing(false)
|
250 |
+
listenerRef.current.isListening = false
|
251 |
+
listenerRef.current.stoppedListeningAt = Date.now()
|
252 |
+
listenerRef.current.durationInMs =
|
253 |
+
listenerRef.current.stoppedListeningAt - listenerRef.current.startedListeningAt
|
254 |
+
|
255 |
+
const hits = listenerRef.current.hits
|
256 |
+
|
257 |
+
if (!foregroundListener.audioBlob || typeof window === "undefined" || !window?.FileReader) {
|
258 |
+
return
|
259 |
+
}
|
260 |
+
|
261 |
+
if (hits <= 11) {
|
262 |
+
return
|
263 |
+
}
|
264 |
+
|
265 |
|
266 |
+
console.log(`end of sample (${foregroundListener.timeElapsed}, ${hits} hits)`)
|
267 |
+
|
268 |
+
|
269 |
+
// at 12 threshold level, we should have between 12 and 20 hits (per 2 sec) for short words and utterances
|
270 |
+
// at 12 threshold level, keystrokes should not be detected, unless the person hits the keyboard heavily
|
271 |
+
|
272 |
+
// console.log("got an interesting sample, sending for review")
|
273 |
+
|
274 |
+
// temporary, to prevent infinite loop
|
275 |
+
if (listenerRef.current.debugCanContinue) {
|
276 |
+
// to prevent the infinite loop, set this value to false
|
277 |
+
// listenerRef.current.debugCanContinue = false
|
278 |
+
|
279 |
+
try {
|
280 |
+
const blob = await getWaveBlob(foregroundListener.audioBlob, false) // false = 16 bit, true = 32 bit
|
281 |
+
const arrayBuffer = await blob.arrayBuffer()
|
282 |
+
const uint8Array = new Uint8Array(arrayBuffer)
|
283 |
+
|
284 |
+
setAudioData(uint8Array)
|
285 |
+
setAudioDataFrame(audioDataFrameRef.current + 1)
|
286 |
+
} catch (err) {
|
287 |
+
const error = `failed to convert the audio sample: ${err}`
|
288 |
+
console.error(error)
|
289 |
+
toast({
|
290 |
+
title: "Error",
|
291 |
+
description: error,
|
292 |
+
variant: "destructive"
|
293 |
+
})
|
294 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
} else {
|
296 |
+
console.log("Julian: infinite loop temporary disabled!")
|
|
|
297 |
}
|
298 |
+
}, 2000)
|
299 |
}, [heardSomething])
|
300 |
|
301 |
if (heardSomething && listenerRef.current.isListening) {
|
src/app/main.tsx
CHANGED
@@ -20,11 +20,13 @@ export default function Main() {
|
|
20 |
|
21 |
const [action, setAction] = useState<string>("Nothing to say yet.")
|
22 |
|
23 |
-
const handleOnEvent = (event: string) => {
|
24 |
setLoadingAction(true)
|
25 |
startTransition(async () => {
|
26 |
-
const action = await think(event)
|
27 |
-
|
|
|
|
|
28 |
setLoadingAction(false)
|
29 |
})
|
30 |
}
|
@@ -32,11 +34,16 @@ export default function Main() {
|
|
32 |
const handleOnObserve = (observation: string, image: string) => {
|
33 |
setLastRawObservation(observation)
|
34 |
setLastImage(image)
|
35 |
-
|
|
|
|
|
36 |
}
|
37 |
|
38 |
const handleOnListen = (recording: string) => {
|
39 |
-
|
|
|
|
|
|
|
40 |
}
|
41 |
|
42 |
return (
|
@@ -94,9 +101,9 @@ export default function Main() {
|
|
94 |
<div className="fixed z-10 left-0 right-0 bottom-0 flex flex-col items-center justify-center">
|
95 |
<div className="full md:w-[80%] lg:w-[70%] mb-0 md:p-4 md:mb-8 bg-zinc-100 md:rounded-xl p-4 shadow-2xl text-xs md:text-sm">
|
96 |
<p>🅿️ <span className="font-semibold">
|
97 |
-
</span>
|
98 |
-
<a href="https://huggingface.co/meta-llama" target="_blank" className="font-semibold underline"> Llama-2 </a>
|
99 |
-
You need
|
100 |
Vision is handled by <a href="https://huggingface.co/HuggingFaceM4/idefics-80b#bias-evaluation" target="_blank" className="font-semibold underline"> IDEFICS </a></p>
|
101 |
<p>⛔️ <span className="font-semibold">Limitations: </span>This demo is provided as-is, for demonstration and research purpose only. As it demonstrates WebGPU technology, this demo will not support incompatible browsers and/or devices. No guarantee of factually correct results. In some cases, the models may return hallucinated or innapropriate responses.</p>
|
102 |
</div>
|
|
|
20 |
|
21 |
const [action, setAction] = useState<string>("Nothing to say yet.")
|
22 |
|
23 |
+
const handleOnEvent = (event: string, needAnswer: boolean) => {
|
24 |
setLoadingAction(true)
|
25 |
startTransition(async () => {
|
26 |
+
const action = await think(event, needAnswer)
|
27 |
+
if (action) {
|
28 |
+
setAction(action)
|
29 |
+
}
|
30 |
setLoadingAction(false)
|
31 |
})
|
32 |
}
|
|
|
34 |
const handleOnObserve = (observation: string, image: string) => {
|
35 |
setLastRawObservation(observation)
|
36 |
setLastImage(image)
|
37 |
+
if (!observation) { return }
|
38 |
+
// handleOnEvent(`It is ${format(new Date(), 'HH:mm')} and you are seeing this: ${observation}`)
|
39 |
+
handleOnEvent(`You are seeing this: ${observation}`, false)
|
40 |
}
|
41 |
|
42 |
const handleOnListen = (recording: string) => {
|
43 |
+
if (!recording || recording === "[BLANK_AUDIO]") { return }
|
44 |
+
// handleOnEvent(`It is ${format(new Date(), 'HH:mm')} and you are hearing this: ${recording}`)
|
45 |
+
handleOnEvent(`${recording}`, true)
|
46 |
+
|
47 |
}
|
48 |
|
49 |
return (
|
|
|
101 |
<div className="fixed z-10 left-0 right-0 bottom-0 flex flex-col items-center justify-center">
|
102 |
<div className="full md:w-[80%] lg:w-[70%] mb-0 md:p-4 md:mb-8 bg-zinc-100 md:rounded-xl p-4 shadow-2xl text-xs md:text-sm">
|
103 |
<p>🅿️ <span className="font-semibold">
|
104 |
+
</span>A multimodal demo to make
|
105 |
+
<a href="https://huggingface.co/meta-llama" target="_blank" className="font-semibold underline"> Llama-2 </a> hear, see and talk.
|
106 |
+
You need a laptop computer with <a href="https://caniuse.com/webgpu" target="_blank" className="font-semibold underline">a modern browser supporting WebGPU</a>.
|
107 |
Vision is handled by <a href="https://huggingface.co/HuggingFaceM4/idefics-80b#bias-evaluation" target="_blank" className="font-semibold underline"> IDEFICS </a></p>
|
108 |
<p>⛔️ <span className="font-semibold">Limitations: </span>This demo is provided as-is, for demonstration and research purpose only. As it demonstrates WebGPU technology, this demo will not support incompatible browsers and/or devices. No guarantee of factually correct results. In some cases, the models may return hallucinated or innapropriate responses.</p>
|
109 |
</div>
|
src/app/observe.tsx
CHANGED
@@ -80,11 +80,14 @@ export function Observe({
|
|
80 |
}
|
81 |
const prompt = `What do you see here?`
|
82 |
|
|
|
|
|
|
|
83 |
// console.log("Calling IDEFICS..")
|
84 |
-
const newObservation =
|
85 |
|
86 |
// console.log("New observation: ", newObservation)
|
87 |
-
if (newObservation !== lastObservation) {
|
88 |
// console.log("update!")
|
89 |
setLastObservation(newObservation || "")
|
90 |
onObserve(newObservation || "", imageBase64)
|
|
|
80 |
}
|
81 |
const prompt = `What do you see here?`
|
82 |
|
83 |
+
console.log("JULIAN: disabled watch")
|
84 |
+
|
85 |
+
|
86 |
// console.log("Calling IDEFICS..")
|
87 |
+
const newObservation = await see({ prompt, imageBase64 })
|
88 |
|
89 |
// console.log("New observation: ", newObservation)
|
90 |
+
if (newObservation && newObservation !== lastObservation) {
|
91 |
// console.log("update!")
|
92 |
setLastObservation(newObservation || "")
|
93 |
onObserve(newObservation || "", imageBase64)
|
src/app/speak.tsx
CHANGED
@@ -3,60 +3,32 @@
|
|
3 |
import { ReactNode, useEffect, useState } from "react"
|
4 |
import { onlyText } from "react-children-utilities"
|
5 |
|
|
|
|
|
|
|
6 |
export function Speak({
|
7 |
children
|
8 |
}: {
|
9 |
children: ReactNode
|
10 |
}) {
|
11 |
-
const
|
12 |
-
const
|
13 |
-
|
14 |
-
const
|
15 |
-
|
16 |
-
useEffect(() => {
|
17 |
-
console.log("getting voices..")
|
18 |
-
setTimeout(() => {
|
19 |
-
if (typeof window === "undefined") { return }
|
20 |
-
if (!window?.speechSynthesis) { return }
|
21 |
-
const allVoices = window.speechSynthesis.getVoices()
|
22 |
-
|
23 |
-
const enVoices = allVoices.filter(voice => voice.lang.toLowerCase() === "en-us")
|
24 |
-
|
25 |
-
if (!enVoices.length) { return }
|
26 |
|
27 |
-
|
28 |
-
console.table(enVoices)
|
29 |
|
30 |
-
|
31 |
|
32 |
-
|
33 |
-
const googleVoice = enVoices.find(voice => voice.name.includes("Google"))
|
34 |
|
35 |
-
console.log("google voice:", googleVoice)
|
36 |
-
|
37 |
-
setVoice(googleVoice || kathyVoice)
|
38 |
-
}, 1000)
|
39 |
-
}, [])
|
40 |
-
|
41 |
useEffect(() => {
|
42 |
-
|
43 |
-
if (
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
// console.log(`Speaking "${newMessage}"`)
|
50 |
-
setPlayedMessage(newMessage)
|
51 |
-
const utterance = new SpeechSynthesisUtterance(newMessage)
|
52 |
-
utterance.voice = voice
|
53 |
-
|
54 |
-
console.log("julian: voice disabled :D")
|
55 |
-
// synth.speak(utterance)
|
56 |
-
|
57 |
-
}, [voice?.name, newMessage, playedMessage])
|
58 |
|
59 |
-
return
|
60 |
-
null
|
61 |
-
)
|
62 |
}
|
|
|
3 |
import { ReactNode, useEffect, useState } from "react"
|
4 |
import { onlyText } from "react-children-utilities"
|
5 |
|
6 |
+
import { useTimeout } from "@/lib/useTimeout"
|
7 |
+
import { useStore } from "./useStore"
|
8 |
+
|
9 |
export function Speak({
|
10 |
children
|
11 |
}: {
|
12 |
children: ReactNode
|
13 |
}) {
|
14 |
+
const isSpeechSynthesisAvailable = useStore(state => state.isSpeechSynthesisAvailable)
|
15 |
+
const lastSpokenSentence = useStore(state => state.lastSpokenSentence)
|
16 |
+
const init = useStore(state => state.init)
|
17 |
+
const speak = useStore(state => state.speak)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
const newMessage = onlyText(children).trim()
|
|
|
20 |
|
21 |
+
useEffect(() => { init() }, [])
|
22 |
|
23 |
+
const canSpeak = isSpeechSynthesisAvailable && newMessage?.length && newMessage !== lastSpokenSentence
|
|
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
useEffect(() => {
|
26 |
+
console.log("debug:", { canSpeak, newMessage })
|
27 |
+
if (canSpeak) {
|
28 |
+
console.log("speaking!")
|
29 |
+
speak(newMessage)
|
30 |
+
}
|
31 |
+
}, [canSpeak, newMessage])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
return null
|
|
|
|
|
34 |
}
|
src/app/useStore.ts
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"use client"
|
2 |
+
|
3 |
+
import { getSpeechSynthesisVoice } from "@/lib/getSpeechSynthesisVoice"
|
4 |
+
import { create } from "zustand"
|
5 |
+
|
6 |
+
export const useStore = create<{
|
7 |
+
isSpeechSynthesisAvailable: boolean
|
8 |
+
speechSynthesis: SpeechSynthesis
|
9 |
+
speechSynthesisVoice: SpeechSynthesisVoice
|
10 |
+
isSpeaking: boolean
|
11 |
+
lastSpokenSentence: string
|
12 |
+
isHearing: boolean // robot is hearing
|
13 |
+
init: () => void,
|
14 |
+
loadVoice: () => void,
|
15 |
+
speak: (sentence: string) => void
|
16 |
+
setHearing: (isHearing: boolean) => void
|
17 |
+
}>((set, get) => ({
|
18 |
+
isSpeechSynthesisAvailable: false,
|
19 |
+
speechSynthesis: undefined as unknown as SpeechSynthesis,
|
20 |
+
speechSynthesisVoice: undefined as unknown as SpeechSynthesisVoice,
|
21 |
+
isSpeaking: false,
|
22 |
+
lastSpokenSentence: "",
|
23 |
+
isHearing: false, // robot is taking
|
24 |
+
init: () => {
|
25 |
+
if (!window?.speechSynthesis) {
|
26 |
+
console.error(`no speech synthesis engine available`)
|
27 |
+
return
|
28 |
+
}
|
29 |
+
const speechSynthesis = window.speechSynthesis
|
30 |
+
set({ speechSynthesis })
|
31 |
+
|
32 |
+
speechSynthesis.onvoiceschanged = () => { get().loadVoice() }
|
33 |
+
|
34 |
+
setTimeout(() => {
|
35 |
+
get().loadVoice()
|
36 |
+
}, 2000)
|
37 |
+
|
38 |
+
// due to the lack of event for the speaking state, we create our own polling system
|
39 |
+
// see https://developer.mozilla.org/en-US/docs/Web/API/SpeechSynthesis/speaking
|
40 |
+
setInterval(() => {
|
41 |
+
const { isSpeaking } = get()
|
42 |
+
if (!speechSynthesis.speaking && isSpeaking) {
|
43 |
+
set({ isSpeaking: false })
|
44 |
+
} else if (speechSynthesis.speaking && !isSpeaking) {
|
45 |
+
set({ isSpeaking: true })
|
46 |
+
}
|
47 |
+
}, 100)
|
48 |
+
},
|
49 |
+
loadVoice: () => {
|
50 |
+
let { speechSynthesis, speechSynthesisVoice } = get()
|
51 |
+
if (!speechSynthesis) {
|
52 |
+
console.error(`no speech synthesis engine available`)
|
53 |
+
return
|
54 |
+
}
|
55 |
+
|
56 |
+
try {
|
57 |
+
speechSynthesisVoice = getSpeechSynthesisVoice(speechSynthesis)
|
58 |
+
if (!speechSynthesisVoice?.name) {
|
59 |
+
throw new Error("no name for the voice")
|
60 |
+
}
|
61 |
+
} catch (err) {
|
62 |
+
console.error(`no speech synthesis voice available: ${err}`)
|
63 |
+
return
|
64 |
+
}
|
65 |
+
if (speechSynthesisVoice) {
|
66 |
+
set({ speechSynthesisVoice, isSpeechSynthesisAvailable: true })
|
67 |
+
}
|
68 |
+
},
|
69 |
+
speak: (sentence: string) => {
|
70 |
+
const { speechSynthesis, speechSynthesisVoice } = get()
|
71 |
+
if (!speechSynthesis || !speechSynthesisVoice) { return }
|
72 |
+
speechSynthesis.cancel()
|
73 |
+
|
74 |
+
const utterance = new SpeechSynthesisUtterance(sentence)
|
75 |
+
utterance.voice = speechSynthesisVoice
|
76 |
+
|
77 |
+
speechSynthesis.speak(utterance)
|
78 |
+
|
79 |
+
set({ lastSpokenSentence: sentence })
|
80 |
+
},
|
81 |
+
setHearing: (isHearing: boolean) => { set({ isHearing }) },
|
82 |
+
}))
|
83 |
+
|
src/lib/getSpeechSynthesisVoice.ts
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export function getSpeechSynthesisVoice(speechSynthesis: SpeechSynthesis): SpeechSynthesisVoice {
|
2 |
+
const allVoices = speechSynthesis.getVoices()
|
3 |
+
|
4 |
+
console.log("all voices:")
|
5 |
+
console.table(allVoices)
|
6 |
+
|
7 |
+
const fallbackVoice = allVoices[0]
|
8 |
+
|
9 |
+
const enVoices = allVoices.filter(voice => voice.lang.toLowerCase() === "en-us")
|
10 |
+
|
11 |
+
console.log("available english voices:")
|
12 |
+
console.table(enVoices)
|
13 |
+
|
14 |
+
const kathyVoice = enVoices.find(voice => voice.name.includes("Kathy"))
|
15 |
+
|
16 |
+
// if we find a high-quality voice
|
17 |
+
const googleVoice = enVoices.find(voice => voice.name.includes("Google"))
|
18 |
+
|
19 |
+
// console.log("google voice:", googleVoice)
|
20 |
+
|
21 |
+
return googleVoice || kathyVoice || fallbackVoice
|
22 |
+
}
|
src/lib/useTimeout.ts
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { useEffect } from "react"
|
2 |
+
|
3 |
+
export function useTimeout(duration: number, callback: () => void) {
|
4 |
+
useEffect(() => {
|
5 |
+
setTimeout(() => {
|
6 |
+
callback()
|
7 |
+
}, duration)
|
8 |
+
}, [])
|
9 |
+
}
|