"use client" import { useCallback, useEffect, useRef, useState, useTransition } from "react" import { useInterval } from "usehooks-ts" // TODO: try this? https://www.npmjs.com/package/react-audio-voice-recorder import { useRecorder } from "react-microphone-recorder" import { getWaveBlob } from "webm-to-wav-converter" import { AvailableModels, InferenceSession, SessionManager, } from "whisper-turbo" import { useToast } from "@/components/ui/use-toast" import { useStore } from "./useStore" export interface TSSegment { text: string; start: number; stop: number; last: boolean; } export interface TSTranscript { segments: Array; } export function Listen({ onListen, }: { onListen: (recording: string) => void }) { const { toast } = useToast() const speechSynthesis = useStore(state => state.speechSynthesis) const isSpeaking = useStore(state => state.isSpeaking) const isSpeakingRef = useRef(isSpeaking) useEffect(() => {isSpeakingRef.current = isSpeaking }, [isSpeaking]) const setHearing = useStore(state => state.setHearing) const isHearing = useStore(state => state.isHearing) const [transcribing, setTranscribing] = useState(false) const transcribingRef = useRef(transcribing) useEffect(() => { transcribingRef.current = transcribing }, [transcribing]) // used to detect changes, signal when we can analyze the audio const [audioDataFrame, setAudioDataFrame] = useState(0) const audioDataFrameRef = useRef(audioDataFrame) useEffect(() => { audioDataFrameRef.current = audioDataFrame }, [audioDataFrame]) const [transcriptBuffer, setTranscriptBuffer] = useState("") useEffect(() => { onListen(transcriptBuffer) }, [transcriptBuffer]) /* Available models: { WHISPER_TINY: 'whisper-tiny', WHISPER_BASE: 'whisper-base', WHISPER_SMALL: 'whisper-small', WHISPER_MEDIUM: 'whisper-medium', WHISPER_LARGE: 'whisper-large' } */ // unfortunately, we cannot really use models larger than TINY because they are // too slow to process requests const whisperModel: AvailableModels = AvailableModels.WHISPER_TINY const listenerRef = useRef({ isListening: false, startedListeningAt: 0, stoppedListeningAt: 0, durationInMs: 0, hits: 0, debugCanContinue: true, // used for debugging }) // the background listener is not a CIA spy device, but a detect of changes in the // background noise volume level. The goal is to detect whenever an interesting event is happening const backgroundListener = useRecorder() // the foreground listener is the actual sound sampler // with out system, it will always lag a bit behind the background listener // however there might be a fix (which I haven't tried yet): // to take the last second of the background listener sample, // and glue it to the beginning of the foreground listener sample // // or, alternatively, we could just try to use a shorter time window for the background listener, // to make it more reactive const foregroundListener = useRecorder() // to detect voice, we use a combination of audio level and frequency sampling const heardSomething = backgroundListener.audioLevel > 12 // 18 const status = heardSomething ? "I hear something!" : "background noise" const session = useRef(null) const [audioData, setAudioData] = useState(null) const [audioMetadata, setAudioMetadata] = useState(null) const [loaded, setLoaded] = useState(false) const [progress, setProgress] = useState(0) const isLoadingModel = progress > 0 const hasLoadedModel = progress === 100 const loadModel = async () => { console.log("loadModel") if (session.current) { session.current.destroy() } if (!whisperModel) { console.error("No whisper model loaded") return } try { const manager = new SessionManager() const loadResult = await manager.loadModel( whisperModel, () => { setLoaded(true) }, (p: number) => { console.log("progress:", p) setProgress(p) } ) if (loadResult.isErr) { throw new Error(loadResult.error.message) } else { session.current = loadResult.value } } catch (err) { const error = `failed to load the model: ${err}` console.error(error) toast({ title: "Error", description: error, variant: "destructive" }) } } const runSession = async () => { if (!loaded) { console.log("runSession: aborting (model not loaded yet)") return } if (!session.current) { console.log("runSession: aborting (no model loaded)") toast({ title: "Error", description: "No model loaded", variant: "destructive" }) return } // console.log("debug:", { audioData, audioDataFrame }) if (!audioData) { console.log("runSession: aborting (no audio file loaded)") toast({ title: "Error", description: "No audio file loaded", variant: "destructive" }) return } setTranscribing(transcribingRef.current = true) try { await session.current.transcribe(audioData, (s: any) => { const segment = s as { text: string, start: number, stop: number, last: boolean } const text = segment.text.trim() console.log("text:", text) if (text) { setTranscriptBuffer(text) } if (s.last) { console.log("IS LAST") setTranscribing(transcribingRef.current = false) return } }) } catch (err) { const error = `transcription crashed: ${err}` console.error(error) toast({ title: "Error", description: "No audio file loaded", variant: "destructive" }) } } // let's disable the background recorder for now useInterval(() => { // console.log("let's stop, and start again") backgroundListener.stopRecording() backgroundListener.startRecording() }, 3000) useEffect(() => { const fn = async () => { console.log("load model..") await loadModel() console.log("starting to listen to background noise to detect volume peaks..") backgroundListener.startRecording() } fn() }, []) useEffect(() => { if (!audioData) { console.log("no audio") } // console.log("audioDataFrame changed, need to process audioData!") runSession() }, [audioDataFrame]) // note: this effect only reacts to "head something" changes // anod not to changes to isListening or isSpekaing useEffect(() => { const isListening = listenerRef.current.isListening if (!heardSomething) { return } if (listenerRef.current.isListening) { // console.log("we are already listening, so skipping..") return } if (isSpeakingRef.current) { console.log("we are already busy speaking, so ignoring..") return } setHearing(true) // console.log("recording..") foregroundListener.startRecording() listenerRef.current.hits = 0 listenerRef.current.isListening = true setTimeout(async () => { foregroundListener.stopRecording() setHearing(false) listenerRef.current.isListening = false listenerRef.current.stoppedListeningAt = Date.now() listenerRef.current.durationInMs = listenerRef.current.stoppedListeningAt - listenerRef.current.startedListeningAt const hits = listenerRef.current.hits if (!foregroundListener.audioBlob || typeof window === "undefined" || !window?.FileReader) { return } if (hits <= 11) { return } console.log(`end of sample (${foregroundListener.timeElapsed}, ${hits} hits)`) // at 12 threshold level, we should have between 12 and 20 hits (per 2 sec) for short words and utterances // at 12 threshold level, keystrokes should not be detected, unless the person hits the keyboard heavily // console.log("got an interesting sample, sending for review") // temporary, to prevent infinite loop if (listenerRef.current.debugCanContinue) { // to prevent the infinite loop, set this value to false // listenerRef.current.debugCanContinue = false try { const blob = await getWaveBlob(foregroundListener.audioBlob, false) // false = 16 bit, true = 32 bit const arrayBuffer = await blob.arrayBuffer() const uint8Array = new Uint8Array(arrayBuffer) setAudioData(uint8Array) setAudioDataFrame(audioDataFrameRef.current + 1) } catch (err) { const error = `failed to convert the audio sample: ${err}` console.error(error) toast({ title: "Error", description: error, variant: "destructive" }) } } else { console.log("Julian: infinite loop temporary disabled!") } }, 2000) }, [heardSomething]) if (heardSomething && listenerRef.current.isListening) { listenerRef.current.hits = listenerRef.current.hits + 1 } return (

{isLoadingModel && !hasLoadedModel ?

Loading whisper-turbo: {progress}% done

{ transcriptBuffer || "" }

}

) }