Spaces:
Paused
Paused
File size: 4,950 Bytes
65ee86e 050e1d0 65ee86e 58490f9 65ee86e 58490f9 65ee86e 58490f9 7249a2e 65ee86e 050e1d0 65ee86e 1896b82 050e1d0 7249a2e 65ee86e 050e1d0 1896b82 65ee86e 7249a2e 1896b82 5164616 7249a2e 65ee86e 58490f9 1896b82 58490f9 65ee86e 58490f9 7249a2e 58490f9 7249a2e 65ee86e 7249a2e 1896b82 7249a2e 65ee86e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
"use client"
import { useRef, useState, useTransition } from "react"
import { format } from "date-fns"
import { Observe } from "./observe"
import { cn } from "@/lib/utils"
import { think } from "./engine/think"
import { Progress } from "./interface/progress"
import { Listen } from "./listen"
import { Speak } from "./speak"
import { Toaster } from "@/components/ui/toaster"
export default function Main() {
const [_isPending, startTransition] = useTransition()
const [lastImage, setLastImage] = useState<string>("")
const [lastRawObservation, setLastRawObservation] = useState<string>("")
const [isLoadingAction, setLoadingAction] = useState(false)
const [action, setAction] = useState<string>("Nothing to say yet.")
const lastEvent = useRef("")
const handleOnEvent = (event: string, needAnswer: boolean) => {
lastEvent.current = event
setLoadingAction(true)
startTransition(async () => {
try {
const action = await think(event, needAnswer)
// here what could happen is that we received a message more recent than what the LLM is currently working on
// when that happen, the best is to just interrupt the LLM (well.. in our case, it means ignore what it says)
const canSetAction = action && lastEvent.current === event
if (canSetAction) {
setAction(action)
}
} catch (err) {
console.error(err)
} finally {
setLoadingAction(false)
}
})
}
// receive a new observation from what the agent is looking at
const handleOnObserve = (observation: string, image: string) => {
setLastRawObservation(observation)
setLastImage(image)
if (!observation) { return }
// handleOnEvent(`It is ${format(new Date(), 'HH:mm')} and you are seeing this: ${observation}`)
handleOnEvent(`(looking at at ${observation})`, false)
}
const handleOnListen = (recording: string) => {
if (!recording || recording === "[BLANK_AUDIO]") { return }
// handleOnEvent(`It is ${format(new Date(), 'HH:mm')} and you are hearing this: ${recording}`)
handleOnEvent(`${recording}`, true)
}
return (
<div className="w-screen h-screen bg-zinc-100">
<div className="fixed z-10 left-0 right-0 flex flex-col items-center justify-center">
<div className={cn(
`flex flex-col md:flex-row`,
`items-center justify-between`,
`w-full md:w-[90%] lg:w-[80%]`,
`p-2 mt-0 md:p-4 md:mt-8`,
`bg-zinc-100 md:rounded-xl`,
`shadow-2xl text-xs md:text-sm`
)}>
<div className="flex flex-row space-x-4 w-full md:w-1/2 p-2 md:p-4">
<div className="flex w-[112px]">
{lastImage ?
<div className="w-28 aspect-video">
<img
src={lastImage}
alt="screenshot"
className="rounded-lg shadow-xl border border-zinc-500"
/>
</div> : null}
</div>
<div className="text-lg flex-grow italic">
<span className="text-zinc-700 text-lg">
{lastRawObservation}
</span>
</div>
</div>
<div className="flex flex-row w-full md:w-1/2 p-2 md:p-4">
<div className="w-full text-zinc-800 text-lg">
{action}
</div>
</div>
</div>
</div>
<Observe onObserve={handleOnObserve} />
<Listen onListen={handleOnListen} />
<Speak>{action}</Speak>
<Toaster />
<Progress
isLoading={isLoadingAction}
resetKey=""
className="left-6 right-0"
/>
<div className="fixed z-10 left-0 right-0 bottom-0 flex flex-col items-center justify-center">
<div className="full md:w-[80%] lg:w-[70%] mb-0 md:p-4 md:mb-8 bg-zinc-100 md:rounded-xl p-4 shadow-2xl text-xs md:text-sm">
<p>🅿️ <span className="font-semibold">
</span>A multimodal demo to make
<a href="https://huggingface.co/meta-llama" target="_blank" className="font-semibold underline"> Llama-2 </a> hear, see and talk.
You need a laptop computer with <a href="https://caniuse.com/webgpu" target="_blank" className="font-semibold underline">a modern browser supporting WebGPU</a>.
Vision is handled by <a href="https://huggingface.co/HuggingFaceM4/idefics-80b#bias-evaluation" target="_blank" className="font-semibold underline"> IDEFICS </a></p>
<p>⛔️ <span className="font-semibold">Limitations: </span>This demo is provided as-is, for demonstration and research purpose only. As it demonstrates WebGPU technology, this demo will not support incompatible browsers and/or devices. No guarantee of factually correct results. In some cases, the models may return hallucinated or innapropriate responses.</p>
</div>
</div>
</div>
)
} |