Spaces:

jbilcke-hf
/

observer

Paused

File size: 4,950 Bytes

"use client"

import { useRef, useState, useTransition } from "react"
import { format } from "date-fns"

import { Observe } from "./observe"
import { cn } from "@/lib/utils"

import { think } from "./engine/think"
import { Progress } from "./interface/progress"
import { Listen } from "./listen"
import { Speak } from "./speak"
import { Toaster } from "@/components/ui/toaster"

export default function Main() {
  const [_isPending, startTransition] = useTransition()
  const [lastImage, setLastImage] = useState<string>("")
  const [lastRawObservation, setLastRawObservation] = useState<string>("")
  const [isLoadingAction, setLoadingAction] = useState(false)
  
  const [action, setAction] = useState<string>("Nothing to say yet.")
  const lastEvent = useRef("")
  
  const handleOnEvent = (event: string, needAnswer: boolean) => {
    lastEvent.current = event
    setLoadingAction(true)
    startTransition(async () => {
      try {
        const action = await think(event, needAnswer)

        // here what could happen is that we received a message more recent than what the LLM is currently working on
        // when that happen, the best is to just interrupt the LLM (well.. in our case, it means ignore what it says)
        const canSetAction = action && lastEvent.current === event

        if (canSetAction) {
          setAction(action)
        }
      } catch (err) {
        console.error(err)
      } finally {
        setLoadingAction(false)
      }
    })
  }
  // receive a new observation from what the agent is looking at
  const handleOnObserve = (observation: string, image: string) => {
    setLastRawObservation(observation)
    setLastImage(image)
    if (!observation) { return }
    // handleOnEvent(`It is ${format(new Date(), 'HH:mm')} and you are seeing this: ${observation}`)
    handleOnEvent(`(looking at at ${observation})`, false)
  }

  const handleOnListen = (recording: string) => {
    if (!recording || recording === "[BLANK_AUDIO]") { return }
    // handleOnEvent(`It is ${format(new Date(), 'HH:mm')} and you are hearing this: ${recording}`)
    handleOnEvent(`${recording}`, true)

  }
  
  return (
    <div className="w-screen h-screen bg-zinc-100">
      
      <div className="fixed z-10 left-0 right-0 flex flex-col items-center justify-center">
        <div className={cn(
          `flex flex-col md:flex-row`,
          `items-center justify-between`,
          `w-full md:w-[90%] lg:w-[80%]`,
          `p-2 mt-0 md:p-4 md:mt-8`,
          `bg-zinc-100 md:rounded-xl`,
          `shadow-2xl text-xs md:text-sm`
        )}>
          <div className="flex flex-row space-x-4 w-full md:w-1/2 p-2 md:p-4">
            <div className="flex w-[112px]">
              {lastImage ? 
                <div className="w-28 aspect-video">
                  <img
                    src={lastImage}
                    alt="screenshot"
                    className="rounded-lg shadow-xl border border-zinc-500"
                  />
                </div> : null}
            </div>

            <div className="text-lg flex-grow italic">
              <span className="text-zinc-700 text-lg">
                {lastRawObservation}
              </span>
            </div>
          </div>


          <div className="flex flex-row w-full md:w-1/2 p-2 md:p-4">

            <div className="w-full text-zinc-800 text-lg">
              {action}
            </div>
          </div>
        </div>
      </div>

      <Observe onObserve={handleOnObserve} />
      <Listen onListen={handleOnListen} />
      <Speak>{action}</Speak>
      <Toaster />

      <Progress
        isLoading={isLoadingAction}
        resetKey=""
        className="left-6 right-0"
      />

      <div className="fixed z-10 left-0 right-0 bottom-0 flex flex-col items-center justify-center">
        <div className="full md:w-[80%] lg:w-[70%] mb-0 md:p-4 md:mb-8 bg-zinc-100 md:rounded-xl p-4 shadow-2xl text-xs md:text-sm">
          <p>🅿️ <span className="font-semibold">
            </span>A multimodal demo to make 
           <a href="https://huggingface.co/meta-llama" target="_blank" className="font-semibold underline"> Llama-2 </a> hear, see and talk.
           You need a laptop computer with <a href="https://caniuse.com/webgpu" target="_blank" className="font-semibold underline">a modern browser supporting WebGPU</a>.
            Vision is handled by <a href="https://huggingface.co/HuggingFaceM4/idefics-80b#bias-evaluation" target="_blank" className="font-semibold underline"> IDEFICS </a></p>
          <p>⛔️ <span className="font-semibold">Limitations: </span>This demo is provided as-is, for demonstration and research purpose only. As it demonstrates WebGPU technology, this demo will not support incompatible browsers and/or devices. No guarantee of factually correct results. In some cases, the models may return hallucinated or innapropriate responses.</p>
        </div>
      </div>
    </div>
  )
}