File size: 4,950 Bytes
65ee86e
 
050e1d0
65ee86e
 
58490f9
65ee86e
58490f9
65ee86e
 
58490f9
 
7249a2e
65ee86e
 
 
 
 
 
 
 
050e1d0
65ee86e
1896b82
050e1d0
7249a2e
65ee86e
050e1d0
 
 
 
 
 
 
 
 
 
 
 
 
 
1896b82
65ee86e
 
7249a2e
 
 
 
1896b82
 
5164616
7249a2e
65ee86e
58490f9
1896b82
 
 
 
58490f9
 
65ee86e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58490f9
7249a2e
58490f9
7249a2e
65ee86e
 
 
 
 
 
 
 
 
7249a2e
1896b82
 
 
7249a2e
 
65ee86e
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"use client"

import { useRef, useState, useTransition } from "react"
import { format } from "date-fns"

import { Observe } from "./observe"
import { cn } from "@/lib/utils"

import { think } from "./engine/think"
import { Progress } from "./interface/progress"
import { Listen } from "./listen"
import { Speak } from "./speak"
import { Toaster } from "@/components/ui/toaster"

export default function Main() {
  const [_isPending, startTransition] = useTransition()
  const [lastImage, setLastImage] = useState<string>("")
  const [lastRawObservation, setLastRawObservation] = useState<string>("")
  const [isLoadingAction, setLoadingAction] = useState(false)
  
  const [action, setAction] = useState<string>("Nothing to say yet.")
  const lastEvent = useRef("")
  
  const handleOnEvent = (event: string, needAnswer: boolean) => {
    lastEvent.current = event
    setLoadingAction(true)
    startTransition(async () => {
      try {
        const action = await think(event, needAnswer)

        // here what could happen is that we received a message more recent than what the LLM is currently working on
        // when that happen, the best is to just interrupt the LLM (well.. in our case, it means ignore what it says)
        const canSetAction = action && lastEvent.current === event

        if (canSetAction) {
          setAction(action)
        }
      } catch (err) {
        console.error(err)
      } finally {
        setLoadingAction(false)
      }
    })
  }
  // receive a new observation from what the agent is looking at
  const handleOnObserve = (observation: string, image: string) => {
    setLastRawObservation(observation)
    setLastImage(image)
    if (!observation) { return }
    // handleOnEvent(`It is ${format(new Date(), 'HH:mm')} and you are seeing this: ${observation}`)
    handleOnEvent(`(looking at at ${observation})`, false)
  }

  const handleOnListen = (recording: string) => {
    if (!recording || recording === "[BLANK_AUDIO]") { return }
    // handleOnEvent(`It is ${format(new Date(), 'HH:mm')} and you are hearing this: ${recording}`)
    handleOnEvent(`${recording}`, true)

  }
  
  return (
    <div className="w-screen h-screen bg-zinc-100">
      
      <div className="fixed z-10 left-0 right-0 flex flex-col items-center justify-center">
        <div className={cn(
          `flex flex-col md:flex-row`,
          `items-center justify-between`,
          `w-full md:w-[90%] lg:w-[80%]`,
          `p-2 mt-0 md:p-4 md:mt-8`,
          `bg-zinc-100 md:rounded-xl`,
          `shadow-2xl text-xs md:text-sm`
        )}>
          <div className="flex flex-row space-x-4 w-full md:w-1/2 p-2 md:p-4">
            <div className="flex w-[112px]">
              {lastImage ? 
                <div className="w-28 aspect-video">
                  <img
                    src={lastImage}
                    alt="screenshot"
                    className="rounded-lg shadow-xl border border-zinc-500"
                  />
                </div> : null}
            </div>

            <div className="text-lg flex-grow italic">
              <span className="text-zinc-700 text-lg">
                {lastRawObservation}
              </span>
            </div>
          </div>


          <div className="flex flex-row w-full md:w-1/2 p-2 md:p-4">

            <div className="w-full text-zinc-800 text-lg">
              {action}
            </div>
          </div>
        </div>
      </div>

      <Observe onObserve={handleOnObserve} />
      <Listen onListen={handleOnListen} />
      <Speak>{action}</Speak>
      <Toaster />

      <Progress
        isLoading={isLoadingAction}
        resetKey=""
        className="left-6 right-0"
      />

      <div className="fixed z-10 left-0 right-0 bottom-0 flex flex-col items-center justify-center">
        <div className="full md:w-[80%] lg:w-[70%] mb-0 md:p-4 md:mb-8 bg-zinc-100 md:rounded-xl p-4 shadow-2xl text-xs md:text-sm">
          <p>🅿️ <span className="font-semibold">
            </span>A multimodal demo to make 
           <a href="https://huggingface.co/meta-llama" target="_blank" className="font-semibold underline"> Llama-2 </a> hear, see and talk.
           You need a laptop computer with <a href="https://caniuse.com/webgpu" target="_blank" className="font-semibold underline">a modern browser supporting WebGPU</a>.
            Vision is handled by <a href="https://huggingface.co/HuggingFaceM4/idefics-80b#bias-evaluation" target="_blank" className="font-semibold underline"> IDEFICS </a></p>
          <p>⛔️ <span className="font-semibold">Limitations: </span>This demo is provided as-is, for demonstration and research purpose only. As it demonstrates WebGPU technology, this demo will not support incompatible browsers and/or devices. No guarantee of factually correct results. In some cases, the models may return hallucinated or innapropriate responses.</p>
        </div>
      </div>
    </div>
  )
}