File size: 2,749 Bytes
3112b0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import gradio as gr
from gradio_webrtc import WebRTC, ReplyOnPause, AdditionalOutputs
import numpy as np
import os
from twilio.rest import Client

account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
auth_token = os.environ.get("TWILIO_AUTH_TOKEN")

if account_sid and auth_token:
    client = Client(account_sid, auth_token)

    token = client.tokens.create()

    rtc_configuration = {
        "iceServers": token.ice_servers,
        "iceTransportPolicy": "relay",
    }
else:
    rtc_configuration = None

checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct"

device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

whisper = pipeline(
    model="openai/whisper-large-v3-turbo", device=device
)

system_prompt = "You are an AI coding assistant. Your task is to write single-file HTML applications based on a user's request. You may also be asked to edit your original response. Only return the code needed to fulfill the request."
user_prompt = "Please write a single-file HTML application to fulfill the following request. Only return the necessary code. Include all necessary imports and styles.\nThe message:{user_message}\nCurrent code you have written:{code}"



def generate(user_message: tuple[int, np.ndarray],
             history: list[dict],
             code: str):

    msg_text = whisper({"array": user_message[1], "sampling_rate": user_message[0]})["text"]
    history.append({"role": "user", "content": user_prompt.format(user_message=msg_text, code=code)})
    input_text = tokenizer.apply_chat_template(history, tokenize=False)
    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
    outputs = model.generate(inputs, max_new_tokens=500, temperature=0.2, top_p=0.9, do_sample=True)
    response = tokenizer.decode(outputs[0])
    output = response[response.rindex("<|im_start|>assistant\n") + len("<|im_start|>assistant\n"):]
    history.append({"role": "assistant", "content": output})
    yield AdditionalOutputs(history, output)


with gr.Blocks() as demo:
    history = gr.State([{"role": "system", "content": system_prompt}])
    with gr.Row():
        code = gr.Code(language="html")
        sandbox = gr.HTML("")
    with gr.Row():
        webrtc = WebRTC(rtc_configuration=rtc_configuration, mode="send", modality="audio")
    webrtc.stream(ReplyOnPause(generate),
                  inputs=[webrtc, history, code],
                  outputs=[webrtc], time_limit=90)
    webrtc.on_additional_outputs(lambda history, code: (history, code),
                                 outputs=[history, code])

if __name__ == "__main__":
    demo.launch()