Spaces:
Running
on
T4
Running
on
T4
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
import gradio as gr | |
from gradio_webrtc import WebRTC, ReplyOnPause, AdditionalOutputs | |
import numpy as np | |
import os | |
from twilio.rest import Client | |
account_sid = os.environ.get("TWILIO_ACCOUNT_SID") | |
auth_token = os.environ.get("TWILIO_AUTH_TOKEN") | |
if account_sid and auth_token: | |
client = Client(account_sid, auth_token) | |
token = client.tokens.create() | |
rtc_configuration = { | |
"iceServers": token.ice_servers, | |
"iceTransportPolicy": "relay", | |
} | |
else: | |
rtc_configuration = None | |
checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct" | |
device = "cuda" | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device) | |
whisper = pipeline( | |
model="openai/whisper-large-v3-turbo", device=device | |
) | |
system_prompt = "You are an AI coding assistant. Your task is to write single-file HTML applications based on a user's request. You may also be asked to edit your original response. Only return the code needed to fulfill the request." | |
user_prompt = "Please write a single-file HTML application to fulfill the following request. Only return the necessary code. Include all necessary imports and styles.\nThe message:{user_message}\nCurrent code you have written:{code}" | |
def generate(user_message: tuple[int, np.ndarray], | |
history: list[dict], | |
code: str): | |
msg_text = whisper({"array": user_message[1], "sampling_rate": user_message[0]})["text"] | |
history.append({"role": "user", "content": user_prompt.format(user_message=msg_text, code=code)}) | |
input_text = tokenizer.apply_chat_template(history, tokenize=False) | |
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device) | |
outputs = model.generate(inputs, max_new_tokens=500, temperature=0.2, top_p=0.9, do_sample=True) | |
response = tokenizer.decode(outputs[0]) | |
output = response[response.rindex("<|im_start|>assistant\n") + len("<|im_start|>assistant\n"):] | |
history.append({"role": "assistant", "content": output}) | |
yield AdditionalOutputs(history, output) | |
with gr.Blocks() as demo: | |
history = gr.State([{"role": "system", "content": system_prompt}]) | |
with gr.Row(): | |
code = gr.Code(language="html") | |
sandbox = gr.HTML("") | |
with gr.Row(): | |
webrtc = WebRTC(rtc_configuration=rtc_configuration, mode="send", modality="audio") | |
webrtc.stream(ReplyOnPause(generate), | |
inputs=[webrtc, history, code], | |
outputs=[webrtc], time_limit=90) | |
webrtc.on_additional_outputs(lambda history, code: (history, code), | |
outputs=[history, code]) | |
if __name__ == "__main__": | |
demo.launch() | |