Hermes-3-Llama-3.2-3B

Running on Zero

File size: 4,181 Bytes

51a7d9e
 
 
 
22f5f54
51a7d9e
edb9e8a
51a7d9e
 
 
99a7a45
 
 
51a7d9e
99a7a45
51a7d9e
99a7a45
51a7d9e
 
 
 
 
 
 
 
 
 
99a7a45
 
22f5f54
 
 
 
 
99a7a45
51a7d9e
99a7a45
 
 
 
 
 
 
 
f663115
51a7d9e
99a7a45
fd6304d
 
51a7d9e
 
 
 
 
fd6304d
99a7a45
 
 
 
 
 
 
 
22f5f54
030c23d
639e063
edb9e8a
030c23d
 
f663115
 
51a7d9e
22f5f54
51a7d9e
030c23d
0961bc7
f663115
030c23d
 
b4d1f01
 
 
 
8ea3132
99a7a45
51a7d9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
030c23d
51a7d9e
 
030c23d
51a7d9e
 
99a7a45
 
 
 
51a7d9e
 
 
 
 
 
 
 
 
99a7a45
51a7d9e

import torch
from PIL import Image
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import os
from threading import Thread


HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL_LIST = "THUDM/glm-4-9b-chat, THUDM/glm-4-9b-chat-1m, THUDM/codegeex4-all-9b"
#MODELS = os.environ.get("MODELS")
#MODEL_NAME = MODELS.split("/")[-1]

TITLE = "<h1><center>GLM SPACE</center></h1>"

PLACEHOLDER = f'<h3><center>Feel Free To Test GLM</center></h3>'

CSS = """
.duplicate-button {
  margin: auto !important;
  color: white !important;
  background: black !important;
  border-radius: 100vh !important;
}
"""

model_chat = AutoModelForCausalLM.from_pretrained(
        "THUDM/glm-4-9b-chat",
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        ).to(0).eval()

tokenizer_chat = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat",trust_remote_code=True)

model_code = AutoModelForCausalLM.from_pretrained(
    "THUDM/codegeex4-all-9b",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
).to(device).eval()

tokenizer_code = AutoTokenizer.from_pretrained("THUDM/codegeex4-all-9b", trust_remote_code=True)

@spaces.GPU
def stream_chat(message: str, history: list, temperature: float, max_length: int, model: str):
    print(f'message is - {message}')
    print(f'history is - {history}')
    conversation = []
    for prompt, answer in history:
        conversation.extend([{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}])
    conversation.append({"role": "user", "content": message})

    print(f"Conversation is -\n{conversation}")

    if mode == "glm-4-9b-chat":
        tokenizer = tokenizer_chat
        model = model_chat
    else:
        model = model_code
        tokenizer = tokenizer_code
        
    input_ids = tokenizer.apply_chat_template(conversation, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)

    generate_kwargs = dict(
        max_length=max_length,
        streamer=streamer,
        do_sample=True,
        top_k=1,
        temperature=temperature,
        repetition_penalty=1.2,
    )
    gen_kwargs = {**input_ids, **generate_kwargs}

    with torch.no_grad():
        thread = Thread(target=model.generate, kwargs=gen_kwargs)
        thread.start()
        buffer = ""
        for new_text in streamer:
            buffer += new_text
            yield buffer
 
chatbot = gr.Chatbot(height=600, placeholder = PLACEHOLDER)

with gr.Blocks(css=CSS) as demo:
    gr.HTML(TITLE)
    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Slider(
                minimum=0,
                maximum=1,
                step=0.1,
                value=0.8,
                label="Temperature",
                render=False,
            ),
            gr.Slider(
                minimum=128,
                maximum=8192,
                step=1,
                value=1024,
                label="Max Length",
                render=False,
            ),
            choice = gr.Radio(
                ["glm-4-9b-chat", "codegeex4-all-9b"],
                label="Load Model"
            ),
        ],
        examples=[
            ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],
            ["What are 5 creative things I could do with my kids' art? I don't want to throw them away, but it's also so much clutter."],
            ["Tell me a random fun fact about the Roman Empire."],
            ["Show me a code snippet of a website's sticky header in CSS and JavaScript."],
        ],
        cache_examples=False,
    )
    

if __name__ == "__main__":
    demo.launch()