Tomoniai's picture
Create app.py
d79aeef verified
raw
history blame
3.8 kB
import gradio as gr
import torch
from threading import Thread
from typing import Iterator
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
MAX_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = 2048
base_model_name = "m-a-p/OpenCodeInterpreter-DS-1.3B"
model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float32, device_map="cpu", low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
def format_prompt(message, history):
system_prompt = "You are OpenCodeInterpreter, you are an expert programmer that helps to write code based on the user request, with concise explanations."
prompt = []
prompt.append({"role": "system", "content": system_prompt})
for user_prompt, bot_response in history:
prompt.extend([{"role": "user", "content": user_prompt}, {"role": "assistant", "content": bot_response}])
prompt.append({"role": "user", "content": message})
return prompt
def generate(prompt: str, history: list[tuple[str, str]], max_new_tokens: int = 1024, temperature: float = 0.3,
top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1 ) -> Iterator[str]:
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
formatted_prompt = []
formatted_prompt = format_prompt(prompt, history)
input_ids = tokenizer.apply_chat_template(formatted_prompt, return_tensors="pt", add_generation_prompt=True)
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
input_ids = input_ids.to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=15.0, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict({"input_ids": input_ids}, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=False, top_p=top_p, top_k=top_k,
temperature=temperature, num_beams=1, repetition_penalty=repetition_penalty, eos_token_id=tokenizer.eos_token_id)
t = Thread(target=model.generate, kwargs=generation_kwargs )
t.start()
outputs = []
for chunk in streamer:
outputs.append(chunk)
yield "".join(outputs).replace("<|EOT|>","")
mychatbot = gr.Chatbot(layout="bubble", avatar_images=["user.png", "botoci.png"], bubble_full_width=False, show_label=False, show_copy_button=True, likeable=True,)
additional_inputs = additional_inputs=[
gr.Slider(
label="Max new tokens",
minimum=1,
maximum=MAX_MAX_NEW_TOKENS,
step=1,
value=512,
),
gr.Slider(
label="Temperature",
minimum=0,
maximum=1.0,
step=0.1,
value=0.3,
),
gr.Slider(
label="Top-p",
minimum=0.05,
maximum=1.0,
step=0.05,
value=0.9,
),
gr.Slider(
label="Top-k",
minimum=1,
maximum=1000,
step=1,
value=50,
),
gr.Slider(
label="Repetition penalty",
minimum=1.0,
maximum=2.0,
step=0.05,
value=1,
)]
iface = gr.ChatInterface(fn=generate,
chatbot=mychatbot,
additional_inputs=additional_inputs,
description=" Running on CPU. The response may be slow for cpu environments. πŸ™πŸ»",
retry_btn=None,
undo_btn=None
)
with gr.Blocks() as demo:
gr.HTML("<center><h1>Tomoniai's Chat with OpenCodeInterpreter</h1></center>")
iface.render()
demo.queue(max_size=10).launch(show_api=False)