Spaces:
Runtime error
Runtime error
File size: 3,166 Bytes
964cd90 16e433b 964cd90 16e433b 964cd90 16e433b a61f894 964cd90 16e433b 8cc16d8 964cd90 16e433b 964cd90 16e433b 964cd90 16e433b 964cd90 16e433b 04e30d7 16e433b b9570bd 16e433b f91c3ab 29655fa 16e433b 964cd90 16e433b 29655fa 16e433b a61f894 16e433b 29655fa 8cc16d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import spaces
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
import gradio as gr
import sentencepiece
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:120'
model_id = "thesven/Llama3-8B-SFT-code_bagel-bnb-4bit"
tokenizer_path = "./"
DESCRIPTION = """
# thesven/Llama3-8B-SFT-code_bagel-bnb-4bit
"""
tokenizer = AutoTokenizer.from_pretrained(model_id, device_map="auto", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True)
def format_prompt(user_message, system_message="You are an expert developer in all programming languages. Help me with my code. Answer any questions I have with code examples."):
prompt = f"<|im_start|>assistant\n{system_message}<|im_end|>\n<|im_start|>\nuser\n{user_message}<|im_end|>\nassistant\n"
return prompt
@spaces.GPU
def predict(message, system_message, max_new_tokens=600, temperature=3.5, top_p=0.9, top_k=40, do_sample=False):
formatted_prompt = format_prompt(message, system_message)
input_ids = tokenizer.encode(formatted_prompt, return_tensors='pt')
input_ids = input_ids.to(model.device)
response_ids = model.generate(
input_ids,
max_length=max_new_tokens + input_ids.shape[1],
temperature=temperature,
top_p=top_p,
top_k=top_k,
no_repeat_ngram_size=9,
pad_token_id=tokenizer.eos_token_id,
do_sample=do_sample
)
response = tokenizer.decode(response_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
truncate_str = "<|im_end|>"
if truncate_str and truncate_str in response:
response = response.split(truncate_str)[0]
return [("bot", response)]
with gr.Blocks() as demo:
gr.Markdown(DESCRIPTION)
with gr.Group():
system_prompt = gr.Textbox(placeholder='Provide a System Prompt In The First Person', label='System Prompt', lines=2, value="You are an expert developer in all programming languages. Help me with my code. Answer any questions I have with code examples.")
with gr.Group():
chatbot = gr.Chatbot(label='thesven/Llama3-8B-SFT-code_bagel-bnb-4bit')
with gr.Group():
textbox = gr.Textbox(placeholder='Your Message Here', label='Your Message', lines=2)
submit_button = gr.Button('Submit', variant='primary')
with gr.Accordion(label='Advanced options', open=False):
max_new_tokens = gr.Slider(label='Max New Tokens', minimum=1, maximum=55000, step=1, value=512)
temperature = gr.Slider(label='Temperature', minimum=0.1, maximum=4.0, step=0.1, value=0.1)
top_p = gr.Slider(label='Top-P (nucleus sampling)', minimum=0.05, maximum=1.0, step=0.05, value=0.9)
top_k = gr.Slider(label='Top-K', minimum=1, maximum=1000, step=1, value=40)
do_sample_checkbox = gr.Checkbox(label='Disable for faster inference', value=True)
submit_button.click(
fn=predict,
inputs=[textbox, system_prompt, max_new_tokens, temperature, top_p, top_k, do_sample_checkbox],
outputs=chatbot
)
demo.launch()
|