File size: 3,203 Bytes
175bb86 709d394 486a2f6 cf5249f 709d394 349f644 51dbac2 175bb86 02596d2 5cf089b 02596d2 51dbac2 515f252 f7f4304 38fedf1 51dbac2 709d394 3f46449 9b3882c 51dbac2 97a347f 3f46449 709d394 51dbac2 f51c2a1 709d394 3ea359d 709d394 dc0acc6 98b2176 709d394 ef2fea2 600a2a9 709d394 29437cc 709d394 1cdad52 4f6966f b5aae38 6e1661f cd0aa02 6e1661f cd0aa02 a13c01c 6e1661f a13c01c b5aae38 acf224c 7fc9307 acf224c 7fc9307 a13c01c 8325138 1cdad52 4f6966f 1cdad52 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import torch
torch.jit.script = lambda f: f
import spaces
import gradio as gr
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,AwqConfig
import torch
import os
import bitnet
key = os.environ.get("key")
from huggingface_hub import login
login(key)
from bitnet import replace_linears_in_hf
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
model_id = "Nexusflow/Starling-LM-7B-beta"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
# load_in_8bit=True,
# quantization_config=nf4_config,
torch_dtype = torch.bfloat16,
# device_map="auto"
)
replace_linears_in_hf(model)
model.to('cuda').eval()
@spaces.GPU
def generate_response(user_input, max_new_tokens, temperature):
os.system("nvidia-smi")
messages = [{"role": "user", "content": user_input}]
input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
input_ids = input_ids.to(model.device)
os.system("nvidia-smi")
gen_tokens = model.generate(
input_ids = input_ids,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature,
)
gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
if gen_text.startswith(user_input):
gen_text = gen_text[len(user_input):].lstrip()
return gen_text
examples = [
{"message": "What is the weather like today?", "max_new_tokens": 250, "temperature": 0.5},
{"message": "Tell me a joke.", "max_new_tokens": 650, "temperature": 0.7},
{"message": "Explain the concept of machine learning.", "max_new_tokens": 980, "temperature": 0.4}
]
example_choices = [f"Example {i+1}" for i in range(len(examples))]
def load_example(choice):
index = example_choices.index(choice)
example = examples[index]
return example["message"], example["max_new_tokens"], example["temperature"]
with gr.Blocks() as demo:
with gr.Row():
max_new_tokens_slider = gr.Slider(minimum=100, maximum=4000, value=980, label="Max New Tokens")
temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.3, label="Temperature")
message_box = gr.Textbox(lines=2, label="Your Message")
generate_button = gr.Button("Try🫡Command-R")
output_box = gr.Textbox(label="🫡Command-R")
generate_button.click(
fn=generate_response,
inputs=[message_box, max_new_tokens_slider, temperature_slider],
outputs=output_box
)
example_dropdown = gr.Dropdown(label="🫡Load Example", choices=example_choices)
example_button = gr.Button("🫡Load")
example_button.click(
fn=load_example,
inputs=example_dropdown,
outputs=[message_box, max_new_tokens_slider, temperature_slider]
)
demo.launch()
|