File size: 5,497 Bytes
0fb2bdc 9d48224 128a6b8 0fb2bdc 8f50610 0fb2bdc 3e99dbe 0fb2bdc 3e99dbe 0fb2bdc 3e99dbe 0fb2bdc 3e99dbe 0fb2bdc 3e99dbe 0fb2bdc 3e99dbe 0fb2bdc 3e99dbe 0fb2bdc 3e99dbe 0fb2bdc 3e99dbe 0fb2bdc 3e99dbe 0fb2bdc 3e99dbe 0fb2bdc 3e99dbe 0fb2bdc 3e99dbe 0fb2bdc 3e99dbe 0fb2bdc 3e99dbe 0fb2bdc 3e99dbe 0fb2bdc c1f8208 666acd7 c1f8208 0fb2bdc 9d48224 0fb2bdc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
# import torch
# from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, LlamaConfig
# import gradio as gr
# # Model IDs from Hugging Face Hub
# base_model_id = "HuggingFaceTB/SmolLM2-135M"
# instruct_model_id = "MaxBlumenfeld/smollm2-135m-bootleg-instruct-01"
# # Load tokenizer
# base_tokenizer = AutoTokenizer.from_pretrained(base_model_id)
# # Load models with explicit LLaMA architecture
# base_model = LlamaForCausalLM.from_pretrained(base_model_id)
# instruct_model = LlamaForCausalLM.from_pretrained(instruct_model_id)
# def generate_response(model, tokenizer, message, temperature=0.5, max_length=200, system_prompt="", is_instruct=False):
# # Prepare input based on model type
# if is_instruct:
# if system_prompt:
# full_prompt = f"{system_prompt}\n\nHuman: {message}\nAssistant:"
# else:
# full_prompt = f"Human: {message}\nAssistant:"
# else:
# # For base model, use simpler prompt format
# full_prompt = message
# inputs = tokenizer(full_prompt, return_tensors="pt")
# with torch.no_grad():
# outputs = model.generate(
# inputs.input_ids,
# max_length=max_length,
# do_sample=True,
# temperature=temperature,
# top_k=50,
# top_p=0.95,
# num_return_sequences=1,
# pad_token_id=tokenizer.eos_token_id # Add padding token
# )
# response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# if is_instruct:
# try:
# response = response.split("Assistant:")[-1].strip()
# except:
# pass
# else:
# response = response[len(full_prompt):].strip()
# return response
# def chat(message, temperature, max_length, system_prompt):
# # Generate responses from both models
# base_response = generate_response(
# base_model,
# base_tokenizer,
# message,
# temperature,
# max_length,
# system_prompt,
# is_instruct=False
# )
# instruct_response = generate_response(
# instruct_model,
# base_tokenizer,
# message,
# temperature,
# max_length,
# system_prompt,
# is_instruct=True
# )
# return base_response, instruct_response
# # Create Gradio interface
# with gr.Blocks() as demo:
# gr.Markdown("# SmolLM2-135M Comparison Demo")
# gr.Markdown("Compare responses between base and fine-tuned versions of SmolLM2-135M")
# with gr.Row():
# with gr.Column():
# message_input = gr.Textbox(label="Input Message")
# system_prompt = gr.Textbox(
# label="System Prompt (Optional)",
# placeholder="Set context or personality for the model",
# lines=3
# )
# with gr.Column():
# temperature = gr.Slider(
# minimum=0.1,
# maximum=2.0,
# value=0.5,
# label="Temperature"
# )
# max_length = gr.Slider(
# minimum=50,
# maximum=500,
# value=200,
# step=10,
# label="Max Length"
# )
# with gr.Row():
# with gr.Column():
# gr.Markdown("### Base Model Response")
# base_output = gr.Textbox(label="Base Model (SmolLM2-135M)", lines=5)
# with gr.Column():
# gr.Markdown("### Bootleg Instruct Model Response")
# instruct_output = gr.Textbox(label="Fine-tuned Model", lines=5)
# submit_btn = gr.Button("Generate Responses")
# submit_btn.click(
# fn=chat,
# inputs=[message_input, temperature, max_length, system_prompt],
# outputs=[base_output, instruct_output]
# )
# if __name__ == "__main__":
# demo.launch()
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import gradio as gr
# model_id = "HuggingFaceTB/SmolLM2-135M"
model_id = "MaxBlumenfeld/smollm2-135m-bootleg-instruct04"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
def generate_response(message, temperature=0.7, max_length=200):
prompt = f"Human: {message}\nAssistant:"
inputs = tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
outputs = model.generate(
inputs.input_ids,
max_length=max_length,
temperature=temperature,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response.split("Assistant:")[-1].strip()
with gr.Blocks() as demo:
gr.Markdown("# SmolLM2 Bootleg Instruct Chat")
with gr.Row():
with gr.Column():
message = gr.Textbox(label="Message")
temp = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, label="Temperature")
max_len = gr.Slider(minimum=50, maximum=500, value=200, label="Max Length")
submit = gr.Button("Send")
with gr.Column():
output = gr.Textbox(label="Response")
submit.click(
generate_response,
inputs=[message, temp, max_len],
outputs=output
)
if __name__ == "__main__":
demo.launch()
|