# import torch | |
# from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, LlamaConfig | |
# import gradio as gr | |
# # Model IDs from Hugging Face Hub | |
# base_model_id = "HuggingFaceTB/SmolLM2-135M" | |
# instruct_model_id = "MaxBlumenfeld/smollm2-135m-bootleg-instruct-01" | |
# # Load tokenizer | |
# base_tokenizer = AutoTokenizer.from_pretrained(base_model_id) | |
# # Load models with explicit LLaMA architecture | |
# base_model = LlamaForCausalLM.from_pretrained(base_model_id) | |
# instruct_model = LlamaForCausalLM.from_pretrained(instruct_model_id) | |
# def generate_response(model, tokenizer, message, temperature=0.5, max_length=200, system_prompt="", is_instruct=False): | |
# # Prepare input based on model type | |
# if is_instruct: | |
# if system_prompt: | |
# full_prompt = f"{system_prompt}\n\nHuman: {message}\nAssistant:" | |
# else: | |
# full_prompt = f"Human: {message}\nAssistant:" | |
# else: | |
# # For base model, use simpler prompt format | |
# full_prompt = message | |
# inputs = tokenizer(full_prompt, return_tensors="pt") | |
# with torch.no_grad(): | |
# outputs = model.generate( | |
# inputs.input_ids, | |
# max_length=max_length, | |
# do_sample=True, | |
# temperature=temperature, | |
# top_k=50, | |
# top_p=0.95, | |
# num_return_sequences=1, | |
# pad_token_id=tokenizer.eos_token_id # Add padding token | |
# ) | |
# response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# if is_instruct: | |
# try: | |
# response = response.split("Assistant:")[-1].strip() | |
# except: | |
# pass | |
# else: | |
# response = response[len(full_prompt):].strip() | |
# return response | |
# def chat(message, temperature, max_length, system_prompt): | |
# # Generate responses from both models | |
# base_response = generate_response( | |
# base_model, | |
# base_tokenizer, | |
# message, | |
# temperature, | |
# max_length, | |
# system_prompt, | |
# is_instruct=False | |
# ) | |
# instruct_response = generate_response( | |
# instruct_model, | |
# base_tokenizer, | |
# message, | |
# temperature, | |
# max_length, | |
# system_prompt, | |
# is_instruct=True | |
# ) | |
# return base_response, instruct_response | |
# # Create Gradio interface | |
# with gr.Blocks() as demo: | |
# gr.Markdown("# SmolLM2-135M Comparison Demo") | |
# gr.Markdown("Compare responses between base and fine-tuned versions of SmolLM2-135M") | |
# with gr.Row(): | |
# with gr.Column(): | |
# message_input = gr.Textbox(label="Input Message") | |
# system_prompt = gr.Textbox( | |
# label="System Prompt (Optional)", | |
# placeholder="Set context or personality for the model", | |
# lines=3 | |
# ) | |
# with gr.Column(): | |
# temperature = gr.Slider( | |
# minimum=0.1, | |
# maximum=2.0, | |
# value=0.5, | |
# label="Temperature" | |
# ) | |
# max_length = gr.Slider( | |
# minimum=50, | |
# maximum=500, | |
# value=200, | |
# step=10, | |
# label="Max Length" | |
# ) | |
# with gr.Row(): | |
# with gr.Column(): | |
# gr.Markdown("### Base Model Response") | |
# base_output = gr.Textbox(label="Base Model (SmolLM2-135M)", lines=5) | |
# with gr.Column(): | |
# gr.Markdown("### Bootleg Instruct Model Response") | |
# instruct_output = gr.Textbox(label="Fine-tuned Model", lines=5) | |
# submit_btn = gr.Button("Generate Responses") | |
# submit_btn.click( | |
# fn=chat, | |
# inputs=[message_input, temperature, max_length, system_prompt], | |
# outputs=[base_output, instruct_output] | |
# ) | |
# if __name__ == "__main__": | |
# demo.launch() | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import torch | |
import gradio as gr | |
# model_id = "HuggingFaceTB/SmolLM2-135M" | |
model_id = "MaxBlumenfeld/smollm2-135m-bootleg-instruct04" | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
model = AutoModelForCausalLM.from_pretrained(model_id) | |
def generate_response(message, temperature=0.7, max_length=200): | |
prompt = f"Human: {message}\nAssistant:" | |
inputs = tokenizer(prompt, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model.generate( | |
inputs.input_ids, | |
max_length=max_length, | |
temperature=temperature, | |
do_sample=True, | |
pad_token_id=tokenizer.eos_token_id | |
) | |
response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return response.split("Assistant:")[-1].strip() | |
with gr.Blocks() as demo: | |
gr.Markdown("# SmolLM2 Bootleg Instruct Chat") | |
with gr.Row(): | |
with gr.Column(): | |
message = gr.Textbox(label="Message") | |
temp = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, label="Temperature") | |
max_len = gr.Slider(minimum=50, maximum=500, value=200, label="Max Length") | |
submit = gr.Button("Send") | |
with gr.Column(): | |
output = gr.Textbox(label="Response") | |
submit.click( | |
generate_response, | |
inputs=[message, temp, max_len], | |
outputs=output | |
) | |
if __name__ == "__main__": | |
demo.launch() | |