# Import necessary libraries import os from threading import Thread import argparse import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer, AutoModelForCausalLM from peft import PeftConfig, PeftModel from utils import get_device # Assuming this function exists from huggingface_hub import login # Authenticate using Hugging Face API token from environment variable hf_api_token = os.getenv("HF_API_TOKEN") if hf_api_token is None: raise ValueError("Hugging Face API token not found in environment variables. Please set it as a secret in Hugging Face Spaces.") login(token=hf_api_token) # Create the parser parser = argparse.ArgumentParser(description='Check model usage.') # Add the arguments parser.add_argument('--baseonly', action='store_true', help='A boolean switch to indicate base only mode') # Execute the parse_args() method args = parser.parse_args() # Define model and adapter names, data type, and quantization type model_name = "microsoft/Phi-3-mini-4k-instruct" adapters_name = "zurd46/eliAI" torch_dtype = torch.bfloat16 # Set the appropriate torch data type # Display device and CPU thread information device = get_device() print(f"Number of GPUs available: {torch.cuda.device_count()}") print(f"Running on device: {device}") print(f"CPU threads: {torch.get_num_threads()}") # Check if CUDA is available and set the device accordingly if not torch.cuda.is_available(): raise RuntimeError("CUDA is not available. Ensure that a GPU is available and properly configured.") # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) # Load base model model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch_dtype) model.resize_token_embeddings(len(tokenizer)) # Load adapter if available and not baseonly usingAdapter = False if not args.baseonly: usingAdapter = True model = PeftModel.from_pretrained(model, adapters_name) model.to(device) print(f"Model {model_name} loaded successfully on {device}") # Function to run the text generation process def run_generation(user_text, top_p, temperature, top_k, max_new_tokens): template = "<|context|><|user|>\n{}<|end|>\n<|assistant|>" model_inputs = tokenizer(template.format(user_text) if usingAdapter else user_text, return_tensors="pt") model_inputs = model_inputs.to(device) # Generate text in a separate thread streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( input_ids=model_inputs['input_ids'], streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, top_p=top_p, temperature=float(temperature), top_k=top_k, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id ) t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() # Retrieve and yield the generated text model_output = "" for new_text in streamer: model_output += new_text return model_output # Gradio UI setup with gr.Blocks(css=""" .form.svelte-sfqy0y { background: var(--block-background-fill); padding: 20px; } body { font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; color: #e0e0e0; margin: 0; padding: 0; box-sizing: border-box; } .gradio-container { max-width: 900px; margin: auto; padding: 20px; border-radius: 8px; box-shadow: 0 0 10px rgba(0,0,0,0.5); } .gr-button { color: white; border: none; border-radius: 4px; padding: 10px 24px; cursor: pointer; } .gr-button:hover { background-color: #3700b3; } .gr-slider input[type=range] { -webkit-appearance: none; width: 100%; height: 8px; border-radius: 5px; outline: none; opacity: 0.9; -webkit-transition: .2s; transition: opacity .2s; } .gr-slider input[type=range]:hover { opacity: 1; } .gr-textbox { color: white; border: none; border-radius: 4px; padding: 10px; } .chatbox { max-height: 400px; overflow-y: auto; margin-bottom: 20px; } """) as demo: gr.Markdown( """