import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig import torch import logging import sys import os import psutil import gc # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout) ] ) logger = logging.getLogger(__name__) def log_system_info(): """Log system information for debugging""" logger.info(f"Python version: {sys.version}") logger.info(f"PyTorch version: {torch.__version__}") logger.info(f"CUDA available: {torch.cuda.is_available()}") if torch.cuda.is_available(): logger.info(f"CUDA version: {torch.version.cuda}") logger.info(f"GPU: {torch.cuda.get_device_name(0)}") logger.info(f"CPU count: {psutil.cpu_count()}") logger.info(f"Memory available: {psutil.virtual_memory().available / (1024 * 1024 * 1024):.2f} GB") def cleanup_memory(): """Clean up memory""" gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() print("Starting application...") log_system_info() try: print("Loading model and tokenizer...") # Initialize model and tokenizer with error handling model_id = "htigenai/finetune_test" # your model ID # Configure quantization quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4" ) # Load tokenizer with error handling try: tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=True ) logger.info("Tokenizer loaded successfully") except Exception as e: logger.error(f"Error loading tokenizer: {str(e)}") raise # Load model with error handling try: model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.float16, quantization_config=quantization_config, trust_remote_code=True, low_cpu_mem_usage=True ) logger.info("Model loaded successfully") except Exception as e: logger.error(f"Error loading model: {str(e)}") raise def generate_text(prompt): """Generate text based on the input prompt.""" try: logger.info(f"Generating text for prompt: {prompt[:50]}...") # Clean up memory before generation cleanup_memory() # Tokenize input inputs = tokenizer( prompt, return_tensors="pt", padding=True, truncation=True, max_length=512 ).to(model.device) # Generate with torch.inference_mode(): outputs = model.generate( **inputs, max_new_tokens=200, temperature=0.7, top_p=0.95, do_sample=True, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, repetition_penalty=1.1 ) # Decode and return generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) logger.info("Text generation completed successfully") # Clean up memory after generation cleanup_memory() return generated_text except Exception as e: logger.error(f"Error during generation: {str(e)}") return f"Error during generation: {str(e)}" # Create Gradio interface iface = gr.Interface( fn=generate_text, inputs=gr.Textbox( lines=3, placeholder="Enter your prompt here...", label="Input Prompt" ), outputs=gr.Textbox( label="Generated Response", lines=5 ), title="Text Generation Model", description="Enter a prompt and get AI-generated text. Please be patient as generation may take a few moments.", examples=[ ["What are your thoughts about cats?"], ["Write a short story about a magical forest"], ["Explain quantum computing to a 5-year-old"], ], allow_flagging="never", cache_examples=False, ) # Launch the interface iface.launch( share=False, debug=True, show_error=True, server_name="0.0.0.0" ) except Exception as e: logger.error(f"Application startup failed: {str(e)}") raise