from PIL import Image # import torch import os import gradio as gr # Below is the code refactored into a Python class for better modularity and reusability. import torch from transformers import TextStreamer class FloorPlanAnalyzer: def __init__(self, model_path, load_in_4bit=True, gradient_checkpointing="unsloth", device="cpu"): """ Initializes the FloorPlanAnalyzer with the specified model and configuration. """ from unsloth import FastVisionModel # Assuming unsloth package is installed self.device = device self.model, self.tokenizer = FastVisionModel.from_pretrained( model_path, load_in_4bit=load_in_4bit, use_gradient_checkpointing=gradient_checkpointing, ) FastVisionModel.for_inference(self.model) def prepare_input(self, image_path, instruction): """ Prepares the input for the model by loading the image and applying the chat template. Args: image_path (str): Path to the floor plan image. instruction (str): Instruction text to guide the analysis. Returns: torch.Tensor: Processed inputs for the model. """ # Load image image = Image.open(image_path).convert("RGB") # Create message template messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": instruction} ]} ] # Generate input text input_text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True) # Tokenize and prepare inputs inputs = self.tokenizer( image, input_text, add_special_tokens=False, return_tensors="pt", ).to(self.device) return inputs def analyze(self, image_path, instruction, max_new_tokens=512, temperature=1.5, min_p=0.1): """ Analyzes the floor plan based on the provided instruction. Args: image_path (str): Path to the floor plan image. instruction (str): Instruction guiding the analysis. max_new_tokens (int): Maximum number of tokens to generate. temperature (float): Sampling temperature for generation. min_p (float): Minimum probability for nucleus sampling. Returns: str: The generated output from the model. """ # Prepare inputs inputs = self.prepare_input(image_path, instruction) # Set up text streamer text_streamer = TextStreamer(self.tokenizer, skip_prompt=True) # Generate output output = self.model.generate( **inputs, streamer=text_streamer, max_new_tokens=max_new_tokens, use_cache=True, temperature=temperature, min_p=min_p, ) return output # Instantiate the FloorPlanAnalyzer model_path = "./model/" analyzer = FloorPlanAnalyzer(model_path=model_path) # Sample images for Gradio examples # Define sample images for user convenience # Define sample images and instructions sample_images = [ ["./samples/10_2.jpg", "You are an expert in architecture and interior design. Analyze the floor plan image and describe accurately the key features, room count, layout, and any other important details you observe."], ["./samples/10_10.jpg", "You are an expert in architecture and interior design. Analyze the floor plan image and describe accurately the key features, room count, layout, and any other important details you observe."], ["./samples/0_10.jpg", "You are an expert in architecture and interior design. Analyze the floor plan image and describe accurately the key features, room count, layout, and any other important details you observe."], ["./samples/2_12.jpg", "You are an expert in architecture and interior design. Analyze the floor plan image and describe accurately the key features, room count, layout, and any other important details you observe."] ] # Ensure samples directory exists os.makedirs("samples", exist_ok=True) # Save some dummy sample images if they don't exist (you should replace these with actual images) for i, sample in enumerate(sample_images): image_path = sample[0] if not os.path.exists(image_path): img = Image.new("RGB", (224, 224), color=(i * 50, i * 50, i * 50)) img.save(image_path) # Gradio prediction function def predict_image(image, instruction): """ Processes the uploaded image and instruction through the FloorPlanAnalyzer. Args: image (PIL.Image.Image): The uploaded floor plan image. instruction (str): The user-provided instruction. Returns: str: The generated output description. """ return analyzer.analyze(image, instruction) gr_interface = gr.Interface( fn=predict_image, inputs=[ gr.Image(type="pil", label="Upload Floor Plan Image"), gr.Textbox( label="Instruction Text", value="""You are an expert in architecture and interior design. Analyze the floor plan image and describe accurately the key features, room count, layout, and any other important details you observe.""" ) ], outputs=gr.Textbox(label="Analysis Result"), title="Floor Plan Analyzer", description="Upload a floor plan image and provide instructions to analyze it. Get detailed insights into the layout and design.", examples=sample_images # Add the examples here ) # # Gradio UI setup with examples # gr_interface = gr.Interface( # fn=predict_image, # inputs=gr.Image(type="pil"), # Updated to gr.Image for image input # outputs=[gr.Image(type="pil"), gr.Textbox()], # Updated to gr.Image and gr.Textbox # title="House CAD Design Object Detection", # description="Upload a CAD design image of a house to detect objects with bounding boxes and probabilities.", # examples=sample_images # Add the examples here # ) # Launch the Gradio interface if run as main if __name__ == "__main__": gr_interface.launch()