import gradio as gr import torch import cv2 import numpy as np import json from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import pytesseract # Load Object Detection Pipeline obj_detect = pipeline("object-detection", model="facebook/detr-resnet-50", device=-1) # Load Qwen for Code Generation MODEL_NAME = "Qwen/Qwen2.5-Coder-3B" device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if torch.cuda.is_available() else torch.float32 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=dtype, device_map="auto" ) # Define the process_image function (same as your original logic) def process_image(img): opencv_image = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) img_height, img_width, _ = opencv_image.shape # Run Object Detection detections = obj_detect(img) # Run OCR text_data = pytesseract.image_to_string(opencv_image) ui_json = { "id": "generated-ui", "name": "Generated UI", "components": [], "ocr_text": text_data.strip() } for det in detections: ui_json["components"].append({ "id": f"{det['label']}-{len(ui_json['components']) + 1}", "name": det["label"].capitalize(), "confidence": round(det["score"], 2), }) metadata_str = json.dumps(ui_json, indent=2) # Generate React Code prompt = f"Generate a React component from this metadata:\n{metadata_str}" inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): output = model.generate(**inputs, max_length=1024) code_response = tokenizer.decode(output[0], skip_special_tokens=True) return metadata_str, code_response # Gradio Interface interface = gr.Interface( fn=process_image, inputs=gr.Image(type="pil"), outputs=["text", "text"], title="Screenshot → Metadata & React Code", description="Upload a UI screenshot and get structured metadata + React code.", ) # Run in Docker with 0.0.0.0 to allow external access interface.launch(server_name="0.0.0.0", server_port=7860)