File size: 2,170 Bytes
3ac452f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
import torch
import cv2
import numpy as np
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import pytesseract

# Load Object Detection Pipeline
obj_detect = pipeline("object-detection", model="facebook/detr-resnet-50", device=-1)

# Load Qwen for Code Generation
MODEL_NAME = "Qwen/Qwen2.5-Coder-3B"
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if torch.cuda.is_available() else torch.float32

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype=dtype, device_map="auto"
)

# Define the process_image function (same as your original logic)
def process_image(img):
    opencv_image = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
    img_height, img_width, _ = opencv_image.shape

    # Run Object Detection
    detections = obj_detect(img)

    # Run OCR
    text_data = pytesseract.image_to_string(opencv_image)

    ui_json = {
        "id": "generated-ui",
        "name": "Generated UI",
        "components": [],
        "ocr_text": text_data.strip()
    }

    for det in detections:
        ui_json["components"].append({
            "id": f"{det['label']}-{len(ui_json['components']) + 1}",
            "name": det["label"].capitalize(),
            "confidence": round(det["score"], 2),
        })

    metadata_str = json.dumps(ui_json, indent=2)

    # Generate React Code
    prompt = f"Generate a React component from this metadata:\n{metadata_str}"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(**inputs, max_length=1024)
    code_response = tokenizer.decode(output[0], skip_special_tokens=True)

    return metadata_str, code_response

# Gradio Interface
interface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="pil"),
    outputs=["text", "text"],
    title="Screenshot → Metadata & React Code",
    description="Upload a UI screenshot and get structured metadata + React code.",
)

# Run in Docker with 0.0.0.0 to allow external access
interface.launch(server_name="0.0.0.0", server_port=7860)