theia / app.py
bmay's picture
Update app.py
ce0a6c0 verified
raw
history blame
2.93 kB
import gradio as gr
import spaces
import torch
import torchvision.transforms
import numpy as np
from transformers import AutoModel
from theia.decoding import load_feature_stats, prepare_depth_decoder, prepare_mask_generator, decode_everything
def load_description(fp):
with open(fp, 'r', encoding='utf-8') as f:
content = f.read()
return content
@spaces.GPU(duration=30)
def run_theia(image):
theia_model = AutoModel.from_pretrained("theaiinstitute/theia-tiny-patch16-224-cddsv", trust_remote_code=True)
theia_model = theia_model.to('cuda')
target_model_names = [
"google/vit-huge-patch14-224-in21k",
"facebook/dinov2-large",
"openai/clip-vit-large-patch14",
"facebook/sam-vit-huge",
"LiheYoung/depth-anything-large-hf",
]
feature_means, feature_vars = load_feature_stats(target_model_names, stat_file_root="feature_stats")
mask_generator, sam_model = prepare_mask_generator('cuda')
depth_anything_model_name = "LiheYoung/depth-anything-large-hf"
depth_anything_decoder, _ = prepare_depth_decoder(depth_anything_model_name, 'cuda')
image = torchvision.transforms.Resize(size=(224, 224))(image)
images = [image]
theia_decode_results, gt_decode_results = decode_everything(
theia_model=theia_model,
feature_means=feature_means,
feature_vars=feature_vars,
images=images,
mask_generator=mask_generator,
sam_model=sam_model,
depth_anything_decoder=depth_anything_decoder,
pred_iou_thresh=0.5,
stability_score_thresh=0.7,
gt=True,
device='cuda',
)
theia_decode_results = (255.0 * theia_decode_results[0]).astype(np.uint8)
gt_decode_results = (255.0 * gt_decode_results[0]).astype(np.uint8)
return [(theia_decode_results, "Theia Results"), (gt_decode_results, "Ground Truth")]
with gr.Blocks() as demo:
gr.HTML(load_description("gradio_title.md"))
gr.Markdown("This space demonstrates decoding Theia-predicted VFM representations to their original teacher model outputs. For DINOv2 we apply the PCA visualization, for SAM we use its decoder to generate segmentation masks (but with SAM's pipeline of prompting), and for Depth-Anything we use its decoder head to do depth prediction.")
with gr.Row():
with gr.Column():
input_image = gr.Image(type="pil", label="Input Image")
submit_button = gr.Button("Submit")
with gr.Column():
output_gallery = gr.Gallery(label="Input, DINOv2, SAM, Depth Anything", type="numpy")
submit_button.click(run_theia, inputs=input_image, outputs=output_gallery)
demo.launch()