# This file is adapted from https://huggingface.co/spaces/diffusers/controlnet-canny/blob/main/app.py # The original license file is LICENSE.ControlNet in this repo. from diffusers import FlaxStableDiffusionControlNetPipeline, FlaxControlNetModel, FlaxDPMSolverMultistepScheduler from transformers import CLIPTokenizer, FlaxCLIPTextModel, set_seed from flax.training.common_utils import shard from flax.jax_utils import replicate from diffusers.utils import load_image import jax.numpy as jnp import jax import cv2 from PIL import Image import numpy as np import gradio as gr import os if gr.__version__ != "3.28.3": #doesn't work... os.system("pip uninstall -y gradio") os.system("pip install gradio==3.28.3") title_description = """ # UCDR-Net ## Unlimited Controlled Domain Randomization Network for Bridging the Sim2Real Gap in Robotics """ description = """ While existing ControlNet and public diffusion models are predominantly geared towards high-resolution images (512x512 or above) and intricate artistic detail generation, there's an untapped potential of these models in Automatic Data Augmentation (ADA). By harnessing the inherent variance in prompt-conditioned generated images, we can significantly boost the visual diversity of training samples for computer vision pipelines. This is particularly relevant in the field of robotics, where deep learning is increasingly playing a pivotal role in training policies for robotic manipulation from images. In this HuggingFace sprint, we present UCDR-Net (Unlimited Controlled Domain Randomization Network), a novel CannyEdge mini-ControlNet trained on Stable Diffusion 1.5 with mixed datasets. Our model generates photorealistic and varied renderings from simplistic robotic simulation images, enabling real-time data augmentation for robotic vision training. We specifically designed UCDR-Net to be fast and composition preserving, with an emphasis on lower resolution images (128x128) for online data augmentation in typical preprocessing pipelines. Our choice of Canny Edge version of ControlNet ensures shape and structure preservation in the image, which is crucial for visuomotor policy learning. We trained ControlNet from scratch using only 128x128 images, preprocessing the training datasets and extracting Canny Edge maps. We then trained four Control-Nets with different mixtures of 2 datasets (Coyo-700M and Bridge Data) and showcased the results. * [Coyo-700M](https://github.com/kakaobrain/coyo-dataset) * [Bridge](https://sites.google.com/view/bridgedata) Model Description and Training Process: Please refer to the readme file attached to the model repository. Model Repository: [ControlNet repo](https://huggingface.co/Baptlem/baptlem-controlnet) """ traj_description = """ To demonstrate UCDR-Net's capabilities, we generated a trajectory of our simulated robotic environment and presented the resulting videos for each model. We batched the frames for each video and performed independent inference for each frame, which explains the "wobbling" effect. Prompt used for every video: "A robotic arm with a gripper and a small cube on a table, super realistic, industrial background" """ perfo_description = """ Our model has been benchmarked on a node of 4 Titan RTX 24Go GPUs, achieving an impressive 13 FPS image generation rate! The Table on the right shows the performances of our models running on different nodes. To make the benchmark, we loaded one of our model on every GPUs of the node. We then retrieve an episode of our simulation. For every frame of the episode, we preprocess the image (resize, canny, …) and process the Canny image on the GPUs. We repeated this procedure for different Batch Size (BS). We can see that the greater the BS the greater the FPS. By increazing the BS, we take advantage of the parallelization of the GPUs. """ conclusion_description = """ UCDR-Net stands as a natural development in bridging the Sim2Real gap in robotics by providing real-time data augmentation for training visual policies. We are excited to share our work with the HuggingFace community and contribute to the advancement of robotic vision training techniques. """ def create_key(seed=0): return jax.random.PRNGKey(seed) def load_controlnet(controlnet_version): controlnet, controlnet_params = FlaxControlNetModel.from_pretrained( "Baptlem/baptlem-controlnet", subfolder=controlnet_version, from_flax=True, dtype=jnp.float32, ) return controlnet, controlnet_params def load_sb_pipe(controlnet_version, sb_path="runwayml/stable-diffusion-v1-5"): controlnet, controlnet_params = load_controlnet(controlnet_version) scheduler, scheduler_params = FlaxDPMSolverMultistepScheduler.from_pretrained( sb_path, subfolder="scheduler" ) pipe, params = FlaxStableDiffusionControlNetPipeline.from_pretrained( sb_path, controlnet=controlnet, revision="flax", dtype=jnp.bfloat16 ) pipe.scheduler = scheduler params["controlnet"] = controlnet_params params["scheduler"] = scheduler_params return pipe, params controlnet_path = "Baptlem/baptlem-controlnet" controlnet_version = "coyo-500k" # Constants low_threshold = 100 high_threshold = 200 print(os.path.abspath('.')) print(os.listdir(".")) print("Gradio version:", gr.__version__) # pipe.enable_xformers_memory_efficient_attention() # pipe.enable_model_cpu_offload() # pipe.enable_attention_slicing() print("Loaded models...") def pipe_inference( image, prompt, is_canny=False, num_samples=4, resolution=128, num_inference_steps=50, guidance_scale=7.5, model="coyo-500k", seed=0, negative_prompt="", ): print("Loading pipe") pipe, params = load_sb_pipe(model) if not isinstance(image, np.ndarray): image = np.array(image) processed_image = resize_image(image, resolution) #-> PIL if not is_canny: resized_image, processed_image = preprocess_canny(processed_image, resolution) rng = create_key(seed) rng = jax.random.split(rng, jax.device_count()) prompt_ids = pipe.prepare_text_inputs([prompt] * num_samples) negative_prompt_ids = pipe.prepare_text_inputs([negative_prompt] * num_samples) processed_image = pipe.prepare_image_inputs([processed_image] * num_samples) p_params = replicate(params) prompt_ids = shard(prompt_ids) negative_prompt_ids = shard(negative_prompt_ids) processed_image = shard(processed_image) print("Inference...") output = pipe( prompt_ids=prompt_ids, image=processed_image, params=p_params, prng_seed=rng, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, neg_prompt_ids=negative_prompt_ids, jit=True, ).images print("Finished inference...") # all_outputs = [] # all_outputs.append(image) # if not is_canny: # all_outputs.append(resized_image) # for image in output.images: # all_outputs.append(image) all_outputs = pipe.numpy_to_pil(np.asarray(output.reshape((num_samples,) + output.shape[-3:]))) return all_outputs def resize_image(image, resolution): if not isinstance(image, np.ndarray): image = np.array(image) h, w = image.shape[:2] ratio = w/h if ratio > 1 : resized_image = cv2.resize(image, (int(resolution*ratio), resolution), interpolation=cv2.INTER_NEAREST) elif ratio < 1 : resized_image = cv2.resize(image, (resolution, int(resolution/ratio)), interpolation=cv2.INTER_NEAREST) else: resized_image = cv2.resize(image, (resolution, resolution), interpolation=cv2.INTER_NEAREST) return Image.fromarray(resized_image) def preprocess_canny(image, resolution=128): if not isinstance(image, np.ndarray): image = np.array(image) processed_image = cv2.Canny(image, low_threshold, high_threshold) processed_image = processed_image[:, :, None] processed_image = np.concatenate([processed_image, processed_image, processed_image], axis=2) resized_image = Image.fromarray(image) processed_image = Image.fromarray(processed_image) return resized_image, processed_image def create_demo(process, max_images=12, default_num_images=4): with gr.Blocks() as demo: with gr.Row(): gr.Markdown(title_description) with gr.Row(): with gr.Column(): input_image = gr.Image(source='upload', type='numpy') prompt = gr.Textbox(label='Prompt') run_button = gr.Button(label='Run') with gr.Accordion('Advanced options', open=False): is_canny = gr.Checkbox( label='Is canny', value=False) num_samples = gr.Slider(label='Images', minimum=1, maximum=max_images, value=default_num_images, step=1) """ canny_low_threshold = gr.Slider( label='Canny low threshold', minimum=1, maximum=255, value=100, step=1) canny_high_threshold = gr.Slider( label='Canny high threshold', minimum=1, maximum=255, value=200, step=1) """ resolution = gr.Slider(label='Resolution', minimum=128, maximum=128, value=128, step=1) num_steps = gr.Slider(label='Steps', minimum=1, maximum=100, value=20, step=1) guidance_scale = gr.Slider(label='Guidance Scale', minimum=0.1, maximum=30.0, value=7.5, step=0.1) model = gr.Dropdown(choices=["coyo-500k", "bridge-2M", "coyo1M-bridge2M", "coyo2M-bridge325k"], value="coyo-500k", label="Model used for inference", info="Find every models at https://huggingface.co/Baptlem/baptlem-controlnet") seed = gr.Slider(label='Seed', minimum=-1, maximum=2147483647, step=1, randomize=True) n_prompt = gr.Textbox( label='Negative Prompt', value= 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality' ) with gr.Column(): result = gr.Gallery(label='Output', show_label=False, elem_id='gallery').style(grid=2, height='auto') with gr.Row(): gr.Video("./trajectory_hf/trajectory_coyo2M-bridge325k_64.avi", format="avi", interactive=False) with gr.Row(): gr.Markdown(description) with gr.Row(): with gr.Column(): gr.Markdown(traj_description) with gr.Column(): gr.Video("./trajectory_hf/trajectory.avi", format="avi", interactive=False) with gr.Row(): with gr.Column(): gr.Markdown("Trajectory processed with coyo-500k model :") with gr.Column(): gr.Video("./trajectory_hf/trajectory_coyo-500k.avi", format="avi", interactive=False) with gr.Row(): with gr.Column(): gr.Markdown("Trajectory processed with bridge-2M model :") with gr.Column(): gr.Video("./trajectory_hf/trajectory_bridge-2M.avi", format="avi", interactive=False) with gr.Row(): with gr.Column(): gr.Markdown("Trajectory processed with coyo1M-bridge2M model :") with gr.Column(): gr.Video("./trajectory_hf/trajectory_coyo1M-bridge2M.avi", format="avi", interactive=False) with gr.Row(): with gr.Column(): gr.Markdown("Trajectory processed with coyo2M-bridge325k model :") with gr.Column(): gr.Video("./trajectory_hf/trajectory_coyo2M-bridge325k.avi", format="avi", interactive=False) with gr.Row(): with gr.Column(): gr.Markdown(perfo_description) with gr.Column(): gr.Image("./perfo_rtx.png", interactive=False) with gr.Row(): gr.Markdown(conclusion_description) inputs = [ input_image, prompt, is_canny, num_samples, resolution, #canny_low_threshold, #canny_high_threshold, num_steps, guidance_scale, model, seed, n_prompt, ] prompt.submit(fn=process, inputs=inputs, outputs=result) run_button.click(fn=process, inputs=inputs, outputs=result, api_name='canny') return demo if __name__ == '__main__': pipe_inference demo = create_demo(pipe_inference) demo.queue().launch() # gr.Interface(create_demo).launch()