Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import torch | |
import math | |
import cv2 | |
import os | |
import sys | |
import FFV1MT_MS | |
import flow_tools | |
import spaces | |
# for dynamic GPU resource | |
def process_images(videos, x, y): | |
# read video file | |
cap = cv2.VideoCapture(videos) | |
# transform images to a list of images ndarray | |
images = [] | |
while True: | |
ret, frame = cap.read() | |
if ret: | |
images.append(frame) | |
else: | |
break | |
if len(images) < 11: | |
print('video is too short') | |
return | |
# only use the first 11 frames | |
images = images[:11] | |
# transform images to a list of images tensor | |
images = [torch.from_numpy(img).permute(2, 0, 1).float().to(device).unsqueeze(0) / 255.0 for img in images] | |
# if the max size of the image is larger than 1024, resize the image to 768 with same ratio | |
max_size = max(images[0].shape[2], images[0].shape[3]) | |
if max_size > 768: | |
ratio = 768 / max_size | |
images = [torch.nn.functional.interpolate(img, scale_factor=ratio, mode='bicubic', align_corners=True) for img | |
in images] | |
# transform color image to gray image | |
result = model.forward_viz(images, layer=7, x=x, y=y) | |
flow = result['flow'] | |
attention = result['attention'] | |
activation = result['activation'] | |
return [flow, activation, attention] | |
title = "Modelling Human Visual Motion Processing with Trainable Motion Energy Sensing and a Self-attention Network π€ " | |
description = "## Introduction π₯π₯π₯\n" \ | |
" The intersection of cognitive neuroscience and computer vision offers exciting advancements in " \ | |
"how machines perceive motion. Our research bridges the gap between these fields by proposing a novel " \ | |
"image-computable model that aligns with human motion perception mechanisms. By integrating trainable" \ | |
" motion energy sensing with recurrent self-attention networks, we can simulate the complex motion " \ | |
"processing of the human visual cortex, particularly the V1-MT pathway. Our model not only parallels" \ | |
" physiological responses in V1 and MT neurons but also replicates human psychophysical responses " \ | |
"to dynamic stimuli. \n\n\n" \ | |
"![](https://drive.google.com/uc?id=10PcKzQ9X1nsXKUi8OPR0jN_ZsjlCAV47) \n" \ | |
"## Environment Configuration π‘ \n" \ | |
"To run our model, the basic environment configuration is required:\n" \ | |
'- gradio == 4.7.1'\ | |
'- Python 3.8 or higher \n' \ | |
'- Pyotrch 2.0 \n' \ | |
'- CUDA Toolkit 11.x (for GPU acceleration)\n' \ | |
'- opencv-python \n' \ | |
'- Imageio \n' \ | |
'- Matplotlib \n\n' \ | |
"## Preprint Paper π \n" \ | |
"The paper is available at [arXiv](https://arxiv.org/abs/2305.09156) \n" \ | |
"## Video Presentation πΉ \n" \ | |
"The video presentation is available at [Video Record](https://recorder-v3.slideslive.com/?share=85662&s=6afe157c-e764-4e3c-9302-2c6dd6887db1). \n" \ | |
"## Conference Website \n" \ | |
"The project is presented at [NeurIPS 2023](https://neurips.cc/virtual/2023/poster/70202). \n" \ | |
"## Below is the interactive demo of our model. You can select the video examples below or upload your own videos. "\ | |
"The model outputs the motion flow field, the activation of the first stage, and the attention map of the second stage." \ | |
"We also provide two sliders to adjust the location of the attention visualizer. \n" \ | |
" **Note**: The demo is running on CPU, so it may take a while to process the video. \n" | |
examples = [["example_1.mp4", 62, 56], ["example_2.mp4", 59, 55], ["example_3.mp4", 50, 50], ["example_4.mp4", 50, 50], | |
["example_5.mp4", 39, 72]] | |
md = "## Citation \n" \ | |
"If you do think this work helps your research, please cite our work as:\n"\ | |
"```\n"\ | |
"@inproceedings{ \n"\ | |
"sun2023modeling,\n"\ | |
"title={Modeling Human Visual Motion Processing with Trainable Motion Energy Sensing and a Self-attention Network},\n"\ | |
"author={Zitang Sun and Yen-Ju Chen and Yung-Hao Yang and Shin'ya Nishida},\n"\ | |
"booktitle={Thirty-seventh Conference on Neural Information Processing Systems},\n"\ | |
"year={2023},\n"\ | |
"url={https://openreview.net/forum?id=tRKimbAk5D}\n"\ | |
"}\n"\ | |
"```\n"\ | |
"## Author \n" \ | |
"This project page is developed by Zitang Sun π§ (zitangsun96 @ gmail.com)\n" \ | |
"## LICENSE \n" \ | |
"This project is licensed under the terms of the MIT license. \n"\ | |
"## Address π‘ \n" \ | |
"[Cognitive Informatics Lab](http://www.cog.ist.i.kyoto-u.ac.jp/en/index.html), Graduate School of Informatics, Kyoto University, Japan \n" | |
if __name__ =='__main__': | |
# torch.cuda.init() | |
# print(f"Is CUDA available: {torch.cuda.is_available()}") | |
# # True | |
# print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") | |
# # Tesla T4 | |
model = FFV1MT_MS.FFV1DNN() | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
print('Number fo parameters: {}'.format(model.num_parameters())) | |
model.to(device) | |
model_dict = torch.load('Model_example.pth.tar', map_location="cpu")['state_dict'] | |
# save model | |
model.load_state_dict(model_dict, strict=True) | |
model.eval() | |
iface = gr.Interface(fn=process_images, | |
inputs=[gr.Video(label="Upload video or use the example images below"), | |
gr.Slider(0, 100, label='X location of attention visualizer'), | |
gr.Slider(0, 100, label='Y location of attention visualizer')], | |
# out put is three images | |
outputs=[gr.Image(type="numpy", label="Motion flow field"), | |
gr.Image(type="numpy", label="Activation of Stage I"), | |
gr.Image(type="numpy", label="Attention map of Stage II")], | |
title=title, | |
description=description, | |
article=md, | |
examples=examples) | |
iface.launch(debug=True) | |