Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import torch | |
import math | |
import cv2 | |
import os | |
import sys | |
import FFV1MT_MS | |
import flow_tools | |
import spaces | |
# for dynamic GPU resource | |
def process_images(videos, x, y): | |
# read video file | |
cap = cv2.VideoCapture(videos) | |
# transform images to a list of images ndarray | |
images = [] | |
while True: | |
ret, frame = cap.read() | |
if ret: | |
images.append(frame) | |
else: | |
break | |
if len(images) < 11: | |
print('video is too short') | |
return | |
# only use the first 11 frames | |
images = images[:11] | |
# transform images to a list of images tensor | |
images = [torch.from_numpy(img).permute(2, 0, 1).float().to(device).unsqueeze(0) / 255.0 for img in images] | |
# if the max size of the image is larger than 1024, resize the image to 768 with same ratio | |
max_size = max(images[0].shape[2], images[0].shape[3]) | |
if max_size > 768: | |
ratio = 768 / max_size | |
images = [torch.nn.functional.interpolate(img, scale_factor=ratio, mode='bicubic', align_corners=True) for img | |
in images] | |
# transform color image to gray image | |
result = model.forward_viz(images, layer=7, x=x, y=y) | |
flow = result['flow'] | |
attention = result['attention'] | |
activation = result['activation'] | |
return [flow, activation, attention] | |
title = "Modelling Human Visual Motion Processing with Trainable Motion Energy Sensing and a Self-attention Network 🤗 " | |
description = "## Introduction 🔥🔥🔥\n" \ | |
" The intersection of cognitive neuroscience and computer vision offers exciting advancements in " \ | |
"how machines perceive motion like humans. Our research bridges the gap between these fields by proposing a novel " \ | |
"image-computable model that aligns with human motion perception mechanisms. By integrating trainable" \ | |
" motion energy sensing with recurrent self-attention networks, we can simulate the complex motion " \ | |
"processing of the human visual cortex, particularly the V1-MT pathway. Our model not only parallels" \ | |
" physiological responses in V1 and MT neurons but also replicates human psychophysical responses " \ | |
"to dynamic stimuli. \n\n\n" \ | |
"認知神経科学とコンピュータビジョンの交差点は、コンピューターが人間のように動きを認識する方法において、興味深い進歩を提供します。"\ | |
"私たちの研究は、これらの分野間のギャップを埋めるために、人間の動作知覚メカニズムに合致する新しい画像計算可能なモデルを提案しています。"\ | |
"訓練可能な動きエネルギー感知をリカレントSelf-attentionネットワークと統合することにより、"\ | |
"特にV1-MT経路における人間の視覚皮質の複雑な動き処理をシミュレートすることができます。"\ | |
"私たちのモデルは、V1およびMTニューロンでの生理的反応と並行して、動的刺激に対する人間の心理物理学的反応も再現します。"\ | |
"![](https://drive.google.com/uc?id=10PcKzQ9X1nsXKUi8OPR0jN_ZsjlCAV47) \n" \ | |
"## Environment Configuration 🐡 \n" \ | |
"To run our model, the basic environment configuration is required:\n" \ | |
'- gradio == 4.7.1 \n'\ | |
'- Python 3.8 or higher \n' \ | |
'- Pyotrch 2.0 \n' \ | |
'- CUDA Toolkit 11.x (for GPU acceleration)\n' \ | |
'- opencv-python \n' \ | |
'- Imageio \n' \ | |
'- Matplotlib \n\n' \ | |
"## Preprint Paper 📝 \n" \ | |
"The paper is available at [arXiv](https://arxiv.org/abs/2305.09156) \n" \ | |
"## Video Presentation 📹 \n" \ | |
"The video presentation is available at [Video Record](https://recorder-v3.slideslive.com/?share=85662&s=6afe157c-e764-4e3c-9302-2c6dd6887db1). \n" \ | |
"## Conference Website \n" \ | |
"The project is presented at [NeurIPS 2023](https://neurips.cc/virtual/2023/poster/70202). \n" \ | |
"## Below is the interactive demo of our model. You can select the video examples below or upload your own videos. "\ | |
"The model outputs the motion flow field, the activation of the first stage, and the attention map of the second stage." \ | |
"We also provide two sliders to adjust the location of the attention visualizer. \n" \ | |
" **Note**: The demo is running on CPU, so it may take a while to process the video. \n" | |
examples = [["example_1.mp4", 62, 56], ["example_2.mp4", 59, 55], ["example_3.mp4", 50, 50], ["example_4.mp4", 50, 50], | |
["example_5.mp4", 39, 72]] | |
md = "## Citation \n" \ | |
"If you do think this work helps your research, please cite our work as:\n"\ | |
"```\n"\ | |
"@inproceedings{ \n"\ | |
"sun2023modeling,\n"\ | |
"title={Modeling Human Visual Motion Processing with Trainable Motion Energy Sensing and a Self-attention Network},\n"\ | |
"author={Zitang Sun and Yen-Ju Chen and Yung-Hao Yang and Shin'ya Nishida},\n"\ | |
"booktitle={Thirty-seventh Conference on Neural Information Processing Systems},\n"\ | |
"year={2023},\n"\ | |
"url={https://openreview.net/forum?id=tRKimbAk5D}\n"\ | |
"}\n"\ | |
"```\n"\ | |
"## Author \n" \ | |
"This project page is developed by Zitang Sun 📧 (zitangsun96 @ gmail.com)\n" \ | |
"## LICENSE \n" \ | |
"This project is licensed under the terms of the MIT license. \n"\ | |
"## Address 🏡 \n" \ | |
"[Cognitive Informatics Lab](http://www.cog.ist.i.kyoto-u.ac.jp/en/index.html), Graduate School of Informatics, Kyoto University, Japan \n" | |
if __name__ =='__main__': | |
# torch.cuda.init() | |
# print(f"Is CUDA available: {torch.cuda.is_available()}") | |
# # True | |
# print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") | |
# # Tesla T4 | |
model = FFV1MT_MS.FFV1DNN() | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
print('Number fo parameters: {}'.format(model.num_parameters())) | |
model.to(device) | |
model_dict = torch.load('Model_example.pth.tar', map_location="cpu")['state_dict'] | |
# load an example model | |
model.load_state_dict(model_dict, strict=True) | |
model.eval() | |
iface = gr.Interface(fn=process_images, | |
inputs=[gr.Video(label="Upload video or use the example images below"), | |
gr.Slider(0, 100, label='X location of attention visualizer'), | |
gr.Slider(0, 100, label='Y location of attention visualizer')], | |
# out put is three images | |
outputs=[gr.Image(type="numpy", label="Motion flow field"), | |
gr.Image(type="numpy", label="Neural Activation of Stage I"), | |
gr.Image(type="numpy", label="Attention map of Stage II")], | |
title=title, | |
description=description, | |
article=md, | |
examples=examples) | |
iface.launch(debug=True) | |