Spaces:
Running
on
Zero
Running
on
Zero
gradio tweaks
Browse files
README.md
CHANGED
@@ -7,6 +7,284 @@ sdk: gradio
|
|
7 |
sdk_version: 4.38.1
|
8 |
app_file: webgui.py
|
9 |
pinned: false
|
|
|
10 |
---
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
sdk_version: 4.38.1
|
8 |
app_file: webgui.py
|
9 |
pinned: false
|
10 |
+
suggested_hardware: a10g-large
|
11 |
---
|
12 |
+
<h1 align='center'>EchoMimic: Lifelike Audio-Driven Portrait Animations through Editable Landmark Conditioning</h1>
|
13 |
|
14 |
+
<div align='center'>
|
15 |
+
<a href='https://github.com/yuange250' target='_blank'>Zhiyuan Chen</a><sup>*</sup> 
|
16 |
+
<a href='https://github.com/JoeFannie' target='_blank'>Jiajiong Cao</a><sup>*</sup> 
|
17 |
+
<a href='https://github.com/octavianChen' target='_blank'>Zhiquan Chen</a><sup></sup> 
|
18 |
+
<a href='https://github.com/lymhust' target='_blank'>Yuming Li</a><sup></sup> 
|
19 |
+
<a href='https://github.com/' target='_blank'>Chenguang Ma</a><sup></sup>
|
20 |
+
</div>
|
21 |
+
<div align='center'>
|
22 |
+
*Equal Contribution.
|
23 |
+
</div>
|
24 |
+
|
25 |
+
<div align='center'>
|
26 |
+
Terminal Technology Department, Alipay, Ant Group.
|
27 |
+
</div>
|
28 |
+
<br>
|
29 |
+
<div align='center'>
|
30 |
+
<a href='https://badtobest.github.io/echomimic.html'><img src='https://img.shields.io/badge/Project-Page-blue'></a>
|
31 |
+
<a href='https://huggingface.co/BadToBest/EchoMimic'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow'></a>
|
32 |
+
<a href='https://www.modelscope.cn/models/BadToBest/EchoMimic'><img src='https://img.shields.io/badge/ModelScope-Model-purple'></a>
|
33 |
+
<a href='https://arxiv.org/abs/2407.08136'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
|
34 |
+
<a href='assets/echomimic.png'><img src='https://badges.aleen42.com/src/wechat.svg'></a>
|
35 |
+
</div>
|
36 |
+
|
37 |
+
## 📣 📣 Updates
|
38 |
+
* [2024.07.17] 🔥🔥🔥 Accelerated models and pipe are released. The inference speed can be improved by **10x** (from ~7mins/240frames to ~50s/240frames on V100 GPU)
|
39 |
+
* [2024.07.14] 🔥 [ComfyUI](https://github.com/smthemex/ComfyUI_EchoMimic) is now available. Thanks @smthemex for the contribution.
|
40 |
+
* [2024.07.13] 🔥 Thanks [NewGenAI](https://www.youtube.com/@StableAIHub) for the [video installation tutorial](https://www.youtube.com/watch?v=8R0lTIY7tfI).
|
41 |
+
* [2024.07.13] 🔥 We release our pose&audio driven codes and models.
|
42 |
+
* [2024.07.12] 🔥 WebUI and GradioUI versions are released. We thank @greengerong @Robin021 and @O-O1024 for their contributions.
|
43 |
+
* [2024.07.12] 🔥 Our [paper](https://arxiv.org/abs/2407.08136) is in public on arxiv.
|
44 |
+
* [2024.07.09] 🔥 We release our audio driven codes and models.
|
45 |
+
|
46 |
+
## Gallery
|
47 |
+
### Audio Driven (Sing)
|
48 |
+
|
49 |
+
<table class="center">
|
50 |
+
|
51 |
+
<tr>
|
52 |
+
<td width=30% style="border: none">
|
53 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/d014d921-9f94-4640-97ad-035b00effbfe" muted="false"></video>
|
54 |
+
</td>
|
55 |
+
<td width=30% style="border: none">
|
56 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/877603a5-a4f9-4486-a19f-8888422daf78" muted="false"></video>
|
57 |
+
</td>
|
58 |
+
<td width=30% style="border: none">
|
59 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/e0cb5afb-40a6-4365-84f8-cb2834c4cfe7" muted="false"></video>
|
60 |
+
</td>
|
61 |
+
</tr>
|
62 |
+
|
63 |
+
</table>
|
64 |
+
|
65 |
+
### Audio Driven (English)
|
66 |
+
|
67 |
+
<table class="center">
|
68 |
+
|
69 |
+
<tr>
|
70 |
+
<td width=30% style="border: none">
|
71 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/386982cd-3ff8-470d-a6d9-b621e112f8a5" muted="false"></video>
|
72 |
+
</td>
|
73 |
+
<td width=30% style="border: none">
|
74 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/5c60bb91-1776-434e-a720-8857a00b1501" muted="false"></video>
|
75 |
+
</td>
|
76 |
+
<td width=30% style="border: none">
|
77 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/1f15adc5-0f33-4afa-b96a-2011886a4a06" muted="false"></video>
|
78 |
+
</td>
|
79 |
+
</tr>
|
80 |
+
|
81 |
+
</table>
|
82 |
+
|
83 |
+
### Audio Driven (Chinese)
|
84 |
+
|
85 |
+
<table class="center">
|
86 |
+
|
87 |
+
<tr>
|
88 |
+
<td width=30% style="border: none">
|
89 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/a8092f9a-a5dc-4cd6-95be-1831afaccf00" muted="false"></video>
|
90 |
+
</td>
|
91 |
+
<td width=30% style="border: none">
|
92 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/c8b5c59f-0483-42ef-b3ee-4cffae6c7a52" muted="false"></video>
|
93 |
+
</td>
|
94 |
+
<td width=30% style="border: none">
|
95 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/532a3e60-2bac-4039-a06c-ff6bf06cb4a4" muted="false"></video>
|
96 |
+
</td>
|
97 |
+
</tr>
|
98 |
+
|
99 |
+
</table>
|
100 |
+
|
101 |
+
### Landmark Driven
|
102 |
+
|
103 |
+
<table class="center">
|
104 |
+
|
105 |
+
<tr>
|
106 |
+
<td width=30% style="border: none">
|
107 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/1da6c46f-4532-4375-a0dc-0a4d6fd30a39" muted="false"></video>
|
108 |
+
</td>
|
109 |
+
<td width=30% style="border: none">
|
110 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/d4f4d5c1-e228-463a-b383-27fb90ed6172" muted="false"></video>
|
111 |
+
</td>
|
112 |
+
<td width=30% style="border: none">
|
113 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/18bd2c93-319e-4d1c-8255-3f02ba717475" muted="false"></video>
|
114 |
+
</td>
|
115 |
+
</tr>
|
116 |
+
|
117 |
+
</table>
|
118 |
+
|
119 |
+
### Audio + Selected Landmark Driven
|
120 |
+
|
121 |
+
<table class="center">
|
122 |
+
|
123 |
+
<tr>
|
124 |
+
<td width=30% style="border: none">
|
125 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/4a29d735-ec1b-474d-b843-3ff0bdf85f55" muted="false"></video>
|
126 |
+
</td>
|
127 |
+
<td width=30% style="border: none">
|
128 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/b994c8f5-8dae-4dd8-870f-962b50dc091f" muted="false"></video>
|
129 |
+
</td>
|
130 |
+
<td width=30% style="border: none">
|
131 |
+
<video controls loop src="https://github.com/BadToBest/EchoMimic/assets/11451501/955c1d51-07b2-494d-ab93-895b9c43b896" muted="false"></video>
|
132 |
+
</td>
|
133 |
+
</tr>
|
134 |
+
|
135 |
+
</table>
|
136 |
+
|
137 |
+
**(Some demo images above are sourced from image websites. If there is any infringement, we will immediately remove them and apologize.)**
|
138 |
+
|
139 |
+
## Installation
|
140 |
+
|
141 |
+
### Download the Codes
|
142 |
+
|
143 |
+
```bash
|
144 |
+
git clone https://github.com/BadToBest/EchoMimic
|
145 |
+
cd EchoMimic
|
146 |
+
```
|
147 |
+
|
148 |
+
### Python Environment Setup
|
149 |
+
|
150 |
+
- Tested System Environment: Centos 7.2/Ubuntu 22.04, Cuda >= 11.7
|
151 |
+
- Tested GPUs: A100(80G) / RTX4090D (24G) / V100(16G)
|
152 |
+
- Tested Python Version: 3.8 / 3.10 / 3.11
|
153 |
+
|
154 |
+
Create conda environment (Recommended):
|
155 |
+
|
156 |
+
```bash
|
157 |
+
conda create -n echomimic python=3.8
|
158 |
+
conda activate echomimic
|
159 |
+
```
|
160 |
+
|
161 |
+
Install packages with `pip`
|
162 |
+
```bash
|
163 |
+
pip install -r requirements.txt
|
164 |
+
```
|
165 |
+
|
166 |
+
### Download ffmpeg-static
|
167 |
+
Download and decompress [ffmpeg-static](https://www.johnvansickle.com/ffmpeg/old-releases/ffmpeg-4.4-amd64-static.tar.xz), then
|
168 |
+
```
|
169 |
+
export FFMPEG_PATH=/path/to/ffmpeg-4.4-amd64-static
|
170 |
+
```
|
171 |
+
|
172 |
+
### Download pretrained weights
|
173 |
+
|
174 |
+
```shell
|
175 |
+
git lfs install
|
176 |
+
git clone https://huggingface.co/BadToBest/EchoMimic pretrained_weights
|
177 |
+
```
|
178 |
+
|
179 |
+
The **pretrained_weights** is organized as follows.
|
180 |
+
|
181 |
+
```
|
182 |
+
./pretrained_weights/
|
183 |
+
├── denoising_unet.pth
|
184 |
+
├── reference_unet.pth
|
185 |
+
├── motion_module.pth
|
186 |
+
├── face_locator.pth
|
187 |
+
├── sd-vae-ft-mse
|
188 |
+
│ └── ...
|
189 |
+
├── sd-image-variations-diffusers
|
190 |
+
│ └── ...
|
191 |
+
└── audio_processor
|
192 |
+
└── whisper_tiny.pt
|
193 |
+
```
|
194 |
+
|
195 |
+
In which **denoising_unet.pth** / **reference_unet.pth** / **motion_module.pth** / **face_locator.pth** are the main checkpoints of **EchoMimic**. Other models in this hub can be also downloaded from it's original hub, thanks to their brilliant works:
|
196 |
+
- [sd-vae-ft-mse](https://huggingface.co/stabilityai/sd-vae-ft-mse)
|
197 |
+
- [sd-image-variations-diffusers](https://huggingface.co/lambdalabs/sd-image-variations-diffusers)
|
198 |
+
- [audio_processor(whisper)](https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt)
|
199 |
+
|
200 |
+
### Audio-Drived Algo Inference
|
201 |
+
Run the python inference script:
|
202 |
+
|
203 |
+
```bash
|
204 |
+
python -u infer_audio2vid.py
|
205 |
+
python -u infer_audio2vid_pose.py
|
206 |
+
```
|
207 |
+
|
208 |
+
### Audio-Drived Algo Inference On Your Own Cases
|
209 |
+
|
210 |
+
Edit the inference config file **./configs/prompts/animation.yaml**, and add your own case:
|
211 |
+
|
212 |
+
```bash
|
213 |
+
test_cases:
|
214 |
+
"path/to/your/image":
|
215 |
+
- "path/to/your/audio"
|
216 |
+
```
|
217 |
+
|
218 |
+
The run the python inference script:
|
219 |
+
```bash
|
220 |
+
python -u infer_audio2vid.py
|
221 |
+
```
|
222 |
+
|
223 |
+
### Motion Alignment between Ref. Img. and Driven Vid.
|
224 |
+
|
225 |
+
(Firstly download the checkpoints with '_pose.pth' postfix from huggingface)
|
226 |
+
|
227 |
+
Edit driver_video and ref_image to your path in demo_motion_sync.py, then run
|
228 |
+
```bash
|
229 |
+
python -u demo_motion_sync.py
|
230 |
+
```
|
231 |
+
|
232 |
+
### Audio&Pose-Drived Algo Inference
|
233 |
+
Edit ./configs/prompts/animation_pose.yaml, then run
|
234 |
+
```bash
|
235 |
+
python -u infer_audio2vid_pose.py
|
236 |
+
```
|
237 |
+
|
238 |
+
### Pose-Drived Algo Inference
|
239 |
+
Set draw_mouse=True in line 135 of infer_audio2vid_pose.py. Edit ./configs/prompts/animation_pose.yaml, then run
|
240 |
+
```bash
|
241 |
+
python -u infer_audio2vid_pose.py
|
242 |
+
```
|
243 |
+
|
244 |
+
### Run the Gradio UI
|
245 |
+
|
246 |
+
Thanks to the contribution from @Robin021:
|
247 |
+
|
248 |
+
```bash
|
249 |
+
|
250 |
+
python -u webgui.py --server_port=3000
|
251 |
+
|
252 |
+
```
|
253 |
+
|
254 |
+
## Release Plans
|
255 |
+
|
256 |
+
| Status | Milestone | ETA |
|
257 |
+
|:--------:|:-------------------------------------------------------------------------|:--:|
|
258 |
+
| ✅ | The inference source code of the Audio-Driven algo meet everyone on GitHub | 9th July, 2024 |
|
259 |
+
| ✅ | Pretrained models trained on English and Mandarin Chinese to be released | 9th July, 2024 |
|
260 |
+
| ✅ | The inference source code of the Pose-Driven algo meet everyone on GitHub | 13th July, 2024 |
|
261 |
+
| ✅ | Pretrained models with better pose control to be released | 13th July, 2024 |
|
262 |
+
| ✅ | Accelerated models to be released | 17th July, 2024 |
|
263 |
+
| 🚀 | Pretrained models with better sing performance to be released | TBD |
|
264 |
+
| 🚀 | Large-Scale and High-resolution Chinese-Based Talking Head Dataset | TBD |
|
265 |
+
|
266 |
+
## Acknowledgements
|
267 |
+
|
268 |
+
We would like to thank the contributors to the [AnimateDiff](https://github.com/guoyww/AnimateDiff), [Moore-AnimateAnyone](https://github.com/MooreThreads/Moore-AnimateAnyone) and [MuseTalk](https://github.com/TMElyralab/MuseTalk) repositories, for their open research and exploration.
|
269 |
+
|
270 |
+
We are also grateful to [V-Express](https://github.com/tencent-ailab/V-Express) and [hallo](https://github.com/fudan-generative-vision/hallo) for their outstanding work in the area of diffusion-based talking heads.
|
271 |
+
|
272 |
+
If we missed any open-source projects or related articles, we would like to complement the acknowledgement of this specific work immediately.
|
273 |
+
|
274 |
+
## Citation
|
275 |
+
|
276 |
+
If you find our work useful for your research, please consider citing the paper :
|
277 |
+
|
278 |
+
```
|
279 |
+
@misc{chen2024echomimic,
|
280 |
+
title={EchoMimic: Lifelike Audio-Driven Portrait Animations through Editable Landmark Conditioning},
|
281 |
+
author={Zhiyuan Chen, Jiajiong Cao, Zhiquan Chen, Yuming Li, Chenguang Ma},
|
282 |
+
year={2024},
|
283 |
+
archivePrefix={arXiv},
|
284 |
+
primaryClass={cs.CV}
|
285 |
+
}
|
286 |
+
```
|
287 |
+
|
288 |
+
## Star History
|
289 |
+
|
290 |
+
[![Star History Chart](https://api.star-history.com/svg?repos=BadToBest/EchoMimic&type=Date)](https://star-history.com/?spm=5176.28103460.0.0.342a3da23STWrU#BadToBest/EchoMimic&Date)
|
webgui.py
CHANGED
@@ -211,30 +211,67 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, seed, fac
|
|
211 |
|
212 |
with gr.Blocks() as demo:
|
213 |
gr.Markdown('# EchoMimic')
|
214 |
-
gr.Markdown('
|
|
|
215 |
with gr.Row():
|
216 |
with gr.Column():
|
217 |
uploaded_img = gr.Image(type="filepath", label="Reference Image")
|
218 |
uploaded_audio = gr.Audio(type="filepath", label="Input Audio")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
with gr.Column():
|
220 |
output_video = gr.Video()
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
def generate_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
|
240 |
|
@@ -263,7 +300,8 @@ with gr.Blocks() as demo:
|
|
263 |
fps,
|
264 |
device
|
265 |
],
|
266 |
-
outputs=output_video
|
|
|
267 |
)
|
268 |
parser = argparse.ArgumentParser(description='EchoMimic')
|
269 |
parser.add_argument('--server_name', type=str, default='0.0.0.0', help='Server name')
|
@@ -273,5 +311,5 @@ args = parser.parse_args()
|
|
273 |
# demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|
274 |
|
275 |
if __name__ == '__main__':
|
276 |
-
demo.launch()
|
277 |
#demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|
|
|
211 |
|
212 |
with gr.Blocks() as demo:
|
213 |
gr.Markdown('# EchoMimic')
|
214 |
+
gr.Markdown('## Lifelike Audio-Driven Portrait Animations through Editable Landmark Conditioning')
|
215 |
+
gr.Markdown('Inference time: from ~7mins/240frames to ~50s/240frames on V100 GPU')
|
216 |
with gr.Row():
|
217 |
with gr.Column():
|
218 |
uploaded_img = gr.Image(type="filepath", label="Reference Image")
|
219 |
uploaded_audio = gr.Audio(type="filepath", label="Input Audio")
|
220 |
+
with gr.Accordion("Advanced Configuration", open=False):
|
221 |
+
with gr.Row():
|
222 |
+
width = gr.Slider(label="Width", minimum=128, maximum=1024, value=default_values["width"])
|
223 |
+
height = gr.Slider(label="Height", minimum=128, maximum=1024, value=default_values["height"])
|
224 |
+
with gr.Row():
|
225 |
+
length = gr.Slider(label="Length", minimum=100, maximum=5000, value=default_values["length"])
|
226 |
+
seed = gr.Slider(label="Seed", minimum=0, maximum=10000, value=default_values["seed"])
|
227 |
+
with gr.Row():
|
228 |
+
facemask_dilation_ratio = gr.Slider(label="Facemask Dilation Ratio", minimum=0.0, maximum=1.0, step=0.01, value=default_values["facemask_dilation_ratio"])
|
229 |
+
facecrop_dilation_ratio = gr.Slider(label="Facecrop Dilation Ratio", minimum=0.0, maximum=1.0, step=0.01, value=default_values["facecrop_dilation_ratio"])
|
230 |
+
with gr.Row():
|
231 |
+
context_frames = gr.Slider(label="Context Frames", minimum=0, maximum=50, step=1, value=default_values["context_frames"])
|
232 |
+
context_overlap = gr.Slider(label="Context Overlap", minimum=0, maximum=10, step=1, value=default_values["context_overlap"])
|
233 |
+
with gr.Row():
|
234 |
+
cfg = gr.Slider(label="CFG", minimum=0.0, maximum=10.0, step=0.1, value=default_values["cfg"])
|
235 |
+
steps = gr.Slider(label="Steps", minimum=1, maximum=100, step=1, value=default_values["steps"])
|
236 |
+
with gr.Row():
|
237 |
+
sample_rate = gr.Slider(label="Sample Rate", minimum=8000, maximum=48000, step=1000, value=default_values["sample_rate"])
|
238 |
+
fps = gr.Slider(label="FPS", minimum=1, maximum=60, step=1, value=default_values["fps"])
|
239 |
+
device = gr.Radio(label="Device", choices=["cuda", "cpu"], value=default_values["device"])
|
240 |
+
generate_button = gr.Button("Generate Video")
|
241 |
with gr.Column():
|
242 |
output_video = gr.Video()
|
243 |
+
gr.Examples(
|
244 |
+
label = "Portrait examples",
|
245 |
+
examples = [
|
246 |
+
['assets/test_imgs/a.png'],
|
247 |
+
['assets/test_imgs/b.png'],
|
248 |
+
['assets/test_imgs/c.png'],
|
249 |
+
['assets/test_imgs/d.png'],
|
250 |
+
['assets/test_imgs/e.png']
|
251 |
+
],
|
252 |
+
inputs = [uploaded_img]
|
253 |
+
)
|
254 |
+
gr.Examples(
|
255 |
+
label = "Audio examples",
|
256 |
+
examples = [
|
257 |
+
['assets/test_audios/chunnuanhuakai.wav'],
|
258 |
+
['assets/test_audios/chunwang.wav'],
|
259 |
+
['assets/test_audios/echomimic_en_girl.wav'],
|
260 |
+
['assets/test_audios/echomimic_en.wav'],
|
261 |
+
['assets/test_audios/echomimic_girl.wav'],
|
262 |
+
['assets/test_audios/echomimic.wav'],
|
263 |
+
['assets/test_audios/jane.wav'],
|
264 |
+
['assets/test_audios/mei.wav'],
|
265 |
+
['assets/test_audios/walden.wav'],
|
266 |
+
['assets/test_audios/yun.wav'],
|
267 |
+
],
|
268 |
+
inputs = [uploaded_audio]
|
269 |
+
)
|
270 |
+
gr.HTML("""
|
271 |
+
<a href="https://huggingface.co/spaces/fffiloni/EchoMimic?duplicate=true">
|
272 |
+
<img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-xl.svg" alt="Duplicate this Space">
|
273 |
+
</a>
|
274 |
+
""")
|
275 |
|
276 |
def generate_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
|
277 |
|
|
|
300 |
fps,
|
301 |
device
|
302 |
],
|
303 |
+
outputs=output_video,
|
304 |
+
show_api=False
|
305 |
)
|
306 |
parser = argparse.ArgumentParser(description='EchoMimic')
|
307 |
parser.add_argument('--server_name', type=str, default='0.0.0.0', help='Server name')
|
|
|
311 |
# demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|
312 |
|
313 |
if __name__ == '__main__':
|
314 |
+
demo.queue(max_size=3).launch(show_api=False, show_error=True)
|
315 |
#demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|