Files changed (1) hide show
  1. app.py +39 -31
app.py CHANGED
@@ -1,24 +1,30 @@
1
  import os
2
- os.system("git clone https://github.com/modelscope/DiffSynth-Studio.git")
3
- os.system("cp -r DiffSynth-Studio/diffsynth ./")
4
- os.system("pip install -r DiffSynth-Studio/requirements.txt")
5
- from diffsynth import save_video, ModelManager, SVDVideoPipeline
6
- from diffsynth import ModelManager
7
- import torch, os, random, time
8
- import gradio as gr
9
  import numpy as np
10
  from PIL import Image
 
 
11
  import spaces
12
 
 
 
 
 
13
 
 
14
  def get_i2v_pipeline():
15
- model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
16
- model_id_list=["stable-video-diffusion-img2vid-xt", "ExVideo-SVD-128f-v1"],
17
- downloading_priority=["HuggingFace"])
 
 
 
18
  pipe = SVDVideoPipeline.from_model_manager(model_manager)
19
  return pipe
20
 
21
-
22
  @spaces.GPU(duration=280)
23
  def sample(image, seed, randomize_seed, motion_bucket_id, num_inference_steps):
24
  if randomize_seed:
@@ -36,45 +42,47 @@ def sample(image, seed, randomize_seed, motion_bucket_id, num_inference_steps):
36
  save_video(video, file_path, fps=30, quality=7)
37
  return file_path, seed
38
 
39
-
40
  def crop_and_resize(image):
41
  height = 512
42
  width = 512
43
  image = np.array(image)
44
  image_height, image_width, _ = image.shape
45
  if image_height / image_width < height / width:
46
- croped_width = int(image_height / height * width)
47
- left = (image_width - croped_width) // 2
48
- image = image[:, left: left+croped_width]
49
  image = Image.fromarray(image).convert("RGB").resize((width, height))
50
  else:
51
- croped_height = int(image_width / width * height)
52
- left = (image_height - croped_height) // 2
53
- image = image[left: left+croped_height, :]
54
  image = Image.fromarray(image).convert("RGB").resize((width, height))
55
  return image
56
 
57
-
58
  pipe = get_i2v_pipeline()
59
 
 
60
  def process_examples(image):
61
- file_path, seed = sample(image, seed=0, randomize_seed=True, motion_bucket_id=100, num_inference_steps=25 )
62
  return file_path, seed
63
-
 
64
  with gr.Blocks() as demo:
65
  gr.Markdown('''
66
- # ExVideo
67
 
68
- ExVideo is a post-tuning technique aimed at enhancing the capability of video generation models. We have extended Stable Video Diffusion to achieve the generation of long videos up to 128 frames.
69
 
70
- This is the first model we have made public. Due to limitations in computational resources, this model was trained on about 40,000 videos using 8x A100 GPUs for approximately one week. Therefore, the model may sometimes generate content that does not conform to real-world principles. Please look forward to the release of our subsequent models.
71
 
72
- To use this model, please refer to [DiffSynth](https://github.com/modelscope/DiffSynth-Studio).
73
 
74
- * [Project Page](https://ecnu-cilab.github.io/ExVideoProjectPage/)
75
- * [Source Code](https://github.com/modelscope/DiffSynth-Studio)
76
- * [Technical report](https://arxiv.org/abs/2406.14130)
77
- ''')
78
  with gr.Row():
79
  with gr.Column():
80
  image = gr.Image(label="Upload your image", type="pil")
@@ -85,7 +93,7 @@ To use this model, please refer to [DiffSynth](https://github.com/modelscope/Dif
85
  randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
86
  motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to synthesize", value=100, minimum=0, maximum=127)
87
  num_inference_steps = gr.Slider(label="Inference steps", value=25, minimum=1, maximum=50)
88
-
89
  image.upload(fn=crop_and_resize, inputs=image, outputs=image, queue=False)
90
  generate_btn.click(fn=sample, inputs=[image, seed, randomize_seed, motion_bucket_id, num_inference_steps], outputs=[video, seed], api_name="video")
91
  gr.Examples(
@@ -103,4 +111,4 @@ To use this model, please refer to [DiffSynth](https://github.com/modelscope/Dif
103
  )
104
 
105
  if __name__ == "__main__":
106
- demo.launch()
 
1
  import os
2
+ import torch
3
+ import random
4
+ import time
 
 
 
 
5
  import numpy as np
6
  from PIL import Image
7
+ import gradio as gr
8
+ from diffsynth import save_video, ModelManager, SVDVideoPipeline
9
  import spaces
10
 
11
+ # Clone the repository and install dependencies
12
+ os.system("git clone https://github.com/modelscope/DiffSynth-Studio.git")
13
+ os.system("cp -r DiffSynth-Studio/diffsynth ./")
14
+ os.system("pip install -r DiffSynth-Studio/requirements.txt")
15
 
16
+ # Function to initialize the model pipeline
17
  def get_i2v_pipeline():
18
+ model_manager = ModelManager(
19
+ torch_dtype=torch.float16,
20
+ device="cuda",
21
+ model_id_list=["stable-video-diffusion-img2vid-xt", "ExVideo-SVD-128f-v1"],
22
+ downloading_priority=["HuggingFace"]
23
+ )
24
  pipe = SVDVideoPipeline.from_model_manager(model_manager)
25
  return pipe
26
 
27
+ # Function to sample and generate video
28
  @spaces.GPU(duration=280)
29
  def sample(image, seed, randomize_seed, motion_bucket_id, num_inference_steps):
30
  if randomize_seed:
 
42
  save_video(video, file_path, fps=30, quality=7)
43
  return file_path, seed
44
 
45
+ # Function to crop and resize the image
46
  def crop_and_resize(image):
47
  height = 512
48
  width = 512
49
  image = np.array(image)
50
  image_height, image_width, _ = image.shape
51
  if image_height / image_width < height / width:
52
+ cropped_width = int(image_height / height * width)
53
+ left = (image_width - cropped_width) // 2
54
+ image = image[:, left: left + cropped_width]
55
  image = Image.fromarray(image).convert("RGB").resize((width, height))
56
  else:
57
+ cropped_height = int(image_width / width * height)
58
+ left = (image_height - cropped_height) // 2
59
+ image = image[left: left + cropped_height, :]
60
  image = Image.fromarray(image).convert("RGB").resize((width, height))
61
  return image
62
 
63
+ # Initialize the model pipeline
64
  pipe = get_i2v_pipeline()
65
 
66
+ # Function to process examples
67
  def process_examples(image):
68
+ file_path, seed = sample(image, seed=0, randomize_seed=True, motion_bucket_id=100, num_inference_steps=25)
69
  return file_path, seed
70
+
71
+ # Create the Gradio interface
72
  with gr.Blocks() as demo:
73
  gr.Markdown('''
74
+ # ExVideo
75
 
76
+ ExVideo is a post-tuning technique aimed at enhancing the capability of video generation models. We have extended Stable Video Diffusion to achieve the generation of long videos up to 128 frames.
77
 
78
+ This is the first model we have made public. Due to limitations in computational resources, this model was trained on about 40,000 videos using 8x A100 GPUs for approximately one week. Therefore, the model may sometimes generate content that does not conform to real-world principles. Please look forward to the release of our subsequent models.
79
 
80
+ To use this model, please refer to [DiffSynth](https://github.com/modelscope/DiffSynth-Studio).
81
 
82
+ * [Project Page](https://ecnu-cilab.github.io/ExVideoProjectPage/)
83
+ * [Source Code](https://github.com/modelscope/DiffSynth-Studio)
84
+ * [Technical report](https://arxiv.org/abs/2406.14130)
85
+ ''')
86
  with gr.Row():
87
  with gr.Column():
88
  image = gr.Image(label="Upload your image", type="pil")
 
93
  randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
94
  motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to synthesize", value=100, minimum=0, maximum=127)
95
  num_inference_steps = gr.Slider(label="Inference steps", value=25, minimum=1, maximum=50)
96
+
97
  image.upload(fn=crop_and_resize, inputs=image, outputs=image, queue=False)
98
  generate_btn.click(fn=sample, inputs=[image, seed, randomize_seed, motion_bucket_id, num_inference_steps], outputs=[video, seed], api_name="video")
99
  gr.Examples(
 
111
  )
112
 
113
  if __name__ == "__main__":
114
+ demo.launch()