Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -73,7 +73,7 @@ import torch
|
|
73 |
from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
|
74 |
from diffusers.utils import export_to_gif
|
75 |
import re
|
76 |
-
def text2vid(input_text):
|
77 |
# 使用正则表达式分割输入文本并提取句子
|
78 |
sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
|
79 |
|
@@ -95,11 +95,10 @@ def text2vid(input_text):
|
|
95 |
|
96 |
all_frames = [] # 存储所有句子的所有帧
|
97 |
|
98 |
-
# 循环遍历每个句子,生成动画并导出为GIF
|
99 |
for index, sentence in enumerate(sentences):
|
100 |
output = pipe(
|
101 |
-
#prompt=sentence + ",
|
102 |
-
prompt=sentence + ",
|
103 |
negative_prompt="bad quality, worse quality, low resolution",
|
104 |
num_frames=24,
|
105 |
guidance_scale=2.0,
|
@@ -111,7 +110,39 @@ def text2vid(input_text):
|
|
111 |
|
112 |
return all_frames
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
|
117 |
def text2text_A(user_input):
|
@@ -156,6 +187,42 @@ def text2audio(text_input, duration_seconds):
|
|
156 |
print(duration_seconds)
|
157 |
return audio_values[0, 0].numpy(), model.config.audio_encoder.sampling_rate
|
158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
# 定义生成结果视频的函数
|
160 |
def result_generate(video_clip, audio_clip):
|
161 |
video = video_clip.set_audio(audio_clip)
|
@@ -164,13 +231,27 @@ def result_generate(video_clip, audio_clip):
|
|
164 |
video_buffer.seek(0)
|
165 |
return video_buffer
|
166 |
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
# 获取图像描述
|
169 |
text = img2text(image)
|
170 |
# 生成详细的文本场景描述
|
171 |
sentences = text2text(text)
|
172 |
# 生成视频帧
|
173 |
-
video_frames = text2vid(sentences)
|
174 |
|
175 |
# 转换视频帧为numpy数组
|
176 |
video_frames = [np.array(frame) for frame in video_frames]
|
@@ -199,7 +280,22 @@ def generate_video(image):
|
|
199 |
|
200 |
# 函数现在返回视频文件的路径,不再需要读取数据并删除
|
201 |
return video_file_path
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
import traceback
|
204 |
|
205 |
def safe_generate_video(image):
|
@@ -214,18 +310,38 @@ def safe_generate_video(image):
|
|
214 |
return None, error_msg
|
215 |
|
216 |
# 定义 Gradio 接口
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
|
|
73 |
from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
|
74 |
from diffusers.utils import export_to_gif
|
75 |
import re
|
76 |
+
def text2vid(input_text,desc = "4k, high resolution"):
|
77 |
# 使用正则表达式分割输入文本并提取句子
|
78 |
sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
|
79 |
|
|
|
95 |
|
96 |
all_frames = [] # 存储所有句子的所有帧
|
97 |
|
|
|
98 |
for index, sentence in enumerate(sentences):
|
99 |
output = pipe(
|
100 |
+
#prompt=sentence + ", " + desc,
|
101 |
+
prompt=sentence + ", " + desc,
|
102 |
negative_prompt="bad quality, worse quality, low resolution",
|
103 |
num_frames=24,
|
104 |
guidance_scale=2.0,
|
|
|
110 |
|
111 |
return all_frames
|
112 |
|
113 |
+
def text2vid_pro(input_text,desc = "4k, high resolution"):
|
114 |
+
# 使用正则表达式分割输入文本并提取句子
|
115 |
+
sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
|
116 |
+
|
117 |
+
# 加载动作适配器和动画扩散管道
|
118 |
+
adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM", config_file="wangfuyun/AnimateLCM/config.json", torch_dtype=torch.float16)
|
119 |
+
pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
|
120 |
+
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
|
121 |
+
|
122 |
+
# 加载LoRA权重
|
123 |
+
pipe.load_lora_weights("wangfuyun/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
|
124 |
+
|
125 |
+
# 设置适配器并启用功能
|
126 |
+
try:
|
127 |
+
pipe.set_adapters(["lcm-lora"], [0.8])
|
128 |
+
except ValueError as e:
|
129 |
+
print("Ignoring the error:", str(e))
|
130 |
+
pipe.enable_vae_slicing()
|
131 |
+
pipe.enable_model_cpu_offload()
|
132 |
|
133 |
+
# 循环遍历每个句子,生成动画并导出为GIF
|
134 |
+
for index, sentence in enumerate(sentences):
|
135 |
+
output = pipe(
|
136 |
+
#prompt=sentence + "," + desc ,
|
137 |
+
prompt=sentence + ", cartoon",
|
138 |
+
negative_prompt="bad quality, worse quality, low resolution",
|
139 |
+
num_frames=24,
|
140 |
+
guidance_scale=2.0,
|
141 |
+
num_inference_steps=6,
|
142 |
+
generator=torch.Generator("cpu").manual_seed(0)
|
143 |
+
)
|
144 |
+
frames = output.frames[0]
|
145 |
+
export_to_gif(frames, f"{index+1}.gif")
|
146 |
|
147 |
|
148 |
def text2text_A(user_input):
|
|
|
187 |
print(duration_seconds)
|
188 |
return audio_values[0, 0].numpy(), model.config.audio_encoder.sampling_rate
|
189 |
|
190 |
+
|
191 |
+
def text2audio_pro(text_input, duration_seconds):
|
192 |
+
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
|
193 |
+
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
194 |
+
inputs = processor(text=[text_input], padding=True, return_tensors="pt")
|
195 |
+
|
196 |
+
# Calculate max_new_tokens based on the desired duration
|
197 |
+
max_new_tokens = int((duration_seconds / 5) * 256)
|
198 |
+
|
199 |
+
audio_values = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
200 |
+
|
201 |
+
# Save audio file
|
202 |
+
scipy.io.wavfile.write("bgm.wav", rate=model.config.audio_encoder.sampling_rate, data=audio_values[0, 0].numpy())
|
203 |
+
|
204 |
+
|
205 |
+
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
|
206 |
+
import glob
|
207 |
+
from transformers import AutoProcessor, MusicgenForConditionalGeneration
|
208 |
+
import scipy.io.wavfile
|
209 |
+
|
210 |
+
def video_generate_pro(img2text_input=" "):
|
211 |
+
# 设置视频帧率
|
212 |
+
frame_rate = 24 # 可以修改这个值来设置不同的帧率
|
213 |
+
|
214 |
+
# 获取所有GIF文件,假设它们位于同一文件夹并按名称排序
|
215 |
+
gif_files = sorted(glob.glob('./*.gif'))
|
216 |
+
|
217 |
+
# 创建视频剪辑列表,每个GIF文件作为一个VideoFileClip
|
218 |
+
clips = [VideoFileClip(gif) for gif in gif_files]
|
219 |
+
|
220 |
+
# 连接视频剪辑
|
221 |
+
final_clip = concatenate_videoclips(clips, method="compose")
|
222 |
+
|
223 |
+
# 输出视频文件
|
224 |
+
final_clip.write_videofile('output_video.mp4', codec='libx264')
|
225 |
+
|
226 |
# 定义生成结果视频的函数
|
227 |
def result_generate(video_clip, audio_clip):
|
228 |
video = video_clip.set_audio(audio_clip)
|
|
|
231 |
video_buffer.seek(0)
|
232 |
return video_buffer
|
233 |
|
234 |
+
from moviepy.editor import VideoFileClip, AudioFileClip
|
235 |
+
def result_generate_pro():
|
236 |
+
# 加载视频文件
|
237 |
+
video = VideoFileClip("output_video.mp4")
|
238 |
+
|
239 |
+
# 加载音频文件
|
240 |
+
audio = AudioFileClip("bgm.wav")
|
241 |
+
|
242 |
+
# 将音频设置为视频的音频
|
243 |
+
video = video.set_audio(audio)
|
244 |
+
|
245 |
+
# 导出新的视频文件
|
246 |
+
video.write_videofile("result.mp4", codec="libx264", audio_codec="aac")
|
247 |
+
|
248 |
+
def generate_video_basic(image,desc):
|
249 |
# 获取图像描述
|
250 |
text = img2text(image)
|
251 |
# 生成详细的文本场景描述
|
252 |
sentences = text2text(text)
|
253 |
# 生成视频帧
|
254 |
+
video_frames = text2vid(sentences,desc)
|
255 |
|
256 |
# 转换视频帧为numpy数组
|
257 |
video_frames = [np.array(frame) for frame in video_frames]
|
|
|
280 |
|
281 |
# 函数现在返回视频文件的路径,不再需要读取数据并删除
|
282 |
return video_file_path
|
283 |
+
|
284 |
+
def generate_video_pro(image,desc):
|
285 |
+
# 获取图像描述
|
286 |
+
text = img2text(image)
|
287 |
+
sentences = text2text(text) # 从文本生成结构化句子
|
288 |
+
text2vid_pro(sentences, desc) # 从句子创建视频序列
|
289 |
+
video_generate_pro() # 创建视频文件
|
290 |
+
video = VideoFileClip("output_video.mp4")
|
291 |
+
duration = video.duration
|
292 |
+
print(duration)
|
293 |
+
audio_text = text2text_A(text)
|
294 |
+
text2audio_pro(audio_text,duration)
|
295 |
+
result_generate_pro()
|
296 |
+
return "result.mp4"
|
297 |
+
|
298 |
+
|
299 |
import traceback
|
300 |
|
301 |
def safe_generate_video(image):
|
|
|
310 |
return None, error_msg
|
311 |
|
312 |
# 定义 Gradio 接口
|
313 |
+
with gr.Blocks() as demo:
|
314 |
+
gr.Markdown("Upload an image and provide a description to generate a video.")
|
315 |
+
with gr.Tab("Basic Version"):
|
316 |
+
with gr.Row():
|
317 |
+
image_input = gr.Image(type="pil")
|
318 |
+
description_input = gr.Textbox(label="Description", placeholder="Enter description here", lines=2)
|
319 |
+
with gr.Row():
|
320 |
+
video_output = gr.Video(label="Generated Video")
|
321 |
+
error_output = gr.Textbox(label="Error Messages", placeholder="No errors", lines=5)
|
322 |
+
gr.Interface(
|
323 |
+
fn=generate_video_basic,
|
324 |
+
inputs=[image_input, description_input],
|
325 |
+
outputs=[video_output, error_output],
|
326 |
+
title="Basic Version Video Generation",
|
327 |
+
description="Upload an image and some descriptions to generate a video in the basic version. Any errors will be displayed below.",
|
328 |
+
theme="soft"
|
329 |
+
)
|
330 |
+
with gr.Tab("Pro Version"):
|
331 |
+
with gr.Row():
|
332 |
+
image_input = gr.Image(type="pil")
|
333 |
+
description_input = gr.Textbox(label="Description", placeholder="Enter description here", lines=2)
|
334 |
+
with gr.Row():
|
335 |
+
video_output = gr.Video(label="Generated Video")
|
336 |
+
error_output = gr.Textbox(label="Error Messages", placeholder="No errors", lines=5)
|
337 |
+
gr.Interface(
|
338 |
+
fn=generate_video_pro,
|
339 |
+
inputs=[image_input, description_input],
|
340 |
+
outputs=[video_output, error_output],
|
341 |
+
title="Pro Version Video Generation",
|
342 |
+
description="Upload an image and some descriptions to generate a video in the pro version. Any errors will be displayed below.",
|
343 |
+
theme="soft"
|
344 |
+
)
|
345 |
+
|
346 |
+
demo.launch()
|
347 |
|