Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,19 +4,24 @@ from PIL import Image
|
|
4 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
5 |
import torch
|
6 |
from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
|
7 |
-
from moviepy.editor import
|
|
|
8 |
from transformers import AutoProcessor, MusicgenForConditionalGeneration
|
9 |
import scipy.io.wavfile
|
10 |
import re
|
11 |
-
|
|
|
|
|
|
|
12 |
|
13 |
# 定义图像到文本函数
|
14 |
def img2text(image):
|
15 |
-
processor = BlipProcessor.from_pretrained("
|
16 |
-
model = BlipForConditionalGeneration.from_pretrained("
|
17 |
inputs = processor(image, return_tensors="pt")
|
18 |
out = model.generate(**inputs)
|
19 |
caption = processor.decode(out[0], skip_special_tokens=True)
|
|
|
20 |
return caption
|
21 |
|
22 |
# 定义文本生成函数
|
@@ -58,20 +63,84 @@ def text2text(user_input):
|
|
58 |
response = requests.post(f"{base_url}/chat/completions", headers=headers, json=data)
|
59 |
response.raise_for_status()
|
60 |
completion = response.json()
|
|
|
61 |
return completion['choices'][0]['message']['content']
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
# 定义文本到视频函数
|
64 |
def text2vid(input_text):
|
65 |
sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
|
66 |
-
adapter = MotionAdapter.from_pretrained("
|
67 |
-
pipe = AnimateDiffPipeline.from_pretrained("
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
for sentence in sentences:
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
def text2text_A(user_input):
|
77 |
# 设置API密钥和基础URL
|
@@ -107,13 +176,13 @@ def text2text_A(user_input):
|
|
107 |
|
108 |
# 定义文本到音频函数
|
109 |
def text2audio(text_input, duration_seconds):
|
110 |
-
processor = AutoProcessor.from_pretrained("
|
111 |
-
model = MusicgenForConditionalGeneration.from_pretrained("
|
112 |
inputs = processor(text=[text_input], padding=True, return_tensors="pt")
|
113 |
max_new_tokens = int((duration_seconds / 5) * 256)
|
114 |
audio_values = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
115 |
-
|
116 |
-
return
|
117 |
|
118 |
# 定义生成结果视频的函数
|
119 |
def result_generate(video_clip, audio_clip):
|
@@ -123,18 +192,50 @@ def result_generate(video_clip, audio_clip):
|
|
123 |
video_buffer.seek(0)
|
124 |
return video_buffer
|
125 |
|
126 |
-
# 整合所有步骤到主函数
|
127 |
def generate_video(image):
|
|
|
128 |
text = img2text(image)
|
|
|
129 |
sentences = text2text(text)
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
audio_text = text2text_A(text)
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
# 定义 Gradio 接口
|
139 |
interface = gr.Interface(
|
140 |
fn=lambda img: generate_video(img),
|
|
|
4 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
5 |
import torch
|
6 |
from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
|
7 |
+
from moviepy.editor import concatenate_videoclips, AudioFileClip
|
8 |
+
from moviepy.video.io.ImageSequenceClip import ImageSequenceClip
|
9 |
from transformers import AutoProcessor, MusicgenForConditionalGeneration
|
10 |
import scipy.io.wavfile
|
11 |
import re
|
12 |
+
import numpy as np
|
13 |
+
import os
|
14 |
+
import io
|
15 |
+
import tempfile
|
16 |
|
17 |
# 定义图像到文本函数
|
18 |
def img2text(image):
|
19 |
+
processor = BlipProcessor.from_pretrained("blip-image-captioning-large")
|
20 |
+
model = BlipForConditionalGeneration.from_pretrained("blip-image-captioning-large")
|
21 |
inputs = processor(image, return_tensors="pt")
|
22 |
out = model.generate(**inputs)
|
23 |
caption = processor.decode(out[0], skip_special_tokens=True)
|
24 |
+
print(caption)
|
25 |
return caption
|
26 |
|
27 |
# 定义文本生成函数
|
|
|
63 |
response = requests.post(f"{base_url}/chat/completions", headers=headers, json=data)
|
64 |
response.raise_for_status()
|
65 |
completion = response.json()
|
66 |
+
print(completion['choices'][0]['message']['content'])
|
67 |
return completion['choices'][0]['message']['content']
|
68 |
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
import torch
|
73 |
+
from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
|
74 |
+
from diffusers.utils import export_to_gif
|
75 |
+
import re
|
76 |
+
def text2vid(input_text):
|
77 |
+
# 使用正则表达式分割输入文本并提取句子
|
78 |
+
sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
|
79 |
+
|
80 |
+
# 加载动作适配器和动画扩散管道
|
81 |
+
adapter = MotionAdapter.from_pretrained("/home/u2022211776/jupyterlab/AnimateLCM", config_file="/home/u2022211776/jupyterlab/AnimateLCM/config.json", torch_dtype=torch.float16)
|
82 |
+
pipe = AnimateDiffPipeline.from_pretrained("/home/u2022211776/jupyterlab/epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
|
83 |
+
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
|
84 |
+
|
85 |
+
# 加载LoRA权重
|
86 |
+
pipe.load_lora_weights("/home/u2022211776/jupyterlab/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
|
87 |
+
|
88 |
+
# 设置适配器并启用功能
|
89 |
+
try:
|
90 |
+
pipe.set_adapters(["lcm-lora"], [0.8])
|
91 |
+
except ValueError as e:
|
92 |
+
print("Ignoring the error:", str(e))
|
93 |
+
pipe.enable_vae_slicing()
|
94 |
+
pipe.enable_model_cpu_offload()
|
95 |
+
|
96 |
+
all_frames = [] # 存储所有句子的所有帧
|
97 |
+
|
98 |
+
# 循环遍历每个句子,生成动画并导出为GIF
|
99 |
+
for index, sentence in enumerate(sentences):
|
100 |
+
output = pipe(
|
101 |
+
#prompt=sentence + ", 4k, high resolution",
|
102 |
+
prompt=sentence + ", cartoon",
|
103 |
+
negative_prompt="bad quality, worse quality, low resolution",
|
104 |
+
num_frames=24,
|
105 |
+
guidance_scale=2.0,
|
106 |
+
num_inference_steps=6,
|
107 |
+
generator=torch.Generator("cpu").manual_seed(0)
|
108 |
+
)
|
109 |
+
frames = output.frames[0]
|
110 |
+
all_frames.extend(frames) # 添加每个句子的帧到all_frames
|
111 |
+
|
112 |
+
return all_frames
|
113 |
+
|
114 |
+
|
115 |
+
|
116 |
+
|
117 |
# 定义文本到视频函数
|
118 |
def text2vid(input_text):
|
119 |
sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
|
120 |
+
adapter = MotionAdapter.from_pretrained("AnimateLCM", config_file="AnimateLCM/config.json", torch_dtype=torch.float16)
|
121 |
+
pipe = AnimateDiffPipeline.from_pretrained("epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
|
122 |
+
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
|
123 |
+
pipe.load_lora_weights("AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
|
124 |
+
try:
|
125 |
+
pipe.set_adapters(["lcm-lora"], [0.8])
|
126 |
+
except ValueError as e:
|
127 |
+
print("Ignoring the error:", str(e))
|
128 |
+
pipe.enable_vae_slicing()
|
129 |
+
pipe.enable_model_cpu_offload()
|
130 |
+
|
131 |
+
video_frames = []
|
132 |
for sentence in sentences:
|
133 |
+
output = pipe(
|
134 |
+
prompt=sentence + ", 4k, high resolution",
|
135 |
+
negative_prompt="bad quality, worse quality, low resolution",
|
136 |
+
num_frames=24,
|
137 |
+
guidance_scale=2.0,
|
138 |
+
num_inference_steps=6,
|
139 |
+
generator=torch.Generator("cpu").manual_seed(0)
|
140 |
+
)
|
141 |
+
video_frames.extend(output.frames[0])
|
142 |
+
|
143 |
+
return video_frames
|
144 |
|
145 |
def text2text_A(user_input):
|
146 |
# 设置API密钥和基础URL
|
|
|
176 |
|
177 |
# 定义文本到音频函数
|
178 |
def text2audio(text_input, duration_seconds):
|
179 |
+
processor = AutoProcessor.from_pretrained("musicgen-small")
|
180 |
+
model = MusicgenForConditionalGeneration.from_pretrained("musicgen-small")
|
181 |
inputs = processor(text=[text_input], padding=True, return_tensors="pt")
|
182 |
max_new_tokens = int((duration_seconds / 5) * 256)
|
183 |
audio_values = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
184 |
+
print(duration_seconds)
|
185 |
+
return audio_values[0, 0].numpy(), model.config.audio_encoder.sampling_rate
|
186 |
|
187 |
# 定义生成结果视频的函数
|
188 |
def result_generate(video_clip, audio_clip):
|
|
|
192 |
video_buffer.seek(0)
|
193 |
return video_buffer
|
194 |
|
|
|
195 |
def generate_video(image):
|
196 |
+
# 获取图像描述
|
197 |
text = img2text(image)
|
198 |
+
# 生成详细的文本场景描述
|
199 |
sentences = text2text(text)
|
200 |
+
# 生成视频帧
|
201 |
+
video_frames = text2vid(sentences)
|
202 |
+
|
203 |
+
# 转换视频帧为numpy数组
|
204 |
+
video_frames = [np.array(frame) for frame in video_frames]
|
205 |
+
|
206 |
+
# 创建视频片段
|
207 |
+
video_clip = ImageSequenceClip(video_frames, fps=24)
|
208 |
+
video_duration = video_clip.duration
|
209 |
+
|
210 |
+
# 生成音频数据
|
211 |
audio_text = text2text_A(text)
|
212 |
+
audio_data, audio_rate = text2audio(audio_text, video_duration)
|
213 |
+
|
214 |
+
# 将音频数据写入临时文件
|
215 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile:
|
216 |
+
scipy.io.wavfile.write(tmpfile, audio_rate, audio_data)
|
217 |
+
tmpfile_path = tmpfile.name
|
218 |
+
|
219 |
+
# 创建AudioFileClip对象
|
220 |
+
audio_clip = AudioFileClip(tmpfile_path)
|
221 |
|
222 |
+
# 将音频添加到视频中
|
223 |
+
video_clip = video_clip.set_audio(audio_clip)
|
224 |
+
print("audio_done")
|
225 |
+
|
226 |
+
# 将视频写入临时文件
|
227 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmpfile:
|
228 |
+
video_clip.write_videofile(tmpfile.name, codec="libx264", audio_codec="aac")
|
229 |
+
video_file_path = tmpfile.name
|
230 |
+
|
231 |
+
# 读取临时文件数据并删除
|
232 |
+
with open(video_file_path, 'rb') as f:
|
233 |
+
video_data = f.read()
|
234 |
+
os.remove(video_file_path)
|
235 |
+
os.remove(tmpfile_path)
|
236 |
+
print("video_done")
|
237 |
+
return video_data
|
238 |
+
|
239 |
# 定义 Gradio 接口
|
240 |
interface = gr.Interface(
|
241 |
fn=lambda img: generate_video(img),
|