HongcanGuo commited on
Commit
02556a5
·
verified ·
1 Parent(s): 7e348d8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -23
app.py CHANGED
@@ -4,19 +4,24 @@ from PIL import Image
4
  from transformers import BlipProcessor, BlipForConditionalGeneration
5
  import torch
6
  from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
7
- from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
 
8
  from transformers import AutoProcessor, MusicgenForConditionalGeneration
9
  import scipy.io.wavfile
10
  import re
11
- from io import BytesIO
 
 
 
12
 
13
  # 定义图像到文本函数
14
  def img2text(image):
15
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
16
- model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
17
  inputs = processor(image, return_tensors="pt")
18
  out = model.generate(**inputs)
19
  caption = processor.decode(out[0], skip_special_tokens=True)
 
20
  return caption
21
 
22
  # 定义文本生成函数
@@ -58,20 +63,84 @@ def text2text(user_input):
58
  response = requests.post(f"{base_url}/chat/completions", headers=headers, json=data)
59
  response.raise_for_status()
60
  completion = response.json()
 
61
  return completion['choices'][0]['message']['content']
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  # 定义文本到视频函数
64
  def text2vid(input_text):
65
  sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
66
- adapter = MotionAdapter.from_pretrained("your-motion-adapter")
67
- pipe = AnimateDiffPipeline.from_pretrained("your-diffusion-model", motion_adapter=adapter)
68
- video_clips = []
 
 
 
 
 
 
 
 
 
69
  for sentence in sentences:
70
- frames = pipe(sentence, num_inference_steps=50, guidance_scale=7.5)
71
- video_clip = frames_to_video_clip(frames) # Assume this function converts frames to a video clip
72
- video_clips.append(video_clip)
73
- final_clip = concatenate_videoclips(video_clips, method="compose")
74
- return final_clip
 
 
 
 
 
 
75
 
76
  def text2text_A(user_input):
77
  # 设置API密钥和基础URL
@@ -107,13 +176,13 @@ def text2text_A(user_input):
107
 
108
  # 定义文本到音频函数
109
  def text2audio(text_input, duration_seconds):
110
- processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
111
- model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
112
  inputs = processor(text=[text_input], padding=True, return_tensors="pt")
113
  max_new_tokens = int((duration_seconds / 5) * 256)
114
  audio_values = model.generate(**inputs, max_new_tokens=max_new_tokens)
115
- audio_clip = numpy_array_to_audio_clip(audio_values.numpy(), rate=model.config.audio_encoder.sampling_rate) # Assume this function converts numpy array to audio clip
116
- return audio_clip
117
 
118
  # 定义生成结果视频的函数
119
  def result_generate(video_clip, audio_clip):
@@ -123,18 +192,50 @@ def result_generate(video_clip, audio_clip):
123
  video_buffer.seek(0)
124
  return video_buffer
125
 
126
- # 整合所有步骤到主函数
127
  def generate_video(image):
 
128
  text = img2text(image)
 
129
  sentences = text2text(text)
130
- final_video_clip = text2vid(sentences)
131
- video = VideoFileClip(final_video_clip) # Assumes final_video_clip is a path or BytesIO object
132
- duration = video.duration
 
 
 
 
 
 
 
 
133
  audio_text = text2text_A(text)
134
- audio_clip = text2audio(audio_text, duration)
135
- result_video = result_generate(final_video_clip, audio_clip)
136
- return result_video
 
 
 
 
 
 
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  # 定义 Gradio 接口
139
  interface = gr.Interface(
140
  fn=lambda img: generate_video(img),
 
4
  from transformers import BlipProcessor, BlipForConditionalGeneration
5
  import torch
6
  from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
7
+ from moviepy.editor import concatenate_videoclips, AudioFileClip
8
+ from moviepy.video.io.ImageSequenceClip import ImageSequenceClip
9
  from transformers import AutoProcessor, MusicgenForConditionalGeneration
10
  import scipy.io.wavfile
11
  import re
12
+ import numpy as np
13
+ import os
14
+ import io
15
+ import tempfile
16
 
17
  # 定义图像到文本函数
18
  def img2text(image):
19
+ processor = BlipProcessor.from_pretrained("blip-image-captioning-large")
20
+ model = BlipForConditionalGeneration.from_pretrained("blip-image-captioning-large")
21
  inputs = processor(image, return_tensors="pt")
22
  out = model.generate(**inputs)
23
  caption = processor.decode(out[0], skip_special_tokens=True)
24
+ print(caption)
25
  return caption
26
 
27
  # 定义文本生成函数
 
63
  response = requests.post(f"{base_url}/chat/completions", headers=headers, json=data)
64
  response.raise_for_status()
65
  completion = response.json()
66
+ print(completion['choices'][0]['message']['content'])
67
  return completion['choices'][0]['message']['content']
68
 
69
+
70
+
71
+
72
+ import torch
73
+ from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
74
+ from diffusers.utils import export_to_gif
75
+ import re
76
+ def text2vid(input_text):
77
+ # 使用正则表达式分割输入文本并提取句子
78
+ sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
79
+
80
+ # 加载动作适配器和动画扩散管道
81
+ adapter = MotionAdapter.from_pretrained("/home/u2022211776/jupyterlab/AnimateLCM", config_file="/home/u2022211776/jupyterlab/AnimateLCM/config.json", torch_dtype=torch.float16)
82
+ pipe = AnimateDiffPipeline.from_pretrained("/home/u2022211776/jupyterlab/epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
83
+ pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
84
+
85
+ # 加载LoRA权重
86
+ pipe.load_lora_weights("/home/u2022211776/jupyterlab/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
87
+
88
+ # 设置适配器并启用功能
89
+ try:
90
+ pipe.set_adapters(["lcm-lora"], [0.8])
91
+ except ValueError as e:
92
+ print("Ignoring the error:", str(e))
93
+ pipe.enable_vae_slicing()
94
+ pipe.enable_model_cpu_offload()
95
+
96
+ all_frames = [] # 存储所有句子的所有帧
97
+
98
+ # 循环遍历每个句子,生成动画并导出为GIF
99
+ for index, sentence in enumerate(sentences):
100
+ output = pipe(
101
+ #prompt=sentence + ", 4k, high resolution",
102
+ prompt=sentence + ", cartoon",
103
+ negative_prompt="bad quality, worse quality, low resolution",
104
+ num_frames=24,
105
+ guidance_scale=2.0,
106
+ num_inference_steps=6,
107
+ generator=torch.Generator("cpu").manual_seed(0)
108
+ )
109
+ frames = output.frames[0]
110
+ all_frames.extend(frames) # 添加每个句子的帧到all_frames
111
+
112
+ return all_frames
113
+
114
+
115
+
116
+
117
  # 定义文本到视频函数
118
  def text2vid(input_text):
119
  sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
120
+ adapter = MotionAdapter.from_pretrained("AnimateLCM", config_file="AnimateLCM/config.json", torch_dtype=torch.float16)
121
+ pipe = AnimateDiffPipeline.from_pretrained("epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
122
+ pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
123
+ pipe.load_lora_weights("AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
124
+ try:
125
+ pipe.set_adapters(["lcm-lora"], [0.8])
126
+ except ValueError as e:
127
+ print("Ignoring the error:", str(e))
128
+ pipe.enable_vae_slicing()
129
+ pipe.enable_model_cpu_offload()
130
+
131
+ video_frames = []
132
  for sentence in sentences:
133
+ output = pipe(
134
+ prompt=sentence + ", 4k, high resolution",
135
+ negative_prompt="bad quality, worse quality, low resolution",
136
+ num_frames=24,
137
+ guidance_scale=2.0,
138
+ num_inference_steps=6,
139
+ generator=torch.Generator("cpu").manual_seed(0)
140
+ )
141
+ video_frames.extend(output.frames[0])
142
+
143
+ return video_frames
144
 
145
  def text2text_A(user_input):
146
  # 设置API密钥和基础URL
 
176
 
177
  # 定义文本到音频函数
178
  def text2audio(text_input, duration_seconds):
179
+ processor = AutoProcessor.from_pretrained("musicgen-small")
180
+ model = MusicgenForConditionalGeneration.from_pretrained("musicgen-small")
181
  inputs = processor(text=[text_input], padding=True, return_tensors="pt")
182
  max_new_tokens = int((duration_seconds / 5) * 256)
183
  audio_values = model.generate(**inputs, max_new_tokens=max_new_tokens)
184
+ print(duration_seconds)
185
+ return audio_values[0, 0].numpy(), model.config.audio_encoder.sampling_rate
186
 
187
  # 定义生成结果视频的函数
188
  def result_generate(video_clip, audio_clip):
 
192
  video_buffer.seek(0)
193
  return video_buffer
194
 
 
195
  def generate_video(image):
196
+ # 获取图像描述
197
  text = img2text(image)
198
+ # 生成详细的文本场景描述
199
  sentences = text2text(text)
200
+ # 生成视频帧
201
+ video_frames = text2vid(sentences)
202
+
203
+ # 转换视频帧为numpy数组
204
+ video_frames = [np.array(frame) for frame in video_frames]
205
+
206
+ # 创建视频片段
207
+ video_clip = ImageSequenceClip(video_frames, fps=24)
208
+ video_duration = video_clip.duration
209
+
210
+ # 生成音频数据
211
  audio_text = text2text_A(text)
212
+ audio_data, audio_rate = text2audio(audio_text, video_duration)
213
+
214
+ # 将音频数据写入临时文件
215
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile:
216
+ scipy.io.wavfile.write(tmpfile, audio_rate, audio_data)
217
+ tmpfile_path = tmpfile.name
218
+
219
+ # 创建AudioFileClip对象
220
+ audio_clip = AudioFileClip(tmpfile_path)
221
 
222
+ # 将音频添加到视频中
223
+ video_clip = video_clip.set_audio(audio_clip)
224
+ print("audio_done")
225
+
226
+ # 将视频写入临时文件
227
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmpfile:
228
+ video_clip.write_videofile(tmpfile.name, codec="libx264", audio_codec="aac")
229
+ video_file_path = tmpfile.name
230
+
231
+ # 读取临时文件数据并删除
232
+ with open(video_file_path, 'rb') as f:
233
+ video_data = f.read()
234
+ os.remove(video_file_path)
235
+ os.remove(tmpfile_path)
236
+ print("video_done")
237
+ return video_data
238
+
239
  # 定义 Gradio 接口
240
  interface = gr.Interface(
241
  fn=lambda img: generate_video(img),