Spaces:

HongcanGuo
/

InspiroV

Running

App Files Files Community

HongcanGuo commited on May 30, 2024

Commit

02556a5

verified ·

1 Parent(s): 7e348d8

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -23

app.py CHANGED Viewed

@@ -4,19 +4,24 @@ from PIL import Image
 from transformers import BlipProcessor, BlipForConditionalGeneration
 import torch
 from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
-from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
 from transformers import AutoProcessor, MusicgenForConditionalGeneration
 import scipy.io.wavfile
 import re
-from io import BytesIO
 # 定义图像到文本函数
 def img2text(image):
-    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
     inputs = processor(image, return_tensors="pt")
     out = model.generate(**inputs)
     caption = processor.decode(out[0], skip_special_tokens=True)
     return caption
 # 定义文本生成函数
@@ -58,20 +63,84 @@ def text2text(user_input):
     response = requests.post(f"{base_url}/chat/completions", headers=headers, json=data)
     response.raise_for_status()
     completion = response.json()
     return completion['choices'][0]['message']['content']
 # 定义文本到视频函数
 def text2vid(input_text):
     sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
-    adapter = MotionAdapter.from_pretrained("your-motion-adapter")
-    pipe = AnimateDiffPipeline.from_pretrained("your-diffusion-model", motion_adapter=adapter)
-    video_clips = []
     for sentence in sentences:
-        frames = pipe(sentence, num_inference_steps=50, guidance_scale=7.5)
-        video_clip = frames_to_video_clip(frames)  # Assume this function converts frames to a video clip
-        video_clips.append(video_clip)
-    final_clip = concatenate_videoclips(video_clips, method="compose")
-    return final_clip
 def text2text_A(user_input):
     # 设置API密钥和基础URL
@@ -107,13 +176,13 @@ def text2text_A(user_input):
 # 定义文本到音频函数
 def text2audio(text_input, duration_seconds):
-    processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
-    model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
     inputs = processor(text=[text_input], padding=True, return_tensors="pt")
     max_new_tokens = int((duration_seconds / 5) * 256)
     audio_values = model.generate(**inputs, max_new_tokens=max_new_tokens)
-    audio_clip = numpy_array_to_audio_clip(audio_values.numpy(), rate=model.config.audio_encoder.sampling_rate)  # Assume this function converts numpy array to audio clip
-    return audio_clip
 # 定义生成结果视频的函数
 def result_generate(video_clip, audio_clip):
@@ -123,18 +192,50 @@ def result_generate(video_clip, audio_clip):
     video_buffer.seek(0)
     return video_buffer
-# 整合所有步骤到主函数
 def generate_video(image):
     text = img2text(image)
     sentences = text2text(text)
-    final_video_clip = text2vid(sentences)
-    video = VideoFileClip(final_video_clip)  # Assumes final_video_clip is a path or BytesIO object
-    duration = video.duration
     audio_text = text2text_A(text)
-    audio_clip = text2audio(audio_text, duration)
-    result_video = result_generate(final_video_clip, audio_clip)
-    return result_video
 # 定义 Gradio 接口
 interface = gr.Interface(
     fn=lambda img: generate_video(img),

 from transformers import BlipProcessor, BlipForConditionalGeneration
 import torch
 from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
+from moviepy.editor import concatenate_videoclips, AudioFileClip
+from moviepy.video.io.ImageSequenceClip import ImageSequenceClip
 from transformers import AutoProcessor, MusicgenForConditionalGeneration
 import scipy.io.wavfile
 import re
+import numpy as np
+import os
+import io
+import tempfile
 # 定义图像到文本函数
 def img2text(image):
+    processor = BlipProcessor.from_pretrained("blip-image-captioning-large")
+    model = BlipForConditionalGeneration.from_pretrained("blip-image-captioning-large")
     inputs = processor(image, return_tensors="pt")
     out = model.generate(**inputs)
     caption = processor.decode(out[0], skip_special_tokens=True)
+    print(caption)
     return caption
 # 定义文本生成函数
     response = requests.post(f"{base_url}/chat/completions", headers=headers, json=data)
     response.raise_for_status()
     completion = response.json()
+    print(completion['choices'][0]['message']['content'])
     return completion['choices'][0]['message']['content']
+import torch
+from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
+from diffusers.utils import export_to_gif
+import re
+def text2vid(input_text):
+    # 使用正则表达式分割输入文本并提取句子
+    sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
+    # 加载动作适配器和动画扩散管道
+    adapter = MotionAdapter.from_pretrained("/home/u2022211776/jupyterlab/AnimateLCM", config_file="/home/u2022211776/jupyterlab/AnimateLCM/config.json", torch_dtype=torch.float16)
+    pipe = AnimateDiffPipeline.from_pretrained("/home/u2022211776/jupyterlab/epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
+    pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
+    # 加载LoRA权重
+    pipe.load_lora_weights("/home/u2022211776/jupyterlab/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
+    # 设置适配器并启用功能
+    try:
+        pipe.set_adapters(["lcm-lora"], [0.8])
+    except ValueError as e:
+        print("Ignoring the error:", str(e))
+    pipe.enable_vae_slicing()
+    pipe.enable_model_cpu_offload()
+    all_frames = []  # 存储所有句子的所有帧
+    # 循环遍历每个句子，生成动画并导出为GIF
+    for index, sentence in enumerate(sentences):
+        output = pipe(
+            #prompt=sentence + ", 4k, high resolution",
+            prompt=sentence + ", cartoon",
+            negative_prompt="bad quality, worse quality, low resolution",
+            num_frames=24,
+            guidance_scale=2.0,
+            num_inference_steps=6,
+            generator=torch.Generator("cpu").manual_seed(0)
+        )
+        frames = output.frames[0]
+        all_frames.extend(frames)  # 添加每个句子的帧到all_frames
+    return all_frames
 # 定义文本到视频函数
 def text2vid(input_text):
     sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
+    adapter = MotionAdapter.from_pretrained("AnimateLCM", config_file="AnimateLCM/config.json", torch_dtype=torch.float16)
+    pipe = AnimateDiffPipeline.from_pretrained("epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
+    pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
+    pipe.load_lora_weights("AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
+    try:
+        pipe.set_adapters(["lcm-lora"], [0.8])
+    except ValueError as e:
+        print("Ignoring the error:", str(e))
+    pipe.enable_vae_slicing()
+    pipe.enable_model_cpu_offload()
+    video_frames = []
     for sentence in sentences:
+        output = pipe(
+            prompt=sentence + ", 4k, high resolution",
+            negative_prompt="bad quality, worse quality, low resolution",
+            num_frames=24,
+            guidance_scale=2.0,
+            num_inference_steps=6,
+            generator=torch.Generator("cpu").manual_seed(0)
+        )
+        video_frames.extend(output.frames[0])
+    return video_frames
 def text2text_A(user_input):
     # 设置API密钥和基础URL
 # 定义文本到音频函数
 def text2audio(text_input, duration_seconds):
+    processor = AutoProcessor.from_pretrained("musicgen-small")
+    model = MusicgenForConditionalGeneration.from_pretrained("musicgen-small")
     inputs = processor(text=[text_input], padding=True, return_tensors="pt")
     max_new_tokens = int((duration_seconds / 5) * 256)
     audio_values = model.generate(**inputs, max_new_tokens=max_new_tokens)
+    print(duration_seconds)
+    return audio_values[0, 0].numpy(), model.config.audio_encoder.sampling_rate
 # 定义生成结果视频的函数
 def result_generate(video_clip, audio_clip):
     video_buffer.seek(0)
     return video_buffer
 def generate_video(image):
+    # 获取图像描述
     text = img2text(image)
+    # 生成详细的文本场景描述
     sentences = text2text(text)
+    # 生成视频帧
+    video_frames = text2vid(sentences)
+    # 转换视频帧为numpy数组
+    video_frames = [np.array(frame) for frame in video_frames]
+    # 创建视频片段
+    video_clip = ImageSequenceClip(video_frames, fps=24)
+    video_duration = video_clip.duration
+    # 生成音频数据
     audio_text = text2text_A(text)
+    audio_data, audio_rate = text2audio(audio_text, video_duration)
+    # 将音频数据写入临时文件
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile:
+        scipy.io.wavfile.write(tmpfile, audio_rate, audio_data)
+        tmpfile_path = tmpfile.name
+    # 创建AudioFileClip对象
+    audio_clip = AudioFileClip(tmpfile_path)
+    # 将音频添加到视频中
+    video_clip = video_clip.set_audio(audio_clip)
+    print("audio_done")
+    # 将视频写入临时文件
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmpfile:
+        video_clip.write_videofile(tmpfile.name, codec="libx264", audio_codec="aac")
+        video_file_path = tmpfile.name
+    # 读取临时文件数据并删除
+    with open(video_file_path, 'rb') as f:
+        video_data = f.read()
+    os.remove(video_file_path)
+    os.remove(tmpfile_path)
+    print("video_done")
+    return video_data
 # 定义 Gradio 接口
 interface = gr.Interface(
     fn=lambda img: generate_video(img),