File size: 992 Bytes
c43b0d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
062627a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import gradio as gr
import imagebind
import torch
from diffusers import StableUnCLIPImg2ImgPipeline
import soundfile as sf

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
    "stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variation="fp16"
)
pipe = pipe.to(device)

model = imagebind.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

@torch.no_grad()
def anything2img(prompt, audio):
    sr, waveform = audio
    audio_path = 'tmp.wav'
    sf.write(audio_path, waveform, sr)
    audio_paths=[audio_path]
    embeddings = model.forward({
        imagebind.ModalityType.AUDIO: imagebind.load_and_transform_audio_data(audio_paths, device),
    })
    embeddings = embeddings[imagebind.ModalityType.AUDIO]
    images = pipe(prompt=prompt, image_embeds=embeddings.half()).images
    return images[0]
    

demo = gr.Interface(fn=anything2img, inputs=["text", "audio"], outputs="image")
demo.launch()