|
import requests |
|
from PIL import Image |
|
import scipy |
|
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoProcessor, MusicgenForConditionalGeneration |
|
import streamlit as st |
|
|
|
def image_to_music(raw_image): |
|
img_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") |
|
img_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") |
|
|
|
inputs = img_processor(raw_image, return_tensors="pt") |
|
|
|
out = img_model.generate(**inputs) |
|
txt = img_processor.decode(out[0], skip_special_tokens=True) |
|
|
|
audio_processor = AutoProcessor.from_pretrained("facebook/musicgen-small") |
|
audio_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small") |
|
|
|
inputs = audio_processor( |
|
text=[txt], |
|
padding=True, |
|
return_tensors="pt", |
|
) |
|
|
|
audio_values = audio_model.generate(**inputs, max_new_tokens=256) |
|
sampling_rate = audio_model.config.audio_encoder.sampling_rate |
|
scipy.io.wavfile.write("music.wav", rate=sampling_rate, data=audio_values[0, 0].numpy()) |
|
|
|
st.header("VisTune: an AI Image-to-Music generator") |
|
|
|
uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"]) |
|
|
|
if uploaded_image: |
|
st.image(uploaded_image, caption="Uploaded Image.", use_column_width=True) |
|
|
|
if st.button("Generate Music") and uploaded_image: |
|
raw_image = Image.open(uploaded_image).convert('RGB') |
|
image_to_music(raw_image) |
|
st.audio("music.wav") |
|
|