from audiocraft.models import MusicGen import streamlit as st import os import torch import torchaudio import numpy as np import base64 from dotenv import load_dotenv import google.generativeai as genai load_dotenv() genai.configure(api_key=os.getenv("API_KEY")) llm = genai.GenerativeModel("gemini-pro") @st.cache_resource def load_model(): model = MusicGen.get_pretrained("facebook/musicgen-small") return model def generate_music_tensors(description, duration:int): print(f"Description: {description}") print(f"Duration: {duration}") model = load_model() model.set_generation_params( use_sampling=True, top_k=250, duration=duration ) output = model.generate( descriptions=[description], progress=True, return_tokens=True ) return output[0] def save_audio(samples: torch.Tensor): sample_rate = 32000 save_path = "saved_audio/" assert samples.dim() == 2 or samples.dim() == 3 samples = samples.detach().cpu() if samples.dim() == 2: samples = samples[None, ...] for idx, audio in enumerate(samples): audio_path = os.path.join(save_path, f"audio_{idx}.wav") torchaudio.save(audio_path, audio, sample_rate) def download_music(bin_file, file_label="File"): with open(bin_file, 'rb') as f: data = f.read() bin_str = base64.b64encode(data).decode() href = f'Download {file_label}' return href st.set_page_config( page_icon=":musical_note:", page_title="MusicGen" ) def main(): st.title("Text to Music Generation") with st.expander("View Details..."): st.write("This was built by https://github.com/ishan-kshirsagar0-7 using Meta's Audiocraft library. Enter the description of the music you want to generate, and set the duration with the slider given below. The longer the duration slider, the longer it will take to generate the music.") text_area = st.text_area("Enter your description...") time_slider = st.slider("Select time duration (in seconds)", 2, 20, 5) context = f"""Given the basic description of a prompt for a text-to-music generator below, enhance that prompt by using specific, direct, accurate and relevant vocabulary. This enhanced prompt must clearly assert and describe the kind of music user wants to generate, with the help of appropriate musical terminology or taxonomy. Craft a creative prompt that clearly explains the text-to-music model what music the user desires. DO NOT respond with anything other than the output prompt. You can be as creative as you like with the descriptions, but DO NOT make up details that the original prompt did not ask for. Also, make sure the description is not too lengthy, keep it concise. Your prompt must explain the flow of the music from start through the middle towards the finish, explicitly mentioning the way instruments are played and what they should sound like. ORIGINAL PROMPT : {text_area} YOUR OUTPUT PROMPT : """ llm_result = llm.generate_content(context) prompt = llm_result.text if text_area and time_slider: st.json( { "Description": prompt, "Duration": time_slider } ) st.subheader("Generated Music") music_tensors = generate_music_tensors(prompt, time_slider) print(f"Music Tensors: {music_tensors}") save_music_file = save_audio(music_tensors) audio_filepath = "saved_audio/audio_0.wav" audio_file = open(audio_filepath, 'rb') audio_bytes = audio_file.read() st.audio(audio_bytes) st.markdown(download_music(audio_filepath, 'Audio'), unsafe_allow_html=True) if __name__ == '__main__': main()