import gradio as gr import spaces import librosa import soundfile as sf import wavio import os import subprocess import pickle import torch import torch.nn as nn from transformers import T5Tokenizer from transformer_model import Transformer def save_wav(filepath): # Extract the directory and the stem (filename without extension) directory = os.path.dirname(filepath) stem = os.path.splitext(os.path.basename(filepath))[0] # Construct the full paths for MIDI and WAV files midi_filepath = os.path.join(directory, f"{stem}.mid") wav_filepath = os.path.join(directory, f"{stem}.wav") # Run the fluidsynth command to convert MIDI to WAV process = subprocess.Popen( f"fluidsynth -r 16000 soundfont.sf -g 1.0 --quiet --no-shell {midi_filepath} -T wav -F {wav_filepath} > /dev/null", shell=True ) process.wait() return wav_filepath def generate_midi(caption, temperature=0.9, max_len=3000): device = 'cuda' if torch.cuda.is_available() else 'cpu' artifact_folder = 'artifacts' tokenizer_filepath = os.path.join(artifact_folder, "vocab_remi.pkl") # Load the tokenizer dictionary with open(tokenizer_filepath, "rb") as f: r_tokenizer = pickle.load(f) # Get the vocab size vocab_size = len(r_tokenizer) print("Vocab size: ", vocab_size) model = Transformer(vocab_size, 768, 8, 5000, 18, 1024, False, 8, device=device) model.load_state_dict(torch.load('/artifacts/pytorch_model_95.bin', map_location=device)) model.eval() tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base") # caption = "A cinematic electronic soundtrack that evokes an epic and dark atmosphere, featuring cello, contrabass, and drums. The song is set in A minor with a moderate tempo and a 4/4 time signature, creating an emotional and action-packed ambiance suitable for film." inputs = tokenizer(caption, return_tensors='pt', padding=True, truncation=True) input_ids = nn.utils.rnn.pad_sequence(inputs.input_ids, batch_first=True, padding_value=0) input_ids = input_ids.to(device) attention_mask =nn.utils.rnn.pad_sequence(inputs.attention_mask, batch_first=True, padding_value=0) attention_mask = attention_mask.to(device) output = model.generate(input_ids, attention_mask, max_len=max_len,temperature = temperature) output_list = output[0].tolist() generated_midi = r_tokenizer.decode(output_list) generated_midi.dump_midi("output.mid") # @spaces.GPU(duration=120) def gradio_generate(prompt, temperature): # Generate midi generate_midi(prompt, temperature) # Convert midi to wav filename = "output.mid" save_wav(filename) filename = filename.replace(".mid", ".wav") # Read the generated WAV file output_wave, samplerate = sf.read(filename, dtype='float32') output_filename = "temp.wav" wavio.write(output_filename, output_wave, rate=16000, sampwidth=2) return output_filename title="Text2midi: Generating Symbolic Music from Captions" description_text = """
For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings.
Generate midi music using Text2midi by providing a text prompt.
This is the demo for Text2midi for controllable text to midi generation: Read our paper.