Spaces:

erl-j
/

soundfont-generator

Running on Zero

erl-j

some css changes

1056399 3 days ago

5.87 kB

	import torch
	import einops
	import gradio as gr
	import datetime
	import numpy as np
	import spaces
	import soundfile
	import os
	import sys
	import zipfile
	from pathlib import Path
	from huggingface_hub import hf_hub_download

	sys.path.append("sf-creator-fork")
	from main import sfz, decentsampler
	# Download models from Hugging Face Hub
	decoder_path = hf_hub_download("erl-j/soundfont-generator-assets", "decoder.pt")
	model_path = hf_hub_download(
	"erl-j/soundfont-generator-assets", "synth_lfm_modern_full.ckpt"
	)

	# Load models once at startup
	device = "cuda"
	decoder = torch.load(decoder_path, map_location=device).eval()
	model = torch.load(model_path, map_location=device).eval()


	@spaces.GPU
	def generate_and_export_soundfont(text, steps=20, instrument_name=None):
	sample_start = datetime.datetime.now()

	# Generate audio as before
	z = model.sample(1, text=[text], steps=steps)
	z_reshaped = einops.rearrange(z, "b t c d -> (b c) d t")

	with torch.no_grad():
	audio = decoder.decode(z_reshaped)

	audio_output = einops.rearrange(audio, "b c t -> c (b t)").cpu().numpy()
	audio_output = audio_output / np.max(np.abs(audio_output))

	# Export individual wav files
	export_audio = audio.cpu().numpy().astype(np.float32)
	output_dir = "output"
	os.makedirs(output_dir, exist_ok=True)

	# Create instrument name if not provided
	if not instrument_name:
	instrument_name = text.replace(" ", "_")[:20]

	# Save individual WAV files
	pitches = [
	"C1",
	"F#1",
	"C2",
	"F#2",
	"C3",
	"F#3",
	"C4",
	"F#4",
	"C5",
	"F#5",
	"C6",
	"F#6",
	"C7",
	"F#7",
	"C8",
	]

	wav_files = []
	for i in range(audio.shape[0]):
	wav_path = f"{output_dir}/{pitches[i]}.wav"
	soundfile.write(wav_path, export_audio[i].T, 44100)
	wav_files.append(wav_path)

	# Generate SFZ file
	sfz(
	directory=output_dir,
	lowkey="21",
	highkey="108",
	instrument=instrument_name,
	loopmode="no_loop",
	polyphony=None,
	)

	# Create zip file containing SFZ and WAV files for the complete soundfont
	zip_path = f"{output_dir}/{instrument_name}_package.zip"
	with zipfile.ZipFile(zip_path, "w") as zipf:
	# Add SFZ file
	sfz_file = f"{output_dir}/{instrument_name}.sfz"
	zipf.write(sfz_file, os.path.basename(sfz_file))
	# Add all WAV files
	for wav_file in wav_files:
	if os.path.exists(wav_file):
	zipf.write(wav_file, os.path.basename(wav_file))

	total_time = (datetime.datetime.now() - sample_start).total_seconds()

	return (
	(44100, audio_output.T),
	f"Generation took {total_time:.2f}s\nFiles saved in {output_dir}",
	zip_path,
	wav_files,
	)


	custom_js = open("custom.js").read()
	custom_css = open("custom.css").read()

	demo = gr.Blocks(
	title="Erl-j's Soundfont Generator",
	theme=gr.themes.Default(
	primary_hue="green",
	font=[gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"],
	),
	js=custom_js,
	css=custom_css,
	)

	with demo:
	gr.Markdown(open("intro.md").read())

	with gr.Row():
	steps = gr.Slider(
	minimum=1, maximum=50, value=20, step=1, label="Generation steps"
	)

	with gr.Row():
	text_input = gr.Textbox(
	label="Prompt",
	placeholder="Enter text description (e.g. 'hard bass', 'sparkly bells')",
	lines=2,
	)

	with gr.Row():
	generate_btn = gr.Button("Generate Soundfont", variant="primary")

	with gr.Row():
	audio_output = gr.Audio(label="Generated Audio Preview", visible=False)
	status_output = gr.Textbox(label="Status", lines=2, visible=False)

	with gr.Row():
	wav_files = gr.File(
	label="Individual WAV Files",
	file_count="multiple",
	visible=False,
	elem_id="individual-wav-files",
	)

	gr.Markdown("## Download Soundfont Package here:")
	with gr.Row():
	sf = gr.File(
	label="Download SFZ Soundfont Package",
	type="filepath",
	visible=True,
	elem_id="sfz",
	)

	html = """
	<div id="keyboard-container"></div>
	"""

	gr.HTML(html)


	gr.Markdown("""
	# About
	The model is a modified version of [stable audio open](https://huggingface.co/stabilityai/stable-audio-open-1.0).

	Unlike the original model, this version uses latent flow matching rather than latent diffusion.
	Secondly, the pitches are stacked in a channel dimension rather than concatenated in the time dimension.
	This allows for faster generation.

	Soundfont export code is based on the [sf-creator](https://github.com/paulwellnerbou/sf-creator) project.

	Similar work by Nercessian and Imort: [InstrumentGen](https://instrumentgen.netlify.app/).

	Thank you @carlthome for coming up with the name.

	To cite this work, please use the following BibTeX entry:
	```bibtex
	@misc{erl-j-soundfont-generator,
	title={Erl-j's Soundfont Generator},
	author={Nicolas Jonason},
	year={2024},
	publisher={Huggingface},
	}
	```
	""")

	generate_btn.click(
	fn=generate_and_export_soundfont,
	inputs=[text_input, steps],
	outputs=[audio_output, status_output, sf, wav_files],
	).success(js="() => console.log('Success')")

	text_input.submit(
	fn=generate_and_export_soundfont,
	inputs=[text_input, steps],
	outputs=[audio_output, status_output, sf, wav_files],
	)

	if __name__ == "__main__":
	print("Starting demo...")
	demo.launch()