Spaces:

snnithya
/

GaMaDHaNi

Sleeping

GaMaDHaNi / app.py

Nithya

updated the models

3d6b478 4 months ago

17.3 kB

	import sys
	import os
	# Check if running in debug mode
	debug_mode = '--debug' in sys.argv or os.environ.get('DEBUG') == 'True'

	if debug_mode:
	# Path to the local version of the package
	local_package_path = "../../GaMaDHaNi-dev"

	# Add the local package path to sys.path
	sys.path.insert(0, local_package_path)

	print(f"Running in debug mode. Using package from: {local_package_path}")
	import pyprofilers as pp
	debug_mode = True
	else:
	print("Running in normal mode. Using package from site-packages.")
	debug_mode = False

	import spaces
	import gradio as gr
	import numpy as np
	import torch
	import librosa
	import matplotlib.pyplot as plt
	import pandas as pd
	from functools import partial
	import gin
	import torchaudio
	from absl import app
	from torch.nn.functional import interpolate
	import logging
	import crepe
	from hmmlearn import hmm
	import soundfile as sf
	import pdb
	from gamadhani.utils.generate_utils import load_pitch_fns, load_audio_fns
	import gamadhani.utils.pitch_to_audio_utils as p2a
	from gamadhani.utils.utils import get_device

	import copy

	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
	pitch_paths = {
	'Diffusion Pitch Generator': ('diffusion', 'models/diffusion_pitch/'),
	'Autoregressive Pitch Generator': ('transformer', 'models/transformer_pitch/')
	}
	model_loaded = None
	audio_path = 'models/pitch_to_audio/'
	device = get_device()

	def debug_profile(func):
	if debug_mode:
	return pp.profile(sort_by='cumulative', out_lines=10)(func)
	return func

	def predict_voicing(confidence):
	# https://github.com/marl/crepe/pull/26
	"""
	Find the Viterbi path for voiced versus unvoiced frames.
	Parameters
	----------
	confidence : np.ndarray [shape=(N,)]
	voicing confidence array, i.e. the confidence in the presence of
	a pitch
	Returns
	-------
	voicing_states : np.ndarray [shape=(N,)]
	HMM predictions for each frames state, 0 if unvoiced, 1 if
	voiced
	"""
	# uniform prior on the voicing confidence
	starting = np.array([0.5, 0.5])

	# transition probabilities inducing continuous voicing state
	transition = np.array([[0.99, 0.01], [0.01, 0.99]])

	# mean and variance for unvoiced and voiced states
	means = np.array([[0.0], [1.0]])
	variances = np.array([[0.25], [0.25]])

	# fix the model parameters because we are not optimizing the model
	model = hmm.GaussianHMM(n_components=2)
	model.startprob_, model.covars_, model.transmat_, model.means_, \
	model.n_features = starting, variances, transition, means, 1

	# find the Viterbi path
	voicing_states = model.predict(confidence.reshape(-1, 1), [len(confidence)])

	return np.array(voicing_states)

	def extract_pitch(audio, unvoice=True, sr=16000, frame_shift_ms=10, log=True):
	time, frequency, confidence, _ = crepe.predict(
	audio, sr=sr,
	viterbi=True,
	step_size=frame_shift_ms,
	verbose=0 if not log else 1)
	f0 = frequency
	if unvoice:
	is_voiced = predict_voicing(confidence)
	frequency_unvoiced = frequency * is_voiced
	f0 = frequency_unvoiced

	return time, f0, confidence

	def generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps, noise_std=0.4, t0=0.5):
	'''Generate pitch values for the melodic reinterpretation task'''
	# hardcoding the amount of noise to be added
	# noisy_pitch = torch.Tensor(pitch[:, :, -1200:]).to(pitch_model.device) + (torch.normal(mean=0.0, std=noise_std*torch.ones((1200)))).to(pitch_model.device)
	# noisy_pitch = torch.clamp(noisy_pitch, -5.19, 5.19) # clipping the pitch values to be within the range of the model
	samples = pitch_model.sample_sdedit(pitch[:, :, -1200:].to(pitch_model.device), num_samples, num_steps, t0=t0)
	inverted_pitches = invert_pitch_fn(f0=samples.detach().cpu().numpy()[0]).flatten() # pitch values in Hz

	return samples, inverted_pitches

	def generate_pitch_response(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps, model_type='diffusion'):
	'''Generate pitch values for the call and response task'''
	pitch = pitch[:, :, -400:] # consider only the last 4 s of the pitch contour
	if model_type == 'diffusion':
	samples = pitch_model.sample_fn(num_samples, num_steps, prime=pitch)
	else:
	samples = pitch_model.sample_fn(batch_size=num_samples, seq_len=800, prime=pitch)
	inverted_pitches = invert_pitch_fn(f0=samples.clone().detach().cpu().numpy()[0]).flatten() # pitch values in Hz

	return samples, inverted_pitches

	def generate_audio(audio_model, f0s, invert_audio_fn, singers=[3], num_steps=100):
	'''Generate audio given pitch values'''
	singer_tensor = torch.tensor(np.repeat(singers, repeats=f0s.shape[0])).to(audio_model.device)
	samples, _, singers = audio_model.sample_cfg(f0s.shape[0], f0=f0s, num_steps=num_steps, singer=singer_tensor, strength=3)
	audio = invert_audio_fn(samples)

	return audio

	@spaces.GPU(duration=30)
	def generate(pitch, num_samples=1, num_steps=100, singers=[3], outfolder='temp', audio_seq_len=750, pitch_qt=None, type='response', invert_pitch_fn=None, t0=0.5, model_type='diffusion'):
	global pitch_model, audio_model
	# move the models to device
	pitch_model = pitch_model.to(device)
	audio_model = audio_model.to(device)
	logging.log(logging.INFO, 'Generate function')
	# load pitch values onto GPU
	pitch = torch.tensor(pitch).float().unsqueeze(0).unsqueeze(0).to(device)
	if pitch_qt is not None:
	pitch_qt = p2a.GPUQuantileTransformer(pitch_qt, device=device)
	logging.log(logging.INFO, 'Generating pitch')
	if type == 'response':
	pitch, inverted_pitch = generate_pitch_response(pitch, pitch_model, invert_pitch_fn, num_samples=num_samples, num_steps=100, model_type=model_type)
	elif type == 'reinterp':
	pitch, inverted_pitch = generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples=num_samples, num_steps=100, t0=t0)

	else:
	raise ValueError(f'Invalid type: {type}')

	if pitch_qt is not None:
	# if there is not pitch quantile transformer, undo the default quantile transformation that occurs
	def undo_qt(x, min_clip=200):
	pitch= pitch_qt.inverse_transform(x).squeeze(0) # qt transform expects shape (bs, seq_len, 1)
	pitch = torch.round(pitch) # round to nearest integer, done in preprocessing of pitch contour fed into model
	pitch[pitch < 200] = np.nan
	pitch = pitch.unsqueeze(0)
	return pitch
	pitch = undo_qt(pitch)
	interpolated_pitch = p2a.interpolate_pitch(pitch=pitch, audio_seq_len=audio_seq_len).squeeze(0) # interpolate pitch values to match the audio model's input size
	interpolated_pitch = torch.nan_to_num(interpolated_pitch, nan=196) # replace nan values with silent token
	interpolated_pitch = interpolated_pitch.squeeze(1) # to match input size by removing the extra dimension
	logging.log(logging.INFO, 'Generating audio')
	audio = generate_audio(audio_model, interpolated_pitch, invert_audio_fn, singers=singers, num_steps=100)
	audio = audio.detach().cpu().numpy()
	pitch = pitch.detach().cpu().numpy()
	# generate plot of model output to display on interface
	model_output_plot = plt.figure()
	inverted_pitch = np.where(inverted_pitch == 0, np.nan, inverted_pitch)
	plt.plot(inverted_pitch, figure=model_output_plot, label='Model Output')
	plt.close(model_output_plot)
	return (16000, audio[0]), model_output_plot # return audio and plot

	pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn = None, None, None, None # initialize pitch model based on user preference
	audio_model, audio_qt, audio_seq_len, invert_audio_fn = load_audio_fns(
	os.path.join(audio_path, 'last.ckpt'),
	qt_path = os.path.join(audio_path, 'qt.joblib'),
	config_path = os.path.join(audio_path, 'config.gin'),
	device = 'cpu'
	)


	def load_pitch_model(model_selection):
	global device
	model_type, pitch_path = pitch_paths[model_selection]
	pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn, _ = load_pitch_fns(
	os.path.join(pitch_path, 'model.ckpt'), \
	model_type = model_type, \
	config_path = os.path.join(pitch_path, 'config.gin'), \
	qt_path = os.path.join(pitch_path, 'qt.joblib') if model_type == 'diffusion' else None, \
	device = 'cpu'
	)
	return pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn

	@debug_profile
	def container_generate(model_selection, task_selection, audio, singer_id, t0):
	global pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn, model_loaded
	# load pitch model
	if model_loaded is None or model_loaded != model_selection:
	pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn = load_pitch_model(model_selection)
	model_loaded = model_selection
	else:
	logging.log(logging.INFO, f'using existing model: {model_selection}')
	# extract pitch from input
	if audio is None:
	return None, None
	sr, audio = audio
	if len(audio) < 12*sr and task_selection == 'Melodic Reinterpretation':
	# make sure the audio is at least 12 s long
	audio = np.pad(audio, (0, 12*sr - len(audio)), mode='constant')
	if len(audio) < 4*sr and task_selection == 'Call and Response':
	# make sure the audio is at least 4 s long
	audio = np.pad(audio, (4*sr - len(audio), 0), mode='constant')
	audio = audio.astype(np.float32)
	audio /= np.max(np.abs(audio))
	audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # convert only last 4 s
	mic_audio = audio.copy()
	audio = audio[-12*16000:] # consider only last 12 s
	_, f0, _ = extract_pitch(audio)
	mic_f0 = f0.copy() # save the user input pitch values
	logging.log(logging.INFO, 'Pitch extracted')
	f0 = pitch_task_fn(**{
	'inputs': {
	'pitch': {
	'data': torch.Tensor(f0), # task function expects a tensor
	'sampling_rate': 100
	}
	},
	'qt_transform': pitch_qt,
	'time_downsample': 1, # pitch will be extracted at 100 Hz, thus no downsampling
	'seq_len': None,
	})['sampled_sequence']
	# f0 = torch.tensor(f0).to(pitch_model.device).float()
	logging.log(logging.INFO, 'Calling generate function')
	mic_f0 = np.where(mic_f0 == 0, np.nan, mic_f0)
	# plot user input
	user_input_plot = plt.figure()
	plt.plot(np.arange(0, len(mic_f0)), mic_f0, label='User Input', figure=user_input_plot)
	plt.close(user_input_plot)

	if singer_id == 'Singer 1':
	singer = [3]
	elif singer_id == 'Singer 2':
	singer = [27]
	if task_selection == 'Call and Response':
	partial_generate = partial(generate, num_samples=1, num_steps=100, singers=singer, outfolder=None, pitch_qt=pitch_qt, type='response', invert_pitch_fn=invert_pitch_fn, model_type=model_selection)
	else:
	partial_generate = partial(generate, num_samples=1, num_steps=100, singers=singer, outfolder=None, pitch_qt=pitch_qt, type='reinterp', invert_pitch_fn=invert_pitch_fn, t0=t0, model_type=model_selection)
	audio, output_plot = partial_generate(f0)
	return audio, user_input_plot, output_plot

	css = """
	.center-text {
	text-align: center;
	}
	.justify-text {
	text-align: justify;
	}
	"""

	def toggle_visibility(selection):
	# Show element if selection is "Show", otherwise hide it
	if selection == "Melodic Reinterpretation":
	return gr.update(visible=True)
	else:
	return gr.update(visible=False)

	def toggle_options(selection, options = ['Call and Response', 'Melodic Reinterpretation']):
	# Show element if selection is "Show", otherwise hide it
	if selection == "Melodic Reinterpretation":
	return gr.update(choices=options)
	else:
	return gr.update(choices=options[:-1])

	with gr.Blocks(css=css) as demo:
	gr.Markdown("# GaMaDHaNi: Hierarchical Generative Modeling of Melodic Vocal Contours in Hindustani Classical Music", elem_classes="center-text")
	gr.Markdown("### Abstract", elem_classes="center-text")
	gr.Markdown("""
	Hindustani music is a performance-driven oral tradition that exhibits the rendition of rich melodic patterns. In this paper, we focus on generative modeling of singers' vocal melodies extracted from audio recordings, as the voice is musically prominent within the tradition. Prior generative work in Hindustani music models melodies as coarse discrete symbols which fails to capture the rich expressive melodic intricacies of singing. Thus, we propose to use a finely quantized pitch contour, as an intermediate representation for hierarchical audio modeling. We propose GaMaDHaNi, a modular two-level hierarchy, consisting of a generative model on pitch contours, and a pitch contour to audio synthesis model. We compare our approach to non-hierarchical audio models and hierarchical models that use a self-supervised intermediate representation, through a listening test and qualitative analysis. We also evaluate audio model's ability to faithfully represent the pitch contour input using Pearson correlation coefficient. By using pitch contours as an intermediate representation, we show that our model may be better equipped to listen and respond to musicians in a human-AI collaborative setting by highlighting two potential interaction use cases (1) primed generation, and (2) coarse pitch conditioning.
	""", elem_classes="justify-text")
	gr.Markdown("""
	📖 Read more about the project [here](https://arxiv.org/pdf/2408.12658) <br>
	🎧 Listen to the samples [here](https://snnithya.github.io/gamadhani-samples) <br>
	""", elem_classes="center-text")
	with gr.Column():
	gr.Markdown("""
	## Instructions
	In this demo you can interact with the model in two ways:
	1. [Call and response](https://snnithya.github.io/gamadhani-samples/5primed_generation/): The model will try to continue the idea that you input. This is similar to 'primed generation' discussed in the paper. The last 4 s of the audio will be considered as a 'prime' for the model to continue. <br><br>
	2. [Melodic reinterpretation](https://snnithya.github.io/gamadhani-samples/6coarsepitch/): Akin to the idea of 'coarse pitch conditioning' presented in the paper, you can input a pitch contour and the model will generate audio that is similar to but not exactly the same. <br><br>
	### Upload an audio file or record your voice to get started!
	""")
	gr.Markdown("""
	This is still a work in progress, so please feel free to share any weird or interesting examples, we would love to hear them! Contact us at snnithya[at]mit[dot]edu.
	""")
	gr.Markdown("""
	Note: If you see an error message on the screen after clicking 'Run', please wait for five seconds and click 'Run' again.
	""")
	gr.Markdown("""
	Another note: The model may take around 20-30s to generate an output. Hang tight! But if you're left hanging for too long, let me know!
	""")
	gr.Markdown("""
	Last note, I promise: There are some example audio samples at the bottom of the page. You can start with those if you'd like!
	""")
	model_dropdown = gr.Dropdown(["Diffusion Pitch Generator", "Autoregressive Pitch Generator"], label="Select a model type")
	task_dropdown = gr.Dropdown(label="Select a task", choices=["Call and Response", "Melodic Reinterpretation"])
	model_dropdown.change(toggle_options, outputs=task_dropdown)
	t0 = gr.Slider(label="Faithfulness to the input (For melodic reinterpretation task only)", minimum=0.0, maximum=1.0, step=0.01, value=0.3, visible=False)
	task_dropdown.change(toggle_visibility, inputs=task_dropdown, outputs=t0)
	singer_dropdown = gr.Dropdown(label="Select a singer", choices=["Singer 1", "Singer 2"])
	with gr.Row(equal_height=True):
	with gr.Column():
	audio = gr.Audio(label="Input", show_download_button=True)
	examples = gr.Examples(
	examples=[
	["examples/ex1.wav"],
	["examples/ex2.wav"],
	["examples/ex3.wav"],
	["examples/ex4.wav"],
	["examples/ex5.wav"]
	],
	inputs=audio
	)
	with gr.Column():
	generated_audio = gr.Audio(label="Generated Audio", elem_id="audio")
	with gr.Row():
	with gr.Column():
	with gr.Accordion("View Pitch Plot"):
	user_input = gr.Plot(label="User Input")
	with gr.Column():
	with gr.Accordion("View Pitch Plot"):
	generated_pitch = gr.Plot(label="Generated Pitch")
	sbmt = gr.Button()
	sbmt.click(container_generate, inputs=[model_dropdown, task_dropdown, audio, singer_dropdown, t0], outputs=[generated_audio, user_input, generated_pitch])

	def main(argv):

	demo.launch()

	if __name__ == '__main__':
	main(sys.argv)