Spaces:

owiedotch
/

oac

Sleeping

App Files Files Community

oac / app.py

owiedotch

Update app.py

e14d19c verified 23 days ago

raw

history blame contribute delete

17.3 kB

	import os
	import gradio as gr
	import numpy as np
	import soundfile as sf
	from semanticodec import SemantiCodec
	from huggingface_hub import HfApi
	import spaces
	import torch
	import tempfile
	import io
	import uuid
	import pickle
	import time
	from pathlib import Path

	# Initialize the model and ensure it's on the correct device
	def load_model():
	model = SemantiCodec(token_rate=100, semantic_vocab_size=32768) # 0.35 kbps
	if torch.cuda.is_available():
	# Move the model to CUDA and ensure it's fully initialized on CUDA
	model = model.to("cuda:0")
	# Force CUDA initialization
	dummy_input = torch.zeros(1, 1, 1, dtype=torch.long).cuda()
	try:
	with torch.no_grad():
	_ = model.decoder(dummy_input)
	except:
	print("Dummy forward pass failed, but CUDA initialization attempted")
	return model

	# Initialize model
	semanticodec = load_model()
	# Get the device of the model
	model_device = "cuda:0" if torch.cuda.is_available() else "cpu"
	print(f"Model initialized on device: {model_device}")

	# Define sample rate as a constant
	# Changed from 32000 to 16000 to fix playback speed
	SAMPLE_RATE = 16000

	@spaces.GPU(duration=20)
	def encode_audio(audio_path):
	"""Encode audio file to tokens and return them as a file"""
	try:
	print(f"Encoding audio on device: {model_device}")
	# Ensure model is on the right device
	semanticodec.to(model_device)
	tokens = semanticodec.encode(audio_path)
	print(f"Tokens device after encode: {tokens.device if isinstance(tokens, torch.Tensor) else 'numpy'}")

	# Move tokens to CPU before converting to numpy
	if isinstance(tokens, torch.Tensor):
	tokens = tokens.cpu().numpy()

	# Ensure tokens are in the right shape for later decoding
	if tokens.ndim == 1:
	# Reshape to match expected format [batch, seq_len, features]
	tokens = tokens.reshape(1, -1, 1)

	# Save tokens in a way that preserves shape information
	token_data = {
	'tokens': tokens,
	'shape': tokens.shape,
	'device': str(model_device) # Store intended device information
	}

	# Create a temporary file in /tmp which is writable in Spaces
	temp_dir = "/tmp"
	os.makedirs(temp_dir, exist_ok=True)
	temp_file_path = os.path.join(temp_dir, f"tokens_{uuid.uuid4()}.oterin")

	# Write using pickle instead of numpy save
	with open(temp_file_path, "wb") as f:
	pickle.dump(token_data, f)

	# Verify the file exists and has content
	if not os.path.exists(temp_file_path) or os.path.getsize(temp_file_path) == 0:
	raise Exception("Failed to create token file")

	return temp_file_path, f"Encoded to {tokens.shape[1]} tokens"
	except Exception as e:
	print(f"Encoding error: {str(e)}")
	return None, f"Error encoding audio: {str(e)}"

	@spaces.GPU(duration=160)
	def decode_tokens(token_file):
	"""Decode tokens to audio"""
	# Ensure the file exists and has content
	if not token_file or not os.path.exists(token_file):
	return None, "Error: Empty or missing token file"

	try:
	# Load tokens using pickle instead of numpy load
	with open(token_file, "rb") as f:
	token_data = pickle.load(f)

	tokens = token_data['tokens']
	intended_device = token_data.get('device', model_device)
	print(f"Loaded tokens with shape {tokens.shape}, intended device: {intended_device}")

	# Ensure model is on the right device first
	semanticodec.to(model_device)
	print(f"Model device before tensor creation: {next(semanticodec.parameters()).device}")

	# Convert to torch tensor with Long dtype for embedding
	tokens_tensor = torch.tensor(tokens, dtype=torch.long)
	print(f"Tokens tensor created on device: {tokens_tensor.device} with dtype: {tokens_tensor.dtype}")

	# Explicitly move tokens to the model's device
	tokens_tensor = tokens_tensor.to(model_device)
	print(f"Tokens moved to device: {tokens_tensor.device}")

	# Decode the tokens
	waveform = semanticodec.decode(tokens_tensor)
	print(f"Waveform device after decode: {waveform.device if isinstance(waveform, torch.Tensor) else 'numpy'}")

	# Move waveform to CPU for audio processing
	if isinstance(waveform, torch.Tensor):
	waveform = waveform.cpu().numpy()

	# Extract audio data - this should be a numpy array
	audio_data = waveform[0, 0] # Shape should be [time]

	print(f"Audio data shape: {audio_data.shape}, dtype: {audio_data.dtype}")

	# Return in Gradio Audio compatible format: (sample_rate, audio_data)
	return (SAMPLE_RATE, audio_data), f"Decoded {tokens.shape[1]} tokens to audio"
	except Exception as e:
	print(f"Decoding error: {str(e)}")
	return None, f"Error decoding tokens: {str(e)}"

	@spaces.GPU(duration=250)
	def process_both(audio_path):
	"""Encode and then decode the audio without saving intermediate files"""
	try:
	print(f"Processing both on device: {model_device}")
	# Ensure model is on the right device
	semanticodec.to(model_device)
	# Encode
	tokens = semanticodec.encode(audio_path)
	print(f"Tokens device after encode: {tokens.device if isinstance(tokens, torch.Tensor) else 'numpy'}")

	if isinstance(tokens, torch.Tensor):
	tokens = tokens.cpu().numpy()

	# Ensure tokens are in the right shape for decoding
	if tokens.ndim == 1:
	# Reshape to match expected format [batch, seq_len, features]
	tokens = tokens.reshape(1, -1, 1)

	# Convert back to torch tensor with Long dtype for embedding
	tokens_tensor = torch.tensor(tokens, dtype=torch.long)
	print(f"Tokens tensor created on device: {tokens_tensor.device} with dtype: {tokens_tensor.dtype}")

	# Explicitly move tokens to the model's device
	tokens_tensor = tokens_tensor.to(model_device)
	print(f"Tokens moved to device: {tokens_tensor.device}")

	# Ensure model is on the right device again before decoding
	semanticodec.to(model_device)
	print(f"Model device before decode: {next(semanticodec.parameters()).device}")

	# Decode
	waveform = semanticodec.decode(tokens_tensor)
	print(f"Waveform device after decode: {waveform.device if isinstance(waveform, torch.Tensor) else 'numpy'}")

	# Move waveform to CPU for audio processing
	if isinstance(waveform, torch.Tensor):
	waveform = waveform.cpu().numpy()

	# Extract audio data - this should be a numpy array
	audio_data = waveform[0, 0] # Shape should be [time]

	print(f"Audio data shape: {audio_data.shape}, dtype: {audio_data.dtype}")

	# Return in Gradio Audio compatible format: (sample_rate, audio_data)
	return (SAMPLE_RATE, audio_data), f"Encoded to {tokens.shape[1]} tokens\nDecoded {tokens.shape[1]} tokens to audio"
	except Exception as e:
	print(f"Processing error: {str(e)}")
	return None, f"Error processing audio: {str(e)}"

	@spaces.GPU(duration=250)
	def stream_both(audio_path):
	"""Encode and then stream decode the audio"""
	try:
	print(f"Processing both (streaming) on device: {model_device}")
	# Ensure model is on the right device
	semanticodec.to(model_device)

	# First encode the audio
	tokens = semanticodec.encode(audio_path)
	if isinstance(tokens, torch.Tensor):
	tokens = tokens.cpu().numpy()

	# Ensure tokens are in the right shape for decoding
	if tokens.ndim == 1:
	tokens = tokens.reshape(1, -1, 1)

	print(f"Encoded audio to {tokens.shape[1]} tokens, now streaming decoding...")
	yield None, f"Encoded to {tokens.shape[1]} tokens, starting decoding..."

	# If tokens are too small, decode all at once
	if tokens.shape[1] < 1500: # Changed from 500 to 1500 (15 seconds at 100 tokens/sec)
	# Convert to torch tensor with Long dtype for embedding
	tokens_tensor = torch.tensor(tokens, dtype=torch.long).to(model_device)

	# Decode the tokens
	semanticodec.to(model_device)
	waveform = semanticodec.decode(tokens_tensor)
	if isinstance(waveform, torch.Tensor):
	waveform = waveform.cpu().numpy()

	audio_data = waveform[0, 0]
	yield (SAMPLE_RATE, audio_data), f"Encoded to {tokens.shape[1]} tokens and decoded to audio"
	return

	# Split tokens into chunks for streaming
	chunk_size = 1500 # Changed from 500 to 1500 (15 seconds at 100 tokens/sec)
	num_chunks = (tokens.shape[1] + chunk_size - 1) // chunk_size # Ceiling division

	all_audio_chunks = []

	for i in range(num_chunks):
	start_idx = i * chunk_size
	end_idx = min((i + 1) * chunk_size, tokens.shape[1])

	print(f"Decoding chunk {i+1}/{num_chunks}, tokens {start_idx} to {end_idx}")

	# Extract chunk of tokens
	token_chunk = tokens[:, start_idx:end_idx, :]

	# Convert to torch tensor with Long dtype
	tokens_tensor = torch.tensor(token_chunk, dtype=torch.long).to(model_device)

	# Ensure model is on the expected device
	semanticodec.to(model_device)

	# Decode the tokens
	waveform = semanticodec.decode(tokens_tensor)
	if isinstance(waveform, torch.Tensor):
	waveform = waveform.cpu().numpy()

	# Extract audio data
	audio_chunk = waveform[0, 0]
	all_audio_chunks.append(audio_chunk)

	# Combine all chunks we have so far
	combined_audio = np.concatenate(all_audio_chunks)

	# Yield the combined audio for streaming playback
	yield (SAMPLE_RATE, combined_audio), f"Encoded to {tokens.shape[1]} tokens\nDecoded chunk {i+1}/{num_chunks} ({end_idx}/{tokens.shape[1]} tokens)"

	# Small delay to allow Gradio to update UI
	time.sleep(0.1)

	# Final complete audio
	combined_audio = np.concatenate(all_audio_chunks)
	yield (SAMPLE_RATE, combined_audio), f"Completed: Encoded to {tokens.shape[1]} tokens and fully decoded"

	except Exception as e:
	print(f"Streaming process error: {str(e)}")
	yield None, f"Error processing audio: {str(e)}"

	@spaces.GPU(duration=250)
	def stream_decode_tokens(token_file):
	"""Decode tokens to audio in streaming chunks"""
	# Ensure the file exists and has content
	if not token_file or not os.path.exists(token_file):
	yield None, "Error: Empty or missing token file"
	return

	try:
	# Load tokens using pickle instead of numpy load
	with open(token_file, "rb") as f:
	token_data = pickle.load(f)

	tokens = token_data['tokens']
	intended_device = token_data.get('device', model_device)
	print(f"Loaded tokens with shape {tokens.shape}, intended device: {intended_device}")

	# Ensure model is on the right device
	semanticodec.to(model_device)

	# If tokens are too small, decode all at once
	if tokens.shape[1] < 1500: # Changed from 500 to 1500 (15 seconds at 100 tokens/sec)
	# Convert to torch tensor with Long dtype for embedding
	tokens_tensor = torch.tensor(tokens, dtype=torch.long)
	tokens_tensor = tokens_tensor.to(model_device)

	# Decode the tokens
	waveform = semanticodec.decode(tokens_tensor)
	if isinstance(waveform, torch.Tensor):
	waveform = waveform.cpu().numpy()

	audio_data = waveform[0, 0]
	yield (SAMPLE_RATE, audio_data), f"Decoded {tokens.shape[1]} tokens to audio"
	return

	# Split tokens into chunks for streaming
	chunk_size = 1500 # Changed from 500 to 1500 (15 seconds at 100 tokens/sec)
	num_chunks = (tokens.shape[1] + chunk_size - 1) // chunk_size # Ceiling division

	# First status update
	yield None, f"Starting decoding of {tokens.shape[1]} tokens in {num_chunks} chunks..."

	all_audio_chunks = []

	for i in range(num_chunks):
	start_idx = i * chunk_size
	end_idx = min((i + 1) * chunk_size, tokens.shape[1])

	print(f"Decoding chunk {i+1}/{num_chunks}, tokens {start_idx} to {end_idx}")

	# Extract chunk of tokens
	token_chunk = tokens[:, start_idx:end_idx, :]

	# Convert to torch tensor with Long dtype
	tokens_tensor = torch.tensor(token_chunk, dtype=torch.long)
	tokens_tensor = tokens_tensor.to(model_device)

	# Ensure model is on the expected device
	semanticodec.to(model_device)

	# Decode the tokens
	waveform = semanticodec.decode(tokens_tensor)
	if isinstance(waveform, torch.Tensor):
	waveform = waveform.cpu().numpy()

	# Extract audio data
	audio_chunk = waveform[0, 0]
	all_audio_chunks.append(audio_chunk)

	# Combine all chunks we have so far
	combined_audio = np.concatenate(all_audio_chunks)

	# Yield the combined audio for streaming playback
	yield (SAMPLE_RATE, combined_audio), f"Decoded chunk {i+1}/{num_chunks} ({end_idx}/{tokens.shape[1]} tokens)"

	# Small delay to allow Gradio to update UI
	time.sleep(0.1)

	# Final complete audio
	combined_audio = np.concatenate(all_audio_chunks)
	yield (SAMPLE_RATE, combined_audio), f"Completed decoding all {tokens.shape[1]} tokens"

	except Exception as e:
	print(f"Streaming decode error: {str(e)}")
	yield None, f"Error decoding tokens: {str(e)}"

	# Create Gradio interface
	with gr.Blocks(title="Oterin Audio Codec") as demo:
	gr.Markdown("# Oterin Audio Codec")
	gr.Markdown("Upload an audio file to encode it to semantic tokens, decode tokens back to audio, or do both.")

	with gr.Tab("Encode Audio"):
	with gr.Row():
	encode_input = gr.Audio(type="filepath", label="Input Audio")
	encode_output = gr.File(label="Encoded Tokens (.oterin)", file_types=[".oterin"])
	encode_status = gr.Textbox(label="Status")
	encode_btn = gr.Button("Encode")
	encode_btn.click(encode_audio, inputs=encode_input, outputs=[encode_output, encode_status])

	with gr.Tab("Decode Tokens"):
	with gr.Row():
	decode_input = gr.File(label="Token File (.oterin)", file_types=[".oterin"])
	decode_output = gr.Audio(label="Decoded Audio")
	decode_status = gr.Textbox(label="Status")
	decode_btn = gr.Button("Decode")
	decode_btn.click(decode_tokens, inputs=decode_input, outputs=[decode_output, decode_status])

	with gr.Tab("Stream Decode (Listen while decoding)"):
	with gr.Row():
	stream_decode_input = gr.File(label="Token File (.oterin)", file_types=[".oterin"])
	stream_decode_output = gr.Audio(label="Streaming Audio Output")
	stream_decode_status = gr.Textbox(label="Status")
	stream_decode_btn = gr.Button("Start Streaming Decode")
	stream_decode_btn.click(
	stream_decode_tokens,
	inputs=stream_decode_input,
	outputs=[stream_decode_output, stream_decode_status],
	show_progress=True
	)

	with gr.Tab("Both (Encode & Decode)"):
	with gr.Row():
	both_input = gr.Audio(type="filepath", label="Input Audio")
	both_output = gr.Audio(label="Reconstructed Audio")
	both_status = gr.Textbox(label="Status")
	both_btn = gr.Button("Process")
	both_btn.click(process_both, inputs=both_input, outputs=[both_output, both_status])

	with gr.Tab("Both Streaming (Encode & Stream Decode)"):
	with gr.Row():
	stream_both_input = gr.Audio(type="filepath", label="Input Audio")
	stream_both_output = gr.Audio(label="Streaming Reconstructed Audio")
	stream_both_status = gr.Textbox(label="Status")
	stream_both_btn = gr.Button("Encode & Stream Decode")
	stream_both_btn.click(
	stream_both,
	inputs=stream_both_input,
	outputs=[stream_both_output, stream_both_status],
	show_progress=True
	)

	if __name__ == "__main__":
	demo.launch(share=True)