import os
import gradio as gr
import numpy as np
import soundfile as sf
from semanticodec import SemantiCodec
from huggingface_hub import HfApi
import spaces
import torch
import tempfile
import io
import uuid
import pickle
import time
from pathlib import Path

# Initialize the model and ensure it's on the correct device
def load_model():
    model = SemantiCodec(token_rate=100, semantic_vocab_size=32768)  # 0.35 kbps
    if torch.cuda.is_available():
        # Move the model to CUDA and ensure it's fully initialized on CUDA
        model = model.to("cuda:0")
        # Force CUDA initialization
        dummy_input = torch.zeros(1, 1, 1, dtype=torch.long).cuda()
        try:
            with torch.no_grad():
                _ = model.decoder(dummy_input)
        except:
            print("Dummy forward pass failed, but CUDA initialization attempted")
    return model

# Initialize model
semanticodec = load_model()
# Get the device of the model
model_device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Model initialized on device: {model_device}")

# Define sample rate as a constant
# Changed from 32000 to 16000 to fix playback speed
SAMPLE_RATE = 16000

@spaces.GPU(duration=20)
def encode_audio(audio_path):
    """Encode audio file to tokens and return them as a file"""
    try:
        print(f"Encoding audio on device: {model_device}")
        # Ensure model is on the right device
        semanticodec.to(model_device)
        tokens = semanticodec.encode(audio_path)
        print(f"Tokens device after encode: {tokens.device if isinstance(tokens, torch.Tensor) else 'numpy'}")
        
        # Move tokens to CPU before converting to numpy
        if isinstance(tokens, torch.Tensor):
            tokens = tokens.cpu().numpy()
        
        # Ensure tokens are in the right shape for later decoding
        if tokens.ndim == 1:
            # Reshape to match expected format [batch, seq_len, features]
            tokens = tokens.reshape(1, -1, 1)
        
        # Save tokens in a way that preserves shape information
        token_data = {
            'tokens': tokens,
            'shape': tokens.shape,
            'device': str(model_device)  # Store intended device information
        }
        
        # Create a temporary file in /tmp which is writable in Spaces
        temp_dir = "/tmp"
        os.makedirs(temp_dir, exist_ok=True)
        temp_file_path = os.path.join(temp_dir, f"tokens_{uuid.uuid4()}.oterin")
        
        # Write using pickle instead of numpy save
        with open(temp_file_path, "wb") as f:
            pickle.dump(token_data, f)
        
        # Verify the file exists and has content
        if not os.path.exists(temp_file_path) or os.path.getsize(temp_file_path) == 0:
            raise Exception("Failed to create token file")
        
        return temp_file_path, f"Encoded to {tokens.shape[1]} tokens"
    except Exception as e:
        print(f"Encoding error: {str(e)}")
        return None, f"Error encoding audio: {str(e)}"

@spaces.GPU(duration=160)
def decode_tokens(token_file):
    """Decode tokens to audio"""
    # Ensure the file exists and has content
    if not token_file or not os.path.exists(token_file):
        return None, "Error: Empty or missing token file"
    
    try:
        # Load tokens using pickle instead of numpy load
        with open(token_file, "rb") as f:
            token_data = pickle.load(f)
        
        tokens = token_data['tokens']
        intended_device = token_data.get('device', model_device)
        print(f"Loaded tokens with shape {tokens.shape}, intended device: {intended_device}")
        
        # Ensure model is on the right device first
        semanticodec.to(model_device)
        print(f"Model device before tensor creation: {next(semanticodec.parameters()).device}")
        
        # Convert to torch tensor with Long dtype for embedding
        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        print(f"Tokens tensor created on device: {tokens_tensor.device} with dtype: {tokens_tensor.dtype}")
        
        # Explicitly move tokens to the model's device
        tokens_tensor = tokens_tensor.to(model_device)
        print(f"Tokens moved to device: {tokens_tensor.device}")
        
        # Decode the tokens
        waveform = semanticodec.decode(tokens_tensor)
        print(f"Waveform device after decode: {waveform.device if isinstance(waveform, torch.Tensor) else 'numpy'}")
        
        # Move waveform to CPU for audio processing
        if isinstance(waveform, torch.Tensor):
            waveform = waveform.cpu().numpy()
        
        # Extract audio data - this should be a numpy array
        audio_data = waveform[0, 0]  # Shape should be [time]
        
        print(f"Audio data shape: {audio_data.shape}, dtype: {audio_data.dtype}")
        
        # Return in Gradio Audio compatible format: (sample_rate, audio_data)
        return (SAMPLE_RATE, audio_data), f"Decoded {tokens.shape[1]} tokens to audio"
    except Exception as e:
        print(f"Decoding error: {str(e)}")
        return None, f"Error decoding tokens: {str(e)}"

@spaces.GPU(duration=250)
def process_both(audio_path):
    """Encode and then decode the audio without saving intermediate files"""
    try:
        print(f"Processing both on device: {model_device}")
        # Ensure model is on the right device
        semanticodec.to(model_device)
        # Encode
        tokens = semanticodec.encode(audio_path)
        print(f"Tokens device after encode: {tokens.device if isinstance(tokens, torch.Tensor) else 'numpy'}")
        
        if isinstance(tokens, torch.Tensor):
            tokens = tokens.cpu().numpy()
        
        # Ensure tokens are in the right shape for decoding
        if tokens.ndim == 1:
            # Reshape to match expected format [batch, seq_len, features]
            tokens = tokens.reshape(1, -1, 1)
        
        # Convert back to torch tensor with Long dtype for embedding
        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        print(f"Tokens tensor created on device: {tokens_tensor.device} with dtype: {tokens_tensor.dtype}")
        
        # Explicitly move tokens to the model's device
        tokens_tensor = tokens_tensor.to(model_device)
        print(f"Tokens moved to device: {tokens_tensor.device}")
        
        # Ensure model is on the right device again before decoding
        semanticodec.to(model_device)
        print(f"Model device before decode: {next(semanticodec.parameters()).device}")
        
        # Decode
        waveform = semanticodec.decode(tokens_tensor)
        print(f"Waveform device after decode: {waveform.device if isinstance(waveform, torch.Tensor) else 'numpy'}")
        
        # Move waveform to CPU for audio processing
        if isinstance(waveform, torch.Tensor):
            waveform = waveform.cpu().numpy()
        
        # Extract audio data - this should be a numpy array
        audio_data = waveform[0, 0]  # Shape should be [time]
        
        print(f"Audio data shape: {audio_data.shape}, dtype: {audio_data.dtype}")
        
        # Return in Gradio Audio compatible format: (sample_rate, audio_data)
        return (SAMPLE_RATE, audio_data), f"Encoded to {tokens.shape[1]} tokens\nDecoded {tokens.shape[1]} tokens to audio"
    except Exception as e:
        print(f"Processing error: {str(e)}")
        return None, f"Error processing audio: {str(e)}"

@spaces.GPU(duration=250)
def stream_both(audio_path):
    """Encode and then stream decode the audio"""
    try:
        print(f"Processing both (streaming) on device: {model_device}")
        # Ensure model is on the right device
        semanticodec.to(model_device)
        
        # First encode the audio
        tokens = semanticodec.encode(audio_path)
        if isinstance(tokens, torch.Tensor):
            tokens = tokens.cpu().numpy()
        
        # Ensure tokens are in the right shape for decoding
        if tokens.ndim == 1:
            tokens = tokens.reshape(1, -1, 1)
        
        print(f"Encoded audio to {tokens.shape[1]} tokens, now streaming decoding...")
        yield None, f"Encoded to {tokens.shape[1]} tokens, starting decoding..."
        
        # If tokens are too small, decode all at once
        if tokens.shape[1] < 1500:  # Changed from 500 to 1500 (15 seconds at 100 tokens/sec)
            # Convert to torch tensor with Long dtype for embedding
            tokens_tensor = torch.tensor(tokens, dtype=torch.long).to(model_device)
            
            # Decode the tokens
            semanticodec.to(model_device)
            waveform = semanticodec.decode(tokens_tensor)
            if isinstance(waveform, torch.Tensor):
                waveform = waveform.cpu().numpy()
            
            audio_data = waveform[0, 0]
            yield (SAMPLE_RATE, audio_data), f"Encoded to {tokens.shape[1]} tokens and decoded to audio"
            return
        
        # Split tokens into chunks for streaming
        chunk_size = 1500  # Changed from 500 to 1500 (15 seconds at 100 tokens/sec)
        num_chunks = (tokens.shape[1] + chunk_size - 1) // chunk_size  # Ceiling division
        
        all_audio_chunks = []
        
        for i in range(num_chunks):
            start_idx = i * chunk_size
            end_idx = min((i + 1) * chunk_size, tokens.shape[1])
            
            print(f"Decoding chunk {i+1}/{num_chunks}, tokens {start_idx} to {end_idx}")
            
            # Extract chunk of tokens
            token_chunk = tokens[:, start_idx:end_idx, :]
            
            # Convert to torch tensor with Long dtype
            tokens_tensor = torch.tensor(token_chunk, dtype=torch.long).to(model_device)
            
            # Ensure model is on the expected device
            semanticodec.to(model_device)
            
            # Decode the tokens
            waveform = semanticodec.decode(tokens_tensor)
            if isinstance(waveform, torch.Tensor):
                waveform = waveform.cpu().numpy()
            
            # Extract audio data
            audio_chunk = waveform[0, 0]
            all_audio_chunks.append(audio_chunk)
            
            # Combine all chunks we have so far
            combined_audio = np.concatenate(all_audio_chunks)
            
            # Yield the combined audio for streaming playback
            yield (SAMPLE_RATE, combined_audio), f"Encoded to {tokens.shape[1]} tokens\nDecoded chunk {i+1}/{num_chunks} ({end_idx}/{tokens.shape[1]} tokens)"
            
            # Small delay to allow Gradio to update UI
            time.sleep(0.1)
        
        # Final complete audio
        combined_audio = np.concatenate(all_audio_chunks)
        yield (SAMPLE_RATE, combined_audio), f"Completed: Encoded to {tokens.shape[1]} tokens and fully decoded"
        
    except Exception as e:
        print(f"Streaming process error: {str(e)}")
        yield None, f"Error processing audio: {str(e)}"

@spaces.GPU(duration=250)
def stream_decode_tokens(token_file):
    """Decode tokens to audio in streaming chunks"""
    # Ensure the file exists and has content
    if not token_file or not os.path.exists(token_file):
        yield None, "Error: Empty or missing token file"
        return
    
    try:
        # Load tokens using pickle instead of numpy load
        with open(token_file, "rb") as f:
            token_data = pickle.load(f)
        
        tokens = token_data['tokens']
        intended_device = token_data.get('device', model_device)
        print(f"Loaded tokens with shape {tokens.shape}, intended device: {intended_device}")
        
        # Ensure model is on the right device
        semanticodec.to(model_device)
        
        # If tokens are too small, decode all at once
        if tokens.shape[1] < 1500:  # Changed from 500 to 1500 (15 seconds at 100 tokens/sec)
            # Convert to torch tensor with Long dtype for embedding
            tokens_tensor = torch.tensor(tokens, dtype=torch.long)
            tokens_tensor = tokens_tensor.to(model_device)
            
            # Decode the tokens
            waveform = semanticodec.decode(tokens_tensor)
            if isinstance(waveform, torch.Tensor):
                waveform = waveform.cpu().numpy()
            
            audio_data = waveform[0, 0]
            yield (SAMPLE_RATE, audio_data), f"Decoded {tokens.shape[1]} tokens to audio"
            return
        
        # Split tokens into chunks for streaming
        chunk_size = 1500  # Changed from 500 to 1500 (15 seconds at 100 tokens/sec)
        num_chunks = (tokens.shape[1] + chunk_size - 1) // chunk_size  # Ceiling division
        
        # First status update
        yield None, f"Starting decoding of {tokens.shape[1]} tokens in {num_chunks} chunks..."
        
        all_audio_chunks = []
        
        for i in range(num_chunks):
            start_idx = i * chunk_size
            end_idx = min((i + 1) * chunk_size, tokens.shape[1])
            
            print(f"Decoding chunk {i+1}/{num_chunks}, tokens {start_idx} to {end_idx}")
            
            # Extract chunk of tokens
            token_chunk = tokens[:, start_idx:end_idx, :]
            
            # Convert to torch tensor with Long dtype
            tokens_tensor = torch.tensor(token_chunk, dtype=torch.long)
            tokens_tensor = tokens_tensor.to(model_device)
            
            # Ensure model is on the expected device
            semanticodec.to(model_device)
            
            # Decode the tokens
            waveform = semanticodec.decode(tokens_tensor)
            if isinstance(waveform, torch.Tensor):
                waveform = waveform.cpu().numpy()
            
            # Extract audio data
            audio_chunk = waveform[0, 0]
            all_audio_chunks.append(audio_chunk)
            
            # Combine all chunks we have so far
            combined_audio = np.concatenate(all_audio_chunks)
            
            # Yield the combined audio for streaming playback
            yield (SAMPLE_RATE, combined_audio), f"Decoded chunk {i+1}/{num_chunks} ({end_idx}/{tokens.shape[1]} tokens)"
            
            # Small delay to allow Gradio to update UI
            time.sleep(0.1)
        
        # Final complete audio
        combined_audio = np.concatenate(all_audio_chunks)
        yield (SAMPLE_RATE, combined_audio), f"Completed decoding all {tokens.shape[1]} tokens"
        
    except Exception as e:
        print(f"Streaming decode error: {str(e)}")
        yield None, f"Error decoding tokens: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Oterin Audio Codec") as demo:
    gr.Markdown("# Oterin Audio Codec")
    gr.Markdown("Upload an audio file to encode it to semantic tokens, decode tokens back to audio, or do both.")
    
    with gr.Tab("Encode Audio"):
        with gr.Row():
            encode_input = gr.Audio(type="filepath", label="Input Audio")
            encode_output = gr.File(label="Encoded Tokens (.oterin)", file_types=[".oterin"])
        encode_status = gr.Textbox(label="Status")
        encode_btn = gr.Button("Encode")
        encode_btn.click(encode_audio, inputs=encode_input, outputs=[encode_output, encode_status])
    
    with gr.Tab("Decode Tokens"):
        with gr.Row():
            decode_input = gr.File(label="Token File (.oterin)", file_types=[".oterin"])
            decode_output = gr.Audio(label="Decoded Audio")
        decode_status = gr.Textbox(label="Status")
        decode_btn = gr.Button("Decode")
        decode_btn.click(decode_tokens, inputs=decode_input, outputs=[decode_output, decode_status])
    
    with gr.Tab("Stream Decode (Listen while decoding)"):
        with gr.Row():
            stream_decode_input = gr.File(label="Token File (.oterin)", file_types=[".oterin"])
            stream_decode_output = gr.Audio(label="Streaming Audio Output")
        stream_decode_status = gr.Textbox(label="Status")
        stream_decode_btn = gr.Button("Start Streaming Decode")
        stream_decode_btn.click(
            stream_decode_tokens, 
            inputs=stream_decode_input, 
            outputs=[stream_decode_output, stream_decode_status],
            show_progress=True
        )
    
    with gr.Tab("Both (Encode & Decode)"):
        with gr.Row():
            both_input = gr.Audio(type="filepath", label="Input Audio")
            both_output = gr.Audio(label="Reconstructed Audio")
        both_status = gr.Textbox(label="Status")
        both_btn = gr.Button("Process")
        both_btn.click(process_both, inputs=both_input, outputs=[both_output, both_status])
    
    with gr.Tab("Both Streaming (Encode & Stream Decode)"):
        with gr.Row():
            stream_both_input = gr.Audio(type="filepath", label="Input Audio")
            stream_both_output = gr.Audio(label="Streaming Reconstructed Audio")
        stream_both_status = gr.Textbox(label="Status")
        stream_both_btn = gr.Button("Encode & Stream Decode")
        stream_both_btn.click(
            stream_both, 
            inputs=stream_both_input, 
            outputs=[stream_both_output, stream_both_status],
            show_progress=True
        )

if __name__ == "__main__":
    demo.launch(share=True)