Spaces:

owiedotch
/

oac

Sleeping

App Files Files Community

owiedotch commited on 27 days ago

Commit

be68594

verified ·

1 Parent(s): 16120e1

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -9

app.py CHANGED Viewed

@@ -15,10 +15,17 @@ from pathlib import Path
 # Initialize the model and ensure it's on the correct device
 def load_model():
-    model = SemantiCodec(token_rate=100, semantic_vocab_size=16384)  # 1.35 kbps
     if torch.cuda.is_available():
-        # Move the model to CUDA
-        model.to("cuda:0")
     return model
 # Initialize model
@@ -28,13 +35,16 @@ model_device = "cuda:0" if torch.cuda.is_available() else "cpu"
 print(f"Model initialized on device: {model_device}")
 # Define sample rate as a constant
-SAMPLE_RATE = 32000
 @spaces.GPU(duration=20)
 def encode_audio(audio_path):
     """Encode audio file to tokens and return them as a file"""
     try:
         print(f"Encoding audio on device: {model_device}")
         tokens = semanticodec.encode(audio_path)
         print(f"Tokens device after encode: {tokens.device if isinstance(tokens, torch.Tensor) else 'numpy'}")
@@ -88,6 +98,10 @@ def decode_tokens(token_file):
         intended_device = token_data.get('device', model_device)
         print(f"Loaded tokens with shape {tokens.shape}, intended device: {intended_device}")
         # Convert to torch tensor with Long dtype for embedding
         tokens_tensor = torch.tensor(tokens, dtype=torch.long)
         print(f"Tokens tensor created on device: {tokens_tensor.device} with dtype: {tokens_tensor.dtype}")
@@ -96,10 +110,6 @@ def decode_tokens(token_file):
         tokens_tensor = tokens_tensor.to(model_device)
         print(f"Tokens moved to device: {tokens_tensor.device}")
-        # Also ensure model is on the expected device
-        semanticodec.to(model_device)
-        print(f"Model device before decode: {next(semanticodec.parameters()).device}")
         # Decode the tokens
         waveform = semanticodec.decode(tokens_tensor)
         print(f"Waveform device after decode: {waveform.device if isinstance(waveform, torch.Tensor) else 'numpy'}")
@@ -124,6 +134,8 @@ def process_both(audio_path):
     """Encode and then decode the audio without saving intermediate files"""
     try:
         print(f"Processing both on device: {model_device}")
         # Encode
         tokens = semanticodec.encode(audio_path)
         print(f"Tokens device after encode: {tokens.device if isinstance(tokens, torch.Tensor) else 'numpy'}")
@@ -144,7 +156,7 @@ def process_both(audio_path):
         tokens_tensor = tokens_tensor.to(model_device)
         print(f"Tokens moved to device: {tokens_tensor.device}")
-        # Also ensure model is on the expected device
         semanticodec.to(model_device)
         print(f"Model device before decode: {next(semanticodec.parameters()).device}")
@@ -167,6 +179,88 @@ def process_both(audio_path):
         print(f"Processing error: {str(e)}")
         return None, f"Error processing audio: {str(e)}"
 @spaces.GPU(duration=360)
 def stream_decode_tokens(token_file):
     """Decode tokens to audio in streaming chunks"""
@@ -184,6 +278,9 @@ def stream_decode_tokens(token_file):
         intended_device = token_data.get('device', model_device)
         print(f"Loaded tokens with shape {tokens.shape}, intended device: {intended_device}")
         # If tokens are too small, decode all at once
         if tokens.shape[1] < 500:
             # Convert to torch tensor with Long dtype for embedding
@@ -291,6 +388,19 @@ with gr.Blocks(title="Oterin Audio Codec") as demo:
         both_status = gr.Textbox(label="Status")
         both_btn = gr.Button("Process")
         both_btn.click(process_both, inputs=both_input, outputs=[both_output, both_status])
 if __name__ == "__main__":
     demo.launch(share=True)

 # Initialize the model and ensure it's on the correct device
 def load_model():
+    model = SemantiCodec(token_rate=25, semantic_vocab_size=32768)  # 0.35 kbps
     if torch.cuda.is_available():
+        # Move the model to CUDA and ensure it's fully initialized on CUDA
+        model = model.to("cuda:0")
+        # Force CUDA initialization
+        dummy_input = torch.zeros(1, 1, 1, dtype=torch.long).cuda()
+        try:
+            with torch.no_grad():
+                _ = model.decoder(dummy_input)
+        except:
+            print("Dummy forward pass failed, but CUDA initialization attempted")
     return model
 # Initialize model
 print(f"Model initialized on device: {model_device}")
 # Define sample rate as a constant
+# Changed from 32000 to 16000 to fix playback speed
+SAMPLE_RATE = 16000
 @spaces.GPU(duration=20)
 def encode_audio(audio_path):
     """Encode audio file to tokens and return them as a file"""
     try:
         print(f"Encoding audio on device: {model_device}")
+        # Ensure model is on the right device
+        semanticodec.to(model_device)
         tokens = semanticodec.encode(audio_path)
         print(f"Tokens device after encode: {tokens.device if isinstance(tokens, torch.Tensor) else 'numpy'}")
         intended_device = token_data.get('device', model_device)
         print(f"Loaded tokens with shape {tokens.shape}, intended device: {intended_device}")
+        # Ensure model is on the right device first
+        semanticodec.to(model_device)
+        print(f"Model device before tensor creation: {next(semanticodec.parameters()).device}")
         # Convert to torch tensor with Long dtype for embedding
         tokens_tensor = torch.tensor(tokens, dtype=torch.long)
         print(f"Tokens tensor created on device: {tokens_tensor.device} with dtype: {tokens_tensor.dtype}")
         tokens_tensor = tokens_tensor.to(model_device)
         print(f"Tokens moved to device: {tokens_tensor.device}")
         # Decode the tokens
         waveform = semanticodec.decode(tokens_tensor)
         print(f"Waveform device after decode: {waveform.device if isinstance(waveform, torch.Tensor) else 'numpy'}")
     """Encode and then decode the audio without saving intermediate files"""
     try:
         print(f"Processing both on device: {model_device}")
+        # Ensure model is on the right device
+        semanticodec.to(model_device)
         # Encode
         tokens = semanticodec.encode(audio_path)
         print(f"Tokens device after encode: {tokens.device if isinstance(tokens, torch.Tensor) else 'numpy'}")
         tokens_tensor = tokens_tensor.to(model_device)
         print(f"Tokens moved to device: {tokens_tensor.device}")
+        # Ensure model is on the right device again before decoding
         semanticodec.to(model_device)
         print(f"Model device before decode: {next(semanticodec.parameters()).device}")
         print(f"Processing error: {str(e)}")
         return None, f"Error processing audio: {str(e)}"
+@spaces.GPU(duration=360)
+def stream_both(audio_path):
+    """Encode and then stream decode the audio"""
+    try:
+        print(f"Processing both (streaming) on device: {model_device}")
+        # Ensure model is on the right device
+        semanticodec.to(model_device)
+        # First encode the audio
+        tokens = semanticodec.encode(audio_path)
+        if isinstance(tokens, torch.Tensor):
+            tokens = tokens.cpu().numpy()
+        # Ensure tokens are in the right shape for decoding
+        if tokens.ndim == 1:
+            tokens = tokens.reshape(1, -1, 1)
+        print(f"Encoded audio to {tokens.shape[1]} tokens, now streaming decoding...")
+        yield None, f"Encoded to {tokens.shape[1]} tokens, starting decoding..."
+        # If tokens are too small, decode all at once
+        if tokens.shape[1] < 500:
+            # Convert to torch tensor with Long dtype for embedding
+            tokens_tensor = torch.tensor(tokens, dtype=torch.long).to(model_device)
+            # Decode the tokens
+            semanticodec.to(model_device)
+            waveform = semanticodec.decode(tokens_tensor)
+            if isinstance(waveform, torch.Tensor):
+                waveform = waveform.cpu().numpy()
+            audio_data = waveform[0, 0]
+            yield (SAMPLE_RATE, audio_data), f"Encoded to {tokens.shape[1]} tokens and decoded to audio"
+            return
+        # Split tokens into chunks for streaming
+        chunk_size = 500  # Number of tokens per chunk
+        num_chunks = (tokens.shape[1] + chunk_size - 1) // chunk_size  # Ceiling division
+        all_audio_chunks = []
+        for i in range(num_chunks):
+            start_idx = i * chunk_size
+            end_idx = min((i + 1) * chunk_size, tokens.shape[1])
+            print(f"Decoding chunk {i+1}/{num_chunks}, tokens {start_idx} to {end_idx}")
+            # Extract chunk of tokens
+            token_chunk = tokens[:, start_idx:end_idx, :]
+            # Convert to torch tensor with Long dtype
+            tokens_tensor = torch.tensor(token_chunk, dtype=torch.long).to(model_device)
+            # Ensure model is on the expected device
+            semanticodec.to(model_device)
+            # Decode the tokens
+            waveform = semanticodec.decode(tokens_tensor)
+            if isinstance(waveform, torch.Tensor):
+                waveform = waveform.cpu().numpy()
+            # Extract audio data
+            audio_chunk = waveform[0, 0]
+            all_audio_chunks.append(audio_chunk)
+            # Combine all chunks we have so far
+            combined_audio = np.concatenate(all_audio_chunks)
+            # Yield the combined audio for streaming playback
+            yield (SAMPLE_RATE, combined_audio), f"Encoded to {tokens.shape[1]} tokens\nDecoded chunk {i+1}/{num_chunks} ({end_idx}/{tokens.shape[1]} tokens)"
+            # Small delay to allow Gradio to update UI
+            time.sleep(0.1)
+        # Final complete audio
+        combined_audio = np.concatenate(all_audio_chunks)
+        yield (SAMPLE_RATE, combined_audio), f"Completed: Encoded to {tokens.shape[1]} tokens and fully decoded"
+    except Exception as e:
+        print(f"Streaming process error: {str(e)}")
+        yield None, f"Error processing audio: {str(e)}"
 @spaces.GPU(duration=360)
 def stream_decode_tokens(token_file):
     """Decode tokens to audio in streaming chunks"""
         intended_device = token_data.get('device', model_device)
         print(f"Loaded tokens with shape {tokens.shape}, intended device: {intended_device}")
+        # Ensure model is on the right device
+        semanticodec.to(model_device)
         # If tokens are too small, decode all at once
         if tokens.shape[1] < 500:
             # Convert to torch tensor with Long dtype for embedding
         both_status = gr.Textbox(label="Status")
         both_btn = gr.Button("Process")
         both_btn.click(process_both, inputs=both_input, outputs=[both_output, both_status])
+    with gr.Tab("Both Streaming (Encode & Stream Decode)"):
+        with gr.Row():
+            stream_both_input = gr.Audio(type="filepath", label="Input Audio")
+            stream_both_output = gr.Audio(label="Streaming Reconstructed Audio")
+        stream_both_status = gr.Textbox(label="Status")
+        stream_both_btn = gr.Button("Encode & Stream Decode")
+        stream_both_btn.click(
+            stream_both,
+            inputs=stream_both_input,
+            outputs=[stream_both_output, stream_both_status],
+            show_progress=True
+        )
 if __name__ == "__main__":
     demo.launch(share=True)