Spaces:

owiedotch
/

oac

Sleeping

App Files Files Community

owiedotch commited on Feb 27

Commit

24e47df

verified ·

1 Parent(s): e5f91c1

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -56

app.py CHANGED Viewed

@@ -9,19 +9,31 @@ import torch
 import tempfile
 import io
 import uuid
 from pathlib import Path
-# Initialize the model
 def load_model():
-    return SemantiCodec(token_rate=100, semantic_vocab_size=32768)  # 1.40 kbps
 semanticodec = load_model()
 @spaces.GPU(duration=20)
 def encode_audio(audio_path):
     """Encode audio file to tokens and return them as a file"""
     try:
         tokens = semanticodec.encode(audio_path)
         # Move tokens to CPU before converting to numpy
         if isinstance(tokens, torch.Tensor):
             tokens = tokens.cpu().numpy()
@@ -31,23 +43,21 @@ def encode_audio(audio_path):
             # Reshape to match expected format [batch, seq_len, features]
             tokens = tokens.reshape(1, -1, 1)
-        # Save to a BytesIO buffer first
-        buffer = io.BytesIO()
-        np.save(buffer, tokens)
-        buffer.seek(0)
-        # Verify the buffer has content
-        if buffer.getbuffer().nbytes == 0:
-            raise Exception("Failed to create token buffer")
         # Create a temporary file in /tmp which is writable in Spaces
         temp_dir = "/tmp"
         os.makedirs(temp_dir, exist_ok=True)
         temp_file_path = os.path.join(temp_dir, f"tokens_{uuid.uuid4()}.oterin")
-        # Write buffer to the temporary file
         with open(temp_file_path, "wb") as f:
-            f.write(buffer.getvalue())
         # Verify the file exists and has content
         if not os.path.exists(temp_file_path) or os.path.getsize(temp_file_path) == 0:
@@ -55,9 +65,10 @@ def encode_audio(audio_path):
         return temp_file_path, f"Encoded to {tokens.shape[1]} tokens"
     except Exception as e:
         return None, f"Error encoding audio: {str(e)}"
-@spaces.GPU(duration=60)
 def decode_tokens(token_file):
     """Decode tokens to audio"""
     # Ensure the file exists and has content
@@ -65,25 +76,29 @@ def decode_tokens(token_file):
         return None, "Error: Empty or missing token file"
     try:
-        # Load tokens from file
-        tokens = np.load(token_file, allow_pickle=True)
-        # Convert to torch tensor with proper dimensions
-        if isinstance(tokens, np.ndarray):
-            # Ensure tokens are in the right shape
-            if tokens.ndim == 1:
-                # Reshape to match expected format [batch, seq_len, features]
-                tokens = tokens.reshape(1, -1, 1)
-            # Convert to torch tensor (on CPU first)
-            tokens = torch.tensor(tokens)
-        # Explicitly move tokens to CUDA
-        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-        tokens = tokens.to(device)
         # Decode the tokens
-        waveform = semanticodec.decode(tokens)
         # Move waveform to CPU for audio processing
         if isinstance(waveform, torch.Tensor):
@@ -100,14 +115,18 @@ def decode_tokens(token_file):
         return output_buffer, f"Decoded {tokens.shape[1]} tokens to audio"
     except Exception as e:
         return None, f"Error decoding tokens: {str(e)}"
-@spaces.GPU(duration=80)
 def process_both(audio_path):
     """Encode and then decode the audio without saving intermediate files"""
     try:
         # Encode
         tokens = semanticodec.encode(audio_path)
         if isinstance(tokens, torch.Tensor):
             tokens = tokens.cpu().numpy()
@@ -117,14 +136,20 @@ def process_both(audio_path):
             tokens = tokens.reshape(1, -1, 1)
         # Convert back to torch tensor (on CPU first)
-        tokens_tensor = torch.tensor(tokens)
-        # Explicitly move tokens to CUDA
-        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-        tokens_tensor = tokens_tensor.to(device)
         # Decode
         waveform = semanticodec.decode(tokens_tensor)
         # Move waveform to CPU for audio processing
         if isinstance(waveform, torch.Tensor):
@@ -141,31 +166,15 @@ def process_both(audio_path):
         return output_buffer, f"Encoded to {tokens.shape[1]} tokens\nDecoded {tokens.shape[1]} tokens to audio"
     except Exception as e:
         return None, f"Error processing audio: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Oterin Audio Codec") as demo:
     gr.Markdown("# Oterin Audio Codec")
-    gr.Markdown("Upload an audio file to encode it to semantic tokens and decode back to audio.")
-    # Make "Both" the primary default tab
-    with gr.Tab("Encode & Decode"):
-        with gr.Row():
-            both_input = gr.Audio(type="filepath", label="Input Audio")
-            both_output = gr.Audio(label="Reconstructed Audio")
-        both_status = gr.Textbox(label="Status")
-        both_btn = gr.Button("Process", variant="primary")
-        both_btn.click(process_both, inputs=both_input, outputs=[both_output, both_status])
-        gr.Markdown("""
-        ## How it works
-        This option encodes your audio to semantic tokens and immediately decodes it back to audio.
-        It's the recommended way to use the codec as it handles all device management internally.
-        """)
-    # Keep separate functions as secondary options with warning
-    with gr.Tab("Advanced (Encode Only)"):
-        gr.Markdown("⚠️ **DEPRECATED**: Using separate encode/decode can lead to device mismatch errors. The combined Encode & Decode tab is recommended.")
         with gr.Row():
             encode_input = gr.Audio(type="filepath", label="Input Audio")
             encode_output = gr.File(label="Encoded Tokens (.oterin)", file_types=[".oterin"])
@@ -173,14 +182,21 @@ with gr.Blocks(title="Oterin Audio Codec") as demo:
         encode_btn = gr.Button("Encode")
         encode_btn.click(encode_audio, inputs=encode_input, outputs=[encode_output, encode_status])
-    with gr.Tab("Advanced (Decode Only)"):
-        gr.Markdown("⚠️ **DEPRECATED**: Using separate encode/decode can lead to device mismatch errors. The combined Encode & Decode tab is recommended.")
         with gr.Row():
             decode_input = gr.File(label="Token File (.oterin)", file_types=[".oterin"])
             decode_output = gr.Audio(label="Decoded Audio")
         decode_status = gr.Textbox(label="Status")
         decode_btn = gr.Button("Decode")
         decode_btn.click(decode_tokens, inputs=decode_input, outputs=[decode_output, decode_status])
 if __name__ == "__main__":
     demo.launch(share=True)

 import tempfile
 import io
 import uuid
+import pickle
 from pathlib import Path
+# Initialize the model and ensure it's on the correct device
 def load_model():
+    model = SemantiCodec(token_rate=100, semantic_vocab_size=32768)  # 1.40 kbps
+    if torch.cuda.is_available():
+        # Move the model to CUDA
+        model.to("cuda:0")
+    return model
+# Initialize model
 semanticodec = load_model()
+# Get the device of the model
+model_device = "cuda:0" if torch.cuda.is_available() else "cpu"
+print(f"Model initialized on device: {model_device}")
 @spaces.GPU(duration=20)
 def encode_audio(audio_path):
     """Encode audio file to tokens and return them as a file"""
     try:
+        print(f"Encoding audio on device: {model_device}")
         tokens = semanticodec.encode(audio_path)
+        print(f"Tokens device after encode: {tokens.device if isinstance(tokens, torch.Tensor) else 'numpy'}")
         # Move tokens to CPU before converting to numpy
         if isinstance(tokens, torch.Tensor):
             tokens = tokens.cpu().numpy()
             # Reshape to match expected format [batch, seq_len, features]
             tokens = tokens.reshape(1, -1, 1)
+        # Save tokens in a way that preserves shape information
+        token_data = {
+            'tokens': tokens,
+            'shape': tokens.shape,
+            'device': str(model_device)  # Store intended device information
+        }
         # Create a temporary file in /tmp which is writable in Spaces
         temp_dir = "/tmp"
         os.makedirs(temp_dir, exist_ok=True)
         temp_file_path = os.path.join(temp_dir, f"tokens_{uuid.uuid4()}.oterin")
+        # Write using pickle instead of numpy save
         with open(temp_file_path, "wb") as f:
+            pickle.dump(token_data, f)
         # Verify the file exists and has content
         if not os.path.exists(temp_file_path) or os.path.getsize(temp_file_path) == 0:
         return temp_file_path, f"Encoded to {tokens.shape[1]} tokens"
     except Exception as e:
+        print(f"Encoding error: {str(e)}")
         return None, f"Error encoding audio: {str(e)}"
+@spaces.GPU(duration=340)
 def decode_tokens(token_file):
     """Decode tokens to audio"""
     # Ensure the file exists and has content
         return None, "Error: Empty or missing token file"
     try:
+        # Load tokens using pickle instead of numpy load
+        with open(token_file, "rb") as f:
+            token_data = pickle.load(f)
+        tokens = token_data['tokens']
+        intended_device = token_data.get('device', model_device)
+        print(f"Loaded tokens with shape {tokens.shape}, intended device: {intended_device}")
+        # Convert to torch tensor
+        tokens_tensor = torch.tensor(tokens, dtype=torch.float32)
+        print(f"Tokens tensor created on device: {tokens_tensor.device}")
+        # Explicitly move tokens to the model's device
+        tokens_tensor = tokens_tensor.to(model_device)
+        print(f"Tokens moved to device: {tokens_tensor.device}")
+        # Also ensure model is on the expected device
+        semanticodec.to(model_device)
+        print(f"Model device before decode: {next(semanticodec.parameters()).device}")
         # Decode the tokens
+        waveform = semanticodec.decode(tokens_tensor)
+        print(f"Waveform device after decode: {waveform.device if isinstance(waveform, torch.Tensor) else 'numpy'}")
         # Move waveform to CPU for audio processing
         if isinstance(waveform, torch.Tensor):
         return output_buffer, f"Decoded {tokens.shape[1]} tokens to audio"
     except Exception as e:
+        print(f"Decoding error: {str(e)}")
         return None, f"Error decoding tokens: {str(e)}"
+@spaces.GPU(duration=360)
 def process_both(audio_path):
     """Encode and then decode the audio without saving intermediate files"""
     try:
+        print(f"Processing both on device: {model_device}")
         # Encode
         tokens = semanticodec.encode(audio_path)
+        print(f"Tokens device after encode: {tokens.device if isinstance(tokens, torch.Tensor) else 'numpy'}")
         if isinstance(tokens, torch.Tensor):
             tokens = tokens.cpu().numpy()
             tokens = tokens.reshape(1, -1, 1)
         # Convert back to torch tensor (on CPU first)
+        tokens_tensor = torch.tensor(tokens, dtype=torch.float32)
+        print(f"Tokens tensor created on device: {tokens_tensor.device}")
+        # Explicitly move tokens to the model's device
+        tokens_tensor = tokens_tensor.to(model_device)
+        print(f"Tokens moved to device: {tokens_tensor.device}")
+        # Also ensure model is on the expected device
+        semanticodec.to(model_device)
+        print(f"Model device before decode: {next(semanticodec.parameters()).device}")
         # Decode
         waveform = semanticodec.decode(tokens_tensor)
+        print(f"Waveform device after decode: {waveform.device if isinstance(waveform, torch.Tensor) else 'numpy'}")
         # Move waveform to CPU for audio processing
         if isinstance(waveform, torch.Tensor):
         return output_buffer, f"Encoded to {tokens.shape[1]} tokens\nDecoded {tokens.shape[1]} tokens to audio"
     except Exception as e:
+        print(f"Processing error: {str(e)}")
         return None, f"Error processing audio: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Oterin Audio Codec") as demo:
     gr.Markdown("# Oterin Audio Codec")
+    gr.Markdown("Upload an audio file to encode it to semantic tokens, decode tokens back to audio, or do both.")
+    with gr.Tab("Encode Audio"):
         with gr.Row():
             encode_input = gr.Audio(type="filepath", label="Input Audio")
             encode_output = gr.File(label="Encoded Tokens (.oterin)", file_types=[".oterin"])
         encode_btn = gr.Button("Encode")
         encode_btn.click(encode_audio, inputs=encode_input, outputs=[encode_output, encode_status])
+    with gr.Tab("Decode Tokens"):
         with gr.Row():
             decode_input = gr.File(label="Token File (.oterin)", file_types=[".oterin"])
             decode_output = gr.Audio(label="Decoded Audio")
         decode_status = gr.Textbox(label="Status")
         decode_btn = gr.Button("Decode")
         decode_btn.click(decode_tokens, inputs=decode_input, outputs=[decode_output, decode_status])
+    with gr.Tab("Both (Encode & Decode)"):
+        with gr.Row():
+            both_input = gr.Audio(type="filepath", label="Input Audio")
+            both_output = gr.Audio(label="Reconstructed Audio")
+        both_status = gr.Textbox(label="Status")
+        both_btn = gr.Button("Process")
+        both_btn.click(process_both, inputs=both_input, outputs=[both_output, both_status])
 if __name__ == "__main__":
     demo.launch(share=True)