File size: 6,227 Bytes
fc8b181 f18f98b bd40662 a32055a fc8b181 5c44be0 e173c02 544ae95 bd40662 fc8b181 e173c02 fc8b181 544ae95 4ad7b57 fc8b181 544ae95 fc8b181 544ae95 85dc4b0 4ad7b57 fc8b181 e173c02 fc8b181 e173c02 4ad7b57 fc8b181 85dc4b0 fc8b181 85dc4b0 fc8b181 e173c02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import os
import gradio as gr
import numpy as np
import soundfile as sf
from semanticodec import SemantiCodec
from huggingface_hub import HfApi
import spaces
import torch
import tempfile
import io
import uuid
from pathlib import Path
# Initialize the model
def load_model():
return SemantiCodec(token_rate=100, semantic_vocab_size=32768) # 1.40 kbps
semanticodec = load_model()
@spaces.GPU(duration=20)
def encode_audio(audio_path):
"""Encode audio file to tokens and return them as a file"""
try:
tokens = semanticodec.encode(audio_path)
# Move tokens to CPU before converting to numpy
if isinstance(tokens, torch.Tensor):
tokens = tokens.cpu().numpy()
# Ensure tokens are in the right shape for later decoding
if tokens.ndim == 1:
# Reshape to match expected format [batch, seq_len, features]
tokens = tokens.reshape(1, -1, 1)
# Create a temporary file in /tmp which is writable in Spaces
temp_dir = "/tmp"
os.makedirs(temp_dir, exist_ok=True)
temp_file_path = os.path.join(temp_dir, f"tokens_{uuid.uuid4()}.oterin")
# Save tokens directly to file
np.save(temp_file_path, tokens)
# Verify the file exists and has content
if not os.path.exists(temp_file_path) or os.path.getsize(temp_file_path) == 0:
raise Exception("Failed to create token file")
return temp_file_path, f"Encoded to {tokens.shape[1]} tokens"
except Exception as e:
return None, f"Error encoding audio: {str(e)}"
@spaces.GPU(duration=60)
def decode_tokens(token_file):
"""Decode tokens to audio"""
# Ensure the file exists and has content
if not token_file or not os.path.exists(token_file):
return None, "Error: Empty or missing token file"
try:
# Load tokens from file
tokens = np.load(token_file, allow_pickle=True)
# Convert to torch tensor with proper dimensions
if isinstance(tokens, np.ndarray):
# Ensure tokens are in the right shape
if tokens.ndim == 1:
# Reshape to match expected format [batch, seq_len, features]
tokens = tokens.reshape(1, -1, 1)
# Convert to torch tensor
tokens = torch.tensor(tokens)
# Ensure tokens are on the right device
if torch.cuda.is_available():
tokens = tokens.cuda()
# Decode the tokens
waveform = semanticodec.decode(tokens)
# Move waveform to CPU if it's a tensor
if isinstance(waveform, torch.Tensor):
waveform = waveform.cpu().numpy()
# Create in-memory file for audio
output_buffer = io.BytesIO()
sf.write(output_buffer, waveform[0, 0], 32000, format='WAV')
output_buffer.seek(0)
# Verify the buffer has content
if output_buffer.getbuffer().nbytes == 0:
return None, "Error: Failed to generate audio"
return output_buffer, f"Decoded {tokens.shape[1]} tokens to audio"
except Exception as e:
return None, f"Error decoding tokens: {str(e)}"
@spaces.GPU(duration=80)
def process_both(audio_path):
"""Encode and then decode the audio without saving intermediate files"""
try:
# Encode
tokens = semanticodec.encode(audio_path)
if isinstance(tokens, torch.Tensor):
tokens = tokens.cpu().numpy()
# Ensure tokens are in the right shape for decoding
if tokens.ndim == 1:
# Reshape to match expected format [batch, seq_len, features]
tokens = tokens.reshape(1, -1, 1)
# Convert back to tensor for decoding
tokens_tensor = torch.tensor(tokens)
# Ensure tokens are on the right device
if torch.cuda.is_available():
tokens_tensor = tokens_tensor.cuda()
# Decode
waveform = semanticodec.decode(tokens_tensor)
if isinstance(waveform, torch.Tensor):
waveform = waveform.cpu().numpy()
# Create in-memory file
output_buffer = io.BytesIO()
sf.write(output_buffer, waveform[0, 0], 32000, format='WAV')
output_buffer.seek(0)
# Verify the buffer has content
if output_buffer.getbuffer().nbytes == 0:
return None, "Error: Failed to generate audio"
return output_buffer, f"Encoded to {tokens.shape[1]} tokens\nDecoded {tokens.shape[1]} tokens to audio"
except Exception as e:
return None, f"Error processing audio: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Oterin Audio Codec") as demo:
gr.Markdown("# Oterin Audio Codec")
gr.Markdown("Upload an audio file to encode it to semantic tokens, decode tokens back to audio, or do both.")
with gr.Tab("Encode Audio"):
with gr.Row():
encode_input = gr.Audio(type="filepath", label="Input Audio")
encode_output = gr.File(label="Encoded Tokens (.oterin)", file_types=[".oterin"])
encode_status = gr.Textbox(label="Status")
encode_btn = gr.Button("Encode")
encode_btn.click(encode_audio, inputs=encode_input, outputs=[encode_output, encode_status])
with gr.Tab("Decode Tokens"):
with gr.Row():
decode_input = gr.File(label="Token File (.oterin)", file_types=[".oterin"])
decode_output = gr.Audio(label="Decoded Audio")
decode_status = gr.Textbox(label="Status")
decode_btn = gr.Button("Decode")
decode_btn.click(decode_tokens, inputs=decode_input, outputs=[decode_output, decode_status])
with gr.Tab("Both (Encode & Decode)"):
with gr.Row():
both_input = gr.Audio(type="filepath", label="Input Audio")
both_output = gr.Audio(label="Reconstructed Audio")
both_status = gr.Textbox(label="Status")
both_btn = gr.Button("Process")
both_btn.click(process_both, inputs=both_input, outputs=[both_output, both_status])
if __name__ == "__main__":
demo.launch(share=True) |