File size: 6,227 Bytes
fc8b181
f18f98b
bd40662
a32055a
fc8b181
 
 
5c44be0
e173c02
 
544ae95
 
bd40662
fc8b181
 
 
 
 
 
e173c02
fc8b181
544ae95
4ad7b57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc8b181
 
544ae95
fc8b181
544ae95
 
 
85dc4b0
4ad7b57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc8b181
e173c02
fc8b181
e173c02
4ad7b57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc8b181
 
 
 
 
 
 
 
 
85dc4b0
fc8b181
 
 
 
 
 
85dc4b0
fc8b181
 
 
 
 
 
 
 
 
 
 
 
 
 
e173c02
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import os
import gradio as gr
import numpy as np
import soundfile as sf
from semanticodec import SemantiCodec
from huggingface_hub import HfApi
import spaces
import torch
import tempfile
import io
import uuid
from pathlib import Path

# Initialize the model
def load_model():
    return SemantiCodec(token_rate=100, semantic_vocab_size=32768)  # 1.40 kbps

semanticodec = load_model()

@spaces.GPU(duration=20)
def encode_audio(audio_path):
    """Encode audio file to tokens and return them as a file"""
    try:
        tokens = semanticodec.encode(audio_path)
        # Move tokens to CPU before converting to numpy
        if isinstance(tokens, torch.Tensor):
            tokens = tokens.cpu().numpy()
        
        # Ensure tokens are in the right shape for later decoding
        if tokens.ndim == 1:
            # Reshape to match expected format [batch, seq_len, features]
            tokens = tokens.reshape(1, -1, 1)
        
        # Create a temporary file in /tmp which is writable in Spaces
        temp_dir = "/tmp"
        os.makedirs(temp_dir, exist_ok=True)
        temp_file_path = os.path.join(temp_dir, f"tokens_{uuid.uuid4()}.oterin")
        
        # Save tokens directly to file
        np.save(temp_file_path, tokens)
        
        # Verify the file exists and has content
        if not os.path.exists(temp_file_path) or os.path.getsize(temp_file_path) == 0:
            raise Exception("Failed to create token file")
        
        return temp_file_path, f"Encoded to {tokens.shape[1]} tokens"
    except Exception as e:
        return None, f"Error encoding audio: {str(e)}"

@spaces.GPU(duration=60)
def decode_tokens(token_file):
    """Decode tokens to audio"""
    # Ensure the file exists and has content
    if not token_file or not os.path.exists(token_file):
        return None, "Error: Empty or missing token file"
    
    try:
        # Load tokens from file
        tokens = np.load(token_file, allow_pickle=True)
        
        # Convert to torch tensor with proper dimensions
        if isinstance(tokens, np.ndarray):
            # Ensure tokens are in the right shape
            if tokens.ndim == 1:
                # Reshape to match expected format [batch, seq_len, features]
                tokens = tokens.reshape(1, -1, 1)
            
            # Convert to torch tensor
            tokens = torch.tensor(tokens)
        
        # Ensure tokens are on the right device
        if torch.cuda.is_available():
            tokens = tokens.cuda()
        
        # Decode the tokens
        waveform = semanticodec.decode(tokens)
        
        # Move waveform to CPU if it's a tensor
        if isinstance(waveform, torch.Tensor):
            waveform = waveform.cpu().numpy()
        
        # Create in-memory file for audio
        output_buffer = io.BytesIO()
        sf.write(output_buffer, waveform[0, 0], 32000, format='WAV')
        output_buffer.seek(0)
        
        # Verify the buffer has content
        if output_buffer.getbuffer().nbytes == 0:
            return None, "Error: Failed to generate audio"
        
        return output_buffer, f"Decoded {tokens.shape[1]} tokens to audio"
    except Exception as e:
        return None, f"Error decoding tokens: {str(e)}"

@spaces.GPU(duration=80)
def process_both(audio_path):
    """Encode and then decode the audio without saving intermediate files"""
    try:
        # Encode
        tokens = semanticodec.encode(audio_path)
        if isinstance(tokens, torch.Tensor):
            tokens = tokens.cpu().numpy()
        
        # Ensure tokens are in the right shape for decoding
        if tokens.ndim == 1:
            # Reshape to match expected format [batch, seq_len, features]
            tokens = tokens.reshape(1, -1, 1)
        
        # Convert back to tensor for decoding
        tokens_tensor = torch.tensor(tokens)
        
        # Ensure tokens are on the right device
        if torch.cuda.is_available():
            tokens_tensor = tokens_tensor.cuda()
        
        # Decode
        waveform = semanticodec.decode(tokens_tensor)
        if isinstance(waveform, torch.Tensor):
            waveform = waveform.cpu().numpy()
        
        # Create in-memory file
        output_buffer = io.BytesIO()
        sf.write(output_buffer, waveform[0, 0], 32000, format='WAV')
        output_buffer.seek(0)
        
        # Verify the buffer has content
        if output_buffer.getbuffer().nbytes == 0:
            return None, "Error: Failed to generate audio"
        
        return output_buffer, f"Encoded to {tokens.shape[1]} tokens\nDecoded {tokens.shape[1]} tokens to audio"
    except Exception as e:
        return None, f"Error processing audio: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Oterin Audio Codec") as demo:
    gr.Markdown("# Oterin Audio Codec")
    gr.Markdown("Upload an audio file to encode it to semantic tokens, decode tokens back to audio, or do both.")
    
    with gr.Tab("Encode Audio"):
        with gr.Row():
            encode_input = gr.Audio(type="filepath", label="Input Audio")
            encode_output = gr.File(label="Encoded Tokens (.oterin)", file_types=[".oterin"])
        encode_status = gr.Textbox(label="Status")
        encode_btn = gr.Button("Encode")
        encode_btn.click(encode_audio, inputs=encode_input, outputs=[encode_output, encode_status])
    
    with gr.Tab("Decode Tokens"):
        with gr.Row():
            decode_input = gr.File(label="Token File (.oterin)", file_types=[".oterin"])
            decode_output = gr.Audio(label="Decoded Audio")
        decode_status = gr.Textbox(label="Status")
        decode_btn = gr.Button("Decode")
        decode_btn.click(decode_tokens, inputs=decode_input, outputs=[decode_output, decode_status])
    
    with gr.Tab("Both (Encode & Decode)"):
        with gr.Row():
            both_input = gr.Audio(type="filepath", label="Input Audio")
            both_output = gr.Audio(label="Reconstructed Audio")
        both_status = gr.Textbox(label="Status")
        both_btn = gr.Button("Process")
        both_btn.click(process_both, inputs=both_input, outputs=[both_output, both_status])

if __name__ == "__main__":
    demo.launch(share=True)