Spaces:

kotoba-tech
/

kotoba-speech

Running on T4

File size: 3,953 Bytes

565faca

import os
import pathlib
import uuid
from abc import ABC, abstractmethod
from typing import Callable, Optional, Union

import julius
import torch
from audiocraft.data.audio import audio_read, audio_write
from audiocraft.models import MultiBandDiffusion  # type: ignore

from IPython import embed

class Decoder(ABC):
    @abstractmethod
    def decode(self, tokens: list[int], ref_audio_path: Optional[str] = None, causal: Optional[bool] = None):
        raise NotImplementedError


class EncodecDecoder(Decoder):
    def __init__(
        self,
        tokeniser_decode_fn: Callable[[list[int]], str],
        data_adapter_fn: Callable[[list[list[int]]], tuple[list[int], list[list[int]]]],
        output_dir: str,
    ):
        self._mbd_bandwidth = 6  # 1.5
        self._mbd_sample_rate = 24_000
        self._end_of_audio_token = 1024
        self._num_codebooks = 8
        self.mbd = MultiBandDiffusion.get_mbd_24khz(bw=self._mbd_bandwidth)

        self.tokeniser_decode_fn = tokeniser_decode_fn
        self._data_adapter_fn = data_adapter_fn

        self.output_dir = pathlib.Path(output_dir).resolve()
        os.makedirs(self.output_dir, exist_ok=True)

    def _save_audio(self, name: str, wav: torch.Tensor):
        audio_write(
            name,
            wav.squeeze(0).cpu(),
            self._mbd_sample_rate,
            strategy="loudness",
            loudness_compressor=True,
        )

    def get_tokens(self, audio_path: str) -> list[list[int]]:
        """
        Utility method to get tokens from audio. Useful when you want to test reconstruction in some form (e.g.
        limited codebook reconstruction or sampling from second stage model only).
        """
        pass
        wav, sr = audio_read(audio_path)
        if sr != self._mbd_sample_rate:
            wav = julius.resample_frac(wav, sr, self._mbd_sample_rate)
        if wav.ndim == 2:
            wav = wav.unsqueeze(1)
        wav = wav.to("cuda")
        tokens = self.mbd.codec_model.encode(wav)
        tokens = tokens[0][0]
        # embed()
        return tokens.tolist()

    def decode(
        self, tokens: list[list[int]], causal: bool = True, ref_audio_path: Optional[str] = None
    ) -> Union[str, torch.Tensor]:
        # TODO: this has strange behaviour -- if causal is True, it returns tokens. if causal is False, it SAVES the audio file.
        text_ids, extracted_audio_ids = self._data_adapter_fn(tokens)
        text = self.tokeniser_decode_fn(text_ids)
        print(f"Text: {text}")

        tokens = torch.tensor(extracted_audio_ids, device="cuda").unsqueeze(0)

        if tokens.shape[1] < self._num_codebooks:
            tokens = torch.cat(
                [tokens, *[torch.ones_like(tokens[0:1, 0:1]) * 0] * (self._num_codebooks - tokens.shape[1])], dim=1
            )

        if causal:
            return tokens
        else:
            with torch.amp.autocast(device_type="cuda", dtype=torch.float32):
                # embed()
                wav = self.mbd.tokens_to_wav(tokens)
            # NOTE: we couldn't just return wav here as it goes through loudness compression etc :)

        if wav.shape[-1] < 9600:
            # this causes problem for the code below, and is also odd :)
            # first happened for tokens (1, 8, 28) -> wav (1, 1, 8960) (~320x factor in time dimension!)
            raise Exception("wav predicted is shorter than 400ms!")

        try:
            wav_file_name = self.output_dir / f"synth_{text.replace(' ', '_')[:25]}_{uuid.uuid4()}"
            self._save_audio(wav_file_name, wav)
            print(f"\nSaved audio to {wav_file_name}.wav")
            return wav_file_name
        except Exception as e:
            print(f"Failed to save audio! Reason: {e}")
            wav_file_name = self.output_dir / f"synth_{uuid.uuid4()}"
            self._save_audio(wav_file_name, wav)
            print(f"\nSaved audio to {wav_file_name}.wav")
            return wav_file_name