Spaces:

kotoba-tech
/

kotoba-speech

Running on T4

File size: 2,946 Bytes

565faca

import os
from abc import ABC
from typing import Literal, Optional

from df.enhance import enhance, init_df, load_audio, save_audio
from pydub import AudioSegment


def convert_to_wav(input_file: str, output_file: str):
    """Convert an audio file to WAV format

    Args:
        input_file (str): path to input audio file
        output_file (str): path to output WAV file

    """
    # Detect the format of the input file
    format = input_file.split(".")[-1].lower()

    # Read the audio file
    audio = AudioSegment.from_file(input_file, format=format)

    # Export as WAV
    audio.export(output_file, format="wav")


def make_output_file_path(audio_file: str, tag: str, ext: Optional[str] = None) -> str:
    """Generate the output file path

    Args:
        audio_file (str): path to input audio file
        tag (str): tag to append to the output file name
        ext (str, optional): extension of the output file. Defaults to None.

    Returns:
        str: path to output file
    """

    directory = "./enhanced"
    # Get the name of the input file
    filename = os.path.basename(audio_file)

    # Get the name of the input file without the extension
    filename_without_extension = os.path.splitext(filename)[0]

    # Get the extension of the input file
    extension = ext or os.path.splitext(filename)[1]

    # Generate the output file path
    output_file = os.path.join(directory, filename_without_extension + tag + extension)

    return output_file


class BaseEnhancer(ABC):
    """Base class for audio enhancers"""

    def __init__(self, *args, **kwargs):
        raise NotImplementedError

    def __call__(self, audio_file: str, output_file: Optional[str] = None) -> str:
        raise NotImplementedError

    def get_output_file(self, audio_file: str, tag: str, ext: Optional[str] = None) -> str:
        output_file = make_output_file_path(audio_file, tag, ext=ext)
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        return output_file


class DFEnhancer(BaseEnhancer):
    def __init__(self, *args, **kwargs):
        self.model, self.df_state, _ = init_df()

    def __call__(self, audio_file: str, output_file: Optional[str] = None) -> str:
        output_file = output_file or self.get_output_file(audio_file, "_df")

        audio, _ = load_audio(audio_file, sr=self.df_state.sr())

        enhanced = enhance(self.model, self.df_state, audio)

        save_audio(output_file, enhanced, self.df_state.sr())

        return output_file


def get_enhancer(enhancer_name: Literal["df"]) -> BaseEnhancer:
    """Get an audio enhancer

    Args:
        enhancer_name (Literal["df"]): name of the audio enhancer

    Raises:
        ValueError: if the enhancer name is not recognised

    Returns:
        BaseEnhancer: audio enhancer
    """

    if enhancer_name == "df":
        return DFEnhancer()
    else:
        raise ValueError(f"Unknown enhancer name: {enhancer_name}")