Spaces:

djkesu
/

tortoise5c

Running

File size: 11,739 Bytes

import os
import shutil
from pathlib import Path

import streamlit as st
from random import randint

from tortoise.api import MODELS_DIR
from tortoise.inference import (
    infer_on_texts,
    run_and_save_tts,
    split_and_recombine_text,
)
from tortoise.api import TextToSpeech
from tortoise.utils.diffusion import SAMPLERS
from app_utils.filepicker import st_file_selector
from app_utils.conf import TortoiseConfig

from app_utils.funcs import (
    timeit,
    load_model,
    list_voices,
    load_voice_conditionings,
)

LATENT_MODES = [
    "Tortoise original (bad)",
    "average per 4.27s (broken on small files)",
    "average per voice file (broken on small files)",
]

def main():
    conf = TortoiseConfig()
    voice_samples, conditioning_latents = None, None
    with st.expander("Create New Voice", expanded=True):
        if "file_uploader_key" not in st.session_state:
            st.session_state["file_uploader_key"] = str(randint(1000, 100000000))
            st.session_state["text_input_key"] = str(randint(1000, 100000000))

        uploaded_files = st.file_uploader(
            "Upload Audio Samples for a New Voice",
            accept_multiple_files=True,
            type=["wav"],
            key=st.session_state["file_uploader_key"]
        )

        voice_name = st.text_input(
            "New Voice Name",
            help="Enter a name for your new voice.",
            value="",
            key=st.session_state["text_input_key"]
        )

        create_voice_button = st.button(
            "Create Voice",
            disabled = ((voice_name.strip() == "") | (len(uploaded_files) == 0))
        )
        if create_voice_button:
            st.write(st.session_state)
            with st.spinner(f"Creating new voice: {voice_name}"):
                new_voice_name = voice_name.strip().replace(" ", "_")

                voices_dir = f'./tortoise/voices/{new_voice_name}/'
                if os.path.exists(voices_dir):
                    shutil.rmtree(voices_dir)
                os.makedirs(voices_dir)
                voice_samples = []
                for index, uploaded_file in enumerate(uploaded_files):
                    bytes_data = uploaded_file.read()
                    with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file:
                        wav_file.write(bytes_data)
                    voice_samples.append(f"{voices_dir}voice_sample{index}.wav")

                # # Generate conditioning latents and samples here
                # voice_samples, conditioning_latents = generate_conditioning(voices_dir)

                # # Save the conditioning latents and samples
                # save_conditioning(voices_dir, voice_samples, conditioning_latents)

                conditioning_latents = TextToSpeech.get_conditioning_latents(new_voice_name, voice_samples=voice_samples)
                print(voice_samples, conditioning_latents)

                st.session_state["text_input_key"] = str(randint(1000, 100000000))
                st.session_state["file_uploader_key"] = str(randint(1000, 100000000))
                st.experimental_rerun()

    text = st.text_area(
        "Text",
        help="Text to speak.",
        value="The expressiveness of autoregressive transformers is literally nuts! I absolutely adore them.",
    )

    voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"]

    voice = st.selectbox(
        "Voice",
        voices,
        help="Selects the voice to use for generation. See options in voices/ directory (and add your own!) "
        "Use the & character to join two voices together. Use a comma to perform inference on multiple voices.",
        index=0,
    )
    preset = st.selectbox(
        "Preset",
        (
            "single_sample",
            "ultra_fast",
            "very_fast",
            "ultra_fast_old",
            "fast",
            "standard",
            "high_quality",
        ),
        help="Which voice preset to use.",
        index=1,
    )
    
    with st.expander("Advanced"):
        col1, col2 = st.columns(2)
        with col1:
            """#### Model parameters"""
            candidates = st.number_input(
                "Candidates",
                help="How many output candidates to produce per-voice.",
                value=1,
            )
            latent_averaging_mode = st.radio(
                "Latent averaging mode",
                LATENT_MODES,
                help="How voice samples should be averaged together.",
                index=0,
            )
            sampler = st.radio(
                "Sampler",
                ["dpm++2m", "p", "ddim"],
                help="Diffusion sampler. Note that dpm++2m is experimental and typically requires more steps.",
                index=1,
            )
            steps = st.number_input(
                "Steps",
                help="Override the steps used for diffusion (default depends on preset)",
                value=10,
            )
            seed = st.number_input(
                "Seed",
                help="Random seed which can be used to reproduce results.",
                value=-1,
            )
            if seed == -1:
                seed = None
            voice_fixer = st.checkbox(
                "Voice fixer",
                help="Use `voicefixer` to improve audio quality. This is a post-processing step which can be applied to any output.",
                value=True,
            )
            """#### Directories"""
            output_path = st.text_input(
                "Output Path", help="Where to store outputs.", value="results/"
            )

        with col2:
            """#### Optimizations"""
            high_vram = not st.checkbox(
                "Low VRAM",
                help="Re-enable default offloading behaviour of tortoise",
                value=True,
            )
            half = st.checkbox(
                "Half-Precision",
                help="Enable autocast to half precision for autoregressive model",
                value=False,
            )
            kv_cache = st.checkbox(
                "Key-Value Cache",
                help="Enable kv_cache usage, leading to drastic speedups but worse memory usage",
                value=True,
            )
            cond_free = st.checkbox(
                "Conditioning Free",
                help="Force conditioning free diffusion",
                value=True,
            )
            no_cond_free = st.checkbox(
                "Force Not Conditioning Free",
                help="Force disable conditioning free diffusion",
                value=False,
            )

            """#### Text Splitting"""
            min_chars_to_split = st.number_input(
                "Min Chars to Split",
                help="Minimum number of characters to split text on",
                min_value=50,
                value=200,
                step=1,
            )

            """#### Debug"""
            produce_debug_state = st.checkbox(
                "Produce Debug State",
                help="Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.",
                value=True,
            )

    ar_checkpoint = "."
    diff_checkpoint = "." 
    if st.button("Update Basic Settings"):
        conf.update(
            EXTRA_VOICES_DIR=extra_voices_dir,
            LOW_VRAM=not high_vram,
            AR_CHECKPOINT=ar_checkpoint,
            DIFF_CHECKPOINT=diff_checkpoint,
        )

    ar_checkpoint = None
    diff_checkpoint = None
    tts = load_model(MODELS_DIR, high_vram, kv_cache, ar_checkpoint, diff_checkpoint)

    if st.button("Start"):
        assert latent_averaging_mode
        assert preset
        assert voice

        def show_generation(fp, filename: str):
            """
            audio_buffer = BytesIO()
            save_gen_with_voicefix(g, audio_buffer, squeeze=False)
            torchaudio.save(audio_buffer, g, 24000, format='wav')
            """
            st.audio(str(fp), format="audio/wav")
            st.download_button(
                "Download sample",
                str(fp),
                file_name=filename,  # this doesn't actually seem to work lol
            )

        with st.spinner(
            f"Generating {candidates} candidates for voice {voice} (seed={seed}). You can see progress in the terminal"
        ):
            os.makedirs(output_path, exist_ok=True)

            selected_voices = voice.split(",")
            for k, selected_voice in enumerate(selected_voices):
                if "&" in selected_voice:
                    voice_sel = selected_voice.split("&")
                else:
                    voice_sel = [selected_voice]
                voice_samples, conditioning_latents = load_voice_conditionings(
                    voice_sel, []
                )

                voice_path = Path(os.path.join(output_path, selected_voice))

                with timeit(
                    f"Generating {candidates} candidates for voice {selected_voice} (seed={seed})"
                ):
                    nullable_kwargs = {
                        k: v
                        for k, v in zip(
                            ["sampler", "diffusion_iterations", "cond_free"],
                            [sampler, steps, cond_free],
                        )
                        if v is not None
                    }

                    def call_tts(text: str):
                        return tts.tts_with_preset(
                            text,
                            k=candidates,
                            voice_samples=voice_samples,
                            conditioning_latents=conditioning_latents,
                            preset=preset,
                            use_deterministic_seed=seed,
                            return_deterministic_state=True,
                            cvvp_amount=0.0,
                            half=half,
                            latent_averaging_mode=LATENT_MODES.index(
                                latent_averaging_mode
                            ),
                            **nullable_kwargs,
                        )

                    if len(text) < min_chars_to_split:
                        filepaths = run_and_save_tts(
                            call_tts,
                            text,
                            voice_path,
                            return_deterministic_state=True,
                            return_filepaths=True,
                            voicefixer=voice_fixer,
                        )
                        for i, fp in enumerate(filepaths):
                            show_generation(fp, f"{selected_voice}-text-{i}.wav")
                    else:
                        desired_length = int(min_chars_to_split)
                        texts = split_and_recombine_text(
                            text, desired_length, desired_length + 100
                        )
                        filepaths = infer_on_texts(
                            call_tts,
                            texts,
                            voice_path,
                            return_deterministic_state=True,
                            return_filepaths=True,
                            lines_to_regen=set(range(len(texts))),
                            voicefixer=voice_fixer,
                        )
                        for i, fp in enumerate(filepaths):
                            show_generation(fp, f"{selected_voice}-text-{i}.wav")
        if produce_debug_state:
            """Debug states can be found in the output directory"""


if __name__ == "__main__":
    main()