TTS-Spaces-Arena

Running

File size: 15,715 Bytes

import os
from gradio_client import handle_file

# Models to include in the leaderboard, only include models that users can vote on
AVAILABLE_MODELS = {
    # 'XTTSv2': 'xtts',
    # 'WhisperSpeech': 'whisperspeech',
    # 'ElevenLabs': 'eleven',
    # 'OpenVoice': 'openvoice',
    # 'OpenVoice V2': 'openvoicev2',
    # 'Play.HT 2.0': 'playht',
    # 'Play.HT 3.0 Mini': 'playht3',
    # 'MetaVoice': 'metavoice',
    # 'MeloTTS': 'melo',
    # 'StyleTTS 2': 'styletts2',
    # 'GPT-SoVITS': 'sovits',
    # 'Vokan TTS': 'vokan',
    # 'VoiceCraft 2.0': 'voicecraft',
    # 'Parler TTS': 'parler',
    # 'Parler TTS Large': 'parlerlarge',
    # 'Fish Speech v1.4': 'fish',

    # HF Gradio Spaces: # <works with gradio version #>
    # gravio version that works with most spaces: 4.29
     'coqui/xtts': 'coqui/xtts', # 4.29 4.32
    # 'collabora/WhisperSpeech': 'collabora/WhisperSpeech', # 4.32 4.36.1
    # 'myshell-ai/OpenVoice': 'myshell-ai/OpenVoice', # same devs as MeloTTS, which scores higher # 4.29
    # 'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2', # same devs as MeloTTS, which scores higher # 4.29
    # 'mrfakename/MetaVoice-1B-v0.1': 'mrfakename/MetaVoice-1B-v0.1', # 4.29 4.32
    'Pendrokar/xVASynth-TTS': 'Pendrokar/xVASynth-TTS', # 4.29 4.32 4.42.0
    # 'coqui/CoquiTTS': 'coqui/CoquiTTS',
     'mrfakename/MeloTTS': 'mrfakename/MeloTTS', # 4.29 4.32
     'fishaudio/fish-speech-1': 'fishaudio/fish-speech-1', # 4.29 4.32 4.36.1

    # E2 & F5 TTS
    # F5 model
    'mrfakename/E2-F5-TTS': 'mrfakename/E2-F5-TTS', # 5.0

    # # Parler
    # Parler Large model
    # 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
    # Parler Mini model
    'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
    # 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # Mini is the default model of parler_tts
    # 'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso', # 4.29 4.32 4.36.1 4.42.0

    # # Microsoft Edge TTS
    # 'innoai/Edge-TTS-Text-to-Speech': 'innoai/Edge-TTS-Text-to-Speech', # 4.29

    # IMS-Toucan
    # 'Flux9665/MassivelyMultilingualTTS': 'Flux9665/MassivelyMultilingualTTS', # 5.1
    # StyleTTS v2
    # 'Pendrokar/style-tts-2': 'Pendrokar/style-tts-2', #  more votes in OG arena; emotionless
    # StyleTTS kokoro
    'hexgrad/kokoro': 'hexgrad/kokoro',

    # MaskGCT (by Amphion)
    # DEMANDS 300 seconds of ZeroGPU
    # 'amphion/maskgct': 'amphion/maskgct',
    # default ZeroGPU borrow time
    'Svngoku/maskgct-audio-lab': 'Svngoku/maskgct-audio-lab',

    # HF TTS w issues
    # 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
    # 'PolyAI/pheme': '/predict#0', # sleepy HF Space
    # 'amphion/Text-to-Speech': '/predict#0', # disabled also on original HF space due to poor ratings
    # 'suno/bark': '3#0', # Hallucinates
    # 'shivammehta25/Matcha-TTS': '5#0', # seems to require multiple requests for setup
    # 'styletts2/styletts2': '0#0', # API disabled, awaiting approval of PR #15
    # 'Manmay/tortoise-tts': '/predict#0', # Cannot retrieve streamed file; 403
    # 'pytorch/Tacotron2': '0#0', # old gradio
}

HF_SPACES = {
    # XTTS v2
    'coqui/xtts': {
        'name': 'XTTS v2',
        'function': '1',
        'text_param_index': 0,
        'return_audio_index': 1,
        'series': 'XTTS',
    },
    # WhisperSpeech
    'collabora/WhisperSpeech': {
        'name': 'WhisperSpeech',
        'function': '/whisper_speech_demo',
        'text_param_index': 0,
        'return_audio_index': 0,
        'series': 'WhisperSpeech',
    },
    # OpenVoice (MyShell.ai)
    'myshell-ai/OpenVoice': {
        'name':'OpenVoice',
        'function': '1',
        'text_param_index': 0,
        'return_audio_index': 1,
        'series': 'OpenVoice',
    },
    # OpenVoice v2 (MyShell.ai)
    'myshell-ai/OpenVoiceV2': {
        'name':'OpenVoice v2',
        'function': '1',
        'text_param_index': 0,
        'return_audio_index': 1,
        'series': 'OpenVoice',
    },
    # MetaVoice
    'mrfakename/MetaVoice-1B-v0.1': {
        'name':'MetaVoice-1B',
        'function': '/tts',
        'text_param_index': 0,
        'return_audio_index': 0,
        'series': 'MetaVoice-1B',
    },
    # xVASynth (CPU)
    'Pendrokar/xVASynth-TTS': {
        'name': 'xVASynth v3',
        'function': '/predict',
        'text_param_index': 0,
        'return_audio_index': 0,
        'series': 'xVASynth',
    },
    # CoquiTTS (CPU)
    'coqui/CoquiTTS': {
        'name': 'CoquiTTS',
        'function': '0',
        'text_param_index': 0,
        'return_audio_index': 0,
        'series': 'CoquiTTS',
    },
    # HierSpeech_TTS
    'LeeSangHoon/HierSpeech_TTS': {
        'name': 'HierSpeech++',
        'function': '/predict',
        'text_param_index': 0,
        'return_audio_index': 0,
        'series': 'HierSpeech++',
    },
    # MeloTTS (MyShell.ai)
    'mrfakename/MeloTTS': {
        'name': 'MeloTTS',
        'function': '/synthesize',
        'text_param_index': 0,
        'return_audio_index': 0,
        'series': 'MeloTTS',
    },

    # Parler
    'parler-tts/parler_tts': {
        'name': 'Parler Mini',
        'function': '/gen_tts',
        'text_param_index': 0,
        'return_audio_index': 0,
        'is_zero_gpu_space': True,
        'series': 'Parler',
    },
    # Parler Mini
    # 'parler-tts/parler_tts': {
    #     'name': 'Parler Large',
    #     'function': '/gen_tts',
    #     'text_param_index': 0,
    #     'return_audio_index': 0,
    #     'is_zero_gpu_space': True,
    #    'series': 'Parler',
    # },
    # Parler Mini which using Expresso dataset
    'parler-tts/parler-tts-expresso': {
        'name': 'Parler Mini Expresso',
        'function': '/gen_tts',
        'text_param_index': 0,
        'return_audio_index': 0,
        'is_zero_gpu_space': True,
        'series': 'Parler',
    },

    # Microsoft Edge TTS
    'innoai/Edge-TTS-Text-to-Speech': {
        'name': 'Edge TTS',
        'function': '/predict',
        'text_param_index': 0,
        'return_audio_index': 0,
        'is_closed_source': True,
        'series': 'Edge TTS',
    },

    # Fish Speech
    'fishaudio/fish-speech-1': {
        'name': 'Fish Speech',
        'function': '/inference_wrapper',
        'text_param_index': 0,
        'return_audio_index': 0,
        'series': 'Fish Speech',
    },

    # E2/F5 TTS
    'mrfakename/E2-F5-TTS': {
        'name': 'F5 TTS',
        'function': '/basic_tts',
        'text_param_index': 2,
        'return_audio_index': 0,
        'is_zero_gpu_space': True,
        'series': 'E2/F5 TTS',
    },

    # IMS-Toucan
    'Flux9665/MassivelyMultilingualTTS': {
        'name': 'IMS-Toucan',
		'function': "/predict",
        'text_param_index': 0,
        'return_audio_index': 0,
        'series': 'IMS-Toucan',
    },

    # IMS-Toucan English non-artificial
    'Flux9665/EnglishToucan': {
        'name': 'IMS-Toucan EN',
		'function': "/predict",
        'text_param_index': 0,
        'return_audio_index': 0,
        'series': 'IMS-Toucan',
    },

    # StyleTTS v2
    'Pendrokar/style-tts-2': {
        'name': 'StyleTTS v2',
        'function': '/synthesize',
        'text_param_index': 0,
        'return_audio_index': 0,
        'is_zero_gpu_space': True,
        'series': 'StyleTTS',
    },

    # StyleTTS v2 kokoro fine tune
    'hexgrad/kokoro': {
        'name': 'StyleTTS Kokoro',
        'function': '/generate',
        'text_param_index': 0,
        'return_audio_index': 0,
        'is_zero_gpu_space': True,
        'series': 'StyleTTS',
    },

    # MaskGCT (by Amphion)
    'amphion/maskgct': {
        'name': 'MaskGCT',
        'function': '/predict',
        'text_param_index': 1,
        'return_audio_index': 0,
        'is_zero_gpu_space': True,
        'series': 'MaskGCT',
    },
    'Svngoku/maskgct-audio-lab': {
        'name': 'MaskGCT',
        'function': '/predict',
        'text_param_index': 1,
        'return_audio_index': 0,
        'is_zero_gpu_space': True,
        'series': 'MaskGCT',
    },
}

# for zero-shot TTS - voice sample used by XTTS (11 seconds)
DEFAULT_VOICE_SAMPLE_STR = 'https://cdn-uploads.huggingface.co/production/uploads/63d52e0c4e5642795617f668/V6-rMmI-P59DA4leWDIcK.wav'
DEFAULT_VOICE_SAMPLE = handle_file(DEFAULT_VOICE_SAMPLE_STR)
DEFAULT_VOICE_TRANSCRIPT = "The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory."

OVERRIDE_INPUTS = {
    'coqui/xtts': {
        1: 'en',
        2: DEFAULT_VOICE_SAMPLE_STR, # voice sample
        3: None, # mic voice sample
        4: False, #use_mic
        5: False, #cleanup_reference
        6: False, #auto_detect
    },
    'collabora/WhisperSpeech': {
        1: DEFAULT_VOICE_SAMPLE, # voice sample
        2: DEFAULT_VOICE_SAMPLE, # voice sample URL
        3: 14.0, #Tempo - Gradio Slider issue: takes min. rather than value
    },
    'myshell-ai/OpenVoice': {
        1: 'default', # style
        2: 'https://huggingface.co/spaces/myshell-ai/OpenVoiceV2/resolve/main/examples/speaker0.mp3', # voice sample
    },
    'myshell-ai/OpenVoiceV2': {
        1: 'en_us', # style
        2: 'https://huggingface.co/spaces/myshell-ai/OpenVoiceV2/resolve/main/examples/speaker0.mp3', # voice sample
    },
    'PolyAI/pheme': {
        1: 'YOU1000000044_S0000798', # voice
        2: 210,
        3: 0.7, #Tempo - Gradio Slider issue: takes min. rather than value
    },
    'Pendrokar/xVASynth-TTS': {
        1: 'x_ex04', #fine-tuned voice model name
        3: 1.0, #pacing/duration - Gradio Slider issue: takes min. rather than value
    },
    'suno/bark': {
        1: 'Speaker 3 (en)', # voice
    },
    'amphion/Text-to-Speech': {
        1: 'LikeManyWaters', # voice
    },
    'LeeSangHoon/HierSpeech_TTS': {
        1: handle_file('https://huggingface.co/spaces/LeeSangHoon/HierSpeech_TTS/resolve/main/example/female.wav'), # voice sample
        2: 0.333,
        3: 0.333,
        4: 1,
        5: 1,
        6: 0,
        7: 1111,
    },
    'Manmay/tortoise-tts': {
        1: None, # text-from-file
        2: 'angie', # voice
        3: 'disabled', # second voice for a dialogue
        4: 'No', # split by newline
    },
    'mrfakename/MeloTTS': {
        1: 'EN-Default',	# speaker; DEFAULT_VOICE_SAMPLE=EN-Default
        2: 1, # speed
        3: 'EN',	# language
    },
    'mrfakename/MetaVoice-1B-v0.1': {
		1: 5,	# float (numeric value between 0.0 and 10.0) in 'Speech Stability - improves text following for a challenging speaker' Slider component
		2: 5,	# float (numeric value between 1.0 and 5.0) in 'Speaker similarity - How closely to match speaker identity and speech style.' Slider component
		3: "Preset voices",	# Literal['Preset voices', 'Upload target voice']  in 'Choose voice' Radio component
		4: "Bria",	# Literal['Bria', 'Alex', 'Jacob']  in 'Preset voices' Dropdown component
		5: None,	# filepath  in 'Upload a clean sample to clone. Sample should contain 1 speaker, be between 30-90 seconds and not contain background noise.' Audio component
    },
    'parler-tts/parler_tts': {
        1: 'Laura; Laura\'s female voice; very clear audio', # description/prompt
    },
    'parler-tts/parler-tts-expresso': {
        1: 'Elisabeth; Elisabeth\'s female voice; very clear audio', # description/prompt
    },
    'innoai/Edge-TTS-Text-to-Speech': {
        1: 'en-US-EmmaMultilingualNeural - en-US (Female)', # voice
        2: 0, # pace rate
        3: 0, # pitch
    },

    'fishaudio/fish-speech-1': {
        1: False, # normalize
		2: handle_file('https://huggingface.co/spaces/fishaudio/fish-speech-1/resolve/main/examples/English.wav'), # reference_audio
		3: 'In the ancient land of Eldoria, where the skies were painted with shades of mystic hues and the forests whispered secrets of old, there existed a dragon named Zephyros. Unlike the fearsome tales of dragons that plagued human hearts with terror, Zephyros was a creature of wonder and wisdom, revered by all who knew of his existence.', # reference_text
		4: 0, # max_new_tokens
		5: 200, # chunk_length
		6: 0.7, # top_p
		7: 1.2, # repetition_penalty
		8: 0.7, # temperature
		9: 0, #seed
		10: "never", #use_memory_cache
    },

    'mrfakename/E2-F5-TTS': {
		0: DEFAULT_VOICE_SAMPLE, # voice sample
		1: DEFAULT_VOICE_TRANSCRIPT, # transcript of sample (< 15 seconds required)
		3: False, # cleanup silence
        4: 0.15, #crossfade
        5: 32, #nfe_slider
        6: 1, #speed
    },

    # IMS-Toucan
    'Flux9665/MassivelyMultilingualTTS': {
		1: "English (eng)", #language
		2: 0.6, #prosody_creativity
		3: 1, #duration_scaling_factor
		4: 41, #voice_seed
		5: -7.5, #emb1
		6: None, #reference_audio
    },

    # StyleTTS 2
    'Pendrokar/style-tts-2': {
		1: "f-us-2", #voice
        2: 'en-us', # lang
		3: 8, # lngsteps
    },

    # StyleTTS 2 kokoro
    'hexgrad/kokoro': {
		1: "af", #voice
		2: None, #ps
		3: 1, #speed
		4: 3000, #trim
		5: False, #use_gpu; fast enough with multithreaded with CPU
        6: os.getenv('KOKORO'), #sk
    },

    # maskGCT (by amphion)
    'amphion/maskgct': {
        0: DEFAULT_VOICE_SAMPLE, #prompt_wav
		2: -1, #target_len
		3: 25, #n_timesteps
    },
    'Svngoku/maskgct-audio-lab': {
        0: DEFAULT_VOICE_SAMPLE, #prompt_wav
		2: -1, #target_len
		3: 25, #n_timesteps
    },
}


# Model name mapping, can include models that users cannot vote on
model_names = {
    'styletts2': 'StyleTTS 2',
    'tacotron': 'Tacotron',
    'tacotronph': 'Tacotron Phoneme',
    'tacotrondca': 'Tacotron DCA',
    'speedyspeech': 'Speedy Speech',
    'overflow': 'Overflow TTS',
    'vits': 'VITS',
    'vitsneon': 'VITS Neon',
    'neuralhmm': 'Neural HMM',
    'glow': 'Glow TTS',
    'fastpitch': 'FastPitch',
    'jenny': 'Jenny',
    'tortoise': 'Tortoise TTS',
    'xtts2': 'Coqui XTTSv2',
    'xtts': 'Coqui XTTS',
    'openvoice': 'MyShell OpenVoice',
    'elevenlabs': 'ElevenLabs',
    'openai': 'OpenAI',
    'hierspeech': 'HierSpeech++',
    'pheme': 'PolyAI Pheme',
    'speecht5': 'SpeechT5',
    'metavoice': 'MetaVoice-1B',
}

def make_link_to_space(model_name, for_leaderboard=False):
    # create a anchor link if a HF space
    style = 'text-decoration: underline;text-decoration-style: dotted;'
    title = ''

    if model_name in AVAILABLE_MODELS:
        style += 'color: var(--link-text-color);'
        title = model_name
    else:
        style += 'font-style: italic;'
        title = 'Disabled for Arena (See AVAILABLE_MODELS within code for why)'

    model_basename = model_name
    if model_name in HF_SPACES:
        model_basename = HF_SPACES[model_name]['name']

    try:
        if(
            for_leaderboard
            and HF_SPACES[model_name]['is_closed_source']
        ):
            model_basename += ' 🔐'
            title += '; 🔐 = online only or proprietary'
    except:
        pass

    if '/' in model_name:
        return '🤗 <a target="_blank" style="'+ style +'" title="'+ title +'" href="'+ 'https://huggingface.co/spaces/'+ model_name +'">'+ model_basename +'</a>'

    # otherwise just return the model name
    return '<span style="'+ style +'" title="'+ title +'" href="'+ 'https://huggingface.co/spaces/'+ model_name +'">'+ model_name +'</span>'

def markdown_link_to_space(model_name):
    # create a anchor link if a HF space using markdown syntax
    if '/' in model_name:
        return '🤗 [' + model_name + '](https://huggingface.co/spaces/' + model_name + ')'
    # otherwise just return the model name
    return model_name