File size: 5,331 Bytes
88f7073
3cdb410
 
 
11023cf
e37c63c
fa56bd3
 
 
f4d5ad2
fa56bd3
 
 
 
 
 
 
 
 
 
5756bb8
fa56bd3
e37c63c
3cdb410
 
e37c63c
 
 
 
 
 
 
 
fa56bd3
 
 
 
 
 
 
 
 
65cb2a1
 
 
fa56bd3
65cb2a1
 
 
 
fa56bd3
 
 
65cb2a1
 
 
 
 
 
f4d5ad2
 
 
 
57c110b
 
 
f4d5ad2
57c110b
f4d5ad2
57c110b
f4d5ad2
726d00d
57c110b
 
 
1d877e3
471fe68
57c110b
 
471fe68
 
 
57c110b
 
 
 
fa56bd3
57c110b
 
 
 
 
 
 
 
 
 
 
e37c63c
471fe68
57c110b
6cde02f
471fe68
6cde02f
e37c63c
 
 
57c110b
 
 
 
 
 
 
 
 
 
 
e37c63c
 
 
88f7073
471fe68
 
 
65cb2a1
 
 
 
471fe68
 
 
 
 
2ea96fb
471fe68
 
 
 
 
 
 
 
 
 
 
 
 
57c110b
471fe68
 
 
 
 
 
 
 
88f7073
 
d536f9b
88f7073
 
e37c63c
 
d54243a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import gradio as gr
import torch
from TTS.api import TTS
import os
import spaces
import tempfile
from pymongo import MongoClient
from dotenv import load_dotenv
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer

# Load environment variables
load_dotenv()

# Get MongoDB URI and Hugging Face token from .env file
mongodb_uri = os.getenv('MONGODB_URI')
hf_token = os.getenv('HF_TOKEN')

# Connect to MongoDB
client = MongoClient(mongodb_uri)
db = client['mitra']
voices_collection = db['voices']

os.environ["COQUI_TOS_AGREED"] = "1"

device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize TTS model
def load_tts_model():
    return TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

tts = load_tts_model()

# Fetch celebrity voices from MongoDB
def get_celebrity_voices():
    voices = {}
    for category in voices_collection.find():
        for voice in category['voices']:
            voices[voice['name']] = f"voices/{voice['name']}.mp3"
    return voices

celebrity_voices = get_celebrity_voices()

def check_voice_files():
    """
    Checks if all voice files exist in the Hugging Face repository.
    Returns a message listing missing files or confirming all files are present.
    """
    missing = []
    for voice, path in celebrity_voices.items():
        try:
            hf_hub_download(repo_id="nikkmitra/clone", filename=path, repo_type="space", token=hf_token)
        except Exception:
            missing.append(f"{voice}: {path}")
    if missing:
        return "**Missing Voice Files:**\n" + "\n".join(missing)
    else:
        return "**All voice files are present.** 🎉"



# New function to split text into chunks of 100 tokens using the Hindi tokenizer
def split_text_into_chunks(text, max_tokens=100, language="en"):
    """
    Splits the input text into chunks with a maximum of `max_tokens` tokens each.
    Inserts a newline after each chunk.
    Uses a specialized tokenizer for Hindi language.
    """

    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk = ' '.join(tokens[i:i + max_tokens])
        chunks.append(chunk)
    return '\n'.join(chunks)

@spaces.GPU(duration=120)
def tts_generate(text, voice, language):
    # Check for Hindi language and split text if necessary
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
        temp_audio_path = temp_audio.name
    
    try:
        voice_file = hf_hub_download(repo_id="nikkmitra/clone", filename=celebrity_voices[voice], repo_type="space", token=hf_token)
    except Exception as e:
        return f"Error downloading voice file: {e}"
    
    try:
        tts.tts_to_file(
            text=text,
            speaker_wav=voice_file,
            language=language,
            file_path=temp_audio_path
        )
    except AssertionError as ae:
        return f"Error: {ae}"
    except Exception as e:
        return f"An unexpected error occurred: {e}"
    
    return temp_audio_path

@spaces.GPU(duration=120)
def clone_voice(text, audio_file, language):
    print("cloning")    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
        temp_audio_path = temp_audio.name
    
    try:
        tts.tts_to_file(
            text=text, 
            speaker_wav=audio_file,
            language=language,
            file_path=temp_audio_path
        )
    except AssertionError as ae:
        return f"Error: {ae}"
    except Exception as e:
        return f"An unexpected error occurred: {e}"
    
    return temp_audio_path

# Define Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Advanced Voice Synthesis")
    
    # Display voice files status
    voice_status = check_voice_files()
    gr.Markdown(voice_status)
    
    with gr.Tabs():
        with gr.TabItem("TTS"):
            with gr.Row():
                tts_text = gr.Textbox(label="Text to speak")
                tts_voice = gr.Dropdown(choices=list(celebrity_voices.keys()), label="Celebrity Voice")
                tts_language = gr.Dropdown(["en", "es", "fr", "de", "it", "ar","hi"], label="Language", value="en")
            tts_generate_btn = gr.Button("Generate")
            tts_output = gr.Audio(label="Generated Audio")
            
            tts_generate_btn.click(
                tts_generate,
                inputs=[tts_text, tts_voice, tts_language],
                outputs=tts_output
            )
        
        with gr.TabItem("Clone Voice"):
            with gr.Row():
                clone_text = gr.Textbox(label="Text to speak")
                clone_audio = gr.Audio(label="Voice reference audio file", type="filepath")
                clone_language = gr.Dropdown(["en", "es", "fr", "de", "it", "ar", "hi"], label="Language", value="en")
            clone_generate_btn = gr.Button("Generate")
            clone_output = gr.Audio(label="Generated Audio")
            
            clone_generate_btn.click(
                clone_voice,
                inputs=[clone_text, clone_audio, clone_language],
                outputs=clone_output
            )

# Launch the interface
demo.launch()

# Clean up temporary files (this will run after the Gradio server is closed)
for file in os.listdir():
    if file.endswith('.wav') and file.startswith('tmp'):
        os.remove(file)