Spaces:

TDN-M
/

GV-a

Sleeping

File size: 3,498 Bytes

4eeb5fe
38b7ebd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eed20c1
38b7ebd
 
 
 
 
93ff1fe
38b7ebd
93ff1fe
38b7ebd
 
 
 
 
 
 
 
 
 
 
 
b7935b6
93ff1fe
38b7ebd
b7935b6
38b7ebd
b7935b6
38b7ebd
 
 
 
 
 
93ff1fe
38b7ebd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93ff1fe
38b7ebd

import os
import re
import torch
import torchaudio
from huggingface_hub import snapshot_download, hf_hub_download
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from vinorm import TTSnorm

# Cấu hình đường dẫn và tải mô hình
checkpoint_dir = "model/"
repo_id = "capleaf/viXTTS"
use_deepspeed = False

# Tạo thư mục nếu chưa tồn tại
os.makedirs(checkpoint_dir, exist_ok=True)

# Kiểm tra và tải các file cần thiết
required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"]
files_in_dir = os.listdir(checkpoint_dir)
if not all(file in files_in_dir for file in required_files):
    snapshot_download(
        repo_id=repo_id,
        repo_type="model",
        local_dir=checkpoint_dir,
    )
    hf_hub_download(
        repo_id="coqui/XTTS-v2",
        filename="speakers_xtts.pth",
        local_dir=checkpoint_dir,
    )

# Tải cấu hình và mô hình
xtts_config = os.path.join(checkpoint_dir, "config.json")
config = XttsConfig()
config.load_json(xtts_config)
MODEL = Xtts.init_from_config(config)
MODEL.load_checkpoint(config, checkpoint_dir=checkpoint_dir, use_deepspeed=use_deepspeed)

# Đảm bảo mô hình chạy trên CPU
MODEL.to("cpu")

# Danh sách ngôn ngữ được hỗ trợ (chỉ tiếng Việt và tiếng Anh)
supported_languages = ["vi", "en"]

def normalize_vietnamese_text(text):
    """
    Chuẩn hóa văn bản tiếng Việt.
    """
    text = (
        TTSnorm(text, unknown=False, lower=False, rule=True)
        .replace("..", ".")
        .replace("!.", "!")
        .replace("?.", "?")
        .replace(" .", ".")
        .replace(" ,", ",")
        .replace('"', "")
        .replace("'", "")
        .replace("AI", "Ây Ai")
        .replace("A.I", "Ây Ai")
    )
    return text

def generate_speech(text, language="vi", speaker_wav=None, normalize_text=True):
    """
    Tạo giọng nói từ văn bản.
    """
    if language not in supported_languages:
        raise ValueError(f"Ngôn ngữ {language} không được hỗ trợ. Chỉ hỗ trợ tiếng Việt (vi) và tiếng Anh (en).")

    if len(text) < 2:
        raise ValueError("Văn bản quá ngắn. Vui lòng nhập văn bản dài hơn.")

    try:
        # Chuẩn hóa văn bản nếu cần
        if normalize_text and language == "vi":
            text = normalize_vietnamese_text(text)

        # Lấy latent và embedding từ file âm thanh mẫu
        with torch.no_grad():  # Tắt tính gradient để tiết kiệm bộ nhớ
            gpt_cond_latent, speaker_embedding = MODEL.get_conditioning_latents(
                audio_path=speaker_wav,
                gpt_cond_len=15,  # Giảm độ dài để tối ưu hóa cho CPU
                gpt_cond_chunk_len=4,
                max_ref_length=30,  # Giảm độ dài để tối ưu hóa cho CPU
            )

            # Tạo giọng nói
            out = MODEL.inference(
                text,
                language,
                gpt_cond_latent,
                speaker_embedding,
                repetition_penalty=5.0,
                temperature=0.75,
                enable_text_splitting=True,
            )

        # Lưu file âm thanh
        output_file = "output.wav"
        torchaudio.save(output_file, torch.tensor(out["wav"]).unsqueeze(0), 24000)

        return output_file

    except Exception as e:
        raise RuntimeError(f"Lỗi khi tạo giọng nói: {str(e)}")