Spaces:

TDN-M
/

GV-a

Sleeping

App Files Files Community

TDN-M commited on 7 days ago

Commit

00954c4

verified ·

1 Parent(s): a02f58f

Update tts.py

Browse files

Files changed (1) hide show

tts.py +65 -79

tts.py CHANGED Viewed

@@ -2,46 +2,35 @@ import os
 import re
 import torch
 import torchaudio
-from huggingface_hub import hf_hub_download
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from vinorm import TTSnorm
-from torch.amp import autocast
 # Cấu hình đường dẫn và tải mô hình
 checkpoint_dir = "model/"
 repo_id = "capleaf/viXTTS"
 use_deepspeed = False
-# Kiểm tra GPU và hỗ trợ FP16
-if torch.cuda.is_available():
-    device = "cuda"
-    if "A100" in torch.cuda.get_device_name(0):
-        print("Đang sử dụng GPU A100 với hỗ trợ FP16.")
-        use_fp16 = True
-    else:
-        print(f"Đang sử dụng GPU: {torch.cuda.get_device_name(0)}")
-        use_fp16 = False
-else:
-    device = "cpu"
-    use_fp16 = False
 # Tạo thư mục nếu chưa tồn tại
 os.makedirs(checkpoint_dir, exist_ok=True)
 # Kiểm tra và tải các file cần thiết
 required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"]
-for file in required_files:
-    file_path = os.path.join(checkpoint_dir, file)
-    if not os.path.exists(file_path):
-        try:
-            hf_hub_download(
-                repo_id=repo_id if file != "speakers_xtts.pth" else "coqui/XTTS-v2",
-                filename=file,
-                local_dir=checkpoint_dir,
-            )
-        except Exception as e:
-            raise RuntimeError(f"Không thể tải file {file} từ Hugging Face Hub: {str(e)}")
 # Tải cấu hình và mô hình
 xtts_config = os.path.join(checkpoint_dir, "config.json")
@@ -49,74 +38,71 @@ config = XttsConfig()
 config.load_json(xtts_config)
 MODEL = Xtts.init_from_config(config)
 MODEL.load_checkpoint(config, checkpoint_dir=checkpoint_dir, use_deepspeed=use_deepspeed)
 MODEL.to(device)
-# Danh sách ngôn ngữ được hỗ trợ
 supported_languages = ["vi", "en"]
 def normalize_vietnamese_text(text):
-    try:
-        text = (
-            TTSnorm(text, unknown=False, lower=False, rule=True)
-            .replace("..", ".")
-            .replace("!.", "!")
-            .replace("?.", "?")
-            .replace(" .", ".")
-            .replace(" ,", ",")
-            .replace('"', "")
-            .replace("'", "")
-            .replace("AI", "Ây Ai")
-            .replace("A.I", "Ây Ai")
-        )
-        return text
-    except Exception as e:
-        raise RuntimeError(f"Lỗi khi chuẩn hóa văn bản: {str(e)}")
-def generate_speech(
-    text,
-    language="vi",
-    speaker_wav=None,
-    normalize_text=True,
-    repetition_penalty=5.0,
-    temperature=0.75,
-):
     if language not in supported_languages:
-        raise ValueError(f"Ngôn ngữ {language} không được hỗ trợ. Các ngôn ngữ được hỗ trợ: {', '.join(supported_languages)}")
     if len(text) < 2:
-        raise ValueError("Văn bản quá ngắn.")
-    if speaker_wav and not os.path.isfile(speaker_wav):
-        raise ValueError(f"File speaker_wav không tồn tại: {speaker_wav}")
     try:
         if normalize_text and language == "vi":
             text = normalize_vietnamese_text(text)
-        with torch.no_grad():
-            with autocast(device_type='cuda', enabled=use_fp16):
-                gpt_cond_latent, speaker_embedding = MODEL.get_conditioning_latents(
-                    audio_path=speaker_wav,
-                    gpt_cond_len=30 if device == "cuda" else 15,
-                    gpt_cond_chunk_len=8 if device == "cuda" else 4,
-                    max_ref_length=60 if device == "cuda" else 30,
-                )
-                out = MODEL.inference(
-                    text,
-                    language,
-                    gpt_cond_latent,
-                    speaker_embedding,
-                    repetition_penalty=repetition_penalty,
-                    temperature=temperature,
-                    enable_text_splitting=True,
-                )
-        output_dir = "outputs/"
-        os.makedirs(output_dir, exist_ok=True)
-        output_file = os.path.join(output_dir, f"output_{os.urandom(4).hex()}.wav")
-        torchaudio.save(output_file, torch.tensor(out["wav"]).unsqueeze(0).to("cpu"), 24000)
-        if device == "cuda":
-            torch.cuda.empty_cache()
         return output_file
     except Exception as e:
         raise RuntimeError(f"Lỗi khi tạo giọng nói: {str(e)}")

 import re
 import torch
 import torchaudio
+from huggingface_hub import snapshot_download, hf_hub_download
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from vinorm import TTSnorm
 # Cấu hình đường dẫn và tải mô hình
 checkpoint_dir = "model/"
 repo_id = "capleaf/viXTTS"
 use_deepspeed = False
+device = "cuda" if torch.cuda.is_available() and "T4" in torch.cuda.get_device_name(0) else "cpu"
 # Tạo thư mục nếu chưa tồn tại
 os.makedirs(checkpoint_dir, exist_ok=True)
 # Kiểm tra và tải các file cần thiết
 required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"]
+files_in_dir = os.listdir(checkpoint_dir)
+if not all(file in files_in_dir for file in required_files):
+    snapshot_download(
+        repo_id=repo_id,
+        repo_type="model",
+        local_dir=checkpoint_dir,
+    )
+    hf_hub_download(
+        repo_id="coqui/XTTS-v2",
+        filename="speakers_xtts.pth",
+        local_dir=checkpoint_dir,
+    )
 # Tải cấu hình và mô hình
 xtts_config = os.path.join(checkpoint_dir, "config.json")
 config.load_json(xtts_config)
 MODEL = Xtts.init_from_config(config)
 MODEL.load_checkpoint(config, checkpoint_dir=checkpoint_dir, use_deepspeed=use_deepspeed)
+# Tải mô hình vào thiết bị phù hợp
 MODEL.to(device)
+# Danh sách ngôn ngữ được hỗ trợ (chỉ tiếng Việt và tiếng Anh)
 supported_languages = ["vi", "en"]
 def normalize_vietnamese_text(text):
+    """
+    Chuẩn hóa văn bản tiếng Việt.
+    """
+    text = (
+        TTSnorm(text, unknown=False, lower=False, rule=True)
+        .replace("..", ".")
+        .replace("!.", "!")
+        .replace("?.", "?")
+        .replace(" .", ".")
+        .replace(" ,", ",")
+        .replace('"', "")
+        .replace("'", "")
+        .replace("AI", "Ây Ai")
+        .replace("A.I", "Ây Ai")
+    )
+    return text
+def generate_speech(text, language="vi", speaker_wav=None, normalize_text=True):
+    """
+    Tạo giọng nói từ văn bản.
+    """
     if language not in supported_languages:
+        raise ValueError(f"Ngôn ngữ {language} không được hỗ trợ. Chỉ hỗ trợ tiếng Việt (vi) và tiếng Anh (en).")
     if len(text) < 2:
+        raise ValueError("Văn bản quá ngắn. Vui lòng nhập văn bản dài hơn.")
     try:
+        # Chuẩn hóa văn bản nếu cần
         if normalize_text and language == "vi":
             text = normalize_vietnamese_text(text)
+        # Lấy latent và embedding từ file âm thanh mẫu
+        with torch.no_grad():  # Tắt tính gradient để tiết kiệm bộ nhớ
+            gpt_cond_latent, speaker_embedding = MODEL.get_conditioning_latents(
+                audio_path=speaker_wav,
+                gpt_cond_len=30 if device == "cuda" else 15,  # Tăng độ dài khi dùng GPU
+                gpt_cond_chunk_len=8 if device == "cuda" else 4,
+                max_ref_length=60 if device == "cuda" else 30,
+            )
+            # Tạo giọng nói
+            out = MODEL.inference(
+                text,
+                language,
+                gpt_cond_latent,
+                speaker_embedding,
+                repetition_penalty=5.0,
+                temperature=0.75,
+                enable_text_splitting=True,
+            )
+        # Lưu file âm thanh
+        output_file = "output.wav"
+        torchaudio.save(output_file, torch.tensor(out["wav"]).unsqueeze(0).to("cpu"), 24000)
         return output_file
     except Exception as e:
         raise RuntimeError(f"Lỗi khi tạo giọng nói: {str(e)}")