|
import sys |
|
import asyncio |
|
from io import BytesIO |
|
|
|
from fairseq import checkpoint_utils |
|
|
|
import torch |
|
|
|
import edge_tts |
|
import librosa |
|
|
|
|
|
|
|
def has_mps() -> bool: |
|
if sys.platform != "darwin": |
|
return False |
|
else: |
|
if not getattr(torch, 'has_mps', False): |
|
return False |
|
|
|
try: |
|
torch.zeros(1).to(torch.device("mps")) |
|
return True |
|
except Exception: |
|
return False |
|
|
|
|
|
def is_half(device: str) -> bool: |
|
if not device.startswith('cuda'): |
|
return False |
|
else: |
|
gpu_name = torch.cuda.get_device_name( |
|
int(device.split(':')[-1]) |
|
).upper() |
|
|
|
|
|
if ( |
|
('16' in gpu_name and 'V100' not in gpu_name) |
|
or 'P40' in gpu_name |
|
or '1060' in gpu_name |
|
or '1070' in gpu_name |
|
or '1080' in gpu_name |
|
): |
|
return False |
|
|
|
return True |
|
|
|
|
|
def load_hubert_model(device: str, model_path: str = 'hubert_base.pt'): |
|
model = checkpoint_utils.load_model_ensemble_and_task( |
|
[model_path] |
|
)[0][0].to(device) |
|
|
|
if is_half(device): |
|
return model.half() |
|
else: |
|
return model.float() |
|
|
|
|
|
async def call_edge_tts(speaker_name: str, text: str): |
|
tts_com = edge_tts.Communicate(text, speaker_name) |
|
tts_raw = b'' |
|
|
|
|
|
async for chunk in tts_com.stream(): |
|
if chunk['type'] == 'audio': |
|
tts_raw += chunk['data'] |
|
|
|
|
|
ffmpeg_proc = await asyncio.create_subprocess_exec( |
|
'ffmpeg', |
|
'-f', 'mp3', |
|
'-i', '-', |
|
'-f', 'wav', |
|
'-loglevel', 'error', |
|
'-', |
|
stdin=asyncio.subprocess.PIPE, |
|
stdout=asyncio.subprocess.PIPE |
|
) |
|
(tts_wav, _) = await ffmpeg_proc.communicate(tts_raw) |
|
|
|
return librosa.load(BytesIO(tts_wav)) |
|
|
|
|
|
async def call_edge_tts_config(speaker_name: str, text: str, rate: str, volume: str): |
|
tts_com = edge_tts.Communicate(text=text, voice=speaker_name, rate=rate, volume=volume) |
|
tts_raw = b'' |
|
|
|
|
|
async for chunk in tts_com.stream(): |
|
if chunk['type'] == 'audio': |
|
tts_raw += chunk['data'] |
|
|
|
|
|
ffmpeg_proc = await asyncio.create_subprocess_exec( |
|
'ffmpeg', |
|
'-f', 'mp3', |
|
'-i', '-', |
|
'-f', 'wav', |
|
'-loglevel', 'error', |
|
'-', |
|
stdin=asyncio.subprocess.PIPE, |
|
stdout=asyncio.subprocess.PIPE |
|
) |
|
(tts_wav, _) = await ffmpeg_proc.communicate(tts_raw) |
|
|
|
return librosa.load(BytesIO(tts_wav)) |
|
|