|
import argparse |
|
import base64 |
|
import wave |
|
|
|
import ormsgpack |
|
import pyaudio |
|
import requests |
|
from pydub import AudioSegment |
|
from pydub.playback import play |
|
|
|
from tools.commons import ServeReferenceAudio, ServeTTSRequest |
|
from tools.file import audio_to_bytes, read_ref_text |
|
|
|
|
|
def parse_args(): |
|
|
|
parser = argparse.ArgumentParser( |
|
description="Send a WAV file and text to a server and receive synthesized audio." |
|
) |
|
|
|
parser.add_argument( |
|
"--url", |
|
"-u", |
|
type=str, |
|
default="http://127.0.0.1:8080/v1/tts", |
|
help="URL of the server", |
|
) |
|
parser.add_argument( |
|
"--text", "-t", type=str, required=True, help="Text to be synthesized" |
|
) |
|
parser.add_argument( |
|
"--reference_id", |
|
"-id", |
|
type=str, |
|
default=None, |
|
help="ID of the reference model o be used for the speech", |
|
) |
|
parser.add_argument( |
|
"--reference_audio", |
|
"-ra", |
|
type=str, |
|
nargs="+", |
|
default=None, |
|
help="Path to the WAV file", |
|
) |
|
parser.add_argument( |
|
"--reference_text", |
|
"-rt", |
|
type=str, |
|
nargs="+", |
|
default=None, |
|
help="Reference text for voice synthesis", |
|
) |
|
parser.add_argument( |
|
"--output", |
|
"-o", |
|
type=str, |
|
default="generated_audio", |
|
help="Output audio file name", |
|
) |
|
parser.add_argument( |
|
"--play", |
|
type=bool, |
|
default=True, |
|
help="Whether to play audio after receiving data", |
|
) |
|
parser.add_argument("--normalize", type=bool, default=True) |
|
parser.add_argument( |
|
"--format", type=str, choices=["wav", "mp3", "flac"], default="wav" |
|
) |
|
parser.add_argument("--mp3_bitrate", type=int, default=64) |
|
parser.add_argument("--opus_bitrate", type=int, default=-1000) |
|
parser.add_argument("--latency", type=str, default="normal", help="延迟选项") |
|
parser.add_argument( |
|
"--max_new_tokens", |
|
type=int, |
|
default=1024, |
|
help="Maximum new tokens to generate", |
|
) |
|
parser.add_argument( |
|
"--chunk_length", type=int, default=100, help="Chunk length for synthesis" |
|
) |
|
parser.add_argument( |
|
"--top_p", type=float, default=0.7, help="Top-p sampling for synthesis" |
|
) |
|
parser.add_argument( |
|
"--repetition_penalty", |
|
type=float, |
|
default=1.2, |
|
help="Repetition penalty for synthesis", |
|
) |
|
parser.add_argument( |
|
"--temperature", type=float, default=0.7, help="Temperature for sampling" |
|
) |
|
parser.add_argument( |
|
"--speaker", type=str, default=None, help="Speaker ID for voice synthesis" |
|
) |
|
parser.add_argument("--emotion", type=str, default=None, help="Speaker's Emotion") |
|
parser.add_argument( |
|
"--streaming", type=bool, default=False, help="Enable streaming response" |
|
) |
|
parser.add_argument( |
|
"--channels", type=int, default=1, help="Number of audio channels" |
|
) |
|
parser.add_argument("--rate", type=int, default=44100, help="Sample rate for audio") |
|
|
|
return parser.parse_args() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
args = parse_args() |
|
|
|
idstr: str | None = args.reference_id |
|
|
|
if idstr is None: |
|
ref_audios = args.reference_audio |
|
ref_texts = args.reference_text |
|
if ref_audios is None: |
|
byte_audios = [] |
|
else: |
|
byte_audios = [audio_to_bytes(ref_audio) for ref_audio in ref_audios] |
|
if ref_texts is None: |
|
ref_texts = [] |
|
else: |
|
ref_texts = [read_ref_text(ref_text) for ref_text in ref_texts] |
|
else: |
|
byte_audios = [] |
|
ref_texts = [] |
|
pass |
|
|
|
data = { |
|
"text": args.text, |
|
"references": [ |
|
ServeReferenceAudio(audio=ref_audio, text=ref_text) |
|
for ref_text, ref_audio in zip(ref_texts, byte_audios) |
|
], |
|
"reference_id": idstr, |
|
"normalize": args.normalize, |
|
"format": args.format, |
|
"mp3_bitrate": args.mp3_bitrate, |
|
"opus_bitrate": args.opus_bitrate, |
|
"max_new_tokens": args.max_new_tokens, |
|
"chunk_length": args.chunk_length, |
|
"top_p": args.top_p, |
|
"repetition_penalty": args.repetition_penalty, |
|
"temperature": args.temperature, |
|
"speaker": args.speaker, |
|
"emotion": args.emotion, |
|
"streaming": args.streaming, |
|
} |
|
|
|
pydantic_data = ServeTTSRequest(**data) |
|
|
|
response = requests.post( |
|
args.url, |
|
data=ormsgpack.packb(pydantic_data, option=ormsgpack.OPT_SERIALIZE_PYDANTIC), |
|
stream=args.streaming, |
|
headers={ |
|
"authorization": "Bearer YOUR_API_KEY", |
|
"content-type": "application/msgpack", |
|
}, |
|
) |
|
|
|
if response.status_code == 200: |
|
if args.streaming: |
|
p = pyaudio.PyAudio() |
|
audio_format = pyaudio.paInt16 |
|
stream = p.open( |
|
format=audio_format, channels=args.channels, rate=args.rate, output=True |
|
) |
|
|
|
wf = wave.open(f"{args.output}.wav", "wb") |
|
wf.setnchannels(args.channels) |
|
wf.setsampwidth(p.get_sample_size(audio_format)) |
|
wf.setframerate(args.rate) |
|
|
|
stream_stopped_flag = False |
|
|
|
try: |
|
for chunk in response.iter_content(chunk_size=1024): |
|
if chunk: |
|
stream.write(chunk) |
|
wf.writeframesraw(chunk) |
|
else: |
|
if not stream_stopped_flag: |
|
stream.stop_stream() |
|
stream_stopped_flag = True |
|
finally: |
|
stream.close() |
|
p.terminate() |
|
wf.close() |
|
else: |
|
audio_content = response.content |
|
audio_path = f"{args.output}.{args.format}" |
|
with open(audio_path, "wb") as audio_file: |
|
audio_file.write(audio_content) |
|
|
|
audio = AudioSegment.from_file(audio_path, format=args.format) |
|
if args.play: |
|
play(audio) |
|
print(f"Audio has been saved to '{audio_path}'.") |
|
else: |
|
print(f"Request failed with status code {response.status_code}") |
|
print(response.json()) |
|
|