Spaces:

amphion
/

Text-to-Speech

Running

App Files Files Community

Text-to-Speech / inference.py

zyingt

support timbre confusion

8d06e90 verified 8 months ago

raw

history blame

4.77 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import argparse
	from argparse import ArgumentParser
	import os

	from models.tts.fastspeech2.fs2_inference import FastSpeech2Inference
	from models.tts.vits.vits_inference import VitsInference
	from models.tts.valle.valle_inference import VALLEInference
	from utils.util import load_config
	import torch


	def build_inference(args, cfg):
	supported_inference = {
	"FastSpeech2": FastSpeech2Inference,
	"VITS": VitsInference,
	"VALLE": VALLEInference,
	}

	inference_class = supported_inference[cfg.model_type]
	inference = inference_class(args, cfg)
	return inference


	def cuda_relevant(deterministic=False):
	torch.cuda.empty_cache()
	# TF32 on Ampere and above
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.enabled = True
	torch.backends.cudnn.allow_tf32 = True
	# Deterministic
	torch.backends.cudnn.deterministic = deterministic
	torch.backends.cudnn.benchmark = not deterministic
	torch.use_deterministic_algorithms(deterministic)


	def build_parser():
	parser = argparse.ArgumentParser()

	parser.add_argument(
	"--config",
	type=str,
	required=True,
	help="JSON/YAML file for configurations.",
	)
	parser.add_argument(
	"--dataset",
	type=str,
	help="convert from the source data",
	default=None,
	)
	parser.add_argument(
	"--testing_set",
	type=str,
	help="train, test, golden_test",
	default="test",
	)
	parser.add_argument(
	"--test_list_file",
	type=str,
	help="convert from the test list file",
	default=None,
	)
	parser.add_argument(
	"--speaker_name_1",
	type=str,
	default=None,
	help="first speaker name for multi-speaker synthesis, for single-sentence mode only",
	)
	parser.add_argument(
	"--speaker_name_2",
	type=str,
	default=None,
	help="second speaker name for multi-speaker synthesis, for single-sentence mode only",
	)
	parser.add_argument(
	"--alpha",
	type=float,
	default=0.5,
	help="confusion degree, takes values within [0,1]",
	)
	parser.add_argument(
	"--text",
	help="Text to be synthesized.",
	type=str,
	default="",
	)
	parser.add_argument(
	"--vocoder_dir",
	type=str,
	default=None,
	help="Vocoder checkpoint directory. Searching behavior is the same as "
	"the acoustics one.",
	)
	parser.add_argument(
	"--acoustics_dir",
	type=str,
	default=None,
	help="Acoustic model checkpoint directory. If a directory is given, "
	"search for the latest checkpoint dir in the directory. If a specific "
	"checkpoint dir is given, directly load the checkpoint.",
	)
	parser.add_argument(
	"--checkpoint_path",
	type=str,
	default=None,
	help="Acoustic model checkpoint directory. If a directory is given, "
	"search for the latest checkpoint dir in the directory. If a specific "
	"checkpoint dir is given, directly load the checkpoint.",
	)
	parser.add_argument(
	"--mode",
	type=str,
	choices=["batch", "single"],
	required=True,
	help="Synthesize a whole dataset or a single sentence",
	)
	parser.add_argument(
	"--log_level",
	type=str,
	default="warning",
	help="Logging level. Default: warning",
	)
	parser.add_argument(
	"--pitch_control",
	type=float,
	default=1.0,
	help="control the pitch of the whole utterance, larger value for higher pitch",
	)
	parser.add_argument(
	"--energy_control",
	type=float,
	default=1.0,
	help="control the energy of the whole utterance, larger value for larger volume",
	)
	parser.add_argument(
	"--duration_control",
	type=float,
	default=1.0,
	help="control the speed of the whole utterance, larger value for slower speaking rate",
	)
	parser.add_argument(
	"--output_dir",
	type=str,
	default=None,
	help="Output dir for saving generated results",
	)
	return parser

	def main(args_list):
	# Parse arguments and config
	args = build_parser().parse_args(args_list)
	cfg = load_config(args.config)

	# CUDA settings
	cuda_relevant()

	# Build inference
	inferencer = build_inference(args, cfg)

	# Run inference
	output_audio = inferencer.inference()

	return output_audio


	if __name__ == "__main__":
	main()