Spaces:

joytou
/

my-speech

Running

my-speech / tools /commons.py

init project

882ea5e 2 months ago

1.35 kB

	from typing import Annotated, Literal, Optional

	from pydantic import BaseModel, Field, conint


	class ServeReferenceAudio(BaseModel):
	audio: bytes
	text: str


	class ServeTTSRequest(BaseModel):
	text: str
	chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200
	# Audio format
	format: Literal["wav", "pcm", "mp3"] = "wav"
	mp3_bitrate: Literal[64, 128, 192] = 128
	# References audios for in-context learning
	references: list[ServeReferenceAudio] = []
	# Reference id
	# For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/
	# Just pass 7f92f8afb8ec43bf81429cc1c9199cb1
	reference_id: str \| None = None
	# Normalize text for en & zh, this increase stability for numbers
	normalize: bool = True
	mp3_bitrate: Optional[int] = 64
	opus_bitrate: Optional[int] = -1000
	# Balance mode will reduce latency to 300ms, but may decrease stability
	latency: Literal["normal", "balanced"] = "normal"
	# not usually used below
	streaming: bool = False
	emotion: Optional[str] = None
	max_new_tokens: int = 1024
	top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
	repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
	temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7