Cheng Jed commited on
Commit
c005bf8
·
1 Parent(s): ad04301

initial commit

Browse files
Files changed (5) hide show
  1. app.py +118 -0
  2. tts.py +140 -0
  3. voices/doraemon3.wav +0 -0
  4. voices/mk_girl.wav +0 -0
  5. voices/sing.mp3 +0 -0
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import asyncio
3
+ import base64
4
+ import os
5
+ from tts import voices, tts, get_task_result, Voice
6
+ import tempfile
7
+
8
+ def generate_speech(text, voice_name, custom_audio=None, custom_prompt_text=None):
9
+ """Generate speech from text using the selected voice or custom voice"""
10
+ if not text.strip():
11
+ return None, "Please enter some text"
12
+
13
+ output_file = "temp_output.wav"
14
+
15
+ # Handle custom voice upload
16
+ if custom_audio is not None and custom_prompt_text and custom_prompt_text.strip():
17
+ # Create a temporary Voice object with the uploaded audio
18
+ temp_audio_path = custom_audio
19
+ voice = {
20
+ "name": "Custom Voice",
21
+ "promptText": custom_prompt_text,
22
+ "promptAudio": temp_audio_path
23
+ }
24
+ else:
25
+ # Use predefined voice
26
+ voice = voices[voice_name]
27
+
28
+ async def process_tts():
29
+ try:
30
+ task_id = await tts(text, voice)
31
+
32
+ while True:
33
+ result = await get_task_result(task_id)
34
+ if result['status'] != 'PENDING':
35
+ break
36
+ await asyncio.sleep(1)
37
+
38
+ if result['status'] == 'SUCCESS':
39
+ audio_data = result['audio_url']
40
+ if ',' in audio_data:
41
+ audio_data = audio_data.split(',')[1]
42
+
43
+ with open(output_file, 'wb') as f:
44
+ f.write(base64.b64decode(audio_data))
45
+ return output_file, f"Successfully generated audio using {voice['name']}"
46
+ else:
47
+ return None, f"TTS generation failed: {result['message']}"
48
+ except Exception as e:
49
+ return None, f"Error: {str(e)}"
50
+
51
+ return asyncio.run(process_tts())
52
+
53
+ # Create a dictionary of voice names for the dropdown
54
+ voice_options = {k: v["name"] for k, v in voices.items()}
55
+
56
+ # Create the Gradio interface
57
+ with gr.Blocks(title="Cantonese Text-to-Speech") as demo:
58
+ gr.Markdown("# Cantonese Text-to-Speech Demo")
59
+ gr.Markdown("Enter text in Cantonese and select a voice to generate speech.")
60
+
61
+ with gr.Row():
62
+ with gr.Column(scale=2):
63
+ text_input = gr.Textbox(
64
+ placeholder="輸入廣東話文字...",
65
+ label="Text to convert",
66
+ lines=5
67
+ )
68
+
69
+ with gr.Group():
70
+ gr.Markdown("### Choose a voice option")
71
+ voice_dropdown = gr.Dropdown(
72
+ choices=list(voice_options.keys()),
73
+ value=list(voice_options.keys())[0],
74
+ label="Select Predefined Voice",
75
+ info="Choose a voice for synthesis"
76
+ )
77
+
78
+ # Display the actual voice name based on the selection
79
+ voice_name_display = gr.Markdown(value=f"Selected Voice: {voice_options[list(voice_options.keys())[0]]}")
80
+
81
+ with gr.Column(scale=2):
82
+ with gr.Group():
83
+ gr.Markdown("### Or upload your own voice (optional)")
84
+ custom_audio = gr.Audio(
85
+ label="Upload Voice Sample (WAV format)",
86
+ type="filepath",
87
+ format="wav"
88
+ )
89
+ custom_prompt_text = gr.Textbox(
90
+ placeholder="Enter the exact transcription of the uploaded audio...",
91
+ label="Transcription of Uploaded Audio (required if using custom voice)",
92
+ lines=2
93
+ )
94
+ gr.Markdown("*Note: The custom voice sample should be clear with minimal background noise.*")
95
+
96
+ generate_btn = gr.Button("Generate Speech", variant="primary")
97
+
98
+ with gr.Column(scale=3):
99
+ audio_output = gr.Audio(label="Generated Speech", type="filepath")
100
+ status_text = gr.Markdown("Ready to generate speech")
101
+
102
+ # Update the voice name display when dropdown changes
103
+ voice_dropdown.change(
104
+ fn=lambda x: f"Selected Voice: {voice_options[x]}",
105
+ inputs=voice_dropdown,
106
+ outputs=voice_name_display
107
+ )
108
+
109
+ # Generate speech when button is clicked
110
+ generate_btn.click(
111
+ fn=generate_speech,
112
+ inputs=[text_input, voice_dropdown, custom_audio, custom_prompt_text],
113
+ outputs=[audio_output, status_text],
114
+ concurrency_limit=1
115
+ )
116
+
117
+ if __name__ == "__main__":
118
+ demo.launch()
tts.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import os
3
+ import io
4
+ from os import path
5
+ from typing import Dict, Literal, TypedDict, Optional
6
+ import argparse
7
+ import asyncio
8
+ import base64
9
+
10
+ # Environment variables
11
+ TTS_CLIENT_ID = os.environ.get('TTS_CLIENT_ID')
12
+ TTS_CLIENT_SECRET = os.environ.get('TTS_CLIENT_SECRET')
13
+ TTS_API_URL = os.environ.get('TTS_API_URL')
14
+
15
+
16
+ if not TTS_CLIENT_ID or not TTS_CLIENT_SECRET or not TTS_API_URL:
17
+ raise ValueError('Missing environment variables')
18
+
19
+ class TaskResult(TypedDict):
20
+ task_id: str
21
+ message: str
22
+ status: Literal['PENDING', 'SUCCESS', 'FAILED']
23
+ audio_url: str # base64 encoded wav audio
24
+
25
+ class Voice(TypedDict):
26
+ name: str
27
+ promptText: str
28
+ promptAudio: str
29
+
30
+ voices: Dict[str, Voice] = {
31
+ "mk_girl": {
32
+ "name": "👧 凱婷",
33
+ "promptText": "我決定咗啦,我要做一件到目前為止又或者永遠都唔會再見到我做嘅事。",
34
+ "promptAudio": path.join(path.dirname(__file__), "./voices/mk_girl.wav")
35
+ },
36
+ "doraemon": {
37
+ "name": "🥸 全叔",
38
+ "promptText": "各位觀眾大家好,我叮噹呢又同你哋見面啦。好多謝咁多年嚟各位嘅捧場同支持。",
39
+ "promptAudio": path.join(path.dirname(__file__), "./voices/doraemon3.wav")
40
+ },
41
+ "周星馳": {
42
+ "name": "😈 星爺",
43
+ "promptText": "大家好啊,想唔想同我做好朋友啊。",
44
+ "promptAudio": path.join(path.dirname(__file__), "./voices/sing.mp3")
45
+ }
46
+ }
47
+
48
+ async def tts(input_text: str, voice: Voice) -> str:
49
+ """
50
+ Send TTS request with voice information
51
+
52
+ Args:
53
+ input_text: Text to be converted to speech
54
+ voice: Voice configuration
55
+
56
+ Returns:
57
+ task_id: ID of the TTS task
58
+ """
59
+ files = {
60
+ 'input_text': (None, input_text),
61
+ 'prompt_text': (None, voice['promptText']),
62
+ 'audio': ('prompt.wav', open(voice['promptAudio'], 'rb')),
63
+ 'speed': (None, '1.0')
64
+ }
65
+
66
+ headers = {
67
+ 'CF-Access-Client-Id': TTS_CLIENT_ID,
68
+ 'CF-Access-Client-Secret': TTS_CLIENT_SECRET
69
+ }
70
+
71
+ response = requests.post(f"{TTS_API_URL}/api/tts",
72
+ files=files,
73
+ headers=headers)
74
+
75
+ response.raise_for_status()
76
+ return response.json()['task_id']
77
+
78
+ async def get_task_result(task_id: str) -> TaskResult:
79
+ """
80
+ Get result of TTS task
81
+
82
+ Args:
83
+ task_id: ID of the TTS task
84
+
85
+ Returns:
86
+ Task result information
87
+ """
88
+ headers = {
89
+ 'Content-Type': 'application/json',
90
+ 'CF-Access-Client-Id': TTS_CLIENT_ID,
91
+ 'CF-Access-Client-Secret': TTS_CLIENT_SECRET
92
+ }
93
+
94
+ response = requests.get(f"{TTS_API_URL}/api/tts/{task_id}",
95
+ headers=headers)
96
+
97
+ response.raise_for_status()
98
+ return response.json()
99
+
100
+
101
+ async def main():
102
+ parser = argparse.ArgumentParser(description='Text-to-Speech with CosyVoice')
103
+ parser.add_argument('--text', help='Text to convert to speech')
104
+ parser.add_argument('--voice', '-v', choices=list(voices.keys()), default='mk_girl',
105
+ help='Voice to use for synthesis')
106
+ parser.add_argument('--output', '-o', default='output.wav',
107
+ help='Output audio file path')
108
+
109
+ args = parser.parse_args()
110
+ voice = voices[args.voice]
111
+
112
+ print(f"Converting text to speech using voice: {voice['name']}")
113
+ print(f"Text: {args.text}")
114
+
115
+ try:
116
+ task_id = await tts(args.text, voice)
117
+ print(f"TTS request submitted. Task ID: {task_id}")
118
+
119
+ while True:
120
+ result = await get_task_result(task_id)
121
+ if result['status'] != 'PENDING':
122
+ break
123
+ print("Waiting for TTS processing...")
124
+ await asyncio.sleep(1)
125
+
126
+ if result['status'] == 'SUCCESS':
127
+ audio_data = result['audio_url']
128
+ if ',' in audio_data:
129
+ audio_data = audio_data.split(',')[1]
130
+
131
+ with open(args.output, 'wb') as f:
132
+ f.write(base64.b64decode(audio_data))
133
+ print(f"Audio saved to {args.output}")
134
+ else:
135
+ print(f"TTS generation failed: {result['message']}")
136
+ except Exception as e:
137
+ print(f"Error: {str(e)}")
138
+
139
+ if __name__ == "__main__":
140
+ asyncio.run(main())
voices/doraemon3.wav ADDED
Binary file (624 kB). View file
 
voices/mk_girl.wav ADDED
Binary file (182 kB). View file
 
voices/sing.mp3 ADDED
Binary file (125 kB). View file