import warnings warnings.filterwarnings("ignore") # 外部库 import re import requests import argparse import time import os import re import tempfile # import librosa import numpy as np # import torch # from torch import no_grad, LongTensor # import commons import gradio as gr # import gradio.utils as gr_utils # import gradio.processing_utils as gr_processing_utils all_example = "Today is a wonderful day to build something people love!" microsoft_model_list = [ "en-US-JennyMultilingualNeural", "en-US-RyanMultilingualNeural", "en-US-AndrewMultilingualNeural", "en-US-AvaMultilingualNeural", "en-US-BrianMultilingualNeural", "en-US-EmmaMultilingualNeural", "en-US-AlloyMultilingualNeural", "en-US-EchoMultilingualNeural", "en-US-FableMultilingualNeural", "en-US-OnyxMultilingualNeural", "en-US-NovaMultilingualNeural", "en-US-ShimmerMultilingualNeural", "en-US-AlloyMultilingualNeuralHD", "en-US-EchoMultilingualNeuralHD", "en-US-FableMultilingualNeuralHD", "en-US-OnyxMultilingualNeuralHD", "en-US-NovaMultilingualNeuralHD4", "en-US-ShimmerMultilingualNeuralHD" ] openai_model_list = [ "alloy", "echo", "fable", "onyx", "nova", "shimmer" ] eleven_voice_id = [ "21m00Tcm4TlvDq8ikWAM", "29vD33N1CtxCmqQRPOHJ", "2EiwWnXFnvU5JabPnv8n", "5Q0t7uMcjvnagumLfvZi", "AZnzlk1XvdvUeBnXmlld", "CYw3kZ02Hs0563khs1Fj", "D38z5RcWu1voky8WS1ja", "EXAVITQu4vr4xnSDxMaL", "ErXwobaYiN019PkySvjV", "GBv7mTt0atIp3Br8iCZE", "IKne3meq5aSn9XLyUdCD", "JBFqnCBsd6RMkjVDRZzb", "LcfcDJNUP1GQjkzn1xUU", "MF3mGyEYCl7XYWbV9V6O", "N2lVS1w4EtoT3dr4eOWO", "ODq5zmih8GrVes37Dizd", "SOYHLrjzK2X1ezoPC6cr", "TX3LPaxmHKxFdv7VOQHJ", "ThT5KcBeYPX3keUQqHPh", "TxGEqnHWrfWFTfGW9XjX", "VR6AewLTigWG4xSOukaG", "XB0fDUnXU5powFXDhCwa", "Xb7hH8MSUJpSbSDYk0k2", "XrExE9yKIg1WjnnlVkGX", "ZQe5CZNOzWyzPSCn5a3c", "Zlb1dXrM653N07WRdFW3", "bVMeCyTHy58xNoL34h3p", "flq6f7yk4E4fJM5XTYuZ", "g5CIjZEefAph4nQFvHAz", "iP95p4xoKVk53GoZ742B", "jBpfuIE2acCO8z3wKNLl", "jsCqWAovK2LkecY7zXl4", "nPczCjzI2devNBz1zQrb", "oWAxZDx7w5VEj9dCyTzz", "onwK4e9ZLuTAKqWW03F9", "pFZP5JQG7iQjIQuC4Bku", "pMsXgVXv3BLzUgSXRplE", "pNInz6obpgDQGcFmaJgB", "piTKgcLEGmPE4e6mEKli", "pqHfZKP75CvOlQylNhV4", "t0jbNlBVZ17f02VDIeMI", "yoZ06aMxZJJ28mfd3POQ", "z9fAnlkpzviPz146aGWa", "zcAOhNBS3c14rBihAFp1", "zrHiDhphv9ZnVXBqCLjz", ] eleven_name = [ "Rachel", "Drew", "Clyde", "Paul", "Domi", "Dave", "Fin", "Sarah", "Antoni", "Thomas", "Charlie", "George", "Emily", "Elli", "Callum", "Patrick", "Harry", "Liam", "Dorothy", "Josh", "Arnold", "Charlotte", "Alice", "Matilda", "James", "Joseph", "Jeremy", "Michael", "Ethan", "Chris", "Gigi", "Freya", "Brian", "Grace", "Daniel", "Lily", "Serena", "Adam", "Nicole", "Bill", "Jessie", "Sam", "Glinda", "Giovanni", "Mimi", ] eleven_id_model_name_dict = dict(zip(eleven_name, eleven_voice_id)) def openai(text, name): headers = { 'Authorization': 'Bearer ' + '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', 'Content-Type': 'application/json', } json_data = { 'model': 'tts-1-hd', 'input': text, 'voice': name, } response = requests.post('https://api.openai.com/v1/audio/speech', headers=headers, json=json_data) # Note: json_data will not be serialized by requests # exactly as it was in the original request. #data = '{\n "model": "tts-1",\n "input": "The quick brown fox jumped over the lazy dog.",\n "voice": "alloy"\n }' #response = requests.post('https://api.openai.com/v1/audio/speech', headers=headers, data=data) out_arr = np.frombuffer(response.content, dtype=np.uint8) return "Success", (24000,out_arr) def elevenlabs(text,name): url = f"https://api.elevenlabs.io/v1/text-to-speech/{eleven_id_model_name_dict[name]}" CHUNK_SIZE = 1024 #url = "https://api.elevenlabs.io/v1/text-to-speech/" headers = { "Accept": "audio/mpeg", "Content-Type": "application/json", "xi-api-key": "a3391f0e3ff8472b61978dbb70ccc6fe" } data = { "text": text, "model_id": "eleven_monolingual_v1", "voice_settings": { "stability": 0.5, "similarity_boost": 0.5 } } response = requests.post(url, json=data, headers=headers) # with open('output.mp3', 'wb') as f: # for chunk in response.iter_content(chunk_size=CHUNK_SIZE): # if chunk: # f.write(chunk) return "Success", response def microsoft(text, name, style="Neural"): """ :param text: :param name: :param style: :return: """ headers = { 'Ocp-Apim-Subscription-Key': '1f1ef0ce53b84261be94fab81df7e628', 'Content-Type': 'application/ssml+xml', 'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3', 'User-Agent': 'curl', } data = ("" f"" # xml:gender='Female' f"{text}" "" "") response = requests.post( 'https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1', headers=headers, data=data, ) # breakpoint() timestamp = int(time.time()*10000) path = f'/tmp/output_{timestamp}.wav' # TODO: disk might full. with open(path, 'wb') as f: f.write(response.content) return "Success", path if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--device', type=str, default='cuda') parser.add_argument("--share", action="store_true", default=True, help="share gradio app") parser.add_argument("--port", type=int, default=8081, help="port") parser.add_argument('--model_info_path', type=str, default='/gluster/speech_data/info.json') args = parser.parse_args() app = gr.Blocks() with app: gr.Markdown("## English TTS Demo") with gr.Tabs(): with gr.TabItem("11Labs"): tts_input1 = gr.TextArea(label="Text", value=all_example) tts_input2 = gr.Dropdown(eleven_name, label="name") tts_submit = gr.Button("Generate", variant="primary") tts_output1 = gr.Textbox(label="Output Message") tts_output2 = gr.Audio(label="Output Audio") tts_submit.click(elevenlabs, [tts_input1, tts_input2], [tts_output1, tts_output2]) with gr.TabItem("微软"): tts_input1 = gr.TextArea(label="Text", value=all_example) tts_input2 = gr.Dropdown(microsoft_model_list, label="name") tts_submit = gr.Button("Generate", variant="primary") tts_output1 = gr.Textbox(label="Output Message") #tts_output2 = gr.Textbox(label="Output Audio") tts_output2 = gr.Audio(label="Output Audio") tts_submit.click(microsoft, [tts_input1, tts_input2], [tts_output1, tts_output2]) with gr.TabItem("openai"): tts_input1 = gr.TextArea(label="Text", value=all_example) tts_input2 = gr.Dropdown(openai_model_list, label="name") tts_submit = gr.Button("Generate", variant="primary") tts_output1 = gr.Textbox(label="Output Message") tts_output2 = gr.Audio(label="Output Audio") tts_submit.click(openai, [tts_input1, tts_input2], [tts_output1, tts_output2]) app.queue(max_size=10) app.launch(share=True) # _, audio = microsoft(all_example,microsoft_model_list[0]) # breakpoint() # print(audio) # with open("test97.mp3", "wb") as f: # f.write(audio.content)