SayaSS commited on
Commit
247e955
1 Parent(s): 17b78ec
Files changed (3) hide show
  1. app-slice.py +35 -11
  2. app.py +54 -11
  3. requirements.txt +1 -0
app-slice.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  import gradio as gr
3
- import librosa
4
- import numpy as np
5
  from pathlib import Path
6
  import inference.infer_tool as infer_tool
7
  import utils
@@ -9,6 +8,8 @@ from inference.infer_tool import Svc
9
  import logging
10
  import webbrowser
11
  import argparse
 
 
12
  import soundfile
13
  import gradio.processing_utils as gr_processing_utils
14
  logging.getLogger('numba').setLevel(logging.WARNING)
@@ -29,14 +30,24 @@ def audio_postprocess(self, y):
29
 
30
  gr.Audio.postprocess = audio_postprocess
31
  def create_vc_fn(model, sid):
32
- def vc_fn(input_audio, vc_transform, auto_f0, slice_db, noise_scale, pad_seconds):
33
- if input_audio is None:
34
- return "You need to select an audio", None
35
- raw_audio_path = f"raw/{input_audio}"
36
- if "." not in raw_audio_path:
37
- raw_audio_path += ".wav"
38
- infer_tool.format_wav(raw_audio_path)
39
- wav_path = Path(raw_audio_path).with_suffix('.wav')
 
 
 
 
 
 
 
 
 
 
40
  _audio = model.slice_inference(
41
  wav_path, sid, vc_transform, slice_db,
42
  cluster_infer_ratio=0,
@@ -50,6 +61,11 @@ def create_vc_fn(model, sid):
50
  def refresh_raw_wav():
51
  return gr.Dropdown.update(choices=os.listdir("raw"))
52
 
 
 
 
 
 
53
 
54
  if __name__ == '__main__':
55
  parser = argparse.ArgumentParser()
@@ -60,6 +76,10 @@ if __name__ == '__main__':
60
  args = parser.parse_args()
61
  hubert_model = utils.get_hubert_model().to(args.device)
62
  models = []
 
 
 
 
63
  raw = os.listdir("raw")
64
  for f in os.listdir("models"):
65
  name = f
@@ -100,12 +120,16 @@ if __name__ == '__main__':
100
  noise_scale = gr.Number(label="noise_scale", value=0.4)
101
  pad_seconds = gr.Number(label="pad_seconds", value=0.5)
102
  auto_f0 = gr.Checkbox(label="auto_f0", value=False)
 
 
 
103
  vc_submit = gr.Button("Generate", variant="primary")
104
  with gr.Column():
105
  vc_output1 = gr.Textbox(label="Output Message")
106
  vc_output2 = gr.Audio(label="Output Audio")
107
- vc_submit.click(vc_fn, [vc_input, vc_transform, auto_f0, slice_db, noise_scale, pad_seconds], [vc_output1, vc_output2])
108
  vc_refresh.click(refresh_raw_wav, [], [vc_input])
 
109
  if args.colab:
110
  webbrowser.open("http://127.0.0.1:7860")
111
  app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
 
1
  import os
2
  import gradio as gr
3
+ import edge_tts
 
4
  from pathlib import Path
5
  import inference.infer_tool as infer_tool
6
  import utils
 
8
  import logging
9
  import webbrowser
10
  import argparse
11
+ import asyncio
12
+ import librosa
13
  import soundfile
14
  import gradio.processing_utils as gr_processing_utils
15
  logging.getLogger('numba').setLevel(logging.WARNING)
 
30
 
31
  gr.Audio.postprocess = audio_postprocess
32
  def create_vc_fn(model, sid):
33
+ def vc_fn(input_audio, vc_transform, auto_f0, slice_db, noise_scale, pad_seconds, tts_text, tts_voice, tts_mode):
34
+ if tts_mode:
35
+ if len(tts_text) > 100 and limitation:
36
+ return "Text is too long", None
37
+ if tts_text is None or tts_voice is None:
38
+ return "You need to enter text and select a voice", None
39
+ asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
40
+ audio, sr = librosa.load("tts.mp3")
41
+ soundfile.write("tts.wav", audio, 24000, format="wav")
42
+ wav_path = "tts.wav"
43
+ else:
44
+ if input_audio is None:
45
+ return "You need to select an audio", None
46
+ raw_audio_path = f"raw/{input_audio}"
47
+ if "." not in raw_audio_path:
48
+ raw_audio_path += ".wav"
49
+ infer_tool.format_wav(raw_audio_path)
50
+ wav_path = Path(raw_audio_path).with_suffix('.wav')
51
  _audio = model.slice_inference(
52
  wav_path, sid, vc_transform, slice_db,
53
  cluster_infer_ratio=0,
 
61
  def refresh_raw_wav():
62
  return gr.Dropdown.update(choices=os.listdir("raw"))
63
 
64
+ def change_to_tts_mode(tts_mode):
65
+ if tts_mode:
66
+ return gr.Audio.update(visible=False), gr.Button.update(visible=False), gr.Textbox.update(visible=True), gr.Dropdown.update(visible=True)
67
+ else:
68
+ return gr.Audio.update(visible=True), gr.Button.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False)
69
 
70
  if __name__ == '__main__':
71
  parser = argparse.ArgumentParser()
 
76
  args = parser.parse_args()
77
  hubert_model = utils.get_hubert_model().to(args.device)
78
  models = []
79
+ voices = []
80
+ tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
81
+ for r in tts_voice_list:
82
+ voices.append(f"{r['ShortName']}-{r['Gender']}")
83
  raw = os.listdir("raw")
84
  for f in os.listdir("models"):
85
  name = f
 
120
  noise_scale = gr.Number(label="noise_scale", value=0.4)
121
  pad_seconds = gr.Number(label="pad_seconds", value=0.5)
122
  auto_f0 = gr.Checkbox(label="auto_f0", value=False)
123
+ tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
124
+ tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
125
+ tts_voice = gr.Dropdown(choices=voices, visible=False)
126
  vc_submit = gr.Button("Generate", variant="primary")
127
  with gr.Column():
128
  vc_output1 = gr.Textbox(label="Output Message")
129
  vc_output2 = gr.Audio(label="Output Audio")
130
+ vc_submit.click(vc_fn, [vc_input, vc_transform, auto_f0, slice_db, noise_scale, pad_seconds, tts_text, tts_voice, tts_mode], [vc_output1, vc_output2])
131
  vc_refresh.click(refresh_raw_wav, [], [vc_input])
132
+ tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, vc_refresh, tts_text, tts_voice])
133
  if args.colab:
134
  webbrowser.open("http://127.0.0.1:7860")
135
  app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
app.py CHANGED
@@ -7,7 +7,9 @@ import utils
7
  from inference.infer_tool import Svc
8
  import logging
9
  import soundfile
 
10
  import argparse
 
11
  import gradio.processing_utils as gr_processing_utils
12
  logging.getLogger('numba').setLevel(logging.WARNING)
13
  logging.getLogger('markdown_it').setLevel(logging.WARNING)
@@ -27,7 +29,21 @@ def audio_postprocess(self, y):
27
 
28
  gr.Audio.postprocess = audio_postprocess
29
  def create_vc_fn(model, sid):
30
- def vc_fn(input_audio, vc_transform, auto_f0):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  if input_audio is None:
32
  return "You need to upload an audio", None
33
  sampling_rate, audio = input_audio
@@ -48,6 +64,12 @@ def create_vc_fn(model, sid):
48
  return "Success", (44100, out_audio.cpu().numpy())
49
  return vc_fn
50
 
 
 
 
 
 
 
51
  if __name__ == '__main__':
52
  parser = argparse.ArgumentParser()
53
  parser.add_argument('--device', type=str, default='cpu')
@@ -56,6 +78,16 @@ if __name__ == '__main__':
56
  args = parser.parse_args()
57
  hubert_model = utils.get_hubert_model().to(args.device)
58
  models = []
 
 
 
 
 
 
 
 
 
 
59
  for f in os.listdir("models"):
60
  name = f
61
  model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config.json", device=args.device, hubert_model=hubert_model)
@@ -66,14 +98,9 @@ if __name__ == '__main__':
66
  "# <center> Sovits Models\n"
67
  "## <center> The input audio should be clean and pure voice without background music.\n"
68
  "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=sayashi.Sovits-Umamusume)\n\n"
69
- "[Open In Colab](https://colab.research.google.com/drive/1wfsBbMzmtLflOJeqc5ZnJiLY7L239hJW?usp=share_link)"
70
- " without queue and length limitation.\n\n"
71
- "[Original Repo](https://github.com/svc-develop-team/so-vits-svc)\n\n"
72
- "Other models:\n"
73
- "[rudolf](https://huggingface.co/spaces/sayashi/sovits-rudolf)\n"
74
- "[teio](https://huggingface.co/spaces/sayashi/sovits-teio)\n"
75
- "[goldship](https://huggingface.co/spaces/sayashi/sovits-goldship)\n"
76
- "[tannhauser](https://huggingface.co/spaces/sayashi/sovits-tannhauser)\n"
77
 
78
  )
79
  with gr.Tabs():
@@ -90,9 +117,25 @@ if __name__ == '__main__':
90
  vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
91
  vc_transform = gr.Number(label="vc_transform", value=0)
92
  auto_f0 = gr.Checkbox(label="auto_f0", value=False)
 
 
 
93
  vc_submit = gr.Button("Generate", variant="primary")
94
  with gr.Column():
95
  vc_output1 = gr.Textbox(label="Output Message")
96
  vc_output2 = gr.Audio(label="Output Audio")
97
- vc_submit.click(vc_fn, [vc_input, vc_transform, auto_f0], [vc_output1, vc_output2])
98
- app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from inference.infer_tool import Svc
8
  import logging
9
  import soundfile
10
+ import asyncio
11
  import argparse
12
+ import edge_tts
13
  import gradio.processing_utils as gr_processing_utils
14
  logging.getLogger('numba').setLevel(logging.WARNING)
15
  logging.getLogger('markdown_it').setLevel(logging.WARNING)
 
29
 
30
  gr.Audio.postprocess = audio_postprocess
31
  def create_vc_fn(model, sid):
32
+ def vc_fn(input_audio, vc_transform, auto_f0, tts_text, tts_voice, tts_mode):
33
+ if tts_mode:
34
+ if len(tts_text) > 100 and limitation:
35
+ return "Text is too long", None
36
+ if tts_text is None or tts_voice is None:
37
+ return "You need to enter text and select a voice", None
38
+ asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
39
+ audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
40
+ raw_path = io.BytesIO()
41
+ soundfile.write(raw_path, audio, 16000, format="wav")
42
+ raw_path.seek(0)
43
+ out_audio, out_sr = model.infer(sid, vc_transform, raw_path,
44
+ auto_predict_f0=auto_f0,
45
+ )
46
+ return "Success", (44100, out_audio.cpu().numpy())
47
  if input_audio is None:
48
  return "You need to upload an audio", None
49
  sampling_rate, audio = input_audio
 
64
  return "Success", (44100, out_audio.cpu().numpy())
65
  return vc_fn
66
 
67
+ def change_to_tts_mode(tts_mode):
68
+ if tts_mode:
69
+ return gr.Audio.update(visible=False), gr.Textbox.update(visible=True), gr.Dropdown.update(visible=True), gr.Checkbox.update(value=True)
70
+ else:
71
+ return gr.Audio.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False), gr.Checkbox.update(value=False)
72
+
73
  if __name__ == '__main__':
74
  parser = argparse.ArgumentParser()
75
  parser.add_argument('--device', type=str, default='cpu')
 
78
  args = parser.parse_args()
79
  hubert_model = utils.get_hubert_model().to(args.device)
80
  models = []
81
+ others = {
82
+ "rudolf": "https://huggingface.co/spaces/sayashi/sovits-rudolf",
83
+ "teio": "https://huggingface.co/spaces/sayashi/sovits-teio",
84
+ "goldship": "https://huggingface.co/spaces/sayashi/sovits-goldship",
85
+ "tannhauser": "https://huggingface.co/spaces/sayashi/sovits-tannhauser"
86
+ }
87
+ voices = []
88
+ tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
89
+ for r in tts_voice_list:
90
+ voices.append(f"{r['ShortName']}-{r['Gender']}")
91
  for f in os.listdir("models"):
92
  name = f
93
  model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config.json", device=args.device, hubert_model=hubert_model)
 
98
  "# <center> Sovits Models\n"
99
  "## <center> The input audio should be clean and pure voice without background music.\n"
100
  "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=sayashi.Sovits-Umamusume)\n\n"
101
+ "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wfsBbMzmtLflOJeqc5ZnJiLY7L239hJW?usp=share_link)\n\n"
102
+ "[![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg)](https://huggingface.co/spaces/sayashi/sovits-models?duplicate=true)\n\n"
103
+ "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/svc-develop-team/so-vits-svc)"
 
 
 
 
 
104
 
105
  )
106
  with gr.Tabs():
 
117
  vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
118
  vc_transform = gr.Number(label="vc_transform", value=0)
119
  auto_f0 = gr.Checkbox(label="auto_f0", value=False)
120
+ tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
121
+ tts_text = gr.Textbox(visible=False, label="TTS text (100 words limitation)" if limitation else "TTS text")
122
+ tts_voice = gr.Dropdown(choices=voices, visible=False)
123
  vc_submit = gr.Button("Generate", variant="primary")
124
  with gr.Column():
125
  vc_output1 = gr.Textbox(label="Output Message")
126
  vc_output2 = gr.Audio(label="Output Audio")
127
+ vc_submit.click(vc_fn, [vc_input, vc_transform, auto_f0, tts_text, tts_voice, tts_mode], [vc_output1, vc_output2])
128
+ tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice, auto_f0])
129
+ for category, link in others.items():
130
+ with gr.TabItem(category):
131
+ gr.Markdown(
132
+ f'''
133
+ <center>
134
+ <h2>Click to Go</h2>
135
+ <a href="{link}">
136
+ <img src="https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-xl-dark.svg"
137
+ </a>
138
+ </center>
139
+ '''
140
+ )
141
+ app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
requirements.txt CHANGED
@@ -19,3 +19,4 @@ onnxsim
19
  onnxoptimizer
20
  fairseq
21
  librosa
 
 
19
  onnxoptimizer
20
  fairseq
21
  librosa
22
+ edge-tts