ArkanDash commited on
Commit
d3873a6
·
1 Parent(s): 4d12c76

feat(app): added support direct upload for gcolab

Browse files
.gitattributes CHANGED
@@ -32,35 +32,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
- weights/alice/added_IVF141_Flat_nprobe_4.index filter=lfs diff=lfs merge=lfs -text
36
- weights/ayaka-jp/added_IVF1830_Flat_nprobe_9.index filter=lfs diff=lfs merge=lfs -text
37
- weights/nilou-zh/added_IVF1939_Flat_nprobe_9.index filter=lfs diff=lfs merge=lfs -text
38
- weights/teio/added_IVF3421_Flat_nprobe_11.index filter=lfs diff=lfs merge=lfs -text
39
- weights/ayaka-jp/added_IVF415_Flat_nprobe_6.index filter=lfs diff=lfs merge=lfs -text
40
- weights/klee-jp/added_IVF282_Flat_nprobe_5.index filter=lfs diff=lfs merge=lfs -text
41
- weights/klee-jp/cover.png filter=lfs diff=lfs merge=lfs -text
42
- weights/nahida-jp/added_IVF265_Flat_nprobe_5.index filter=lfs diff=lfs merge=lfs -text
43
- weights/nahida-jp/cover.png filter=lfs diff=lfs merge=lfs -text
44
- weights/hutao-jp/added_IVF265_Flat_nprobe_5.index filter=lfs diff=lfs merge=lfs -text
45
- weights/hutao-jp/cover.png filter=lfs diff=lfs merge=lfs -text
46
- weights/raiden-jp/added_IVF783_Flat_nprobe_7.index filter=lfs diff=lfs merge=lfs -text
47
- weights/raiden-jp/cover.png filter=lfs diff=lfs merge=lfs -text
48
- weights/kazuha-jp/added_IVF677_Flat_nprobe_7.index filter=lfs diff=lfs merge=lfs -text
49
- weights/kazuha-jp/cover.png filter=lfs diff=lfs merge=lfs -text
50
- weights/childe-jp/added_IVF428_Flat_nprobe_6.index filter=lfs diff=lfs merge=lfs -text
51
- weights/childe-jp/cover.png filter=lfs diff=lfs merge=lfs -text
52
- weights/noah/added_IVF467_Flat_nprobe_6.index filter=lfs diff=lfs merge=lfs -text
53
- weights/noah/cover.jpeg filter=lfs diff=lfs merge=lfs -text
54
- weights/rie/added_IVF325_Flat_nprobe_5.index filter=lfs diff=lfs merge=lfs -text
55
- weights/rie/cover.png filter=lfs diff=lfs merge=lfs -text
56
- weights/wanderer-jp/added_IVF128_Flat_nprobe_4.index filter=lfs diff=lfs merge=lfs -text
57
- weights/wanderer-jp/cover.png filter=lfs diff=lfs merge=lfs -text
58
- weights/xiao-jp/cover.png filter=lfs diff=lfs merge=lfs -text
59
- weights/xiao-jp/added_IVF233_Flat_nprobe_5.index filter=lfs diff=lfs merge=lfs -text
60
- weights/anji/added_IVF198_Flat_nprobe_4.index filter=lfs diff=lfs merge=lfs -text
61
- weights/anji/cover.png filter=lfs diff=lfs merge=lfs -text
62
- weights/ariana/added_IVF133_Flat_nprobe_4.index filter=lfs diff=lfs merge=lfs -text
63
- weights/ariana/cover.png filter=lfs diff=lfs merge=lfs -text
64
- weights/once/added_IVF229_Flat_nprobe_5.index filter=lfs diff=lfs merge=lfs -text
65
- weights/once/cover.png filter=lfs diff=lfs merge=lfs -text
66
  *.index filter=lfs diff=lfs merge=lfs -text
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  *.index filter=lfs diff=lfs merge=lfs -text
36
+ *.png filter=lfs diff=lfs merge=lfs -text
37
+ *.jpg filter=lfs diff=lfs merge=lfs -text
app-full.py CHANGED
@@ -29,6 +29,8 @@ limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingfac
29
  def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy):
30
  def vc_fn(
31
  input_audio,
 
 
32
  f0_up_key,
33
  f0_method,
34
  index_rate,
@@ -45,20 +47,18 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy):
45
  asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
46
  audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
47
  else:
48
- if args.files:
49
- audio, sr = librosa.load(input_audio, sr=16000, mono=True)
50
- else:
51
  if input_audio is None:
52
  return "You need to upload an audio", None
53
- sampling_rate, audio = input_audio
54
  duration = audio.shape[0] / sampling_rate
55
- if duration > 20 and limitation:
56
- return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
57
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
58
  if len(audio.shape) > 1:
59
  audio = librosa.to_mono(audio.transpose(1, 0))
60
  if sampling_rate != 16000:
61
  audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
 
 
62
  times = [0, 0, 0]
63
  f0_up_key = int(f0_up_key)
64
  audio_opt = vc.pipeline(
@@ -86,31 +86,31 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy):
86
 
87
  def cut_vocal_and_inst(yt_url):
88
  if yt_url != "":
89
- if not os.path.exists("/content/youtube_audio"):
90
- os.mkdir("/content/youtube_audio")
91
  ydl_opts = {
92
  'format': 'bestaudio/best',
93
  'postprocessors': [{
94
  'key': 'FFmpegExtractAudio',
95
  'preferredcodec': 'wav',
96
  }],
97
- "outtmpl": '/content/youtube_audio/audio',
98
  }
99
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
100
  ydl.download([yt_url])
101
- yt_audio_path = "/content/youtube_audio/audio.wav"
102
  command = f"demucs --two-stems=vocals {yt_audio_path}"
103
  result = subprocess.run(command.split(), stdout=subprocess.PIPE)
104
  print(result.stdout.decode())
105
- return ("/content/rvc-models/separated/htdemucs/audio/vocals.wav", "/content/rvc-models/separated/htdemucs/audio/no_vocals.wav", yt_audio_path, "/content/rvc-models/separated/htdemucs/audio/vocals.wav")
106
 
107
  def combine_vocal_and_inst(audio_data, audio_volume):
108
  print(audio_data)
109
- if not os.path.exists("/content/result"):
110
- os.mkdir("/content/result")
111
- vocal_path = "/content/result/output.wav"
112
- inst_path = "/content/rvc-models/separated/htdemucs/audio/no_vocals.wav"
113
- output_path = "/content/result/combine.mp3"
114
  with wave.open(vocal_path, "w") as wave_file:
115
  wave_file.setnchannels(1)
116
  wave_file.setsampwidth(2)
@@ -140,11 +140,16 @@ def change_to_tts_mode(tts_mode):
140
  else:
141
  return gr.Audio.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False)
142
 
 
 
 
 
 
 
143
  if __name__ == '__main__':
144
  parser = argparse.ArgumentParser()
145
  parser.add_argument('--api', action="store_true", default=False)
146
- parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
147
- parser.add_argument("--files", action="store_true", default=False, help="load audio from path")
148
  args, unknown = parser.parse_known_args()
149
  load_hubert()
150
  models = []
@@ -182,7 +187,6 @@ if __name__ == '__main__':
182
  "# <center> RVC Models\n"
183
  "## <center> The input audio should be clean and pure voice without background music.\n"
184
  "### <center> More feature will be added soon... \n"
185
- "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=ArkanDash.Rvc-Models)\n\n"
186
  "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1hx6kKvIuv5XNY1Gai2PEuZhpO5z6xpVh?usp=sharing)\n\n"
187
  "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)"
188
  )
@@ -198,18 +202,16 @@ if __name__ == '__main__':
198
  '</div>'
199
  )
200
  with gr.Row():
201
- if args.files:
202
- with gr.Column():
203
- vc_youtube = gr.Textbox(label="Youtube URL")
204
- vc_convert = gr.Button("Convert", variant="primary")
205
- vc_vocal_preview = gr.Audio(label="Vocal Preview")
206
- vc_inst_preview = gr.Audio(label="Instrumental Preview")
207
- vc_audio_preview = gr.Audio(label="Audio Preview")
208
  with gr.Column():
209
- if args.files:
210
- vc_input = gr.Textbox(label="Input audio path")
211
- else:
212
- vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
 
 
 
 
 
213
  vc_transpose = gr.Number(label="Transpose", value=0)
214
  vc_f0method = gr.Radio(
215
  label="Pitch extraction algorithm, PM is fast but Harvest is better for low frequencies",
@@ -227,24 +229,23 @@ if __name__ == '__main__':
227
  tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
228
  tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
229
  tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
230
- vc_submit = gr.Button("Generate", variant="primary")
231
  vc_output1 = gr.Textbox(label="Output Message")
232
  vc_output2 = gr.Audio(label="Output Audio")
233
- if args.files:
234
- with gr.Column():
235
- vc_volume = gr.Slider(
236
- minimum=0,
237
- maximum=10,
238
- label="Vocal volume",
239
- value=5,
240
- interactive=True,
241
- step=1
242
- )
243
- vc_outputCombine = gr.Audio(label="Output Combined Audio")
244
- vc_combine = gr.Button("Combine",variant="primary")
245
- vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
246
- tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
247
- if args.files:
248
- vc_convert.click(cut_vocal_and_inst, vc_youtube, [vc_vocal_preview, vc_inst_preview, vc_audio_preview, vc_input])
249
- vc_combine.click(combine_vocal_and_inst, [vc_output2, vc_volume], vc_outputCombine)
250
- app.queue(concurrency_count=1, max_size=20, api_open=args.api).launch(share=args.share)
 
29
  def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy):
30
  def vc_fn(
31
  input_audio,
32
+ upload_audio,
33
+ upload_mode,
34
  f0_up_key,
35
  f0_method,
36
  index_rate,
 
47
  asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
48
  audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
49
  else:
50
+ if upload_mode:
 
 
51
  if input_audio is None:
52
  return "You need to upload an audio", None
53
+ sampling_rate, audio = upload_audio
54
  duration = audio.shape[0] / sampling_rate
 
 
55
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
56
  if len(audio.shape) > 1:
57
  audio = librosa.to_mono(audio.transpose(1, 0))
58
  if sampling_rate != 16000:
59
  audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
60
+ else:
61
+ audio, sr = librosa.load(input_audio, sr=16000, mono=True)
62
  times = [0, 0, 0]
63
  f0_up_key = int(f0_up_key)
64
  audio_opt = vc.pipeline(
 
86
 
87
  def cut_vocal_and_inst(yt_url):
88
  if yt_url != "":
89
+ if not os.path.exists("youtube_audio"):
90
+ os.mkdir("youtube_audio")
91
  ydl_opts = {
92
  'format': 'bestaudio/best',
93
  'postprocessors': [{
94
  'key': 'FFmpegExtractAudio',
95
  'preferredcodec': 'wav',
96
  }],
97
+ "outtmpl": 'youtube_audio/audio',
98
  }
99
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
100
  ydl.download([yt_url])
101
+ yt_audio_path = "youtube_audio/audio.wav"
102
  command = f"demucs --two-stems=vocals {yt_audio_path}"
103
  result = subprocess.run(command.split(), stdout=subprocess.PIPE)
104
  print(result.stdout.decode())
105
+ return ("separated/htdemucs/audio/vocals.wav", "separated/htdemucs/audio/no_vocals.wav", yt_audio_path, "separated/htdemucs/audio/vocals.wav")
106
 
107
  def combine_vocal_and_inst(audio_data, audio_volume):
108
  print(audio_data)
109
+ if not os.path.exists("result"):
110
+ os.mkdir("result")
111
+ vocal_path = "result/output.wav"
112
+ inst_path = "separated/htdemucs/audio/no_vocals.wav"
113
+ output_path = "result/combine.mp3"
114
  with wave.open(vocal_path, "w") as wave_file:
115
  wave_file.setnchannels(1)
116
  wave_file.setsampwidth(2)
 
140
  else:
141
  return gr.Audio.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False)
142
 
143
+ def change_to_upload_mode(upload_mode):
144
+ if upload_mode:
145
+ return gr.Textbox().update(visible=False), gr.Audio().update(visible=True)
146
+ else:
147
+ return gr.Textbox().update(visible=True), gr.Audio().update(visible=False)
148
+
149
  if __name__ == '__main__':
150
  parser = argparse.ArgumentParser()
151
  parser.add_argument('--api', action="store_true", default=False)
152
+ parser.add_argument("--colab", action="store_true", default=False, help="share gradio app")
 
153
  args, unknown = parser.parse_known_args()
154
  load_hubert()
155
  models = []
 
187
  "# <center> RVC Models\n"
188
  "## <center> The input audio should be clean and pure voice without background music.\n"
189
  "### <center> More feature will be added soon... \n"
 
190
  "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1hx6kKvIuv5XNY1Gai2PEuZhpO5z6xpVh?usp=sharing)\n\n"
191
  "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)"
192
  )
 
202
  '</div>'
203
  )
204
  with gr.Row():
 
 
 
 
 
 
 
205
  with gr.Column():
206
+ vc_youtube = gr.Textbox(label="Youtube URL")
207
+ vc_convert = gr.Button("Convert", variant="primary")
208
+ vc_vocal_preview = gr.Audio(label="Vocal Preview")
209
+ vc_inst_preview = gr.Audio(label="Instrumental Preview")
210
+ vc_audio_preview = gr.Audio(label="Audio Preview")
211
+ with gr.Column():
212
+ vc_input = gr.Textbox(label="Input audio path")
213
+ vc_upload = gr.Audio(label="Upload audio file", visible=False, interactive=True)
214
+ upload_mode = gr.Checkbox(label="Upload mode", value=False)
215
  vc_transpose = gr.Number(label="Transpose", value=0)
216
  vc_f0method = gr.Radio(
217
  label="Pitch extraction algorithm, PM is fast but Harvest is better for low frequencies",
 
229
  tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
230
  tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
231
  tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
 
232
  vc_output1 = gr.Textbox(label="Output Message")
233
  vc_output2 = gr.Audio(label="Output Audio")
234
+ vc_submit = gr.Button("Generate", variant="primary")
235
+ with gr.Column():
236
+ vc_volume = gr.Slider(
237
+ minimum=0,
238
+ maximum=10,
239
+ label="Vocal volume",
240
+ value=4,
241
+ interactive=True,
242
+ step=1
243
+ )
244
+ vc_outputCombine = gr.Audio(label="Output Combined Audio")
245
+ vc_combine = gr.Button("Combine",variant="primary")
246
+ vc_submit.click(vc_fn, [vc_input, vc_upload, upload_mode, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
247
+ vc_convert.click(cut_vocal_and_inst, vc_youtube, [vc_vocal_preview, vc_inst_preview, vc_audio_preview, vc_input])
248
+ vc_combine.click(combine_vocal_and_inst, [vc_output2, vc_volume], vc_outputCombine)
249
+ tts_mode.change(change_to_tts_mode, [tts_mode, upload_mode], [vc_input, vc_upload, upload_mode, tts_text, tts_voice])
250
+ upload_mode.change(change_to_upload_mode, [upload_mode], [vc_input, vc_upload])
251
+ app.queue(concurrency_count=1, max_size=20, api_open=args.api).launch(share=args.colab)
app.py CHANGED
@@ -39,20 +39,17 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy):
39
  asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
40
  audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
41
  else:
42
- if args.files:
43
- audio, sr = librosa.load(input_audio, sr=16000, mono=True)
44
- else:
45
- if input_audio is None:
46
- return "You need to upload an audio", None
47
- sampling_rate, audio = input_audio
48
- duration = audio.shape[0] / sampling_rate
49
- if duration > 20 and limitation:
50
- return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
51
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
52
- if len(audio.shape) > 1:
53
- audio = librosa.to_mono(audio.transpose(1, 0))
54
- if sampling_rate != 16000:
55
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
56
  times = [0, 0, 0]
57
  f0_up_key = int(f0_up_key)
58
  audio_opt = vc.pipeline(
@@ -101,8 +98,7 @@ def change_to_tts_mode(tts_mode):
101
  if __name__ == '__main__':
102
  parser = argparse.ArgumentParser()
103
  parser.add_argument('--api', action="store_true", default=False)
104
- parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
105
- parser.add_argument("--files", action="store_true", default=False, help="load audio from path")
106
  args, unknown = parser.parse_known_args()
107
  load_hubert()
108
  models = []
@@ -140,8 +136,7 @@ if __name__ == '__main__':
140
  "# <center> RVC Models (Outdated)\n"
141
  "## <center> The input audio should be clean and pure voice without background music.\n"
142
  "### <center> Updated Repository: [NEW RVC Models](https://huggingface.co/spaces/ArkanDash/rvc-models-new).\n"
143
- "#### <center> Recommended to use the Google Colab version for more feature.\n"
144
- "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=ArkanDash.Rvc-Models)\n\n"
145
  "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1hx6kKvIuv5XNY1Gai2PEuZhpO5z6xpVh?usp=sharing)\n\n"
146
  "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)"
147
  )
@@ -158,10 +153,7 @@ if __name__ == '__main__':
158
  )
159
  with gr.Row():
160
  with gr.Column():
161
- if args.files:
162
- vc_input = gr.Textbox(label="Input audio path")
163
- else:
164
- vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
165
  vc_transpose = gr.Number(label="Transpose", value=0)
166
  vc_f0method = gr.Radio(
167
  label="Pitch extraction algorithm, PM is fast but Harvest is better for low frequencies",
@@ -185,4 +177,4 @@ if __name__ == '__main__':
185
  vc_output2 = gr.Audio(label="Output Audio")
186
  vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
187
  tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
188
- app.queue(concurrency_count=1, max_size=20, api_open=args.api).launch(share=args.share)
 
39
  asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
40
  audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
41
  else:
42
+ if input_audio is None:
43
+ return "You need to upload an audio", None
44
+ sampling_rate, audio = input_audio
45
+ duration = audio.shape[0] / sampling_rate
46
+ if duration > 20 and limitation:
47
+ return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
48
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
49
+ if len(audio.shape) > 1:
50
+ audio = librosa.to_mono(audio.transpose(1, 0))
51
+ if sampling_rate != 16000:
52
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
 
 
 
53
  times = [0, 0, 0]
54
  f0_up_key = int(f0_up_key)
55
  audio_opt = vc.pipeline(
 
98
  if __name__ == '__main__':
99
  parser = argparse.ArgumentParser()
100
  parser.add_argument('--api', action="store_true", default=False)
101
+ parser.add_argument("--colab", action="store_true", default=False, help="share gradio app")
 
102
  args, unknown = parser.parse_known_args()
103
  load_hubert()
104
  models = []
 
136
  "# <center> RVC Models (Outdated)\n"
137
  "## <center> The input audio should be clean and pure voice without background music.\n"
138
  "### <center> Updated Repository: [NEW RVC Models](https://huggingface.co/spaces/ArkanDash/rvc-models-new).\n"
139
+ "#### <center> [Recommended to use google colab for more features](https://colab.research.google.com/drive/1hx6kKvIuv5XNY1Gai2PEuZhpO5z6xpVh?usp=sharing)\n"
 
140
  "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1hx6kKvIuv5XNY1Gai2PEuZhpO5z6xpVh?usp=sharing)\n\n"
141
  "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)"
142
  )
 
153
  )
154
  with gr.Row():
155
  with gr.Column():
156
+ vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
 
 
 
157
  vc_transpose = gr.Number(label="Transpose", value=0)
158
  vc_f0method = gr.Radio(
159
  label="Pitch extraction algorithm, PM is fast but Harvest is better for low frequencies",
 
177
  vc_output2 = gr.Audio(label="Output Audio")
178
  vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
179
  tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
180
+ app.queue(concurrency_count=1, max_size=20, api_open=args.api).launch(share=args.colab)
requirements-full.txt ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numba==0.56.4
2
+ numpy==1.23.5
3
+ scipy==1.9.3
4
+ librosa==0.9.2
5
+ llvmlite==0.39.0
6
+ fairseq==0.12.2
7
+ faiss-cpu==1.7.0; sys_platform == "darwin"
8
+ faiss-cpu==1.7.2; sys_platform != "darwin"
9
+ gradio
10
+ Cython
11
+ future>=0.18.3
12
+ pydub>=0.25.1
13
+ soundfile>=0.12.1
14
+ ffmpeg-python>=0.2.0
15
+ tensorboardX
16
+ functorch>=2.0.0
17
+ Jinja2>=3.1.2
18
+ json5>=0.9.11
19
+ Markdown
20
+ matplotlib>=3.7.1
21
+ matplotlib-inline>=0.1.6
22
+ praat-parselmouth>=0.4.3
23
+ Pillow>=9.1.1
24
+ pyworld>=0.3.2
25
+ resampy>=0.4.2
26
+ scikit-learn>=1.2.2
27
+ starlette>=0.26.1
28
+ tensorboard
29
+ tensorboard-data-server
30
+ tensorboard-plugin-wit
31
+ torchgen>=0.0.1
32
+ tqdm>=4.65.0
33
+ tornado>=6.2
34
+ Werkzeug>=2.2.3
35
+ uc-micro-py>=1.0.1
36
+ sympy>=1.11.1
37
+ tabulate>=0.9.0
38
+ PyYAML>=6.0
39
+ pyasn1>=0.4.8
40
+ pyasn1-modules>=0.2.8
41
+ fsspec>=2023.3.0
42
+ absl-py>=1.4.0
43
+ audioread
44
+ uvicorn>=0.21.1
45
+ colorama>=0.4.6
46
+ edge-tts
47
+ demucs
48
+ yt_dlp
49
+ ffmpeg
weights/ayaka-jp/cover.png CHANGED

Git LFS Details

  • SHA256: 1fe85d2c9895d4d0010660eb42ffa154edb7e0decc97f4444ba6009c69d029c0
  • Pointer size: 131 Bytes
  • Size of remote file: 838 kB
weights/nilou-jp/cover.png CHANGED

Git LFS Details

  • SHA256: f1b477f9aaa5837e8e979fd41e2e7e6cc52d61937539af69f6fc5c0537f5b524
  • Pointer size: 132 Bytes
  • Size of remote file: 1.6 MB