Spaces:
Running
Running
File size: 6,584 Bytes
83b15dc 7771f63 ac28dc4 9340499 83b15dc 7771f63 ac28dc4 83b15dc a812692 ac28dc4 83b15dc ac28dc4 420ea59 ac28dc4 83b15dc e74aea7 ac28dc4 e74aea7 ac28dc4 e74aea7 420ea59 e74aea7 1174a8a 83b15dc 8a10c55 c6d0958 1174a8a 83b15dc 1174a8a 83b15dc ef26308 1174a8a 83b15dc 1174a8a 83b15dc c6d0958 1174a8a 9340499 c6d0958 a812692 9340499 83b15dc ac28dc4 91da599 ac28dc4 83b15dc ac28dc4 c6d0958 83b15dc ac28dc4 83b15dc ac28dc4 83b15dc 7771f63 83b15dc 7771f63 ac28dc4 7771f63 ac28dc4 83b15dc 0be617c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import os
from flask import Flask, request, jsonify, send_file, Response
import torch
import torchaudio
import librosa
import yaml
import numpy as np
from pydub import AudioSegment
from modules.commons import build_model, load_checkpoint, recursive_munch
from hf_utils import load_custom_model_from_hf
from modules.campplus.DTDNN import CAMPPlus
from modules.bigvgan import bigvgan
from transformers import AutoFeatureExtractor, WhisperModel
from modules.audio import mel_spectrogram
from modules.rmvpe import RMVPE
from io import BytesIO
# Initialize Flask app
app = Flask(__name__)
# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load model and configuration (same as in the original code)
dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
"DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
"config_dit_mel_seed_uvit_whisper_small_wavenet.yml")
config = yaml.safe_load(open(dit_config_path, 'r'))
model_params = recursive_munch(config['model_params'])
model = build_model(model_params, stage='DiT')
hop_length = config['preprocess_params']['spect_params']['hop_length']
sr = config['preprocess_params']['sr']
# Load checkpoints
model, _, _, _ = load_checkpoint(model, None, dit_checkpoint_path,
load_only_params=True, ignore_modules=[], is_distributed=False)
for key in model:
model[key].eval()
model[key].to(device)
model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
# Load additional models
campplus_ckpt_path = load_custom_model_from_hf("funasr/campplus", "campplus_cn_common.bin", config_filename=None)
campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
campplus_model.load_state_dict(torch.load(campplus_ckpt_path, map_location="cpu"))
campplus_model.eval()
campplus_model.to(device)
bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_22khz_80band_256x', use_cuda_kernel=False)
bigvgan_model.remove_weight_norm()
bigvgan_model = bigvgan_model.eval().to(device)
whisper_name = model_params.speech_tokenizer.whisper_name if hasattr(model_params.speech_tokenizer,
'whisper_name') else "openai/whisper-small"
whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float16).to(device)
del whisper_model.decoder
whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
# f0 conditioned model
dit_checkpoint_path_f0, dit_config_path_f0 = load_custom_model_from_hf("Plachta/Seed-VC",
"DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema.pth",
"config_dit_mel_seed_uvit_whisper_base_f0_44k.yml")
config_f0 = yaml.safe_load(open(dit_config_path_f0, 'r'))
model_params_f0 = recursive_munch(config_f0['model_params'])
model_f0 = build_model(model_params_f0, stage='DiT')
hop_length_f0 = config_f0['preprocess_params']['spect_params']['hop_length']
sr_f0 = config_f0['preprocess_params']['sr']
# Load checkpoints for f0 model
model_f0, _, _, _ = load_checkpoint(model_f0, None, dit_checkpoint_path_f0,
load_only_params=True, ignore_modules=[], is_distributed=False)
for key in model_f0:
model_f0[key].eval()
model_f0[key].to(device)
model_f0.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
# F0 extractor
model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
rmvpe = RMVPE(model_path, is_half=False, device=device)
# Define Mel spectrogram conversion
def to_mel(x):
mel_fn_args = {
"n_fft": config['preprocess_params']['spect_params']['n_fft'],
"win_size": config['preprocess_params']['spect_params']['win_length'],
"hop_size": config['preprocess_params']['spect_params']['hop_length'],
"num_mels": config['preprocess_params']['spect_params']['n_mels'],
"sampling_rate": sr,
"fmin": 0,
"fmax": None,
"center": False
}
return mel_spectrogram(x, **mel_fn_args)
def adjust_f0_semitones(f0_sequence, n_semitones):
factor = 2 ** (n_semitones / 12)
return f0_sequence * factor
def crossfade(chunk1, chunk2, overlap):
fade_out = np.cos(np.linspace(0, np.pi / 2, overlap)) ** 2
fade_in = np.cos(np.linspace(np.pi / 2, 0, overlap)) ** 2
chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
return chunk2
# Define the Flask route for voice conversion
@app.route('/convert', methods=['POST'])
def voice_conversion_api():
# Get the input files and parameters from the request
source = request.files['source']
target = request.files['target']
diffusion_steps = int(request.form['diffusion_steps'])
length_adjust = float(request.form['length_adjust'])
inference_cfg_rate = float(request.form['inference_cfg_rate'])
f0_condition = bool(request.form['f0_condition'])
auto_f0_adjust = bool(request.form['auto_f0_adjust'])
pitch_shift = int(request.form['pitch_shift'])
# Read source and target audio
source_audio = librosa.load(source, sr=sr)[0]
ref_audio = librosa.load(target, sr=sr)[0]
# Process audio
source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
ref_audio = torch.tensor(ref_audio[:sr * 25]).unsqueeze(0).float().to(device)
# Resample and process the audio (same as the original logic)
ref_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
converted_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
# You can add further processing and generation logic here (same as the original code)
# At the end, create the output (this is just an example, adapt based on the real output)
output_wave = np.random.randn(44100 * 10) # Replace with actual generated wave
output_wave = (output_wave * 32768.0).astype(np.int16)
# Convert to MP3 and send the response
mp3_file = BytesIO()
AudioSegment(
output_wave.tobytes(), frame_rate=sr,
sample_width=output_wave.dtype.itemsize, channels=1
).export(mp3_file, format="mp3", bitrate="320k")
mp3_file.seek(0) # Ensure the stream is at the beginning
return send_file(mp3_file, mimetype="audio/mpeg", as_attachment=True, download_name="converted_audio.mp3")
if __name__ == "__main__":
# Run the Flask app
app.run(host='0.0.0.0', debug=True, port=7860)
|