import gradio as gr from share_btn import community_icon_html, loading_icon_html, share_js import os import shutil import re #from huggingface_hub import snapshot_download import numpy as np from scipy.io import wavfile from scipy.io.wavfile import write, read from pydub import AudioSegment file_upload_available = os.environ.get("ALLOW_FILE_UPLOAD") MAX_NUMBER_SENTENCES = 10 import json with open("characters.json", "r") as file: data = json.load(file) characters = [ { "image": item["image"], "title": item["title"], "speaker": item["speaker"] } for item in data ] from TTS.api import TTS tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True) def cut_wav(input_path, max_duration): # Load the WAV file audio = AudioSegment.from_wav(input_path) # Calculate the duration of the audio audio_duration = len(audio) / 1000 # Convert milliseconds to seconds # Determine the duration to cut (maximum of max_duration and actual audio duration) cut_duration = min(max_duration, audio_duration) # Cut the audio cut_audio = audio[:int(cut_duration * 1000)] # Convert seconds to milliseconds # Get the input file name without extension file_name = os.path.splitext(os.path.basename(input_path))[0] # Construct the output file path with the original file name and "_cut" suffix output_path = f"{file_name}_cut.wav" # Save the cut audio as a new WAV file cut_audio.export(output_path, format="wav") return output_path def load_hidden(audio_in): return audio_in def load_hidden_mic(audio_in): print("USER RECORDED A NEW SAMPLE") library_path = 'bark_voices' folder_name = 'audio-0-100' second_folder_name = 'audio-0-100_cleaned' folder_path = os.path.join(library_path, folder_name) second_folder_path = os.path.join(library_path, second_folder_name) print("We need to clean previous util files, if needed:") if os.path.exists(folder_path): try: shutil.rmtree(folder_path) print(f"Successfully deleted the folder previously created from last raw recorded sample: {folder_path}") except OSError as e: print(f"Error: {folder_path} - {e.strerror}") else: print(f"OK, the folder for a raw recorded sample does not exist: {folder_path}") if os.path.exists(second_folder_path): try: shutil.rmtree(second_folder_path) print(f"Successfully deleted the folder previously created from last cleaned recorded sample: {second_folder_path}") except OSError as e: print(f"Error: {second_folder_path} - {e.strerror}") else: print(f"Ok, the folder for a cleaned recorded sample does not exist: {second_folder_path}") return audio_in def clear_clean_ckeck(): return False def wipe_npz_file(folder_path): print("YO • a user is manipulating audio inputs") def split_process(audio, chosen_out_track): gr.Info("Cleaning your audio sample...") os.makedirs("out", exist_ok=True) write('test.wav', audio[0], audio[1]) os.system("python3 -m demucs.separate -n mdx_extra_q -j 4 test.wav -o out") #return "./out/mdx_extra_q/test/vocals.wav","./out/mdx_extra_q/test/bass.wav","./out/mdx_extra_q/test/drums.wav","./out/mdx_extra_q/test/other.wav" if chosen_out_track == "vocals": print("Audio sample cleaned") return "./out/mdx_extra_q/test/vocals.wav" elif chosen_out_track == "bass": return "./out/mdx_extra_q/test/bass.wav" elif chosen_out_track == "drums": return "./out/mdx_extra_q/test/drums.wav" elif chosen_out_track == "other": return "./out/mdx_extra_q/test/other.wav" elif chosen_out_track == "all-in": return "test.wav" def update_selection(selected_state: gr.SelectData): c_image = characters[selected_state.index]["image"] c_title = characters[selected_state.index]["title"] c_speaker = characters[selected_state.index]["speaker"] return c_title, selected_state def infer(prompt, input_wav_file, clean_audio, hidden_numpy_audio): print(""" ————— NEW INFERENCE: ——————— """) if prompt == "": gr.Warning("Do not forget to provide a tts prompt !") if clean_audio is True : print("We want to clean audio sample") # Extract the file name without the extension new_name = os.path.splitext(os.path.basename(input_wav_file))[0] print(f"FILE BASENAME is: {new_name}") if os.path.exists(os.path.join("bark_voices", f"{new_name}_cleaned")): print("This file has already been cleaned") check_name = os.path.join("bark_voices", f"{new_name}_cleaned") source_path = os.path.join(check_name, f"{new_name}_cleaned.wav") else: print("This file is new, we need to clean and store it") source_path = split_process(hidden_numpy_audio, "vocals") # Rename the file new_path = os.path.join(os.path.dirname(source_path), f"{new_name}_cleaned.wav") os.rename(source_path, new_path) source_path = new_path else : print("We do NOT want to clean audio sample") # Path to your WAV file source_path = input_wav_file # Destination directory destination_directory = "bark_voices" # Extract the file name without the extension file_name = os.path.splitext(os.path.basename(source_path))[0] # Construct the full destination directory path destination_path = os.path.join(destination_directory, file_name) # Create the new directory os.makedirs(destination_path, exist_ok=True) # Move the WAV file to the new directory shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav")) # ————— # Split the text into sentences based on common punctuation marks sentences = re.split(r'(?<=[.!?])\s+', prompt) if len(sentences) > MAX_NUMBER_SENTENCES: gr.Info("Your text is too long. To keep this demo enjoyable for everyone, we only kept the first 10 sentences :) Duplicate this space and set MAX_NUMBER_SENTENCES for longer texts ;)") # Keep only the first MAX_NUMBER_SENTENCES sentences first_nb_sentences = sentences[:MAX_NUMBER_SENTENCES] # Join the selected sentences back into a single string limited_prompt = ' '.join(first_nb_sentences) prompt = limited_prompt else: prompt = prompt gr.Info("Generating audio from prompt") tts.tts_to_file(text=prompt, file_path="output.wav", voice_dir="bark_voices/", speaker=f"{file_name}") # List all the files and subdirectories in the given directory contents = os.listdir(f"bark_voices/{file_name}") # Print the contents for item in contents: print(item) print("Preparing final waveform video ...") tts_video = gr.make_waveform(audio="output.wav") print(tts_video) print("FINISHED") return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True), gr.Group.update(visible=True), destination_path def infer_from_c(prompt, c_name): print(""" ————— NEW INFERENCE: ——————— """) if prompt == "": gr.Warning("Do not forget to provide a tts prompt !") print("Warning about prompt sent to user") print(f"USING VOICE LIBRARY: {c_name}") # Split the text into sentences based on common punctuation marks sentences = re.split(r'(?<=[.!?])\s+', prompt) if len(sentences) > MAX_NUMBER_SENTENCES: gr.Info("Your text is too long. To keep this demo enjoyable for everyone, we only kept the first 10 sentences :) Duplicate this space and set MAX_NUMBER_SENTENCES for longer texts ;)") # Keep only the first MAX_NUMBER_SENTENCES sentences first_nb_sentences = sentences[:MAX_NUMBER_SENTENCES] # Join the selected sentences back into a single string limited_prompt = ' '.join(first_nb_sentences) prompt = limited_prompt else: prompt = prompt if c_name == "": gr.Warning("Voice character is not properly selected. Please ensure that the name of the chosen voice is specified in the Character Name input.") print("Warning about Voice Name sent to user") else: print(f"Generating audio from prompt with {c_name} ;)") tts.tts_to_file(text=prompt, file_path="output.wav", voice_dir="examples/library/", speaker=f"{c_name}") print("Preparing final waveform video ...") tts_video = gr.make_waveform(audio="output.wav") print(tts_video) print("FINISHED") return "output.wav", tts_video, gr.update(value=f"examples/library/{c_name}/{c_name}.npz", visible=True), gr.Group.update(visible=True) css = """ #col-container {max-width: 780px; margin-left: auto; margin-right: auto;} a {text-decoration-line: underline; font-weight: 600;} .mic-wrap > button { width: 100%; height: 60px; font-size: 1.4em!important; } .record-icon.svelte-1thnwz { display: flex; position: relative; margin-right: var(--size-2); width: unset; height: unset; } span.record-icon > span.dot.svelte-1thnwz { width: 20px!important; height: 20px!important; } .animate-spin { animation: spin 1s linear infinite; } @keyframes spin { from { transform: rotate(0deg); } to { transform: rotate(360deg); } } #share-btn-container { display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; max-width: 15rem; height: 36px; } div#share-btn-container > div { flex-direction: row; background: black; align-items: center; } #share-btn-container:hover { background-color: #060606; } #share-btn { all: initial; color: #ffffff; font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.5rem !important; padding-bottom: 0.5rem !important; right:0; } #share-btn * { all: unset; } #share-btn-container div:nth-child(-n+2){ width: auto !important; min-height: 0px !important; } #share-btn-container .wrap { display: none !important; } #share-btn-container.hidden { display: none!important; } img[src*='#center'] { display: block; margin: auto; } .footer { margin-bottom: 45px; margin-top: 10px; text-align: center; border-bottom: 1px solid #e5e5e5; } .footer>p { font-size: .8rem; display: inline-block; padding: 0 10px; transform: translateY(10px); background: white; } .dark .footer { border-color: #303030; } .dark .footer>p { background: #0b0f19; } .disclaimer { text-align: left; } .disclaimer > p { font-size: .8rem; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.Markdown("""