import streamlit as st import requests from io import BytesIO from transformers import AutoTokenizer import numpy as np from pydub import AudioSegment import tempfile import os # Set the page configuration st.set_page_config( page_title="Voice Cloning App", layout="centered", initial_sidebar_state="auto", ) @st.cache_resource def load_tokenizer(): return AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-ca") def split_text_into_chunks(text, tokenizer, max_tokens=100): # Tokenize the entire text tokens = tokenizer.tokenize(text) # If text is short enough, return it as a single chunk if len(tokens) <= max_tokens: return [text] # Split into chunks based on punctuation and token limit chunks = [] current_chunk = "" current_tokens = [] sentences = text.split('.') for sentence in sentences: if not sentence.strip(): continue sentence = sentence.strip() + "." sentence_tokens = tokenizer.tokenize(sentence) if len(current_tokens) + len(sentence_tokens) <= max_tokens: current_chunk += " " + sentence if current_chunk else sentence current_tokens.extend(sentence_tokens) else: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence current_tokens = sentence_tokens if current_chunk: chunks.append(current_chunk.strip()) return chunks def merge_audio_segments(audio_contents): combined = None for audio_content in audio_contents: # Save the audio content to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_file: temp_file.write(audio_content) temp_file_path = temp_file.name # Load the audio segment segment = AudioSegment.from_file(temp_file_path) os.unlink(temp_file_path) # Delete the temporary file # Combine the segments if combined is None: combined = segment else: combined += segment # Export the combined audio to a byte stream output = BytesIO() combined.export(output, format='mp3') return output.getvalue() st.title("📢 Voice Cloning Application") st.write("Enter the details below and upload an audio file to clone the voice.") # Load the tokenizer tokenizer = load_tokenizer() # Create a form for input with st.form("voice_clone_form"): # Text input text = st.text_input("Text", value="مرحباً بكم في تطبيق استنساخ الصوت. يمكنك استخدام هذا التطبيق لإنشاء نسخة من صوتك باللغة العربية.") # Language selection language = st.selectbox("Language", options=["ar"], index=0) # File uploader for audio file audio_file = st.file_uploader("Upload Audio File", type=["wav", "mp3", "ogg"]) # Submit button submit_button = st.form_submit_button(label="Clone Voice") if submit_button: if not audio_file: st.error("Please upload an audio file.") else: try: # Split text into chunks if necessary text_chunks = split_text_into_chunks(text, tokenizer) if len(text_chunks) > 1: st.info(f"Text will be processed in {len(text_chunks)} chunks due to length.") audio_contents = [] # Process each chunk progress_bar = st.progress(0) for i, chunk in enumerate(text_chunks): # Prepare the payload payload = { 'text': chunk, 'language': language } # Prepare the files files = { 'audio_file': (audio_file.name, audio_file.read(), audio_file.type) } # Reset file pointer for next iteration audio_file.seek(0) # API endpoint api_url = "https://tellergen.com/api/clone-voice" with st.spinner(f"Processing chunk {i+1}/{len(text_chunks)}..."): response = requests.post(api_url, data=payload, files=files) if response.status_code == 200: content_type = response.headers.get('Content-Type') if 'audio' in content_type: audio_contents.append(response.content) else: st.error(f"Unexpected response format for chunk {i+1}") try: st.json(response.json()) except ValueError: st.text(response.text) break else: st.error(f"API request failed for chunk {i+1} with status code {response.status_code}") try: error_data = response.json() st.error(error_data) except ValueError: st.error(response.text) break progress_bar.progress((i + 1) / len(text_chunks)) # If we have all audio chunks, merge them if len(audio_contents) == len(text_chunks): st.success("Voice cloning completed successfully!") if len(audio_contents) > 1: with st.spinner("Merging audio segments..."): final_audio = merge_audio_segments(audio_contents) else: final_audio = audio_contents[0] st.audio(final_audio, format='audio/mp3') except Exception as e: st.error(f"An error occurred: {e}")