import streamlit as st
import requests
from io import BytesIO
from transformers import AutoTokenizer
import numpy as np
from pydub import AudioSegment
import tempfile
import os

# Set the page configuration
st.set_page_config(
    page_title="Voice Cloning App",
    layout="centered",
    initial_sidebar_state="auto",
)

@st.cache_resource
def load_tokenizer():
    return AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-ca")

def split_text_into_chunks(text, tokenizer, max_tokens=100):
    # Tokenize the entire text
    tokens = tokenizer.tokenize(text)
    
    # If text is short enough, return it as a single chunk
    if len(tokens) <= max_tokens:
        return [text]
    
    # Split into chunks based on punctuation and token limit
    chunks = []
    current_chunk = ""
    current_tokens = []
    
    sentences = text.split('.')
    
    for sentence in sentences:
        if not sentence.strip():
            continue
            
        sentence = sentence.strip() + "."
        sentence_tokens = tokenizer.tokenize(sentence)
        
        if len(current_tokens) + len(sentence_tokens) <= max_tokens:
            current_chunk += " " + sentence if current_chunk else sentence
            current_tokens.extend(sentence_tokens)
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence
            current_tokens = sentence_tokens
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def merge_audio_segments(audio_contents):
    combined = None
    
    for audio_content in audio_contents:
        # Save the audio content to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_file:
            temp_file.write(audio_content)
            temp_file_path = temp_file.name
        
        # Load the audio segment
        segment = AudioSegment.from_file(temp_file_path)
        os.unlink(temp_file_path)  # Delete the temporary file
        
        # Combine the segments
        if combined is None:
            combined = segment
        else:
            combined += segment
    
    # Export the combined audio to a byte stream
    output = BytesIO()
    combined.export(output, format='mp3')
    return output.getvalue()

st.title("📢 Voice Cloning Application")
st.write("Enter the details below and upload an audio file to clone the voice.")

# Load the tokenizer
tokenizer = load_tokenizer()

# Create a form for input
with st.form("voice_clone_form"):
    # Text input
    text = st.text_input("Text", value="مرحباً بكم في تطبيق استنساخ الصوت. يمكنك استخدام هذا التطبيق لإنشاء نسخة من صوتك باللغة العربية.")

    # Language selection
    language = st.selectbox("Language", options=["ar"], index=0)

    # File uploader for audio file
    audio_file = st.file_uploader("Upload Audio File", type=["wav", "mp3", "ogg"])

    # Submit button
    submit_button = st.form_submit_button(label="Clone Voice")

if submit_button:
    if not audio_file:
        st.error("Please upload an audio file.")
    else:
        try:
            # Split text into chunks if necessary
            text_chunks = split_text_into_chunks(text, tokenizer)
            
            if len(text_chunks) > 1:
                st.info(f"Text will be processed in {len(text_chunks)} chunks due to length.")
            
            audio_contents = []
            
            # Process each chunk
            progress_bar = st.progress(0)
            for i, chunk in enumerate(text_chunks):
                # Prepare the payload
                payload = {
                    'text': chunk,
                    'language': language
                }

                # Prepare the files
                files = {
                    'audio_file': (audio_file.name, audio_file.read(), audio_file.type)
                }
                
                # Reset file pointer for next iteration
                audio_file.seek(0)

                # API endpoint
                api_url = "https://tellergen.com/api/clone-voice"

                with st.spinner(f"Processing chunk {i+1}/{len(text_chunks)}..."):
                    response = requests.post(api_url, data=payload, files=files)

                if response.status_code == 200:
                    content_type = response.headers.get('Content-Type')
                    if 'audio' in content_type:
                        audio_contents.append(response.content)
                    else:
                        st.error(f"Unexpected response format for chunk {i+1}")
                        try:
                            st.json(response.json())
                        except ValueError:
                            st.text(response.text)
                        break
                else:
                    st.error(f"API request failed for chunk {i+1} with status code {response.status_code}")
                    try:
                        error_data = response.json()
                        st.error(error_data)
                    except ValueError:
                        st.error(response.text)
                    break
                
                progress_bar.progress((i + 1) / len(text_chunks))

            # If we have all audio chunks, merge them
            if len(audio_contents) == len(text_chunks):
                st.success("Voice cloning completed successfully!")
                
                if len(audio_contents) > 1:
                    with st.spinner("Merging audio segments..."):
                        final_audio = merge_audio_segments(audio_contents)
                else:
                    final_audio = audio_contents[0]
                
                st.audio(final_audio, format='audio/mp3')
            
        except Exception as e:
            st.error(f"An error occurred: {e}")