LivePortrait

Running on Zero

File size: 5,051 Bytes

import os
from io import BytesIO
from typing import IO, Optional
import time
import uuid
from pathlib import Path

from pydub import AudioSegment
import gradio as gr
from elevenlabs import Voice, VoiceSettings, save
from elevenlabs.client import ElevenLabs




def generate_random_filename(parent, extension="txt"):
    """
    Generates a random filename using UUID and current timestamp.

    Args:
        extension (str): The file extension for the generated filename. Default is 'txt'.

    Returns:
        str: A random filename with the specified extension.
    """
    # Generate a random UUID
    random_uuid = uuid.uuid4()

    # Get the current timestamp
    timestamp = int(time.time())

    # Combine UUID and timestamp to create a unique filename
    filename = f"{random_uuid}_{timestamp}.{extension}"
    file_path = os.path.join(parent, filename)
    return file_path

ELEVEN_LABS_MODEL = os.getenv("ELEVEN_LABS_MODEL", "eleven_multilingual_v2")
ELEVEN_LABS_LANGUAGE_SUPPORTS = [
    "English",
    "Chinese",
    "Spanish",
    "Hindi",
    "Portuguese",
    "French",
    "German",
    "Japanese",
    "Arabic",
    "Korean",
    "Indonesian",
    "Italian",
    "Dutch",
    "Turkish",
    "Polish",
    "Swedish",
    "Filipino",
    "Malay",
    "Russian",
    "Romanian",
    "Ukrainian",
    "Greek",
    "Czech",
    "Danish",
    "Finnish",
    "Bulgarian",
    "Croatian",
    "Slovak",
    "Tamil",
]


class ElevenLabsPipeline:
    def __init__(self):
        
        eleven_labs_api_key = os.getenv("ELEVENLABS_TOKEN")
        if eleven_labs_api_key is None:
            raise Exception("ELEVENLABS_API_KEY 환경변수를 설정해주세요.")
        self.client = ElevenLabs(
            api_key=eleven_labs_api_key,  # Defaults to ELEVEN_API_KEY
        )
        #os.makedirs("./tmp", exist_ok=True)
        os.makedirs("/tmp/elevenlabs", exist_ok=True)

    def clone_voice(self, audio, name, description=None):
        response = self.client.voices.get_all()
        for voice in response.voices:
            if voice.name == name:
                return "존재하는 음성입니다. 음성 생성을 시작해주세요."

        try:
            voice = self.client.clone(
                name=name,
                description=description,  # Optional
                files=[audio],
            )
            return "Voice Clone을 성공적으로 생성했습니다."
        except Exception as e:
            return str(e)

    def _get_voice(self, name: str):
        response = self.client.voices.get_all()
        current_voice = None
        for voice in response.voices:
            if voice.name == name:
                current_voice = voice
                break

        return current_voice

    def generate_voice(
        self,
        text: str,
        audio: str = None,
        language: str = "ko",
        mute_before_ms: Optional[int] = 0,
        mute_after_ms: Optional[int] = 0,
        stability: float = 0.5,
        similarity_boost: float = 0.75,
        style: float = 0.0,
        use_speaker_boost=True,
    ) -> str:
        
        if audio is not None:
            name = Path(audio).stem
            self.clone_voice(audio, name)
        else:
            gr.Info("음성이 안주어졌습니다. 기본 음성으로 생성하겠습니다.", duration=2)
            name = "Laura"
            
        current_voice = self._get_voice(name)
        
        if current_voice is None:
            current_voice = self._get_voice(name)
            

        response = self.client.generate(
            text=text,
            model=ELEVEN_LABS_MODEL,
            voice=Voice(
                voice_id=current_voice.voice_id,
                settings=VoiceSettings(
                    stability=stability,
                    similarity_boost=similarity_boost,
                    style=style,
                    use_speaker_boost=use_speaker_boost,
                    language=language,
                ),
            ),
        )

        # Create a BytesIO object to hold the audio data in memory
        audio_stream = BytesIO()

        # Write each chunk of audio data to the stream
        for chunk in response:
            if chunk:
                audio_stream.write(chunk)

        # Reset stream position to the beginning
        audio_stream.seek(0)

        # Load the audio stream into an AudioSegment
        audio_segment = AudioSegment.from_file(audio_stream, format="mp3")

        # Create silent segments for before and after
        mute_before = AudioSegment.silent(duration=mute_before_ms)
        mute_after = AudioSegment.silent(duration=mute_after_ms)

        # Concatenate the segments
        combined_segment = mute_before + audio_segment + mute_after

        #tmp_file = generate_random_filename("./tmp", "mp3")
        tmp_file = generate_random_filename("/tmp/elevenlabs", "mp3")

        # Export the combined audio to the specified file
        combined_segment.export(tmp_file, format="mp3", bitrate="128k")

        return tmp_file