import gradio as gr
from transformers import pipeline
from gradio_client import Client 

# 이미지 인식 파이프라인 로드
image_model = pipeline("image-classification", model="google/vit-base-patch16-224")

def generate_music(prompt):
    # Initialize the client with your API endpoint
    client = Client("https://haoheliu-audioldm-48k-text-to-hifiaudio-generation.hf.space/")
    
    # Call the predict method with the correct parameters
    result = client.predict(
        prompt,  # The main text input for your music generation
        5,  # Duration in seconds
        0,  # Guidance scale
        5,  # Seed for generating music
        1,  # Number of waveforms to generate
        api_name="/text2audio"  # Specify the API name if required
    )
    
    # Assuming the result includes the information you need directly
    print(result)
    return result

# Example usage
prompt = "A serene and peaceful melody to relax."
music_result = generate_music(prompt)

def generate_voice(prompt):
    # Tango API를 사용하여 음성 생성
    client = Client("https://declare-lab-tango.hf.space/")
    result = client.predict(
        prompt,  # 이미지 분류 결과를 프롬프트로 사용
        100,  # Steps
        1,  # Guidance Scale
        api_name="/predict"  # API 엔드포인트 경로
    )
    # Tango API 호출 결과 처리
    # 예: result에서 음성 파일 URL 또는 데이터 추출
    return result

def classify_and_generate_voice(uploaded_image):
    # 이미지 분류
    predictions = image_model(uploaded_image)
    top_prediction = predictions[0]['label']  # 가장 확률이 높은 분류 결과
    # 음성 생성
    voice_result = generate_voice("this is " + top_prediction)
    # 음악 생성
    music_result = generate_music("The rnb beat of 85BPM drums." + top_prediction + ".")
    # 반환된 음성 및 음악 결과를 Gradio 인터페이스로 전달
    # 예: voice_result['url'] 또는 voice_result['audio_data'] 등
    return top_prediction, voice_result, music_result
    
# Gradio 인터페이스 생성
iface = gr.Interface(
    fn=classify_and_generate_voice,
    inputs=gr.Image(type="pil"),
    outputs=[gr.Label(), gr.Audio(), gr.Audio()],
    title="msVision_3",
    description="이미지를 업로드하면, 사물을 인식하고 해당하는 음성 및 음악을 생성합니다.(recognizes object and generate Voice&Music)",
    examples=["dog.jpg","cafe.jpg","seoul.png"]
)

# 인터페이스 실행
iface.launch()