msVision_3

Runtime error

File size: 2,616 Bytes

2bd9468
 
a6d7b81
dedab71
9902a40
8770d52
2bd9468
ba5f8a7
 
ebcd803
ba5f8a7
59ff24b
ba5f8a7
 
db7dc29
b8bb042
ba5f8a7
 
ebcd803
3ba852d
ba5f8a7
ebcd803
 
3377e03
a6d7b81
 
 
 
 
 
 
 
 
 
 
87c119f
3377e03
8770d52
 
a6d7b81
 
3377e03
59ff24b
ebcd803
59ff24b
a6d7b81
ebcd803
a6d7b81
1e442f4
ad7babb
a6d7b81
23708c8
3377e03
23708c8
4d95222
872e164
5174dc4
 
23708c8
2bd9468
ad7babb
3382a71
a6d7b81

import gradio as gr
from transformers import pipeline
from gradio_client import Client  # 가정: gradio_client 라이브러리가 사용 가능하다.

# 이미지 인식 파이프라인 로드
image_model = pipeline("image-classification", model="google/vit-base-patch16-224")

def generate_music(prompt):
    # audioldm API 사용하여 음악 생성 API 호출
    client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
    result = client.predict(
        "playing piano.",	# str in 'Input text' Textbox component
        "Low quality.",	# str in 'Negative prompt' Textbox component
        5,	# int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
        5.5,	# int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
        5,	# int | float in 'Seed' Number component
        3,	# int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
        fn_index=1
    )
    print(result)
    #audio_result = extract_audio(result)
    return result

def generate_voice(prompt):
    # Tango API를 사용하여 음성 생성
    client = Client("https://declare-lab-tango.hf.space/")
    result = client.predict(
        prompt,  # 이미지 분류 결과를 프롬프트로 사용
        100,  # Steps
        1,  # Guidance Scale
        api_name="/predict"  # API 엔드포인트 경로
    )
    # Tango API 호출 결과 처리
    # 예: result에서 음성 파일 URL 또는 데이터 추출
    return result

def classify_and_generate_voice(uploaded_image):
    # 이미지 분류
    predictions = image_model(uploaded_image)
    top_prediction = predictions[0]['label']  # 가장 확률이 높은 분류 결과
    
    # 음성 생성
    voice_result = generate_voice("this is " + top_prediction)
    # 음악 생성
    music_result = generate_music("The rnb beat of 85BPM drums." + top_prediction + ".")
    
    # 반환된 음성 및 음악 결과를 Gradio 인터페이스로 전달
    # 예: voice_result['url'] 또는 voice_result['audio_data'] 등
    return  caption, top_prediction, voice_result, music_result
    
# Gradio 인터페이스 생성
iface = gr.Interface(
    fn=classify_and_generate_voice,
    inputs=gr.Image(type="pil"),
    outputs=[ gr.Label(), gr.Audio(), gr.Audio()],
    title="msVision_3",
    description="이미지를 업로드하면, 사물을 인식하고 해당하는 음성 및 음악을 생성합니다.(recognizes object and generate Voice&Music)",
    examples=["dog.jpg", "cat.png", "cafe.jpg"]
)


# 인터페이스 실행
iface.launch()