Spaces:

rohith2812
/

AtoI-App

Runtime error

File size: 9,256 Bytes

import pinecone
import torch
import numpy as np
from PIL import Image
from diffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler
from datasets import load_dataset
from transformers import pipeline
import gradio as gr

import pinecone

from pinecone import Pinecone

pc = Pinecone(api_key="23afd6c8-4e05-4f77-a069-95ad7b18e6cd")

from datasets import load_dataset

# Load OpenAI CLIP model for embedding generation
import open_clip
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='openai')
tokenizer = open_clip.get_tokenizer('ViT-B-32')

depth_estimator = pipeline("depth-estimation")

# Initialize Stable Diffusion ControlNet

from diffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel
from diffusers.utils.import_utils import is_xformers_available
from diffusers.schedulers import UniPCMultistepScheduler
import torch

controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11f1p_sd15_depth", torch_dtype=torch.float16).to("cuda")
pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
    "stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
).to("cuda")

# Load your fine-tuned LoRA adapter
lora_weights_path = "rohith2812/atoi-lora-finetuned-v1"  # Replace with your LoRA weight file path
pipe.unet.load_attn_procs(lora_weights_path)

pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()

if is_xformers_available():
    pipe.enable_xformers_memory_efficient_attention()

print("Pipeline is ready with fine-tuned LoRA adapter!")

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

def retrieve_image_from_text_prompt(prompt, selected_index, knowledge_database):
    """
    Retrieve the most relevant image based on the text prompt from the selected Pinecone index.
    """
    dataset = load_dataset(knowledge_database, split="train")
    # Initialize the Pinecone index dynamically based on user selection
    index = pc.Index(selected_index)

    # Generate Embedding for Text
    text_tokens = tokenizer([prompt])
    with torch.no_grad():
        query_embedding = model.encode_text(text_tokens).cpu().numpy().flatten()

    # Query Pinecone
    results = index.query(vector=query_embedding.tolist(), top_k=1, include_metadata=True, namespace="text_embeddings")
    if results and "matches" in results and results["matches"]:
        best_match = results["matches"][0]
        image_path = best_match["metadata"]["image_path"]
        description = best_match["metadata"]["description"]

        # Match the image path to the dataset to retrieve the image
        for item in dataset:
            if item["image_path"].endswith(image_path):
                return {"image": item["image"], "description": description}
    return None

# Function to Generate Depth Map
def get_depth_map(image):
    image = depth_estimator(image)["depth"]
    image = np.array(image)
    image = image[:, :, None]
    image = np.concatenate([image, image, image], axis=2)
    detected_map = torch.from_numpy(image).float() / 255.0
    return detected_map.permute(2, 0, 1).unsqueeze(0).half().to("cuda")

from transformers import CLIPProcessor, CLIPModel
import torch

# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to("cuda")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def calculate_clip_score(image, text):
    """Calculate CLIP score for an image and text pair."""
    inputs = clip_processor(text=[text], images=image, return_tensors="pt", padding=True).to("cuda")
    outputs = clip_model(**inputs)
    logits_per_image = outputs.logits_per_image  # Image-to-text similarity score
    clip_score = logits_per_image.softmax(dim=1).max().item()
    return clip_score


def audio_to_image(audio, guidance_scale, num_inference_steps, selected_index, knowledge_database):
    # Initialize Pinecone index based on user selection
    dataset = load_dataset(knowledge_database, split="train")

    index = pc.Index(selected_index)
    print(f"Connected to Pinecone index: {selected_index}")

    # Step 1: Transcribe Audio
    sr, y = audio
    if y.ndim > 1:
        y = y.mean(axis=1)  # Convert to mono
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))
    transcription = transcriber({"sampling_rate": sr, "raw": y})["text"]
    print(f"Transcribed Text: {transcription}")

    # Step 2: Retrieve Image Based on Text Prompt
    print("Retrieving image from vector database...")
    retrieved_data = retrieve_image_from_text_prompt(transcription, selected_index, knowledge_database)
    if not retrieved_data:
        return transcription, None, None, "No relevant image found.", None

    retrieved_image = retrieved_data["image"]
    retrieved_description = retrieved_data["description"]

    # Step 3: Generate Depth Map
    print("Generating depth map...")
    depth_map = get_depth_map(retrieved_image)

    # Step 4: Enhance Image Using Stable Diffusion
    print("Enhancing image with Stable Diffusion...")
    enhanced_image = pipe(
        prompt=f"{transcription}. Ensure formulas are accurate and text is clean and legible.",
        image=retrieved_image,
        control_image=depth_map,
        guidance_scale=guidance_scale,
        num_inference_steps=num_inference_steps
    ).images[0]

    # Step 5: Calculate CLIP Score
    print("Calculating CLIP Score...")
    clip_score = calculate_clip_score(enhanced_image, transcription)

    # Return Retrieved and Enhanced Images with CLIP Score
    return transcription, retrieved_image, enhanced_image, retrieved_description, clip_score

# Gradio Interface Function
def gradio_interface(audio, guidance_scale, num_inference_steps, selected_index, knowledge_database):
    transcription, retrieved_image, enhanced_image, retrieved_description, clip_score = audio_to_image(
        audio, guidance_scale, num_inference_steps, selected_index, knowledge_database
    )
    if enhanced_image is None:
        return transcription, "No relevant image found.", None, retrieved_description, "N/A"
    return transcription, retrieved_image, enhanced_image, retrieved_description, clip_score

# Enhanced Gradio UI
with gr.Blocks(title="Audio-to-Image Generation") as demo:
    gr.Markdown(
        """
        # 🎨 Audio-to-Image Generation with AI
        Speak into the microphone, and watch as this AI application retrieves a relevant image from the database,
        enhances it based on your input, and displays its description and CLIP Score.
        """
    )

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="numpy", label="🎤 Speak Your Prompt")
            guidance_scale_input = gr.Slider(
                minimum=1.0, maximum=20.0, step=0.5, value=8.5, label="🎛️ Guidance Scale"
            )
            num_inference_steps_input = gr.Slider(
                minimum=10, maximum=200, step=10, value=100, label="🔢 Number of Inference Steps"
            )
            index_selection = gr.Dropdown(
                choices=["project-atoi-v2", "project-atoi"],
                value="project-atoi-v2",
                label="🗂️ Select Pinecone Index"
            )
            knowledge_database_selection = gr.Dropdown(
                choices=["rohith2812/atoigeneration-final-data", "rxc5667/3wordsdataset_noduplicates"],
                value="rxc5667/3wordsdataset_noduplicates",
                label="📚 Select Knowledge Database"
            )
            submit_button = gr.Button("Generate Image")
        with gr.Column():
            transcription_output = gr.Textbox(label="📝 Transcribed Prompt")
            retrieved_image_output = gr.Image(label="🖼️ Retrieved Image")
            enhanced_image_output = gr.Image(label="✨ Enhanced Image")
            retrieved_description_output = gr.Textbox(label="📜 Retrieved Description")
            clip_score_output = gr.Textbox(label="📊 CLIP Score")

    examples = gr.Examples(
        examples=[["a picture explain line of best fit in linear regression"], ["Support vector machines"], ["A picture explaining multi[ple components in PCA]"]],
        inputs=[
            audio_input,
            guidance_scale_input,
            num_inference_steps_input,
            index_selection,
            knowledge_database_selection,
        ],
        outputs=[
            transcription_output,
            retrieved_image_output,
            enhanced_image_output,
            retrieved_description_output,
            clip_score_output,
        ],
        label="Examples",
    )

    submit_button.click(
        fn=gradio_interface,
        inputs=[
            audio_input,
            guidance_scale_input,
            num_inference_steps_input,
            index_selection,
            knowledge_database_selection,
        ],
        outputs=[
            transcription_output,
            retrieved_image_output,
            enhanced_image_output,
            retrieved_description_output,
            clip_score_output,
        ],
    )

# Launch Gradio Interface
if __name__ == "__main__":
    demo.launch()