Spaces:

rohith2812
/

AtoI-App

Runtime error

App Files Files Community

AtoI-App / app.py

rohith2812

Update app.py

75d0925 verified 4 months ago

raw

history blame contribute delete

9.26 kB

	import pinecone
	import torch
	import numpy as np
	from PIL import Image
	from diffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler
	from datasets import load_dataset
	from transformers import pipeline
	import gradio as gr

	import pinecone

	from pinecone import Pinecone

	pc = Pinecone(api_key="23afd6c8-4e05-4f77-a069-95ad7b18e6cd")

	from datasets import load_dataset

	# Load OpenAI CLIP model for embedding generation
	import open_clip
	model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='openai')
	tokenizer = open_clip.get_tokenizer('ViT-B-32')

	depth_estimator = pipeline("depth-estimation")

	# Initialize Stable Diffusion ControlNet

	from diffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel
	from diffusers.utils.import_utils import is_xformers_available
	from diffusers.schedulers import UniPCMultistepScheduler
	import torch

	controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11f1p_sd15_depth", torch_dtype=torch.float16).to("cuda")
	pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
	"stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
	).to("cuda")

	# Load your fine-tuned LoRA adapter
	lora_weights_path = "rohith2812/atoi-lora-finetuned-v1" # Replace with your LoRA weight file path
	pipe.unet.load_attn_procs(lora_weights_path)

	pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
	pipe.enable_model_cpu_offload()

	if is_xformers_available():
	pipe.enable_xformers_memory_efficient_attention()

	print("Pipeline is ready with fine-tuned LoRA adapter!")

	transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

	def retrieve_image_from_text_prompt(prompt, selected_index, knowledge_database):
	"""
	Retrieve the most relevant image based on the text prompt from the selected Pinecone index.
	"""
	dataset = load_dataset(knowledge_database, split="train")
	# Initialize the Pinecone index dynamically based on user selection
	index = pc.Index(selected_index)

	# Generate Embedding for Text
	text_tokens = tokenizer([prompt])
	with torch.no_grad():
	query_embedding = model.encode_text(text_tokens).cpu().numpy().flatten()

	# Query Pinecone
	results = index.query(vector=query_embedding.tolist(), top_k=1, include_metadata=True, namespace="text_embeddings")
	if results and "matches" in results and results["matches"]:
	best_match = results["matches"][0]
	image_path = best_match["metadata"]["image_path"]
	description = best_match["metadata"]["description"]

	# Match the image path to the dataset to retrieve the image
	for item in dataset:
	if item["image_path"].endswith(image_path):
	return {"image": item["image"], "description": description}
	return None

	# Function to Generate Depth Map
	def get_depth_map(image):
	image = depth_estimator(image)["depth"]
	image = np.array(image)
	image = image[:, :, None]
	image = np.concatenate([image, image, image], axis=2)
	detected_map = torch.from_numpy(image).float() / 255.0
	return detected_map.permute(2, 0, 1).unsqueeze(0).half().to("cuda")

	from transformers import CLIPProcessor, CLIPModel
	import torch

	# Load CLIP model and processor
	clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to("cuda")
	clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

	def calculate_clip_score(image, text):
	"""Calculate CLIP score for an image and text pair."""
	inputs = clip_processor(text=[text], images=image, return_tensors="pt", padding=True).to("cuda")
	outputs = clip_model(**inputs)
	logits_per_image = outputs.logits_per_image # Image-to-text similarity score
	clip_score = logits_per_image.softmax(dim=1).max().item()
	return clip_score


	def audio_to_image(audio, guidance_scale, num_inference_steps, selected_index, knowledge_database):
	# Initialize Pinecone index based on user selection
	dataset = load_dataset(knowledge_database, split="train")

	index = pc.Index(selected_index)
	print(f"Connected to Pinecone index: {selected_index}")

	# Step 1: Transcribe Audio
	sr, y = audio
	if y.ndim > 1:
	y = y.mean(axis=1) # Convert to mono
	y = y.astype(np.float32)
	y /= np.max(np.abs(y))
	transcription = transcriber({"sampling_rate": sr, "raw": y})["text"]
	print(f"Transcribed Text: {transcription}")

	# Step 2: Retrieve Image Based on Text Prompt
	print("Retrieving image from vector database...")
	retrieved_data = retrieve_image_from_text_prompt(transcription, selected_index, knowledge_database)
	if not retrieved_data:
	return transcription, None, None, "No relevant image found.", None

	retrieved_image = retrieved_data["image"]
	retrieved_description = retrieved_data["description"]

	# Step 3: Generate Depth Map
	print("Generating depth map...")
	depth_map = get_depth_map(retrieved_image)

	# Step 4: Enhance Image Using Stable Diffusion
	print("Enhancing image with Stable Diffusion...")
	enhanced_image = pipe(
	prompt=f"{transcription}. Ensure formulas are accurate and text is clean and legible.",
	image=retrieved_image,
	control_image=depth_map,
	guidance_scale=guidance_scale,
	num_inference_steps=num_inference_steps
	).images[0]

	# Step 5: Calculate CLIP Score
	print("Calculating CLIP Score...")
	clip_score = calculate_clip_score(enhanced_image, transcription)

	# Return Retrieved and Enhanced Images with CLIP Score
	return transcription, retrieved_image, enhanced_image, retrieved_description, clip_score

	# Gradio Interface Function
	def gradio_interface(audio, guidance_scale, num_inference_steps, selected_index, knowledge_database):
	transcription, retrieved_image, enhanced_image, retrieved_description, clip_score = audio_to_image(
	audio, guidance_scale, num_inference_steps, selected_index, knowledge_database
	)
	if enhanced_image is None:
	return transcription, "No relevant image found.", None, retrieved_description, "N/A"
	return transcription, retrieved_image, enhanced_image, retrieved_description, clip_score

	# Enhanced Gradio UI
	with gr.Blocks(title="Audio-to-Image Generation") as demo:
	gr.Markdown(
	"""
	# 🎨 Audio-to-Image Generation with AI
	Speak into the microphone, and watch as this AI application retrieves a relevant image from the database,
	enhances it based on your input, and displays its description and CLIP Score.
	"""
	)

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(type="numpy", label="🎤 Speak Your Prompt")
	guidance_scale_input = gr.Slider(
	minimum=1.0, maximum=20.0, step=0.5, value=8.5, label="🎛️ Guidance Scale"
	)
	num_inference_steps_input = gr.Slider(
	minimum=10, maximum=200, step=10, value=100, label="🔢 Number of Inference Steps"
	)
	index_selection = gr.Dropdown(
	choices=["project-atoi-v2", "project-atoi"],
	value="project-atoi-v2",
	label="🗂️ Select Pinecone Index"
	)
	knowledge_database_selection = gr.Dropdown(
	choices=["rohith2812/atoigeneration-final-data", "rxc5667/3wordsdataset_noduplicates"],
	value="rxc5667/3wordsdataset_noduplicates",
	label="📚 Select Knowledge Database"
	)
	submit_button = gr.Button("Generate Image")
	with gr.Column():
	transcription_output = gr.Textbox(label="📝 Transcribed Prompt")
	retrieved_image_output = gr.Image(label="🖼️ Retrieved Image")
	enhanced_image_output = gr.Image(label="✨ Enhanced Image")
	retrieved_description_output = gr.Textbox(label="📜 Retrieved Description")
	clip_score_output = gr.Textbox(label="📊 CLIP Score")

	examples = gr.Examples(
	examples=[["a picture explain line of best fit in linear regression"], ["Support vector machines"], ["A picture explaining multi[ple components in PCA]"]],
	inputs=[
	audio_input,
	guidance_scale_input,
	num_inference_steps_input,
	index_selection,
	knowledge_database_selection,
	],
	outputs=[
	transcription_output,
	retrieved_image_output,
	enhanced_image_output,
	retrieved_description_output,
	clip_score_output,
	],
	label="Examples",
	)

	submit_button.click(
	fn=gradio_interface,
	inputs=[
	audio_input,
	guidance_scale_input,
	num_inference_steps_input,
	index_selection,
	knowledge_database_selection,
	],
	outputs=[
	transcription_output,
	retrieved_image_output,
	enhanced_image_output,
	retrieved_description_output,
	clip_score_output,
	],
	)

	# Launch Gradio Interface
	if __name__ == "__main__":
	demo.launch()