Spaces:

Sanshruth
/

VQGAN_CLIP

Running

App Files Files Community

VQGAN_CLIP / app.py

Sanshruth

Update app.py

31573b4 verified about 2 months ago

raw

history blame

10.9 kB

	import zipfile
	import time
	import threading
	def unzip_content():
	try:
	# First try using Python's zipfile
	print("Attempting to unzip content using Python...")
	with zipfile.ZipFile('./content.zip', 'r') as zip_ref:
	zip_ref.extractall('.')
	except Exception as e:
	print(f"Python unzip failed: {str(e)}")
	try:
	# Fallback to system unzip command
	print("Attempting to unzip content using system command...")
	subprocess.run(['unzip', '-o', './content.zip'], check=True)
	except Exception as e:
	print(f"System unzip failed: {str(e)}")
	raise Exception("Failed to unzip content using both methods")
	print("Content successfully unzipped!")

	# Try to unzip content at startup
	try:
	unzip_content()
	except Exception as e:
	print(f"Warning: Could not unzip content: {str(e)}")

	import gradio as gr
	import numpy as np
	import torch
	import torchvision
	import torchvision.transforms
	import torchvision.transforms.functional
	import PIL
	import matplotlib.pyplot as plt
	import yaml
	from omegaconf import OmegaConf
	from CLIP import clip
	import os
	import sys

	#os.chdir('./taming-transformers')
	#from taming.models.vqgan import VQModel
	#os.chdir('..')
	taming_path = os.path.join(os.getcwd(), 'taming-transformers')
	sys.path.append(taming_path)
	from taming.models.vqgan import VQModel


	from PIL import Image
	import cv2
	import imageio

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



	def create_video(images_list):
	"""Create video from a list of image tensors"""
	if not images_list:
	print("No images provided.")
	return None

	# Create a unique filename in the current directory
	video_path = os.path.join(os.getcwd(), f"output_{int(time.time())}.mp4")

	try:
	video_writer = imageio.get_writer(video_path, fps=10, codec='libx264', quality=7,
	output_params=['-movflags', 'faststart'])

	for img_tensor in images_list:
	# Convert tensor to numpy array
	img = img_tensor.cpu().numpy().transpose((1, 2, 0))
	img = (img * 255).astype('uint8')
	video_writer.append_data(img)

	video_writer.close()
	return video_path

	except Exception as e:
	if os.path.exists(video_path):
	os.remove(video_path)
	raise e


	def save_from_tensors(tensor):
	"""Process tensor and return the processed version"""
	img = tensor.clone()
	img = img.mul(255).byte()
	img = img.cpu().numpy().transpose((1, 2, 0))
	return img

	def norm_data(data):
	return (data.clip(-1, 1) + 1) / 2

	def setup_clip_model():
	model, _ = clip.load('ViT-B/32', jit=False)
	model.eval().to(device)
	return model

	def setup_vqgan_model(config_path, checkpoint_path):
	config = OmegaConf.load(config_path)
	model = VQModel(**config.model.params)
	state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
	model.load_state_dict(state_dict, strict=False)
	return model.eval().to(device)

	def generator(x, model):
	x = model.post_quant_conv(x)
	x = model.decoder(x)
	return x

	def encode_text(text, clip_model):
	t = clip.tokenize(text).to(device)
	return clip_model.encode_text(t).detach().clone()

	def create_encoding(include, exclude, extras, clip_model):
	include_enc = [encode_text(text, clip_model) for text in include]
	exclude_enc = [encode_text(text, clip_model) for text in exclude]
	extras_enc = [encode_text(text, clip_model) for text in extras]
	return include_enc, exclude_enc, extras_enc

	def create_crops(img, num_crops=32, size1=225, noise_factor=0.05):
	aug_transform = torch.nn.Sequential(
	torchvision.transforms.RandomHorizontalFlip(),
	torchvision.transforms.RandomAffine(30, translate=(0.1, 0.1), fill=0)
	).to(device)

	p = size1 // 2
	img = torch.nn.functional.pad(img, (p, p, p, p), mode='constant', value=0)
	img = aug_transform(img)

	crop_set = []
	for _ in range(num_crops):
	gap1 = int(torch.normal(1.2, .3, ()).clip(.43, 1.9) * size1)
	offsetx = torch.randint(0, int(size1 * 2 - gap1), ())
	offsety = torch.randint(0, int(size1 * 2 - gap1), ())
	crop = img[:, :, offsetx:offsetx + gap1, offsety:offsety + gap1]
	crop = torch.nn.functional.interpolate(crop, (224, 224), mode='bilinear', align_corners=True)
	crop_set.append(crop)

	img_crops = torch.cat(crop_set, 0)
	randnormal = torch.randn_like(img_crops, requires_grad=False)
	randstotal = torch.rand((img_crops.shape[0], 1, 1, 1)).to(device)
	img_crops = img_crops + noise_factor * randstotal * randnormal

	return img_crops

	def optimize_result(params, prompt, vqgan_model, clip_model, w1, w2, extras_enc, exclude_enc):
	alpha = 1
	beta = 0.5
	out = generator(params, vqgan_model)
	out = norm_data(out)
	out = create_crops(out)
	out = torchvision.transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
	(0.26862954, 0.26130258, 0.27577711))(out)

	img_enc = clip_model.encode_image(out)
	final_enc = w1 * prompt + w2 * extras_enc[0]
	final_text_include_enc = final_enc / final_enc.norm(dim=-1, keepdim=True)
	final_text_exclude_enc = exclude_enc[0]

	main_loss = torch.cosine_similarity(final_text_include_enc, img_enc, dim=-1)
	penalize_loss = torch.cosine_similarity(final_text_exclude_enc, img_enc, dim=-1)

	return -alpha * main_loss.mean() + beta * penalize_loss.mean()

	def optimize(params, optimizer, prompt, vqgan_model, clip_model, w1, w2, extras_enc, exclude_enc):
	loss = optimize_result(params, prompt, vqgan_model, clip_model, w1, w2, extras_enc, exclude_enc)
	optimizer.zero_grad()
	loss.backward()
	optimizer.step()
	return loss

	def training_loop(params, optimizer, include_enc, exclude_enc, extras_enc, vqgan_model, clip_model, w1, w2,
	total_iter=200, show_step=1):
	res_img = []
	res_z = []

	for prompt in include_enc:
	for it in range(total_iter):
	loss = optimize(params, optimizer, prompt, vqgan_model, clip_model, w1, w2, extras_enc, exclude_enc)

	if it >= 0 and it % show_step == 0:
	with torch.no_grad():
	generated = generator(params, vqgan_model)
	new_img = norm_data(generated[0].to(device))
	res_img.append(new_img)
	res_z.append(params.clone().detach())
	print(f"loss: {loss.item():.4f}\nno. of iteration: {it}")

	torch.cuda.empty_cache()
	return res_img, res_z

	def generate_art(include_text, exclude_text, extras_text, num_iterations):
	try:
	# Process the input prompts
	include = [x.strip() for x in include_text.split(',')]
	exclude = [x.strip() for x in exclude_text.split(',')]
	extras = [x.strip() for x in extras_text.split(',')]

	w1, w2 = 1.0, 0.9

	# Setup models
	clip_model = setup_clip_model()
	vqgan_model = setup_vqgan_model("./models/vqgan_imagenet_f16_16384/configs/model.yaml",
	"./models/vqgan_imagenet_f16_16384/checkpoints/last.ckpt")

	# Parameters
	learning_rate = 0.1
	batch_size = 1
	wd = 0.1
	size1, size2 = 225, 400

	# Initialize parameters
	initial_image = PIL.Image.open('./gradient1.png')
	initial_image = initial_image.resize((size2, size1))
	initial_image = torchvision.transforms.ToTensor()(initial_image).unsqueeze(0).to(device)

	with torch.no_grad():
	z, _, _ = vqgan_model.encode(initial_image)

	params = torch.nn.Parameter(z).to(device)
	optimizer = torch.optim.AdamW([params], lr=learning_rate, weight_decay=wd)
	params.data = params.data * 0.6 + torch.randn_like(params.data) * 0.4

	# Encode prompts
	include_enc, exclude_enc, extras_enc = create_encoding(include, exclude, extras, clip_model)

	# Run training loop
	res_img, res_z = training_loop(params, optimizer, include_enc, exclude_enc, extras_enc,
	vqgan_model, clip_model, w1, w2, total_iter=num_iterations)

	# Create video directly from tensors
	video_path = create_video(res_img)

	return video_path

	except Exception as e:
	raise e


	def gradio_interface(include_text, exclude_text, extras_text, num_iterations):
	video_path = None
	try:
	video_path = generate_art(include_text, exclude_text, extras_text, int(num_iterations))

	if not os.path.exists(video_path):
	return "Video generation failed"

	# Create a copy of the video path before scheduling deletion
	response_path = video_path

	# Schedule the video file for deletion after a delay
	def cleanup():
	try:
	if os.path.exists(video_path):
	os.remove(video_path)
	except:
	pass

	threading.Timer(10.0, cleanup).start()

	return response_path

	except Exception as e:
	if video_path and os.path.exists(video_path):
	os.remove(video_path)
	return f"An error occurred: {str(e)}"

	# Try to unzip content at startup
	try:
	unzip_content()
	except Exception as e:
	print(f"Warning: Could not unzip content: {str(e)}")

	# Define and launch the Gradio app
	iface = gr.Interface(
	fn=gradio_interface,
	inputs=[
	gr.Textbox(label="Include Prompts (comma-separated)",
	value="desert, heavy rain, cactus"),
	gr.Textbox(label="Exclude Prompts (comma-separated)",
	value="confusing, blurry"),
	gr.Textbox(label="Extra Style Prompts (comma-separated)",
	value="desert, clear, detailed, beautiful, good shape, detailed"),
	gr.Number(label="Number of Iterations",
	value=200, minimum=1, maximum=1000)
	],
	outputs=gr.Video(label="Generated Morphing Video", format="mp4", autoplay=True),
	title="VQGAN-CLIP Art Generator",
	css="allow",
	allow_flagging="never",
	description = """
	Generate artistic videos using VQGAN-CLIP. Enter your prompts separated by commas and adjust the number of iterations. The model will generate a morphing video based on your inputs.
	Note: This application requires GPU access. Please either:
	1. Use the Colab notebook available at https://github.com/SanshruthR/VQGAN-CLIP
	2. Clone this space and enable GPU in your personal copy.
	""")

	if __name__ == "__main__":
	print("Checking GPU availability:", "GPU AVAILABLE" if torch.cuda.is_available() else "NO GPU FOUND")
	iface.launch()