Spaces:

VinayHajare
/

Text-To-Image-EfficientCLIP-GAN

Running

App Files Files Community

Text-To-Image-EfficientCLIP-GAN / inference.py

VinayHajare

Update inference.py

fa96828 verified 8 months ago

raw

history blame contribute delete

3.77 kB

	import os
	import sys
	import io
	import torch
	import torchvision
	import clip
	import numpy as np
	from huggingface_hub import hf_hub_download
	from PIL import Image
	from torchvision.transforms.functional import to_pil_image

	from utils import load_model_weights
	from model import NetG, CLIP_TXT_ENCODER

	# checking the device
	device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

	# Getting the HF token
	HF_TOKEN = os.getenv("HF_TOKEN")

	# repositiory of the model
	repo_id = "VinayHajare/EfficientCLIP-GAN"
	file_name = "saved_models/state_epoch_1480.pth"

	# clip model wrapped with the custom encoder
	clip_text = "ViT-B/32"
	clip_model, preprocessor = clip.load(clip_text, device=device)
	clip_model = clip_model.eval()
	text_encoder = CLIP_TXT_ENCODER(clip_model).to(device)

	# loading the model from the repository and extracting the generator model
	model_path = hf_hub_download(repo_id = repo_id, filename = file_name, token = HF_TOKEN)
	checkpoint = torch.load(model_path, map_location=torch.device(device))
	netG = NetG(64, 100, 512, 256, 3, False, clip_model).to(device)
	generator = load_model_weights(netG, checkpoint['model']['netG'], multi_gpus=False)

	# Function to generate images from text
	def generate_image_from_text(caption, batch_size=4):
	# Create the noise vector
	noise = torch.randn((batch_size, 100)).to(device)
	with torch.no_grad():
	# Tokenize caption
	tokenized_text = clip.tokenize([caption]).to(device)
	# Extract the sentence and word embedding from Custom CLIP ENCODER
	sent_emb, word_emb = text_encoder(tokenized_text)
	# Repeat the sentence embedding to match the batch size
	sent_emb = sent_emb.repeat(batch_size, 1)
	# generate the images
	generated_images = generator(noise, sent_emb, eval=True).float()

	# Convert the tensor images to PIL format
	pil_images = []
	for image_tensor in generated_images.unbind(0):
	# Rescale tensor values to [0, 1]
	image_tensor = image_tensor.data.clamp(-1, 1)
	image_tensor = (image_tensor + 1.0) / 2.0

	# Convert tensor to numpy array
	image_numpy = image_tensor.permute(1, 2, 0).cpu().numpy()

	# Clip numpy array values to [0, 1]
	image_numpy = np.clip(image_numpy, 0, 1)

	# Create a PIL image from the numpy array
	pil_image = Image.fromarray((image_numpy * 255).astype(np.uint8))

	pil_images.append(pil_image)

	return pil_images

	# Function to generate images from text
	def generate_image_from_text_with_persistent_storage(caption, batch_size=4):
	# Create the noise vector
	noise = torch.randn((batch_size, 100)).to(device)
	with torch.no_grad():
	# Tokenize caption
	tokenized_text = clip.tokenize([caption]).to(device)
	# Extract the sentence and word embedding from Custom CLIP ENCODER
	sent_emb, word_emb = text_encoder(tokenized_text)
	# Repeat the sentence embedding to match the batch size
	sent_emb = sent_emb.repeat(batch_size, 1)
	# generate the images
	generated_images = generator(noise, sent_emb, eval=True).float()

	# Create a permanent directory if it doesn't exist
	permanent_dir = "generated_images"
	if not os.path.exists(permanent_dir):
	os.makedirs(permanent_dir)

	image_paths = []
	for idx, image_tensor in enumerate(generated_images.unbind(0)):
	# Save the image tensor to a permanent file
	image_path = os.path.join(permanent_dir, f"image_{idx}.png")
	torchvision.utils.save_image(image_tensor.data, image_path, value_range=(-1, 1), normalize=True)
	image_paths.append(image_path)

	return image_paths