Spaces:

adirik
/

ALIGN-zero-shot-image-classification

Runtime error

App Files Files Community

ALIGN-zero-shot-image-classification / app.py

adirik

clean up

2b592a0 almost 2 years ago

raw

history blame

2.58 kB

	import torch
	import gradio as gr
	from transformers import AlignProcessor, AlignModel


	device = "cuda" if torch.cuda.is_available() else "cpu"

	processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
	model = AlignModel.from_pretrained("kakaobrain/align-base").to(device)
	model.eval()


	def predict(image, labels):
	labels = labels.split(', ')
	inputs = processor(images=image, text=labels, return_tensors="pt").to(device)

	with torch.no_grad():
	outputs = model(**inputs)

	logits_per_image = outputs.logits_per_image
	probs = logits_per_image.softmax(dim=1).cpu().numpy()
	return {k: float(v) for k, v in zip(labels, probs[0])}


	description = """
	<div class="container" style="display:flex;">
	<div class="image">
	<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/132_vit_align/align.png" alt="ALIGN performance" />
	</div>
	<div class="text">
	<p>Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/align">ALIGN</a>,
	as introduced in <a href="https://arxiv.org/abs/2102.05918"></a><i>"Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision"</i>. ALIGN features a dual-encoder architecture with EfficientNet and BERT as its text and vision encoders, and learns to align visual and text representations with contrastive learning.
	Unlike previous work, ALIGN leverages a massive noisy dataset and shows that the scale of the corpus can be used to achieve SOTA representations with a simple recipe.
	\n\nALIGN is not open-sourced and the `kakaobrain/align-base` model used for this demo is based on the Kakao Brain implementation that follows the original paper. The model is trained on the open source [COYO](https://github.com/kakaobrain/coyo-dataset) dataset by the Kakao Brain team. To perform zero-shot image classification with ALIGN, upload an image and enter your candidate labels as free-form text separated by a comma followed by a space.</p>
	</div>
	</div>
	"""

	gr.Interface(
	fn=predict,
	inputs=[
	gr.inputs.Image(label="Image to classify", type="pil"),
	gr.inputs.Textbox(lines=1, label="Comma separated candidate labels", placeholder="Enter labels separated by ', '",)
	],
	theme="grass",
	outputs="label",
	examples=[
	["assets/cartoon.jpeg", "dinosaur, drawing, forest",],
	["assets/painting.jpeg", "watercolor painting, oil painting, boats",],
	],
	title="Zero-Shot Image Classification with ALIGN",
	description=description
	).launch()