Spaces:
Runtime error
Runtime error
from transformers import ViTFeatureExtractor, ViTForImageClassification | |
import gradio as gr | |
from datasets import load_dataset | |
import torch | |
dataset = load_dataset("cifar100") | |
image = dataset["train"]["fine_label"] | |
def classify(image): | |
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224') | |
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224') | |
inputs = feature_extractor(images=image, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
logits = outputs.logits | |
# model predicts one of the 1000 ImageNet classes | |
predicted_class_idx = logits.argmax(-1).item() | |
return model.config.id2label[predicted_class_idx] | |
def image2speech(image): | |
txt = classify(image) | |
return fastspeech(txt), txt | |
fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech") | |
app = gr.Interface(fn=image2speech, | |
inputs="image", | |
title="Image to speech", | |
description="Classifies and image and tell you what is it, intended to help the visually impaired", | |
examples=["remotecontrol.jpg", "calculator.jpg", "cellphone.jpg"], | |
allow_flagging="never", | |
outputs=["audio", "text"]) | |
app.launch(cache_examples=True) |