Spaces:
Runtime error
Runtime error
File size: 5,244 Bytes
d7174bf 70b16bd d7174bf c22438d 70b16bd 0833e94 738ac11 c22438d fc66215 2fb7e31 de597c3 738ac11 c22438d d7174bf 738ac11 70b16bd 2e53777 0833e94 70b16bd 5bc82d4 738ac11 0833e94 738ac11 846b6b6 738ac11 320f206 05e22ed 0833e94 79c3b28 de597c3 79c3b28 738ac11 caf7d56 738ac11 13a86d8 d447a1a 13a86d8 0833e94 169cd2b 6fe0480 70b16bd 6e8de0e 935135a 70b16bd bbdd815 70b16bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
from typing import Optional
import os
os.environ["WANDB_DISABLED"] = "true"
import numpy as np
import gradio as gr
import torch
import torch.nn as nn
import torchvision
from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
from torchvision.transforms.functional import InterpolationMode
from torchvision import transforms
from torchvision.io import ImageReadMode, read_image
from transformers import CLIPModel, AutoModel
from huggingface_hub import hf_hub_download
from safetensors.torch import load_model
from datasets import load_dataset, load_metric
from transformers import (
AutoConfig,
AutoImageProcessor,
AutoModelForSequenceClassification,
AutoTokenizer,
logging,
)
class Transform(torch.nn.Module):
def __init__(self, image_size, mean, std):
super().__init__()
self.transforms = torch.nn.Sequential(
Resize([image_size], interpolation=InterpolationMode.BICUBIC),
CenterCrop(image_size),
ConvertImageDtype(torch.float),
Normalize(mean, std),
)
def forward(self, x) -> torch.Tensor:
"""`x` should be an instance of `PIL.Image.Image`"""
with torch.no_grad():
x = self.transforms(x)
return x
class VisionTextDualEncoderModel(nn.Module):
def __init__(self, num_classes):
super(VisionTextDualEncoderModel, self).__init__()
# Load the XLM-RoBERTa model
self.text_encoder = AutoModel.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")
# Define your vision model (e.g., using torchvision)
self.vision_encoder = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
vision_output_dim = self.vision_encoder.config.vision_config.hidden_size
# Combine the modalities
self.fc = nn.Linear(
self.text_encoder.config.hidden_size + vision_output_dim, num_classes
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
return_loss: Optional[bool] = None,
token_type_ids: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.LongTensor] = None,
):
# Encode text inputs
text_outputs = self.text_encoder(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
).pooler_output
# Encode vision inputs
vision_outputs = self.vision_encoder.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Concatenate text and vision features
combined_features = torch.cat(
(text_outputs, vision_outputs.pooler_output), dim=1
)
# Forward through a linear layer for classification
logits = self.fc(combined_features)
return {"logits": logits}
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")
model = VisionTextDualEncoderModel(num_classes=3)
config = model.vision_encoder.config
# https://huggingface.co/FFZG-cleopatra/M2SA/blob/main/model.safetensors
sf_filename = hf_hub_download("FFZG-cleopatra/M2SA", filename="model.safetensors")
load_model(model, sf_filename)
image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
def predict_sentiment(text, image):
print(text, image)
image = read_image(image, mode=ImageReadMode.RGB)
text_inputs = tokenizer(
text,
max_length=512,
padding="max_length",
truncation=True,
return_tensors="pt"
)
image_transformations = Transform(
config.vision_config.image_size,
image_processor.image_mean,
image_processor.image_std,
)
image_transformations = torch.jit.script(image_transformations)
pixel_values = image_transformations(image)
text_inputs["pixel_values"] = pixel_values.unsqueeze(0)
prediction = None
with torch.no_grad():
outputs = model(**text_inputs)
print(outputs)
prediction = np.argmax(outputs["logits"], axis=-1)
print(id2label[prediction[0].item()])
return id2label[prediction[0].item()]
interface = gr.Interface(
fn=lambda text, image: predict_sentiment(text, image),
inputs=[gr.Textbox(),gr.Image(type="filepath")],
outputs=['text'],
title='Multilingual Multimodal Sentiment Analysis',
examples= [["I am enjoying","A_Sep20_14_1189155141.jpg"]],
description='Get the positive/neutral/negative sentiment for the given input.'
)
interface.launch(inline = False)
|