Spaces:
Sleeping
Sleeping
File size: 2,088 Bytes
ca25718 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import ImageReward as RM
import torch
from rewards.base_reward import BaseRewardLoss
class ImageRewardLoss:
"""Image reward loss for optimization."""
def __init__(
self,
weighting: float,
dtype: torch.dtype,
device: torch.device,
cache_dir: str,
memsave: bool = False,
):
self.name = "ImageReward"
self.weighting = weighting
self.dtype = dtype
self.imagereward_model = RM.load("ImageReward-v1.0", download_root=cache_dir)
self.imagereward_model = self.imagereward_model.to(
device=device, dtype=self.dtype
)
self.imagereward_model.eval()
BaseRewardLoss.freeze_parameters(self.imagereward_model.parameters())
def __call__(self, image: torch.Tensor, prompt: str) -> torch.Tensor:
imagereward_score = self.score_diff(prompt, image)
return (2 - imagereward_score).mean()
def score_diff(self, prompt, image):
# text encode
text_input = self.imagereward_model.blip.tokenizer(
prompt,
padding="max_length",
truncation=True,
max_length=35,
return_tensors="pt",
).to(self.imagereward_model.device)
image_embeds = self.imagereward_model.blip.visual_encoder(image)
# text encode cross attention with image
image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(
self.imagereward_model.device
)
text_output = self.imagereward_model.blip.text_encoder(
text_input.input_ids,
attention_mask=text_input.attention_mask,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_atts,
return_dict=True,
)
txt_features = text_output.last_hidden_state[:, 0, :].to(
self.imagereward_model.device, dtype=self.dtype
)
rewards = self.imagereward_model.mlp(txt_features)
rewards = (rewards - self.imagereward_model.mean) / self.imagereward_model.std
return rewards
|