# Import packages import pickle import torch import torch.nn as nn from torchvision import transforms, models from PIL import Image from PIL import Image, ImageDraw, ImageFont import gradio as gr import numpy as np import pandas as pd import torch.nn.functional as F from torchcam.methods import SmoothGradCAMpp from torchcam.utils import overlay_mask from torchvision.transforms.functional import to_pil_image from sklearn.metrics.pairwise import cosine_similarity # Import specific Detectron2 packages from detectron2.engine import DefaultPredictor from detectron2.config import get_cfg from detectron2.utils.visualizer import Visualizer, ColorMode from detectron2.data import MetadataCatalog from detectron2 import model_zoo # Load detectron2 model and set device to CPU cfg = get_cfg() cfg.MODEL.DEVICE = "cpu" cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")) cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5 cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml") predictor = DefaultPredictor(cfg) # Presteps to combine models from Assignment 3 class Classifier(nn.Module): def __init__(self, input_channels, num_classes): super(Classifier, self).__init__() self.Linear1 = nn.Linear(input_channels, num_classes) # add Linear layer def forward(self, x): x = self.Linear1(x) return x class FeatureExtractor(nn.Module): def __init__(self): super(FeatureExtractor, self).__init__() resnet_18 = models.resnet18(pretrained=True) self.feature_extractor = nn.Sequential(*(list(resnet_18.children())[:-1])) def forward(self, x): x = self.feature_extractor(x) x = x.view(x.size(0), -1) # Flatten the output return x input_channels = 512 # Output size of the ResNet18 feature extractor num_classes = 5 # Number of year range categories from Assignment 3 # Initialize the classifier classifier = Classifier(input_channels, num_classes) # Load the trained classifier weights classifier.load_state_dict(torch.load('class_model_state.pt')) # Combine feature extractor and fully connected layer class CombinedModel(nn.Module): def __init__(self, feature_extractor, classifier): super(CombinedModel, self).__init__() self.feature_extractor = feature_extractor self.classifier = classifier def forward(self, x): x = self.feature_extractor(x) x = self.classifier(x) return x # Initialize the feature extractor feature_extractor = FeatureExtractor() # Combine the models and put model in evaluation mode for prediction later combined_model = CombinedModel(feature_extractor, classifier) combined_model.eval() # Image transformer imgTransforms = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) # use mean and standard deviation computed for ImageNet dataset for normalization # Load bodytype model from Assignment 3 (only predicts 9 classes as already mentioned in Assignment 3) resnet_18_trained = torch.load("resnet_18_trained.pth") resnet_18_trained.eval() # Load feature extractor for the cosine similarity features_mod = nn.Sequential(*(list(resnet_18_trained.children())[:-1])) features_mod.eval() # Load morphs dictionary from Assignment 3 with open('saved_dictionary.pkl', 'rb') as f: loaded_dict = pickle.load(f) # Detect function for the App Interface def detect(input_img): outputs = predictor(input_img) # detect objects in image with detectron2 model pred_classes = outputs["instances"].pred_classes # get predicted classes pred_masks = outputs["instances"].pred_masks # get predicted masks for cropping the image automobile_indices = (pred_classes == 2).nonzero(as_tuple=True)[0].tolist() # Filter out indices of car objects in the image (Based on what I have found in the internet cars do have the predicted class == 2) if len(automobile_indices) > 0: # if there is any car # Find car with the largest number of pixels car_index = max(automobile_indices, key=lambda i: pred_masks[i].sum()) # Get image coordinates of the largest car image coord = np.where(pred_masks[car_index].cpu().numpy()) min_y, max_y = coord[0].min(), coord[0].max() min_x, max_x = coord[1].min(), coord[1].max() # Crop the car from the image and define cropped mask mask = pred_masks[car_index].cpu().numpy() cropped_car = input_img[min_y:max_y+1, min_x:max_x+1] cropped_mask = mask[min_y:max_y+1, min_x:max_x+1] # Create white background with dimensions of the cropped car image white_back = np.ones_like(cropped_car) * 255 # Put cropped car on the white background white_back[cropped_mask] = cropped_car[cropped_mask] # Transform cropped image for model predictions cropped_image_pil = Image.fromarray(white_back.astype(np.uint8)) inp = imgTransforms(cropped_image_pil).unsqueeze(0) # Predict year category and get probabilites for modernity score calculation with torch.no_grad(): out = combined_model(inp) prediction = F.softmax(out, dim=1) pred_year_cat = round(torch.argmax(prediction, dim = 1).tolist()[0]) year_categories = torch.tensor([0,1,2,3,4], dtype=torch.float32).to(torch.device('cpu')) # Calculate modernity score out of test_outputs by multiplying the outputs probabilities with the according year categories modernity_scores = (prediction * year_categories).sum(dim=1) modernity_sc = modernity_scores.tolist()[0] modernity = round(float(modernity_sc), 2) #Create Year Category heatmap with SmoothGradCAMpp(combined_model) as cam_extractor: # Preprocess data and feed it to the model out = combined_model(inp) # Retrieve the CAM by passing the class index and the model output modernity_activation_map = cam_extractor(out.squeeze(0).argmax().item(), out) # Year Category heatmap heatmap_modernity = overlay_mask(cropped_image_pil, to_pil_image(modernity_activation_map[0].squeeze(0), mode='F'), alpha=0.6) # Mapping dictionary for year categories as defined in Assignment 3 year_category_map = { 0: '2000-2003', 1: '2004-2008', 2: '2009-2011', 3: '2012-2014', 4: '2015-2017'} # Get year category from prediction pred_year_cat = year_category_map[pred_year_cat] # Predict bodytype with torch.no_grad(): output = resnet_18_trained(inp) body_type = round(torch.argmax(output, dim = 1).tolist()[0]) #Create bodytype heatmap with SmoothGradCAMpp(resnet_18_trained) as cam_extractor: # Preprocess your data and feed it to the model out = resnet_18_trained(inp) # Retrieve the CAM by passing the class index and the model output body_activation_map = cam_extractor(out.squeeze(0).argmax().item(), out) # Body type heatmap heatmap_body = overlay_mask(cropped_image_pil, to_pil_image(body_activation_map[0].squeeze(0), mode='F'), alpha=0.6) # Mapping dictionary for car body types as defined in Assignment 3 body_type_map = { 0: 'Hatchback', 1: 'SUV', 2: 'MPV', 3: 'Saloon', 4: 'Convertible', 5: 'Coupe', 6: 'Pickup', 7: 'Estate', 8: 'Unknown'} # Get body type from prediction body_type = body_type_map[body_type] # Extract features for cosine similarity with torch.no_grad(): features = features_mod(inp).view(1,-1) # Combine predicted year category and predicted body type to get key for morph dictionary year_body_cat = pred_year_cat + '_' + body_type # Load mean features for specific year_body category mean_features = loaded_dict[year_body_cat] # Calculate cosine similarity as typicality score cosin_sim = cosine_similarity(features, mean_features.unsqueeze(0)).item() cosin = round(float(cosin_sim), 2) year_body = f"Predicted Year Category: {pred_year_cat}, Predicted body type: {body_type}" return "Car found", white_back, year_body, modernity, cosin, heatmap_modernity, heatmap_body # If no car is found else: return "No car found", None, None, None, None, None, None demo = gr.Interface( fn=detect, inputs=gr.Image(type="numpy", label="Upload an image"), outputs=[gr.Textbox(label = "Car detection"), gr.Image(label="Car Image", type = "pil"), gr.Textbox(label = "Predictions"), gr.Number(label="Design Modernity Score"), gr.Number(label="Typicality Score"), gr.Image(label="Heatmap for Production Year Category", type="pil"), gr.Image(label="Heatmap for Car Body Type Category", type="pil")], title='Car detection and Modernity and Typicality Scores of Cars', description='Select one of the provided example images or upload your own image. The model will then search fo a car in the image. If one or more cars are being detected, the car with the largest number of pixels is being extracted, and its design modernity and typicality scores are being calculated.', examples=[["Old_car.jpg"], ['Auto-ueberholt-Fahrrad-scaled.jpeg'], ['160923-nabu-autobahn-helge-may7.jpeg']] ) demo.launch()