File size: 5,497 Bytes
134cb11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import json
import numpy as np
from infer_utils import create_mask
from shapely.wkt import loads
from collections import defaultdict
from tqdm import tqdm
def clean_string(s):
return s.replace(' ', '-').replace('.', '').lower()
def get_class_dict(dataset):
if dataset == "qfabric":
class_dict = {
"temporal_region_based_question_answering: What is the development status in this region [bbox] in image N?":
{
"prior-construction": 1,
"greenland ": 2,
"land-cleared": 3,
"excavation": 4,
"materials-dumped": 5,
"construction-started": 6,
"construction-midway": 7,
"construction-done": 8,
"operational": 9
},
"region_based_question_answering: Identify the type of urban development that has occurred in this area [bbox].":
{
"residential": 10,
"commercial": 11,
"industrial": 12,
"road": 13,
"demolition": 14,
"mega-projects": 15
}
}
elif dataset == "xbd":
class_dict = {
"classification: Classify the level of damage experienced by the building at location [bbox] in the second image. Choose from: No damage, Minor Damage, Major Damage, Destroyed.":
{
"no-damage": 1,
"minor-damage": 2,
"major-damage": 3,
"destroyed": 4,
}
}
else:
raise ValueError(f"Dataset {dataset} should not be evaluated on segmentation classification.")
return class_dict
def classification_segmentation(answer_path, dataset, per_class_f1=False, height=256, width=256):
"""
Given the path to the answer file, this function creates segmentation masks on the original polygon for the predicted and ground truth classes.
Returns the class-weighted per-pixel F1 between predicted and ground-truth masks.
"""
with open(answer_path) as f:
results = json.load(f)
classes = get_class_dict(dataset)
class_stats = defaultdict(lambda: {'tp': 0, 'fp': 0, 'fn': 0, 'count': 0})
for result in tqdm(results.values()):
if result['task'] not in classes:
continue
class_dict = classes[result['task']]
predicted_class = clean_string(result['predicted'])
try:
ground_truth_class = clean_string(result["ground_truth"])
except:
ground_truth_class = clean_string(result["original_answer"])
original_polygon = loads(result['original_input_polygon'])
pred_msk = np.zeros((height, width), dtype='uint8')
gt_msk = np.zeros((height, width), dtype='uint8')
_msk = create_mask(original_polygon, im_size=(height, width))
if predicted_class not in class_dict or ground_truth_class not in class_dict:
continue
pred_label = class_dict[predicted_class]
gt_label = class_dict[ground_truth_class]
pred_msk[_msk > 0] = pred_label
gt_msk[_msk > 0] = gt_label
for label in class_dict.values():
pred_mask = (pred_msk == label)
gt_mask = (gt_msk == label)
tp = np.sum(pred_mask & gt_mask)
fp = np.sum(pred_mask & ~gt_mask)
fn = np.sum(~pred_mask & gt_mask)
class_stats[label]['tp'] += tp
class_stats[label]['fp'] += fp
class_stats[label]['fn'] += fn
class_stats[label]['count'] += np.sum(gt_mask)
scores_dict = {}
for task, class_info in classes.items():
print(f"Task: {task}")
class_f1_scores = {}
weighted_f1_score = 0
total_weight = 0
tp = 0
fp = 0
fn = 0
for class_name, class_label in class_info.items():
stats = class_stats[class_label]
total_samples = sum(stats['count'] for label, stats in class_stats.items() if label in class_info.values())
if stats['tp'] + stats['fp'] == 0 or stats['tp'] + stats['fn'] == 0:
f1 = 0.0
else:
precision = stats['tp'] / (stats['tp'] + stats['fp'])
recall = stats['tp'] / (stats['tp'] + stats['fn'])
if precision + recall == 0:
f1 = 0.0
else:
f1 = 2 * (precision * recall) / (precision + recall)
class_f1_scores[class_name] = f1
if stats['count'] > 0:
prevalence_inv = total_samples / stats['count']
weighted_f1_score += f1 * prevalence_inv
total_weight += prevalence_inv
tp += stats['tp']
fp += stats['fp']
fn += stats['fn']
if tp + fp == 0 or tp + fn == 0:
micro_f1 = 0.0
else:
micro_f1 = tp / (tp + 0.5 * (fp + fn))
if total_weight > 0:
weighted_f1_score /= total_weight
else:
weighted_f1_score = 0.0
scores_dict[task] = (class_f1_scores, weighted_f1_score)
print(f"Per-class F1 scores: {class_f1_scores}")
if dataset == 'qfabric':
print(f"Micro average F1 score: ", micro_f1)
else:
print(f"Weighted average F1 score: {weighted_f1_score}")
return scores_dict |