Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,7 @@ import gradio as gr
|
|
6 |
import time
|
7 |
import traceback
|
8 |
import spaces
|
9 |
-
|
10 |
from torchvision.ops import nms, box_iou
|
11 |
import torch.nn.functional as F
|
12 |
from torchvision import transforms
|
@@ -72,56 +72,77 @@ dog_breeds = ["Afghan_Hound", "African_Hunting_Dog", "Airedale", "American_Staff
|
|
72 |
class MultiHeadAttention(nn.Module):
|
73 |
|
74 |
def __init__(self, in_dim, num_heads=8):
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
super().__init__()
|
76 |
self.num_heads = num_heads
|
77 |
-
self.head_dim = max(1, in_dim // num_heads)
|
78 |
-
self.scaled_dim = self.head_dim * num_heads
|
79 |
-
self.fc_in = nn.Linear(in_dim, self.scaled_dim)
|
80 |
-
self.query = nn.Linear(self.scaled_dim, self.scaled_dim)
|
81 |
-
self.key = nn.Linear(self.scaled_dim, self.scaled_dim)
|
82 |
-
self.value = nn.Linear(self.scaled_dim, self.scaled_dim)
|
83 |
-
self.fc_out = nn.Linear(self.scaled_dim, in_dim)
|
84 |
|
85 |
def forward(self, x):
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
-
|
93 |
-
|
|
|
94 |
|
|
|
95 |
out = torch.einsum("nqk,nvd->nqd", [attention, v])
|
96 |
-
out = out.reshape(N, self.scaled_dim)
|
97 |
-
out = self.fc_out(out)
|
98 |
return out
|
99 |
|
|
|
100 |
class BaseModel(nn.Module):
|
101 |
|
102 |
def __init__(self, num_classes, device='cuda' if torch.cuda.is_available() else 'cpu'):
|
103 |
super().__init__()
|
104 |
self.device = device
|
105 |
|
106 |
-
# 1.
|
107 |
-
self.backbone =
|
108 |
-
|
|
|
|
|
|
|
109 |
|
110 |
-
# 2.
|
111 |
-
with torch.no_grad():
|
112 |
-
dummy_input = torch.randn(1, 3, 224, 224)
|
113 |
features = self.backbone(dummy_input)
|
114 |
-
if len(features.shape) > 2: # 如果特徵是多維的
|
115 |
-
features = features.mean([-2, -1]) # 進行全局平均池化
|
116 |
-
self.feature_dim = features.shape[1] # 獲取正確的特徵維度
|
117 |
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
-
# 3.
|
121 |
self.num_heads = max(1, min(8, self.feature_dim // 64))
|
122 |
self.attention = MultiHeadAttention(self.feature_dim, num_heads=self.num_heads)
|
123 |
|
124 |
-
# 4.
|
125 |
self.classifier = nn.Sequential(
|
126 |
nn.LayerNorm(self.feature_dim),
|
127 |
nn.Dropout(0.3),
|
@@ -130,27 +151,27 @@ class BaseModel(nn.Module):
|
|
130 |
|
131 |
def forward(self, x):
|
132 |
"""
|
133 |
-
|
134 |
Args:
|
135 |
-
x (Tensor):
|
136 |
Returns:
|
137 |
-
Tuple[Tensor, Tensor]:
|
138 |
"""
|
139 |
x = x.to(self.device)
|
140 |
|
141 |
-
# 1.
|
142 |
features = self.backbone(x)
|
143 |
|
144 |
-
# 2.
|
145 |
if len(features.shape) > 2:
|
146 |
-
#
|
147 |
-
#
|
148 |
-
features = features.mean([-2, -1]) #
|
149 |
|
150 |
-
# 3.
|
151 |
attended_features = self.attention(features)
|
152 |
|
153 |
-
# 4.
|
154 |
logits = self.classifier(attended_features)
|
155 |
|
156 |
return logits, attended_features
|
@@ -211,7 +232,7 @@ class ModelManager:
|
|
211 |
).to(self.device)
|
212 |
|
213 |
checkpoint = torch.load(
|
214 |
-
'
|
215 |
map_location=self.device # 確保checkpoint加載到正確的設備
|
216 |
)
|
217 |
self._breed_model.load_state_dict(checkpoint['base_model'], strict=False)
|
@@ -271,7 +292,7 @@ def predict_single_dog(image):
|
|
271 |
return probabilities[0], breeds[:3], relative_probs
|
272 |
|
273 |
@spaces.GPU
|
274 |
-
def detect_multiple_dogs(image, conf_threshold=0.3, iou_threshold=0.
|
275 |
"""
|
276 |
使用YOLO模型檢測圖片中的狗。
|
277 |
只保留被識別為狗(class 16)的物體,並標記它們的狀態。
|
@@ -310,10 +331,10 @@ def detect_multiple_dogs(image, conf_threshold=0.3, iou_threshold=0.55):
|
|
310 |
x1, y1, x2, y2 = box
|
311 |
w, h = x2 - x1, y2 - y1
|
312 |
# 擴大檢測框範圍以包含完整的狗
|
313 |
-
x1 = max(0, x1 - w * 0.
|
314 |
-
y1 = max(0, y1 - h * 0.
|
315 |
-
x2 = min(image.width, x2 + w * 0.
|
316 |
-
y2 = min(image.height, y2 + h * 0.
|
317 |
cropped_image = image.crop((x1, y1, x2, y2))
|
318 |
detected_objects.append((cropped_image, confidence, [x1, y1, x2, y2], is_dog))
|
319 |
|
@@ -442,9 +463,9 @@ def predict(image):
|
|
442 |
combined_confidence = detection_confidence * top1_prob
|
443 |
|
444 |
# 根據信心度決定輸出格式
|
445 |
-
if combined_confidence < 0.
|
446 |
dogs_info += format_unknown_breed_message(color, i+1)
|
447 |
-
elif top1_prob >= 0.
|
448 |
breed = topk_breeds[0]
|
449 |
description = get_dog_description(breed)
|
450 |
if description is None:
|
@@ -555,7 +576,7 @@ def main():
|
|
555 |
'Border_Collie.jpg',
|
556 |
'Golden_Retriever.jpeg',
|
557 |
'Saint_Bernard.jpeg',
|
558 |
-
'Samoyed.
|
559 |
'French_Bulldog.jpeg'
|
560 |
]
|
561 |
detection_components = create_detection_tab(predict, example_images)
|
|
|
6 |
import time
|
7 |
import traceback
|
8 |
import spaces
|
9 |
+
import timm
|
10 |
from torchvision.ops import nms, box_iou
|
11 |
import torch.nn.functional as F
|
12 |
from torchvision import transforms
|
|
|
72 |
class MultiHeadAttention(nn.Module):
|
73 |
|
74 |
def __init__(self, in_dim, num_heads=8):
|
75 |
+
"""
|
76 |
+
Initializes the MultiHeadAttention module.
|
77 |
+
Args:
|
78 |
+
in_dim (int): Dimension of the input features.
|
79 |
+
num_heads (int): Number of attention heads. Defaults to 8.
|
80 |
+
"""
|
81 |
super().__init__()
|
82 |
self.num_heads = num_heads
|
83 |
+
self.head_dim = max(1, in_dim // num_heads) # Compute dimension per head
|
84 |
+
self.scaled_dim = self.head_dim * num_heads # Scaled dimension after splitting into heads
|
85 |
+
self.fc_in = nn.Linear(in_dim, self.scaled_dim) # Linear layer to project input to scaled_dim
|
86 |
+
self.query = nn.Linear(self.scaled_dim, self.scaled_dim) # Query projection
|
87 |
+
self.key = nn.Linear(self.scaled_dim, self.scaled_dim) # Key projection
|
88 |
+
self.value = nn.Linear(self.scaled_dim, self.scaled_dim) # Value projection
|
89 |
+
self.fc_out = nn.Linear(self.scaled_dim, in_dim) # Linear layer to project output back to in_dim
|
90 |
|
91 |
def forward(self, x):
|
92 |
+
"""
|
93 |
+
Forward pass for multi-head attention mechanism.
|
94 |
+
Args:
|
95 |
+
x (Tensor): Input tensor of shape (batch_size, input_dim).
|
96 |
+
Returns:
|
97 |
+
Tensor: Output tensor after applying attention mechanism.
|
98 |
+
"""
|
99 |
+
N = x.shape[0] # Batch size
|
100 |
+
x = self.fc_in(x) # Project input to scaled_dim
|
101 |
+
q = self.query(x).view(N, self.num_heads, self.head_dim) # Compute queries
|
102 |
+
k = self.key(x).view(N, self.num_heads, self.head_dim) # Compute keys
|
103 |
+
v = self.value(x).view(N, self.num_heads, self.head_dim) # Compute values
|
104 |
|
105 |
+
# Calculate attention scores
|
106 |
+
energy = torch.einsum("nqd,nkd->nqk", [q, k]) # Dot product between queries and keys
|
107 |
+
attention = F.softmax(energy / (self.head_dim ** 0.5), dim=2) # Apply softmax with scaling
|
108 |
|
109 |
+
# Compute weighted sum of values based on attention scores
|
110 |
out = torch.einsum("nqk,nvd->nqd", [attention, v])
|
111 |
+
out = out.reshape(N, self.scaled_dim) # Concatenate all heads
|
112 |
+
out = self.fc_out(out) # Project back to original input dimension
|
113 |
return out
|
114 |
|
115 |
+
|
116 |
class BaseModel(nn.Module):
|
117 |
|
118 |
def __init__(self, num_classes, device='cuda' if torch.cuda.is_available() else 'cpu'):
|
119 |
super().__init__()
|
120 |
self.device = device
|
121 |
|
122 |
+
# 1. Initialize backbone, num_classes=0 to remove classifier layer
|
123 |
+
self.backbone = timm.create_model(
|
124 |
+
'convnextv2_base',
|
125 |
+
pretrained=True,
|
126 |
+
num_classes=0
|
127 |
+
)
|
128 |
|
129 |
+
# 2. Use test data to determine actual feature dimensions
|
130 |
+
with torch.no_grad(): # No need to compute gradients
|
131 |
+
dummy_input = torch.randn(1, 3, 224, 224) # Create example input
|
132 |
features = self.backbone(dummy_input)
|
|
|
|
|
|
|
133 |
|
134 |
+
if len(features.shape) > 2: # If features are multi-dimensional
|
135 |
+
features = features.mean([-2, -1]) # Apply global average pooling
|
136 |
+
|
137 |
+
self.feature_dim = features.shape[1] # Get correct feature dimension
|
138 |
+
|
139 |
+
print(f"Feature Dimension from V2 backbone: {self.feature_dim}")
|
140 |
|
141 |
+
# 3. Setup multi-head attention layer
|
142 |
self.num_heads = max(1, min(8, self.feature_dim // 64))
|
143 |
self.attention = MultiHeadAttention(self.feature_dim, num_heads=self.num_heads)
|
144 |
|
145 |
+
# 4. Setup classifier
|
146 |
self.classifier = nn.Sequential(
|
147 |
nn.LayerNorm(self.feature_dim),
|
148 |
nn.Dropout(0.3),
|
|
|
151 |
|
152 |
def forward(self, x):
|
153 |
"""
|
154 |
+
The forward propagation process combines V2's FCCA and the multi-head attention mechanism.
|
155 |
Args:
|
156 |
+
x (Tensor): Input image tensor with shape [batch_size, channels, height, width]
|
157 |
Returns:
|
158 |
+
Tuple[Tensor, Tensor]: Classification logits and attention features.
|
159 |
"""
|
160 |
x = x.to(self.device)
|
161 |
|
162 |
+
# 1. Extract base features
|
163 |
features = self.backbone(x)
|
164 |
|
165 |
+
# 2. Process feature dimensions
|
166 |
if len(features.shape) > 2:
|
167 |
+
# If feature dimensions are [batch_size, channels, height, width]
|
168 |
+
# Convert to [batch_size, channels]
|
169 |
+
features = features.mean([-2, -1]) # Use global average pooling
|
170 |
|
171 |
+
# 3. Apply attention mechanism
|
172 |
attended_features = self.attention(features)
|
173 |
|
174 |
+
# 4. Final classification
|
175 |
logits = self.classifier(attended_features)
|
176 |
|
177 |
return logits, attended_features
|
|
|
232 |
).to(self.device)
|
233 |
|
234 |
checkpoint = torch.load(
|
235 |
+
'ConvNextV2Base_best_model_dog.pth',
|
236 |
map_location=self.device # 確保checkpoint加載到正確的設備
|
237 |
)
|
238 |
self._breed_model.load_state_dict(checkpoint['base_model'], strict=False)
|
|
|
292 |
return probabilities[0], breeds[:3], relative_probs
|
293 |
|
294 |
@spaces.GPU
|
295 |
+
def detect_multiple_dogs(image, conf_threshold=0.3, iou_threshold=0.3):
|
296 |
"""
|
297 |
使用YOLO模型檢測圖片中的狗。
|
298 |
只保留被識別為狗(class 16)的物體,並標記它們的狀態。
|
|
|
331 |
x1, y1, x2, y2 = box
|
332 |
w, h = x2 - x1, y2 - y1
|
333 |
# 擴大檢測框範圍以包含完整的狗
|
334 |
+
x1 = max(0, x1 - w * 0.02)
|
335 |
+
y1 = max(0, y1 - h * 0.02)
|
336 |
+
x2 = min(image.width, x2 + w * 0.02)
|
337 |
+
y2 = min(image.height, y2 + h * 0.02)
|
338 |
cropped_image = image.crop((x1, y1, x2, y2))
|
339 |
detected_objects.append((cropped_image, confidence, [x1, y1, x2, y2], is_dog))
|
340 |
|
|
|
463 |
combined_confidence = detection_confidence * top1_prob
|
464 |
|
465 |
# 根據信心度決定輸出格式
|
466 |
+
if combined_confidence < 0.15:
|
467 |
dogs_info += format_unknown_breed_message(color, i+1)
|
468 |
+
elif top1_prob >= 0.4:
|
469 |
breed = topk_breeds[0]
|
470 |
description = get_dog_description(breed)
|
471 |
if description is None:
|
|
|
576 |
'Border_Collie.jpg',
|
577 |
'Golden_Retriever.jpeg',
|
578 |
'Saint_Bernard.jpeg',
|
579 |
+
'Samoyed.jpeg',
|
580 |
'French_Bulldog.jpeg'
|
581 |
]
|
582 |
detection_components = create_detection_tab(predict, example_images)
|