DawnC commited on
Commit
92ec9ab
1 Parent(s): aebc3f3

Delete smart_breed_matcher.py

Browse files
Files changed (1) hide show
  1. smart_breed_matcher.py +0 -423
smart_breed_matcher.py DELETED
@@ -1,423 +0,0 @@
1
- import torch
2
- import re
3
- import numpy as np
4
- from typing import List, Dict, Tuple, Optional
5
- from dataclasses import dataclass
6
- from breed_health_info import breed_health_info
7
- from breed_noise_info import breed_noise_info
8
- from dog_database import dog_data
9
- from scoring_calculation_system import UserPreferences
10
- from sentence_transformers import SentenceTransformer, util
11
-
12
- class SmartBreedMatcher:
13
- def __init__(self, dog_data: List[Tuple]):
14
- self.dog_data = dog_data
15
- self.model = SentenceTransformer('all-mpnet-base-v2')
16
- self._embedding_cache = {}
17
-
18
- def _get_cached_embedding(self, text: str) -> torch.Tensor:
19
- if text not in self._embedding_cache:
20
- self._embedding_cache[text] = self.model.encode(text)
21
- return self._embedding_cache[text]
22
-
23
- def _categorize_breeds(self) -> Dict:
24
- """自動將狗品種分類"""
25
- categories = {
26
- 'working_dogs': [],
27
- 'herding_dogs': [],
28
- 'hunting_dogs': [],
29
- 'companion_dogs': [],
30
- 'guard_dogs': []
31
- }
32
-
33
- for breed_info in self.dog_data:
34
- description = breed_info[9].lower()
35
- temperament = breed_info[4].lower()
36
-
37
- # 根據描述和性格特徵自動分類
38
- if any(word in description for word in ['herding', 'shepherd', 'cattle', 'flock']):
39
- categories['herding_dogs'].append(breed_info[1])
40
- elif any(word in description for word in ['hunting', 'hunt', 'retriever', 'pointer']):
41
- categories['hunting_dogs'].append(breed_info[1])
42
- elif any(word in description for word in ['companion', 'toy', 'family', 'lap']):
43
- categories['companion_dogs'].append(breed_info[1])
44
- elif any(word in description for word in ['guard', 'protection', 'watchdog']):
45
- categories['guard_dogs'].append(breed_info[1])
46
- elif any(word in description for word in ['working', 'draft', 'cart']):
47
- categories['working_dogs'].append(breed_info[1])
48
-
49
- return categories
50
-
51
- def find_similar_breeds(self, breed_name: str, top_n: int = 5) -> List[Tuple[str, float]]:
52
- """找出與指定品種最相似的其他品種"""
53
- target_breed = next((breed for breed in self.dog_data if breed[1] == breed_name), None)
54
- if not target_breed:
55
- return []
56
-
57
- # 獲取目標品種的特徵
58
- target_features = {
59
- 'breed_name': target_breed[1], # 添加品種名稱
60
- 'size': target_breed[2],
61
- 'temperament': target_breed[4],
62
- 'exercise': target_breed[7],
63
- 'description': target_breed[9]
64
- }
65
-
66
- similarities = []
67
- for breed in self.dog_data:
68
- if breed[1] != breed_name:
69
- breed_features = {
70
- 'breed_name': breed[1], # 添加品種名稱
71
- 'size': breed[2],
72
- 'temperament': breed[4],
73
- 'exercise': breed[7],
74
- 'description': breed[9]
75
- }
76
-
77
- similarity_score = self._calculate_breed_similarity(target_features, breed_features)
78
- similarities.append((breed[1], similarity_score))
79
-
80
- return sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]
81
-
82
-
83
- def _calculate_breed_similarity(self, breed1_features: Dict, breed2_features: Dict) -> float:
84
- """計算兩個品種之間的相似度,包含健康和噪音因素"""
85
- # 計算描述文本的相似度
86
- desc1_embedding = self._get_cached_embedding(breed1_features['description'])
87
- desc2_embedding = self._get_cached_embedding(breed2_features['description'])
88
- description_similarity = float(util.pytorch_cos_sim(desc1_embedding, desc2_embedding))
89
-
90
- # 基本特徵相似度
91
- size_similarity = 1.0 if breed1_features['size'] == breed2_features['size'] else 0.5
92
- exercise_similarity = 1.0 if breed1_features['exercise'] == breed2_features['exercise'] else 0.5
93
-
94
- # 性格相似度
95
- temp1_embedding = self._get_cached_embedding(breed1_features['temperament'])
96
- temp2_embedding = self._get_cached_embedding(breed2_features['temperament'])
97
- temperament_similarity = float(util.pytorch_cos_sim(temp1_embedding, temp2_embedding))
98
-
99
- # 健康分數相似度
100
- health_score1 = self._calculate_health_score(breed1_features['breed_name'])
101
- health_score2 = self._calculate_health_score(breed2_features['breed_name'])
102
- health_similarity = 1.0 - abs(health_score1 - health_score2)
103
-
104
- # 噪音水平相似度
105
- noise_similarity = self._calculate_noise_similarity(
106
- breed1_features['breed_name'],
107
- breed2_features['breed_name']
108
- )
109
-
110
- # 加權計算
111
- weights = {
112
- 'description': 0.25,
113
- 'temperament': 0.20,
114
- 'exercise': 0.2,
115
- 'size': 0.05,
116
- 'health': 0.15,
117
- 'noise': 0.15
118
- }
119
-
120
- final_similarity = (
121
- description_similarity * weights['description'] +
122
- temperament_similarity * weights['temperament'] +
123
- exercise_similarity * weights['exercise'] +
124
- size_similarity * weights['size'] +
125
- health_similarity * weights['health'] +
126
- noise_similarity * weights['noise']
127
- )
128
-
129
- return final_similarity
130
-
131
-
132
- def _calculate_final_scores(self, breed_name: str, base_scores: Dict,
133
- smart_score: float, is_preferred: bool,
134
- similarity_score: float = 0.0) -> Dict:
135
- """
136
- 計算最終分數,包含基礎分數和獎勵分數
137
-
138
- Args:
139
- breed_name: 品種名稱
140
- base_scores: 基礎評分 (空間、運動等)
141
- smart_score: 智能匹配分數
142
- is_preferred: 是否為用戶指定品種
143
- similarity_score: 與指定品種的相似度 (0-1)
144
- """
145
- # 基礎權重
146
- weights = {
147
- 'base': 0.6, # 基礎分數權重
148
- 'smart': 0.25, # 智能匹配權重
149
- 'bonus': 0.15 # 獎勵分數權重
150
- }
151
-
152
- # 計算基礎分數
153
- base_score = base_scores.get('overall', 0.7)
154
-
155
- # 計算獎勵分數
156
- bonus_score = 0.0
157
- if is_preferred:
158
- # 用戶指定品種獲得最高獎勵
159
- bonus_score = 0.95
160
- elif similarity_score > 0:
161
- # 相似品種獲得部分獎勵,但不超過80%的最高獎勵
162
- bonus_score = min(0.8, similarity_score) * 0.95
163
-
164
- # 計算最終分數
165
- final_score = (
166
- base_score * weights['base'] +
167
- smart_score * weights['smart'] +
168
- bonus_score * weights['bonus']
169
- )
170
-
171
- # 更新各項分數
172
- scores = base_scores.copy()
173
-
174
- # 如果是用戶指定品種,稍微提升各項基礎分數,但保持合理範圍
175
- if is_preferred:
176
- for key in scores:
177
- if key != 'overall':
178
- scores[key] = min(1.0, scores[key] * 1.1) # 最多提升10%
179
-
180
- # 為相似品種調整分數
181
- elif similarity_score > 0:
182
- boost_factor = 1.0 + (similarity_score * 0.05) # 最多提升5%
183
- for key in scores:
184
- if key != 'overall':
185
- scores[key] = min(0.95, scores[key] * boost_factor) # 確保不超過95%
186
-
187
- return {
188
- 'final_score': round(final_score, 4),
189
- 'base_score': round(base_score, 4),
190
- 'bonus_score': round(bonus_score, 4),
191
- 'scores': {k: round(v, 4) for k, v in scores.items()}
192
- }
193
-
194
- def _calculate_grooming_similarity(self, breed1: str, breed2: str) -> float:
195
- """計算美容需求相似度"""
196
- grooming_map = {
197
- 'Low': 1,
198
- 'Moderate': 2,
199
- 'High': 3
200
- }
201
-
202
- # 從dog_data中獲取美容需求
203
- breed1_info = next((dog for dog in self.dog_data if dog[1] == breed1), None)
204
- breed2_info = next((dog for dog in self.dog_data if dog[1] == breed2), None)
205
-
206
- if not breed1_info or not breed2_info:
207
- return 0.5 # 默認中等相似度
208
-
209
- grooming1 = breed1_info[8] # Grooming_Needs index
210
- grooming2 = breed2_info[8]
211
-
212
- value1 = grooming_map.get(grooming1, 2)
213
- value2 = grooming_map.get(grooming2, 2)
214
-
215
- # 基礎相似度
216
- base_similarity = 1.0 - abs(value1 - value2) / 2.0
217
-
218
- # 根據用戶需求調整
219
- if grooming2 == 'Moderate':
220
- base_similarity *= 1.1 # 稍微提高中等美容需求的分數
221
- elif grooming2 == 'High':
222
- base_similarity *= 0.9 # 稍微降低高美容需求的分數
223
-
224
- return min(1.0, base_similarity)
225
-
226
- def _calculate_health_score(self, breed_name: str) -> float:
227
- """計算品種的健康分數"""
228
- if breed_name not in breed_health_info:
229
- return 0.5
230
-
231
- health_notes = breed_health_info[breed_name]['health_notes'].lower()
232
-
233
- # 嚴重健康問題
234
- severe_conditions = [
235
- 'cancer', 'cardiomyopathy', 'epilepsy', 'dysplasia',
236
- 'bloat', 'progressive', 'syndrome'
237
- ]
238
-
239
- # 中等健康問題
240
- moderate_conditions = [
241
- 'allergies', 'infections', 'thyroid', 'luxation',
242
- 'skin problems', 'ear'
243
- ]
244
-
245
- severe_count = sum(1 for condition in severe_conditions if condition in health_notes)
246
- moderate_count = sum(1 for condition in moderate_conditions if condition in health_notes)
247
-
248
- health_score = 1.0
249
- health_score -= (severe_count * 0.1)
250
- health_score -= (moderate_count * 0.05)
251
-
252
- # 特殊條件調整(根據用戶偏好)
253
- if hasattr(self, 'user_preferences'):
254
- if self.user_preferences.has_children:
255
- if 'requires frequent' in health_notes or 'regular monitoring' in health_notes:
256
- health_score *= 0.9
257
-
258
- if self.user_preferences.health_sensitivity == 'high':
259
- health_score *= 0.9
260
-
261
- return max(0.3, min(1.0, health_score))
262
-
263
-
264
-
265
- def _calculate_noise_similarity(self, breed1: str, breed2: str) -> float:
266
- """計算兩個品種的噪音相似度"""
267
- noise_levels = {
268
- 'Low': 1,
269
- 'Moderate': 2,
270
- 'High': 3,
271
- 'Unknown': 2 # 默認為中等
272
- }
273
-
274
- noise1 = breed_noise_info.get(breed1, {}).get('noise_level', 'Unknown')
275
- noise2 = breed_noise_info.get(breed2, {}).get('noise_level', 'Unknown')
276
-
277
- # 獲取數值級別
278
- level1 = noise_levels.get(noise1, 2)
279
- level2 = noise_levels.get(noise2, 2)
280
-
281
- # 計算差異並歸一化
282
- difference = abs(level1 - level2)
283
- similarity = 1.0 - (difference / 2) # 最大差異是2,所以除以2來歸一化
284
-
285
- return similarity
286
-
287
- def _general_matching(self, description: str, top_n: int = 10) -> List[Dict]:
288
- """基本的品種匹配邏輯,考慮描述、性格、噪音和健康因素"""
289
- matches = []
290
- # 預先計算描述的 embedding 並快取
291
- desc_embedding = self._get_cached_embedding(description)
292
-
293
- for breed in self.dog_data:
294
- breed_name = breed[1]
295
- breed_description = breed[9]
296
- temperament = breed[4]
297
-
298
- # 使用快取計算相似度
299
- breed_desc_embedding = self._get_cached_embedding(breed_description)
300
- breed_temp_embedding = self._get_cached_embedding(temperament)
301
-
302
- desc_similarity = float(util.pytorch_cos_sim(desc_embedding, breed_desc_embedding))
303
- temp_similarity = float(util.pytorch_cos_sim(desc_embedding, breed_temp_embedding))
304
-
305
- # 其餘計算保持不變
306
- noise_similarity = self._calculate_noise_similarity(breed_name, breed_name)
307
- health_score = self._calculate_health_score(breed_name)
308
- health_similarity = 1.0 - abs(health_score - 0.8)
309
-
310
- weights = {
311
- 'description': 0.35,
312
- 'temperament': 0.25,
313
- 'noise': 0.2,
314
- 'health': 0.2
315
- }
316
-
317
- final_score = (
318
- desc_similarity * weights['description'] +
319
- temp_similarity * weights['temperament'] +
320
- noise_similarity * weights['noise'] +
321
- health_similarity * weights['health']
322
- )
323
-
324
- matches.append({
325
- 'breed': breed_name,
326
- 'score': final_score,
327
- 'is_preferred': False,
328
- 'similarity': final_score,
329
- 'reason': "Matched based on description, temperament, noise level, and health score"
330
- })
331
-
332
- return sorted(matches, key=lambda x: -x['score'])[:top_n]
333
-
334
-
335
- def _detect_breed_preference(self, description: str) -> Optional[str]:
336
- """檢測用戶是否提到特定品種"""
337
- description_lower = f" {description.lower()} "
338
-
339
- for breed_info in self.dog_data:
340
- breed_name = breed_info[1]
341
- normalized_breed = breed_name.lower().replace('_', ' ')
342
-
343
- pattern = rf"\b{re.escape(normalized_breed)}\b"
344
-
345
- if re.search(pattern, description_lower):
346
- return breed_name
347
-
348
- return None
349
-
350
- def match_user_preference(self, description: str, top_n: int = 10) -> List[Dict]:
351
- """根據用戶描述匹配最適合的品種"""
352
- preferred_breed = self._detect_breed_preference(description)
353
-
354
- matches = []
355
- if preferred_breed:
356
- # 首先添加偏好品種
357
- breed_info = next((breed for breed in self.dog_data if breed[1] == preferred_breed), None)
358
- if breed_info:
359
- base_scores = {'overall': 1.0} # 給予最高基礎分數
360
- # 計算偏好品種的最終分數
361
- scores = self._calculate_final_scores(
362
- preferred_breed,
363
- base_scores,
364
- smart_score=1.0,
365
- is_preferred=True,
366
- similarity_score=1.0
367
- )
368
-
369
- matches.append({
370
- 'breed': preferred_breed,
371
- 'score': 1.0, # 確保最高分
372
- 'final_score': scores['final_score'],
373
- 'base_score': scores['base_score'],
374
- 'bonus_score': scores['bonus_score'],
375
- 'is_preferred': True,
376
- 'priority': 1, # 最高優先級
377
- 'health_score': self._calculate_health_score(preferred_breed),
378
- 'noise_level': breed_noise_info.get(preferred_breed, {}).get('noise_level', 'Unknown'),
379
- 'reason': "Directly matched your preferred breed"
380
- })
381
-
382
- # 添加相似品種
383
- similar_breeds = self.find_similar_breeds(preferred_breed, top_n=top_n-1)
384
- for breed_name, similarity in similar_breeds:
385
- if breed_name != preferred_breed:
386
- # 使用 _calculate_final_scores 計算相似品種分數
387
- scores = self._calculate_final_scores(
388
- breed_name,
389
- {'overall': similarity * 0.9}, # 基礎分數略低於偏好品種
390
- smart_score=similarity,
391
- is_preferred=False,
392
- similarity_score=similarity
393
- )
394
-
395
- matches.append({
396
- 'breed': breed_name,
397
- 'score': min(0.95, similarity), # 確保不超過偏好品種
398
- 'final_score': scores['final_score'],
399
- 'base_score': scores['base_score'],
400
- 'bonus_score': scores['bonus_score'],
401
- 'is_preferred': False,
402
- 'priority': 2,
403
- 'health_score': self._calculate_health_score(breed_name),
404
- 'noise_level': breed_noise_info.get(breed_name, {}).get('noise_level', 'Unknown'),
405
- 'reason': f"Similar to {preferred_breed}"
406
- })
407
- else:
408
- matches = self._general_matching(description, top_n)
409
- for match in matches:
410
- match['priority'] = 3
411
-
412
- # 使用複合排序鍵
413
- final_matches = sorted(
414
- matches,
415
- key=lambda x: (
416
- x.get('priority', 3) * -1, # 優先級倒序(1最高)
417
- x.get('is_preferred', False) * 1, # 偏好品種優先
418
- float(x.get('final_score', 0)) * -1, # 分數倒序
419
- x.get('breed', '') # 品種名稱正序
420
- )
421
- )[:top_n]
422
-
423
- return final_matches