Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
import re | |
import numpy as np | |
from typing import List, Dict, Tuple, Optional | |
from dataclasses import dataclass | |
from breed_health_info import breed_health_info | |
from breed_noise_info import breed_noise_info | |
from dog_database import dog_data | |
from scoring_calculation_system import UserPreferences | |
from sentence_transformers import SentenceTransformer, util | |
class SmartBreedMatcher: | |
def __init__(self, dog_data: List[Tuple]): | |
self.dog_data = dog_data | |
self.model = SentenceTransformer('all-mpnet-base-v2') | |
self._embedding_cache = {} | |
def _get_cached_embedding(self, text: str) -> torch.Tensor: | |
if text not in self._embedding_cache: | |
self._embedding_cache[text] = self.model.encode(text) | |
return self._embedding_cache[text] | |
def _categorize_breeds(self) -> Dict: | |
"""自動將狗品種分類""" | |
categories = { | |
'working_dogs': [], | |
'herding_dogs': [], | |
'hunting_dogs': [], | |
'companion_dogs': [], | |
'guard_dogs': [] | |
} | |
for breed_info in self.dog_data: | |
description = breed_info[9].lower() | |
temperament = breed_info[4].lower() | |
# 根據描述和性格特徵自動分類 | |
if any(word in description for word in ['herding', 'shepherd', 'cattle', 'flock']): | |
categories['herding_dogs'].append(breed_info[1]) | |
elif any(word in description for word in ['hunting', 'hunt', 'retriever', 'pointer']): | |
categories['hunting_dogs'].append(breed_info[1]) | |
elif any(word in description for word in ['companion', 'toy', 'family', 'lap']): | |
categories['companion_dogs'].append(breed_info[1]) | |
elif any(word in description for word in ['guard', 'protection', 'watchdog']): | |
categories['guard_dogs'].append(breed_info[1]) | |
elif any(word in description for word in ['working', 'draft', 'cart']): | |
categories['working_dogs'].append(breed_info[1]) | |
return categories | |
def find_similar_breeds(self, breed_name: str, top_n: int = 5) -> List[Tuple[str, float]]: | |
"""找出與指定品種最相似的其他品種""" | |
target_breed = next((breed for breed in self.dog_data if breed[1] == breed_name), None) | |
if not target_breed: | |
return [] | |
# 獲取目標品種的特徵 | |
target_features = { | |
'breed_name': target_breed[1], # 添加品種名稱 | |
'size': target_breed[2], | |
'temperament': target_breed[4], | |
'exercise': target_breed[7], | |
'description': target_breed[9] | |
} | |
similarities = [] | |
for breed in self.dog_data: | |
if breed[1] != breed_name: | |
breed_features = { | |
'breed_name': breed[1], # 添加品種名稱 | |
'size': breed[2], | |
'temperament': breed[4], | |
'exercise': breed[7], | |
'description': breed[9] | |
} | |
similarity_score = self._calculate_breed_similarity(target_features, breed_features) | |
similarities.append((breed[1], similarity_score)) | |
return sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n] | |
def _calculate_breed_similarity(self, breed1_features: Dict, breed2_features: Dict) -> float: | |
"""計算兩個品種之間的相似度,包含健康和噪音因素""" | |
# 計算描述文本的相似度 | |
desc1_embedding = self._get_cached_embedding(breed1_features['description']) | |
desc2_embedding = self._get_cached_embedding(breed2_features['description']) | |
description_similarity = float(util.pytorch_cos_sim(desc1_embedding, desc2_embedding)) | |
# 基本特徵相似度 | |
size_similarity = 1.0 if breed1_features['size'] == breed2_features['size'] else 0.5 | |
exercise_similarity = 1.0 if breed1_features['exercise'] == breed2_features['exercise'] else 0.5 | |
# 性格相似度 | |
temp1_embedding = self._get_cached_embedding(breed1_features['temperament']) | |
temp2_embedding = self._get_cached_embedding(breed2_features['temperament']) | |
temperament_similarity = float(util.pytorch_cos_sim(temp1_embedding, temp2_embedding)) | |
# 健康分數相似度 | |
health_score1 = self._calculate_health_score(breed1_features['breed_name']) | |
health_score2 = self._calculate_health_score(breed2_features['breed_name']) | |
health_similarity = 1.0 - abs(health_score1 - health_score2) | |
# 噪音水平相似度 | |
noise_similarity = self._calculate_noise_similarity( | |
breed1_features['breed_name'], | |
breed2_features['breed_name'] | |
) | |
# 加權計算 | |
weights = { | |
'description': 0.25, | |
'temperament': 0.20, | |
'exercise': 0.2, | |
'size': 0.05, | |
'health': 0.15, | |
'noise': 0.15 | |
} | |
final_similarity = ( | |
description_similarity * weights['description'] + | |
temperament_similarity * weights['temperament'] + | |
exercise_similarity * weights['exercise'] + | |
size_similarity * weights['size'] + | |
health_similarity * weights['health'] + | |
noise_similarity * weights['noise'] | |
) | |
return final_similarity | |
def _calculate_final_scores(self, breed_name: str, base_scores: Dict, | |
smart_score: float, is_preferred: bool, | |
similarity_score: float = 0.0) -> Dict: | |
""" | |
計算最終分數,包含基礎分數和獎勵分數 | |
Args: | |
breed_name: 品種名稱 | |
base_scores: 基礎評分 (空間、運動等) | |
smart_score: 智能匹配分數 | |
is_preferred: 是否為用戶指定品種 | |
similarity_score: 與指定品種的相似度 (0-1) | |
""" | |
# 基礎權重 | |
weights = { | |
'base': 0.6, # 基礎分數權重 | |
'smart': 0.25, # 智能匹配權重 | |
'bonus': 0.15 # 獎勵分數權重 | |
} | |
# 計算基礎分數 | |
base_score = base_scores.get('overall', 0.7) | |
# 計算獎勵分數 | |
bonus_score = 0.0 | |
if is_preferred: | |
# 用戶指定品種獲得最高獎勵 | |
bonus_score = 0.95 | |
elif similarity_score > 0: | |
# 相似品種獲得部分獎勵,但不超過80%的最高獎勵 | |
bonus_score = min(0.8, similarity_score) * 0.95 | |
# 計算最終分數 | |
final_score = ( | |
base_score * weights['base'] + | |
smart_score * weights['smart'] + | |
bonus_score * weights['bonus'] | |
) | |
# 更新各項分數 | |
scores = base_scores.copy() | |
# 如果是用戶指定品種,稍微提升各項基礎分數,但保持合理範圍 | |
if is_preferred: | |
for key in scores: | |
if key != 'overall': | |
scores[key] = min(1.0, scores[key] * 1.1) # 最多提升10% | |
# 為相似品種調整分數 | |
elif similarity_score > 0: | |
boost_factor = 1.0 + (similarity_score * 0.05) # 最多提升5% | |
for key in scores: | |
if key != 'overall': | |
scores[key] = min(0.95, scores[key] * boost_factor) # 確保不超過95% | |
return { | |
'final_score': round(final_score, 4), | |
'base_score': round(base_score, 4), | |
'bonus_score': round(bonus_score, 4), | |
'scores': {k: round(v, 4) for k, v in scores.items()} | |
} | |
def _calculate_grooming_similarity(self, breed1: str, breed2: str) -> float: | |
"""計算美容需求相似度""" | |
grooming_map = { | |
'Low': 1, | |
'Moderate': 2, | |
'High': 3 | |
} | |
# 從dog_data中獲取美容需求 | |
breed1_info = next((dog for dog in self.dog_data if dog[1] == breed1), None) | |
breed2_info = next((dog for dog in self.dog_data if dog[1] == breed2), None) | |
if not breed1_info or not breed2_info: | |
return 0.5 # 默認中等相似度 | |
grooming1 = breed1_info[8] # Grooming_Needs index | |
grooming2 = breed2_info[8] | |
value1 = grooming_map.get(grooming1, 2) | |
value2 = grooming_map.get(grooming2, 2) | |
# 基礎相似度 | |
base_similarity = 1.0 - abs(value1 - value2) / 2.0 | |
# 根據用戶需求調整 | |
if grooming2 == 'Moderate': | |
base_similarity *= 1.1 # 稍微提高中等美容需求的分數 | |
elif grooming2 == 'High': | |
base_similarity *= 0.9 # 稍微降低高美容需求的分數 | |
return min(1.0, base_similarity) | |
def _calculate_health_score(self, breed_name: str) -> float: | |
"""計算品種的健康分數""" | |
if breed_name not in breed_health_info: | |
return 0.5 | |
health_notes = breed_health_info[breed_name]['health_notes'].lower() | |
# 嚴重健康問題 | |
severe_conditions = [ | |
'cancer', 'cardiomyopathy', 'epilepsy', 'dysplasia', | |
'bloat', 'progressive', 'syndrome' | |
] | |
# 中等健康問題 | |
moderate_conditions = [ | |
'allergies', 'infections', 'thyroid', 'luxation', | |
'skin problems', 'ear' | |
] | |
severe_count = sum(1 for condition in severe_conditions if condition in health_notes) | |
moderate_count = sum(1 for condition in moderate_conditions if condition in health_notes) | |
health_score = 1.0 | |
health_score -= (severe_count * 0.1) | |
health_score -= (moderate_count * 0.05) | |
# 特殊條件調整(根據用戶偏好) | |
if hasattr(self, 'user_preferences'): | |
if self.user_preferences.has_children: | |
if 'requires frequent' in health_notes or 'regular monitoring' in health_notes: | |
health_score *= 0.9 | |
if self.user_preferences.health_sensitivity == 'high': | |
health_score *= 0.9 | |
return max(0.3, min(1.0, health_score)) | |
def _calculate_noise_similarity(self, breed1: str, breed2: str) -> float: | |
"""計算兩個品種的噪音相似度""" | |
noise_levels = { | |
'Low': 1, | |
'Moderate': 2, | |
'High': 3, | |
'Unknown': 2 # 默認為中等 | |
} | |
noise1 = breed_noise_info.get(breed1, {}).get('noise_level', 'Unknown') | |
noise2 = breed_noise_info.get(breed2, {}).get('noise_level', 'Unknown') | |
# 獲取數值級別 | |
level1 = noise_levels.get(noise1, 2) | |
level2 = noise_levels.get(noise2, 2) | |
# 計算差異並歸一化 | |
difference = abs(level1 - level2) | |
similarity = 1.0 - (difference / 2) # 最大差異是2,所以除以2來歸一化 | |
return similarity | |
def _general_matching(self, description: str, top_n: int = 10) -> List[Dict]: | |
"""基本的品種匹配邏輯,考慮描述、性格、噪音和健康因素""" | |
matches = [] | |
# 預先計算描述的 embedding 並快取 | |
desc_embedding = self._get_cached_embedding(description) | |
for breed in self.dog_data: | |
breed_name = breed[1] | |
breed_description = breed[9] | |
temperament = breed[4] | |
# 使用快取計算相似度 | |
breed_desc_embedding = self._get_cached_embedding(breed_description) | |
breed_temp_embedding = self._get_cached_embedding(temperament) | |
desc_similarity = float(util.pytorch_cos_sim(desc_embedding, breed_desc_embedding)) | |
temp_similarity = float(util.pytorch_cos_sim(desc_embedding, breed_temp_embedding)) | |
# 其餘計算保持不變 | |
noise_similarity = self._calculate_noise_similarity(breed_name, breed_name) | |
health_score = self._calculate_health_score(breed_name) | |
health_similarity = 1.0 - abs(health_score - 0.8) | |
weights = { | |
'description': 0.35, | |
'temperament': 0.25, | |
'noise': 0.2, | |
'health': 0.2 | |
} | |
final_score = ( | |
desc_similarity * weights['description'] + | |
temp_similarity * weights['temperament'] + | |
noise_similarity * weights['noise'] + | |
health_similarity * weights['health'] | |
) | |
matches.append({ | |
'breed': breed_name, | |
'score': final_score, | |
'is_preferred': False, | |
'similarity': final_score, | |
'reason': "Matched based on description, temperament, noise level, and health score" | |
}) | |
return sorted(matches, key=lambda x: -x['score'])[:top_n] | |
def _detect_breed_preference(self, description: str) -> Optional[str]: | |
"""檢測用戶是否提到特定品種""" | |
description_lower = f" {description.lower()} " | |
for breed_info in self.dog_data: | |
breed_name = breed_info[1] | |
normalized_breed = breed_name.lower().replace('_', ' ') | |
pattern = rf"\b{re.escape(normalized_breed)}\b" | |
if re.search(pattern, description_lower): | |
return breed_name | |
return None | |
def match_user_preference(self, description: str, top_n: int = 10) -> List[Dict]: | |
"""根據用戶描述匹配最適合的品種""" | |
preferred_breed = self._detect_breed_preference(description) | |
matches = [] | |
if preferred_breed: | |
# 首先添加偏好品種 | |
breed_info = next((breed for breed in self.dog_data if breed[1] == preferred_breed), None) | |
if breed_info: | |
base_scores = {'overall': 1.0} # 給予最高基礎分數 | |
# 計算偏好品種的最終分數 | |
scores = self._calculate_final_scores( | |
preferred_breed, | |
base_scores, | |
smart_score=1.0, | |
is_preferred=True, | |
similarity_score=1.0 | |
) | |
matches.append({ | |
'breed': preferred_breed, | |
'score': 1.0, # 確保最高分 | |
'final_score': scores['final_score'], | |
'base_score': scores['base_score'], | |
'bonus_score': scores['bonus_score'], | |
'is_preferred': True, | |
'priority': 1, # 最高優先級 | |
'health_score': self._calculate_health_score(preferred_breed), | |
'noise_level': breed_noise_info.get(preferred_breed, {}).get('noise_level', 'Unknown'), | |
'reason': "Directly matched your preferred breed" | |
}) | |
# 添加相似品種 | |
similar_breeds = self.find_similar_breeds(preferred_breed, top_n=top_n-1) | |
for breed_name, similarity in similar_breeds: | |
if breed_name != preferred_breed: | |
# 使用 _calculate_final_scores 計算相似品種分數 | |
scores = self._calculate_final_scores( | |
breed_name, | |
{'overall': similarity * 0.9}, # 基礎分數略低於偏好品種 | |
smart_score=similarity, | |
is_preferred=False, | |
similarity_score=similarity | |
) | |
matches.append({ | |
'breed': breed_name, | |
'score': min(0.95, similarity), # 確保不超過偏好品種 | |
'final_score': scores['final_score'], | |
'base_score': scores['base_score'], | |
'bonus_score': scores['bonus_score'], | |
'is_preferred': False, | |
'priority': 2, | |
'health_score': self._calculate_health_score(breed_name), | |
'noise_level': breed_noise_info.get(breed_name, {}).get('noise_level', 'Unknown'), | |
'reason': f"Similar to {preferred_breed}" | |
}) | |
else: | |
matches = self._general_matching(description, top_n) | |
for match in matches: | |
match['priority'] = 3 | |
# 使用複合排序鍵 | |
final_matches = sorted( | |
matches, | |
key=lambda x: ( | |
x.get('priority', 3) * -1, # 優先級倒序(1最高) | |
x.get('is_preferred', False) * 1, # 偏好品種優先 | |
float(x.get('final_score', 0)) * -1, # 分數倒序 | |
x.get('breed', '') # 品種名稱正序 | |
) | |
)[:top_n] | |
return final_matches | |