sebastianalgharaballi
commited on
new scoring
Browse files- embeddings.py +543 -82
embeddings.py
CHANGED
@@ -58,6 +58,84 @@ class MatchResult:
|
|
58 |
explanation: str
|
59 |
status: str = "unseen"
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
class EmbeddingManager:
|
62 |
def __init__(self, job_encoder, seeker_encoder):
|
63 |
self.job_encoder = job_encoder
|
@@ -65,10 +143,7 @@ class EmbeddingManager:
|
|
65 |
|
66 |
def get_job_fields(self, job_posting: JobPosting) -> Dict[str, str]:
|
67 |
"""Extract relevant fields from job posting"""
|
68 |
-
# Convert primary skills list to string
|
69 |
primary_skills_str = ', '.join([skill.skill_name for skill in job_posting.primary_skills]) if job_posting.primary_skills else ''
|
70 |
-
|
71 |
-
# Convert secondary skills list to string
|
72 |
secondary_skills_str = ', '.join([skill.skill_name for skill in job_posting.secondary_skills]) if job_posting.secondary_skills else ''
|
73 |
|
74 |
return {
|
@@ -78,8 +153,7 @@ class EmbeddingManager:
|
|
78 |
'primary_skills': primary_skills_str,
|
79 |
'secondary_skills': secondary_skills_str
|
80 |
}
|
81 |
-
|
82 |
-
def get_seeker_fields(self, processed_seeker: IndependentJobSeekerAssessmentRDS,
|
83 |
unprocessed_seeker: JobseekerInfoRDS) -> Dict[str, str]:
|
84 |
"""Extract relevant fields from job seeker"""
|
85 |
return {
|
@@ -90,8 +164,9 @@ class EmbeddingManager:
|
|
90 |
'certifications': self._format_certifications(processed_seeker.certifications),
|
91 |
'summary': unprocessed_seeker.summary
|
92 |
}
|
93 |
-
|
94 |
def _format_experience(self, experiences: List[dict]) -> str:
|
|
|
95 |
exp_parts = []
|
96 |
for exp in experiences:
|
97 |
summaries = exp.get('experience_summaries', [])
|
@@ -99,8 +174,6 @@ class EmbeddingManager:
|
|
99 |
exp_parts.append(exp_str)
|
100 |
return ' | '.join(exp_parts)
|
101 |
|
102 |
-
|
103 |
-
|
104 |
def _format_education(self, educations: List[dict]) -> str:
|
105 |
"""Format education entries into a single string"""
|
106 |
edu_parts = []
|
@@ -111,17 +184,16 @@ class EmbeddingManager:
|
|
111 |
edu_str = f"{degree} in {field} from {institution}"
|
112 |
edu_parts.append(edu_str)
|
113 |
return ' | '.join(edu_parts)
|
114 |
-
|
115 |
def _format_certifications(self, certifications: List[dict]) -> str:
|
116 |
"""Format certification entries into a single string"""
|
117 |
cert_parts = []
|
118 |
for cert in certifications:
|
119 |
-
name = cert.get('name', '')
|
120 |
org = cert.get('organization', '')
|
121 |
start = cert.get('start_date', '')
|
122 |
end = cert.get('end_date', '')
|
123 |
|
124 |
-
# Build certification string
|
125 |
cert_str = name
|
126 |
if org:
|
127 |
cert_str += f" from {org}"
|
@@ -136,105 +208,494 @@ class EmbeddingManager:
|
|
136 |
cert_parts.append(cert_str)
|
137 |
|
138 |
return ' | '.join(cert_parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
def calculate_similarity(self, job_embeddings: Dict[str, np.ndarray],
|
154 |
-
seeker_embeddings: Dict[str, np.ndarray]
|
155 |
-
|
|
|
|
|
|
|
156 |
field_scores = {}
|
157 |
explanation_parts = []
|
158 |
|
159 |
-
#
|
160 |
for job_field, seeker_fields in FIELD_MAPPING.items():
|
161 |
if job_field not in job_embeddings:
|
162 |
continue
|
163 |
|
164 |
job_emb = job_embeddings[job_field]
|
165 |
|
166 |
-
# Handle multiple seeker fields for one job field
|
167 |
for seeker_field in seeker_fields:
|
168 |
if seeker_field not in seeker_embeddings:
|
169 |
continue
|
170 |
|
171 |
seeker_emb = seeker_embeddings[seeker_field]
|
172 |
-
|
173 |
-
# Calculate raw cosine similarity
|
174 |
similarity = np.dot(job_emb, seeker_emb) / (
|
175 |
np.linalg.norm(job_emb) * np.linalg.norm(seeker_emb) + 1e-9
|
176 |
)
|
177 |
|
178 |
-
|
179 |
-
|
180 |
-
field_score = max(0, min(1, (raw_score + 1) / 2)) # Rescale to [0,1]
|
181 |
-
|
182 |
-
# Apply non-linear transformation for more discrimination
|
183 |
-
if field_score > 0.9: # Only the very best get boosted
|
184 |
-
field_score = min(field_score * 1.1, 1.0)
|
185 |
-
elif field_score < 0.7: # More aggressive penalty for lower scores
|
186 |
-
field_score = field_score * 0.6
|
187 |
|
188 |
-
|
189 |
-
field_scores[
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
explanation_parts.append(
|
197 |
-
f"{match_quality.capitalize()} match on {job_field} to {seeker_field} "
|
198 |
-
f"(similarity: {field_score:.2f})"
|
199 |
-
)
|
200 |
-
|
201 |
-
# Calculate weighted average with critical field emphasis
|
202 |
-
final_score = 0.0
|
203 |
-
total_weight = 0.0
|
204 |
-
|
205 |
-
# Critical fields get extra weight (subject to change)
|
206 |
-
critical_fields = {
|
207 |
-
'primary_skills_primary_skills': 1,
|
208 |
-
'role_description_experience': 1,
|
209 |
-
'role_description_certifications': 1, # Added certifications with same weight
|
210 |
}
|
211 |
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
# Apply critical field multiplier
|
216 |
-
weight = base_weight * critical_fields.get(field_pair, 1.0)
|
217 |
-
|
218 |
-
final_score += score * weight
|
219 |
-
total_weight += weight
|
220 |
-
|
221 |
-
if total_weight > 0:
|
222 |
-
final_score = final_score / total_weight
|
223 |
-
|
224 |
-
# Final adjustments for extreme discrimination
|
225 |
-
if final_score > 0.9: # Only truly exceptional matches
|
226 |
-
final_score = min(final_score * 1.1, 1.0)
|
227 |
-
elif final_score < 0.7: # Really penalize poor matches
|
228 |
-
final_score = final_score * 0.6
|
229 |
|
230 |
-
|
231 |
-
|
232 |
-
|
|
|
233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
return MatchResult(
|
235 |
similarity_score=final_score,
|
236 |
-
field_scores=field_scores,
|
237 |
-
explanation=
|
238 |
)
|
239 |
|
240 |
def initialize_embedding_system(job_encoder, seeker_encoder):
|
|
|
58 |
explanation: str
|
59 |
status: str = "unseen"
|
60 |
|
61 |
+
from typing import Dict, List, Optional, Tuple, Set
|
62 |
+
import numpy as np
|
63 |
+
from dataclasses import dataclass
|
64 |
+
from datetime import datetime, timedelta
|
65 |
+
from collections import defaultdict
|
66 |
+
import re
|
67 |
+
class RiskLevel:
|
68 |
+
NO_RISK = 5
|
69 |
+
LOW_RISK = -5
|
70 |
+
MEDIUM_RISK = -10
|
71 |
+
HIGH_RISK = -15
|
72 |
+
class BonusLevel:
|
73 |
+
NO_BONUS = 0
|
74 |
+
GOOD = 1
|
75 |
+
BETTER = 2
|
76 |
+
BEST = 3
|
77 |
+
# Your existing field mappings
|
78 |
+
FIELD_MAPPING = {
|
79 |
+
'title': ['summary'],
|
80 |
+
'primary_skills': ['primary_skills'],
|
81 |
+
'secondary_skills': ['secondary_skills'],
|
82 |
+
'role_description': ['experience', 'certifications']
|
83 |
+
}
|
84 |
+
# Updated field weights incorporating all criteria
|
85 |
+
FIELD_WEIGHTS = {
|
86 |
+
'job_stability': 12,
|
87 |
+
'job_duration': 12,
|
88 |
+
'responsibility_tenure': 1.5,
|
89 |
+
'employment_pedigree': 7.5,
|
90 |
+
'primary_skills_experience': 5,
|
91 |
+
'career_experience': 2.5,
|
92 |
+
'role_impact': 5,
|
93 |
+
'management_scope': 1.5,
|
94 |
+
'primary_skills_occurrence': 12,
|
95 |
+
'primary_skills_frequency': 2.5,
|
96 |
+
'primary_skills_recency': 15,
|
97 |
+
'soft_skills': 1,
|
98 |
+
'employment_recency': 7.5,
|
99 |
+
'location_match': 1,
|
100 |
+
'certifications': 2,
|
101 |
+
'job_title_experience': 5,
|
102 |
+
'job_title_match': 7,
|
103 |
+
'primary_skills_primary_skills': 0.5,
|
104 |
+
'secondary_skills_secondary_skills': 0.1,
|
105 |
+
'role_description_experience': 0.25,
|
106 |
+
'role_description_certifications': 0.05,
|
107 |
+
'title_summary': 0.1
|
108 |
+
}
|
109 |
+
SOFT_SKILLS_KEYWORDS = {
|
110 |
+
'communication': ['effectively communicated', 'presented to stakeholders', 'negotiated', 'collaborated with', 'mediated'],
|
111 |
+
'teamwork': ['worked in a team', 'collaborated with', 'partnered with', 'contributed to a team effort'],
|
112 |
+
'leadership': ['led a team', 'mentored', 'coached', 'managed', 'guided'],
|
113 |
+
'problem_solving': ['resolved', 'addressed challenges', 'innovated', 'strategized', 'implemented solutions'],
|
114 |
+
'adaptability': ['adapted to', 'quickly learned', 'flexible in', 'handled change'],
|
115 |
+
'emotional_intelligence': ['empathized with', 'understood needs', 'fostered relationships', 'built trust', 'managed conflict']
|
116 |
+
}
|
117 |
+
LEADERSHIP_KEYWORDS = [
|
118 |
+
'led', 'managed', 'directed', 'architected', 'innovated',
|
119 |
+
'spearheaded', 'strategized', 'developed', 'executed',
|
120 |
+
'owned', 'delivered', 'implemented'
|
121 |
+
]
|
122 |
+
IMPACT_PATTERNS = {
|
123 |
+
'revenue_growth': r'increased revenue by (\d+)%',
|
124 |
+
'cost_savings': r'saved \$(\d+) million',
|
125 |
+
'project_launch': r'launched .+ generated \$(\d+) million',
|
126 |
+
'project_completion': r'completed .+ ahead of schedule',
|
127 |
+
'budget_management': r'managed \$(\d+) million budget',
|
128 |
+
'risk_mitigation': r'decreased .+ by (\d+)%',
|
129 |
+
'client_retention': r'improved retention by (\d+)%',
|
130 |
+
'satisfaction': r'satisfaction .+ (\d+)% to (\d+)%',
|
131 |
+
'team_growth': r'grew team by (\d+)%'
|
132 |
+
}
|
133 |
+
EXPERIENCE_LEVELS = {
|
134 |
+
'junior': ['Junior', 'Associate'],
|
135 |
+
'mid': ['Staff', 'Senior', 'Sr.'],
|
136 |
+
'senior': ['Principal', 'Lead', 'Supervisor', 'Manager'],
|
137 |
+
'executive': ['Director', 'VP', 'CXO', 'President', 'Owner', 'Founder', 'Partner']
|
138 |
+
}
|
139 |
class EmbeddingManager:
|
140 |
def __init__(self, job_encoder, seeker_encoder):
|
141 |
self.job_encoder = job_encoder
|
|
|
143 |
|
144 |
def get_job_fields(self, job_posting: JobPosting) -> Dict[str, str]:
|
145 |
"""Extract relevant fields from job posting"""
|
|
|
146 |
primary_skills_str = ', '.join([skill.skill_name for skill in job_posting.primary_skills]) if job_posting.primary_skills else ''
|
|
|
|
|
147 |
secondary_skills_str = ', '.join([skill.skill_name for skill in job_posting.secondary_skills]) if job_posting.secondary_skills else ''
|
148 |
|
149 |
return {
|
|
|
153 |
'primary_skills': primary_skills_str,
|
154 |
'secondary_skills': secondary_skills_str
|
155 |
}
|
156 |
+
def get_seeker_fields(self, processed_seeker: IndependentJobSeekerAssessmentRDS,
|
|
|
157 |
unprocessed_seeker: JobseekerInfoRDS) -> Dict[str, str]:
|
158 |
"""Extract relevant fields from job seeker"""
|
159 |
return {
|
|
|
164 |
'certifications': self._format_certifications(processed_seeker.certifications),
|
165 |
'summary': unprocessed_seeker.summary
|
166 |
}
|
167 |
+
|
168 |
def _format_experience(self, experiences: List[dict]) -> str:
|
169 |
+
"""Format experience entries into a single string"""
|
170 |
exp_parts = []
|
171 |
for exp in experiences:
|
172 |
summaries = exp.get('experience_summaries', [])
|
|
|
174 |
exp_parts.append(exp_str)
|
175 |
return ' | '.join(exp_parts)
|
176 |
|
|
|
|
|
177 |
def _format_education(self, educations: List[dict]) -> str:
|
178 |
"""Format education entries into a single string"""
|
179 |
edu_parts = []
|
|
|
184 |
edu_str = f"{degree} in {field} from {institution}"
|
185 |
edu_parts.append(edu_str)
|
186 |
return ' | '.join(edu_parts)
|
187 |
+
|
188 |
def _format_certifications(self, certifications: List[dict]) -> str:
|
189 |
"""Format certification entries into a single string"""
|
190 |
cert_parts = []
|
191 |
for cert in certifications:
|
192 |
+
name = cert.get('name', '')
|
193 |
org = cert.get('organization', '')
|
194 |
start = cert.get('start_date', '')
|
195 |
end = cert.get('end_date', '')
|
196 |
|
|
|
197 |
cert_str = name
|
198 |
if org:
|
199 |
cert_str += f" from {org}"
|
|
|
208 |
cert_parts.append(cert_str)
|
209 |
|
210 |
return ' | '.join(cert_parts)
|
211 |
+
def calculate_job_stability_risk(self, experiences: List[dict]) -> float:
|
212 |
+
"""Calculate risk based on employment gaps within last 2 years"""
|
213 |
+
if not experiences:
|
214 |
+
return RiskLevel.HIGH_RISK
|
215 |
+
sorted_experiences = sorted(
|
216 |
+
experiences,
|
217 |
+
key=lambda x: datetime.fromisoformat(x['start_date'])
|
218 |
+
)
|
219 |
+
|
220 |
+
two_years_ago = datetime.now() - timedelta(days=730)
|
221 |
+
gaps = []
|
222 |
+
|
223 |
+
for i in range(1, len(sorted_experiences)):
|
224 |
+
current_start = datetime.fromisoformat(sorted_experiences[i]['start_date'])
|
225 |
+
if current_start < two_years_ago:
|
226 |
+
continue
|
227 |
+
|
228 |
+
prev_end = sorted_experiences[i-1].get('end_date')
|
229 |
+
prev_end = datetime.fromisoformat(prev_end) if prev_end else datetime.now()
|
230 |
+
|
231 |
+
if current_start > prev_end:
|
232 |
+
gap_months = (current_start - prev_end).days / 30
|
233 |
+
gaps.append(int(gap_months))
|
234 |
+
if not gaps:
|
235 |
+
return RiskLevel.NO_RISK
|
236 |
+
|
237 |
+
max_gap = max(gaps)
|
238 |
+
if max_gap > 9:
|
239 |
+
return RiskLevel.HIGH_RISK
|
240 |
+
elif max_gap > 3:
|
241 |
+
return RiskLevel.MEDIUM_RISK
|
242 |
+
return RiskLevel.LOW_RISK
|
243 |
+
def calculate_job_duration_risk(self, experiences: List[dict]) -> float:
|
244 |
+
"""Calculate risk based on number of jobs in recent periods"""
|
245 |
+
now = datetime.now()
|
246 |
+
one_year_ago = now - timedelta(days=365)
|
247 |
+
two_years_ago = now - timedelta(days=730)
|
248 |
+
|
249 |
+
jobs_last_year = sum(1 for exp in experiences
|
250 |
+
if datetime.fromisoformat(exp['start_date']) >= one_year_ago)
|
251 |
+
jobs_last_two_years = sum(1 for exp in experiences
|
252 |
+
if datetime.fromisoformat(exp['start_date']) >= two_years_ago)
|
253 |
+
|
254 |
+
# One year criteria
|
255 |
+
if jobs_last_year >= 4:
|
256 |
+
return RiskLevel.HIGH_RISK
|
257 |
+
elif jobs_last_year == 3:
|
258 |
+
return RiskLevel.MEDIUM_RISK
|
259 |
+
elif jobs_last_year == 2:
|
260 |
+
return RiskLevel.LOW_RISK
|
261 |
+
elif jobs_last_year == 1:
|
262 |
+
return RiskLevel.NO_RISK
|
263 |
+
|
264 |
+
# Two year criteria
|
265 |
+
if jobs_last_two_years >= 5:
|
266 |
+
return RiskLevel.HIGH_RISK
|
267 |
+
elif jobs_last_two_years in [3, 4]:
|
268 |
+
return RiskLevel.MEDIUM_RISK
|
269 |
+
elif jobs_last_two_years <= 2:
|
270 |
+
return RiskLevel.LOW_RISK
|
271 |
+
|
272 |
+
return RiskLevel.NO_RISK
|
273 |
+
def calculate_employment_recency_risk(self, experiences: List[dict]) -> float:
|
274 |
+
"""Calculate risk based on most recent employment end date"""
|
275 |
+
if not experiences:
|
276 |
+
return RiskLevel.HIGH_RISK
|
277 |
+
|
278 |
+
latest_end_date = max(
|
279 |
+
datetime.fromisoformat(exp['end_date']) if exp.get('end_date')
|
280 |
+
else datetime.now()
|
281 |
+
for exp in experiences
|
282 |
+
)
|
283 |
+
|
284 |
+
months_since_end = (datetime.now() - latest_end_date).days / 30
|
285 |
+
|
286 |
+
if months_since_end > 9:
|
287 |
+
return RiskLevel.HIGH_RISK
|
288 |
+
elif months_since_end > 6:
|
289 |
+
return RiskLevel.MEDIUM_RISK
|
290 |
+
elif months_since_end > 3:
|
291 |
+
return RiskLevel.LOW_RISK
|
292 |
+
return RiskLevel.NO_RISK
|
293 |
+
def calculate_responsibility_tenure(self, experiences: List[dict]) -> float:
|
294 |
+
"""Calculate bonus based on leadership tenure"""
|
295 |
+
leadership_months = 0
|
296 |
+
|
297 |
+
for exp in experiences:
|
298 |
+
description = ' '.join(exp.get('experience_summaries', []))
|
299 |
+
if any(keyword in description.lower() for keyword in LEADERSHIP_KEYWORDS):
|
300 |
+
start = datetime.fromisoformat(exp['start_date'])
|
301 |
+
end = datetime.fromisoformat(exp['end_date']) if exp.get('end_date') else datetime.now()
|
302 |
+
leadership_months += (end - start).days / 30
|
303 |
+
|
304 |
+
if leadership_months >= 24:
|
305 |
+
return BonusLevel.BEST
|
306 |
+
elif leadership_months >= 12:
|
307 |
+
return BonusLevel.BETTER
|
308 |
+
elif leadership_months > 0:
|
309 |
+
return BonusLevel.GOOD
|
310 |
+
return BonusLevel.NO_BONUS
|
311 |
+
def evaluate_employment_pedigree(self, experiences: List[dict],
|
312 |
+
job_posting: JobPosting) -> float:
|
313 |
+
"""Evaluate employment pedigree based on company comparisons"""
|
314 |
+
score = 0
|
315 |
+
recent_experiences = experiences[:3] # Focus on most recent experiences
|
316 |
+
|
317 |
+
for exp in recent_experiences:
|
318 |
+
# Check industry match
|
319 |
+
if exp.get('industry') == job_posting.industry:
|
320 |
+
score += 1
|
321 |
+
|
322 |
+
# Check company size match
|
323 |
+
if abs(exp.get('company_size', 0) - job_posting.company_size) < 0.2: # Within 20%
|
324 |
+
score += 1
|
325 |
+
|
326 |
+
# Check revenue match
|
327 |
+
if abs(exp.get('company_revenue', 0) - job_posting.company_revenue) < 0.2:
|
328 |
+
score += 1
|
329 |
+
|
330 |
+
# Check growth rate match
|
331 |
+
if abs(exp.get('growth_rate', 0) - job_posting.growth_rate) < 0.1:
|
332 |
+
score += 1
|
333 |
+
|
334 |
+
# Scale score to bonus levels
|
335 |
+
if score >= 10: # High match across multiple factors
|
336 |
+
return BonusLevel.BEST
|
337 |
+
elif score >= 6:
|
338 |
+
return BonusLevel.BETTER
|
339 |
+
elif score >= 3:
|
340 |
+
return BonusLevel.GOOD
|
341 |
+
return BonusLevel.NO_BONUS
|
342 |
+
|
343 |
+
def evaluate_job_title_experience(self, experiences: List[dict], required_years: int) -> float:
|
344 |
+
"""Evaluate job title experience against required years"""
|
345 |
+
actual_years = sum((datetime.fromisoformat(exp.get('end_date', datetime.now().isoformat())) -
|
346 |
+
datetime.fromisoformat(exp['start_date'])).days / 365
|
347 |
+
for exp in experiences)
|
348 |
+
|
349 |
+
percentage = (actual_years / required_years) * 100 if required_years > 0 else 0
|
350 |
+
|
351 |
+
if percentage >= 100:
|
352 |
+
return RiskLevel.NO_RISK
|
353 |
+
elif percentage >= 71:
|
354 |
+
return RiskLevel.LOW_RISK
|
355 |
+
elif percentage >= 61:
|
356 |
+
return RiskLevel.MEDIUM_RISK
|
357 |
+
return RiskLevel.HIGH_RISK
|
358 |
+
|
359 |
+
def evaluate_primary_skills_quality(self, experience_description: str) -> float:
|
360 |
+
"""Evaluate the quality of primary skills usage description"""
|
361 |
+
# Count technical terms and action verbs
|
362 |
+
tech_terms = len(re.findall(r'\b(?:API|REST|SDK|Framework|Platform|System|Database)\b',
|
363 |
+
experience_description, re.IGNORECASE))
|
364 |
+
action_verbs = len(re.findall(r'\b(?:Designed|Implemented|Developed|Engineered|Architected)\b',
|
365 |
+
experience_description, re.IGNORECASE))
|
366 |
+
context_richness = len(re.findall(r'using|with|through|by|via',
|
367 |
+
experience_description, re.IGNORECASE))
|
368 |
+
|
369 |
+
total_score = tech_terms + action_verbs + context_richness
|
370 |
+
|
371 |
+
if total_score >= 7: # Complex technical implementation with context
|
372 |
+
return BonusLevel.BEST
|
373 |
+
elif total_score >= 5: # Good technical description
|
374 |
+
return BonusLevel.BETTER
|
375 |
+
elif total_score >= 3: # Basic technical mention
|
376 |
+
return BonusLevel.GOOD
|
377 |
+
return BonusLevel.NO_BONUS
|
378 |
+
def evaluate_role_impact(self, experiences: List[dict]) -> float:
|
379 |
+
"""Evaluate quantifiable impact mentions in role descriptions"""
|
380 |
+
impact_score = 0
|
381 |
+
|
382 |
+
for exp in experiences:
|
383 |
+
description = ' '.join(exp.get('experience_summaries', []))
|
384 |
+
|
385 |
+
# Performance improvements
|
386 |
+
if perf_match := re.search(r'improv(?:ed|ing)\s+\w+\s+by\s+(\d+)%', description, re.IGNORECASE):
|
387 |
+
value = float(perf_match.group(1))
|
388 |
+
impact_score += min(3, value // 20) # 20% = 1 point, 40% = 2 points, 60%+ = 3 points
|
389 |
+
|
390 |
+
# Cost savings
|
391 |
+
if cost_match := re.search(r'sav(?:ed|ing)\s+\$(\d+)K', description, re.IGNORECASE):
|
392 |
+
value = float(cost_match.group(1))
|
393 |
+
impact_score += min(3, value // 100) # $100K = 1 point, $200K = 2 points, $300K+ = 3 points
|
394 |
+
|
395 |
+
# Time/efficiency improvements
|
396 |
+
if time_match := re.search(r'reduc(?:ed|ing)\s+\w+\s+(?:time|duration)\s+by\s+(\d+)%', description, re.IGNORECASE):
|
397 |
+
value = float(time_match.group(1))
|
398 |
+
impact_score += min(3, value // 25) # 25% = 1 point, 50% = 2 points, 75%+ = 3 points
|
399 |
|
400 |
+
if impact_score >= 6:
|
401 |
+
return BonusLevel.BEST
|
402 |
+
elif impact_score >= 4:
|
403 |
+
return BonusLevel.BETTER
|
404 |
+
elif impact_score >= 2:
|
405 |
+
return BonusLevel.GOOD
|
406 |
+
return BonusLevel.NO_BONUS
|
407 |
+
|
408 |
+
def evaluate_management_scope(self, experiences: List[dict]) -> float:
|
409 |
+
"""Evaluate scope of management responsibilities"""
|
410 |
+
scope_score = 0
|
411 |
+
|
412 |
+
for exp in experiences:
|
413 |
+
description = ' '.join(exp.get('experience_summaries', []))
|
414 |
+
|
415 |
+
# Team size
|
416 |
+
if team_match := re.search(r'(?:manag|lead)(?:ed|ing)?\s+(?:a\s+)?team\s+of\s+(\d+)', description, re.IGNORECASE):
|
417 |
+
team_size = int(team_match.group(1))
|
418 |
+
scope_score += min(3, team_size // 3) # 3 people = 1 point, 6 people = 2 points, 9+ = 3 points
|
419 |
+
|
420 |
+
# Mentoring/training
|
421 |
+
if mentor_match := re.search(r'mentor(?:ed|ing)?\s+(\d+)', description, re.IGNORECASE):
|
422 |
+
mentees = int(mentor_match.group(1))
|
423 |
+
scope_score += min(2, mentees // 2) # 2 mentees = 1 point, 4+ mentees = 2 points
|
424 |
+
|
425 |
+
# Project leadership
|
426 |
+
if re.search(r'led\s+(?:development|migration|implementation)', description, re.IGNORECASE):
|
427 |
+
scope_score += 1
|
428 |
|
429 |
+
if scope_score >= 5:
|
430 |
+
return BonusLevel.BEST
|
431 |
+
elif scope_score >= 3:
|
432 |
+
return BonusLevel.BETTER
|
433 |
+
elif scope_score >= 1:
|
434 |
+
return BonusLevel.GOOD
|
435 |
+
return BonusLevel.NO_BONUS
|
436 |
+
|
437 |
+
def evaluate_soft_skills(self, experiences: List[dict]) -> float:
|
438 |
+
"""Evaluate presence of soft skills keywords"""
|
439 |
+
skill_matches = defaultdict(int)
|
440 |
+
|
441 |
+
for exp in experiences:
|
442 |
+
description = ' '.join(exp.get('experience_summaries', []))
|
443 |
+
for category, keywords in SOFT_SKILLS_KEYWORDS.items():
|
444 |
+
for keyword in keywords:
|
445 |
+
if keyword in description.lower():
|
446 |
+
skill_matches[category] += 1
|
447 |
+
|
448 |
+
# Count categories with significant matches
|
449 |
+
categories_present = sum(1 for matches in skill_matches.values() if matches > 0)
|
450 |
+
|
451 |
+
if categories_present >= 5: # Strong soft skills across most categories
|
452 |
+
return BonusLevel.BEST
|
453 |
+
elif categories_present >= 3:
|
454 |
+
return BonusLevel.BETTER
|
455 |
+
elif categories_present >= 1:
|
456 |
+
return BonusLevel.GOOD
|
457 |
+
return BonusLevel.NO_BONUS
|
458 |
+
|
459 |
+
def analyze_primary_skills(self, job_posting: JobPosting,
|
460 |
+
seeker: IndependentJobSeekerAssessmentRDS) -> Dict[str, float]:
|
461 |
+
"""Analyze occurrence, frequency, and recency of primary skills"""
|
462 |
+
required_skills = {skill.skill_name.lower() for skill in job_posting.primary_skills}
|
463 |
+
skill_counts = defaultdict(int)
|
464 |
+
recent_skills = set()
|
465 |
+
|
466 |
+
# Define skill variations
|
467 |
+
skill_variations = {
|
468 |
+
'python': ['python', 'py', 'python3'],
|
469 |
+
'aws': ['aws', 'amazon web services', 'amazon aws', 'cloud'],
|
470 |
+
'microservices': ['microservices', 'micro-services', 'microservice', 'micro service'],
|
471 |
+
'rest apis': ['rest', 'restful', 'rest api', 'rest apis', 'restful api', 'restful apis'],
|
472 |
+
'api': ['api', 'apis', 'restful api', 'web api'],
|
473 |
+
'docker': ['docker', 'containerization', 'containers'],
|
474 |
+
'kubernetes': ['kubernetes', 'k8s', 'kubectl'],
|
475 |
+
}
|
476 |
+
|
477 |
+
for i, exp in enumerate(seeker.experiences):
|
478 |
+
exp_text = ' '.join(exp.get('experience_summaries', []))
|
479 |
+
exp_text = exp_text.lower()
|
480 |
+
exp_skills = set()
|
481 |
+
|
482 |
+
for skill in required_skills:
|
483 |
+
# Get variations for this skill
|
484 |
+
variations = skill_variations.get(skill.lower(), [skill.lower()])
|
485 |
+
|
486 |
+
# Count all variations
|
487 |
+
skill_count = sum(
|
488 |
+
len(re.findall(rf'\b{re.escape(var)}\b', exp_text))
|
489 |
+
for var in variations
|
490 |
+
)
|
491 |
+
|
492 |
+
skill_counts[skill] += skill_count
|
493 |
+
if skill_count > 0:
|
494 |
+
exp_skills.add(skill)
|
495 |
+
if i == 0: # Most recent experience
|
496 |
+
recent_skills.add(skill)
|
497 |
+
|
498 |
+
# Calculate occurrence risk
|
499 |
+
total_unique = len(set(skill for skill, count in skill_counts.items() if count > 0))
|
500 |
+
occurrence_percentage = (total_unique / len(required_skills)) * 100
|
501 |
+
|
502 |
+
if occurrence_percentage == 100:
|
503 |
+
occurrence_risk = RiskLevel.NO_RISK
|
504 |
+
elif occurrence_percentage >= 75:
|
505 |
+
occurrence_risk = RiskLevel.LOW_RISK
|
506 |
+
elif occurrence_percentage >= 50:
|
507 |
+
occurrence_risk = RiskLevel.MEDIUM_RISK
|
508 |
+
else:
|
509 |
+
occurrence_risk = RiskLevel.HIGH_RISK
|
510 |
+
|
511 |
+
# Calculate frequency risk with more lenient thresholds
|
512 |
+
total_mentions = sum(skill_counts.values())
|
513 |
+
if total_mentions >= 15: # Changed from 20
|
514 |
+
frequency_risk = RiskLevel.LOW_RISK
|
515 |
+
elif total_mentions >= 8: # Changed from 10
|
516 |
+
frequency_risk = RiskLevel.MEDIUM_RISK
|
517 |
+
else:
|
518 |
+
frequency_risk = RiskLevel.HIGH_RISK
|
519 |
+
|
520 |
+
# Calculate recency score
|
521 |
+
recent_percentage = (len(recent_skills) / len(required_skills)) * 100
|
522 |
+
if recent_percentage == 100:
|
523 |
+
recency_risk = RiskLevel.NO_RISK
|
524 |
+
elif recent_percentage >= 65:
|
525 |
+
recency_risk = RiskLevel.LOW_RISK
|
526 |
+
elif recent_percentage >= 50:
|
527 |
+
recency_risk = RiskLevel.MEDIUM_RISK
|
528 |
+
else:
|
529 |
+
recency_risk = RiskLevel.HIGH_RISK
|
530 |
+
|
531 |
+
return {
|
532 |
+
'primary_skills_occurrence': occurrence_risk,
|
533 |
+
'primary_skills_frequency': frequency_risk,
|
534 |
+
'primary_skills_recency': recency_risk
|
535 |
+
}
|
536 |
+
def calculate_job_title_match(self, job_posting: JobPosting,
|
537 |
+
experiences: List[dict]) -> Tuple[float, float]:
|
538 |
+
"""Calculate job title match percentage and risk level"""
|
539 |
+
target_title = job_posting.title.lower()
|
540 |
+
total_months = 0
|
541 |
+
matched_months = 0
|
542 |
+
|
543 |
+
for exp in experiences:
|
544 |
+
start_date = datetime.fromisoformat(exp['start_date'])
|
545 |
+
end_date = datetime.fromisoformat(exp['end_date']) if exp.get('end_date') else datetime.now()
|
546 |
+
months = (end_date - start_date).days / 30
|
547 |
+
total_months += months
|
548 |
+
|
549 |
+
# Check for exact or related title match
|
550 |
+
title = exp.get('title', '').lower()
|
551 |
+
if target_title in title or title in target_title:
|
552 |
+
matched_months += months
|
553 |
+
|
554 |
+
if total_months == 0:
|
555 |
+
return 0, RiskLevel.HIGH_RISK
|
556 |
+
|
557 |
+
match_percentage = (matched_months / total_months) * 100
|
558 |
+
|
559 |
+
# Determine risk level based on percentage
|
560 |
+
if match_percentage >= 80:
|
561 |
+
risk_level = RiskLevel.NO_RISK
|
562 |
+
elif match_percentage >= 70:
|
563 |
+
risk_level = RiskLevel.LOW_RISK
|
564 |
+
elif match_percentage >= 60:
|
565 |
+
risk_level = RiskLevel.MEDIUM_RISK
|
566 |
+
else:
|
567 |
+
risk_level = RiskLevel.HIGH_RISK
|
568 |
+
|
569 |
+
return match_percentage, risk_level
|
570 |
+
def evaluate_career_experience(self, experiences: List[dict],
|
571 |
+
required_years: int) -> float:
|
572 |
+
"""Evaluate career experience level"""
|
573 |
+
if not experiences:
|
574 |
+
return RiskLevel.HIGH_RISK
|
575 |
+
|
576 |
+
first_job_date = min(datetime.fromisoformat(exp['start_date']) for exp in experiences)
|
577 |
+
years_experience = (datetime.now() - first_job_date).days / 365
|
578 |
+
|
579 |
+
if years_experience >= 7:
|
580 |
+
return RiskLevel.NO_RISK
|
581 |
+
elif years_experience >= 3:
|
582 |
+
return RiskLevel.LOW_RISK
|
583 |
+
return RiskLevel.HIGH_RISK
|
584 |
+
def evaluate_certifications(self, certifications: List[dict],
|
585 |
+
job_posting: JobPosting) -> float:
|
586 |
+
"""Evaluate relevance of certifications to job title"""
|
587 |
+
if not certifications:
|
588 |
+
return BonusLevel.NO_BONUS
|
589 |
+
|
590 |
+
relevant_certs = 0
|
591 |
+
job_keywords = set(job_posting.title.lower().split())
|
592 |
+
|
593 |
+
for cert in certifications:
|
594 |
+
cert_name = cert.get('name', '').lower()
|
595 |
+
if any(keyword in cert_name for keyword in job_keywords):
|
596 |
+
relevant_certs += 1
|
597 |
+
|
598 |
+
if relevant_certs >= 3:
|
599 |
+
return BonusLevel.BEST
|
600 |
+
elif relevant_certs == 2:
|
601 |
+
return BonusLevel.BETTER
|
602 |
+
elif relevant_certs == 1:
|
603 |
+
return BonusLevel.GOOD
|
604 |
+
return BonusLevel.NO_BONUS
|
605 |
+
def check_location_match(self, seeker_location: str, job_location: str) -> float:
|
606 |
+
"""Check if experience locations match job location"""
|
607 |
+
if not seeker_location or not job_location:
|
608 |
+
return RiskLevel.HIGH_RISK
|
609 |
+
|
610 |
+
if seeker_location.lower() != job_location.lower():
|
611 |
+
return RiskLevel.LOW_RISK # Different country is low risk per requirements
|
612 |
+
return RiskLevel.NO_RISK
|
613 |
def calculate_similarity(self, job_embeddings: Dict[str, np.ndarray],
|
614 |
+
seeker_embeddings: Dict[str, np.ndarray],
|
615 |
+
job_posting: JobPosting,
|
616 |
+
processed_seeker: IndependentJobSeekerAssessmentRDS) -> MatchResult:
|
617 |
+
"""Calculate similarity with comprehensive scoring system"""
|
618 |
+
# Calculate embedding-based similarity scores
|
619 |
field_scores = {}
|
620 |
explanation_parts = []
|
621 |
|
622 |
+
# Your existing embedding similarity calculation
|
623 |
for job_field, seeker_fields in FIELD_MAPPING.items():
|
624 |
if job_field not in job_embeddings:
|
625 |
continue
|
626 |
|
627 |
job_emb = job_embeddings[job_field]
|
628 |
|
|
|
629 |
for seeker_field in seeker_fields:
|
630 |
if seeker_field not in seeker_embeddings:
|
631 |
continue
|
632 |
|
633 |
seeker_emb = seeker_embeddings[seeker_field]
|
|
|
|
|
634 |
similarity = np.dot(job_emb, seeker_emb) / (
|
635 |
np.linalg.norm(job_emb) * np.linalg.norm(seeker_emb) + 1e-9
|
636 |
)
|
637 |
|
638 |
+
raw_score = (similarity * 0.8)
|
639 |
+
field_score = max(0, min(1, (raw_score + 1) / 2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
640 |
|
641 |
+
field_pair = f"{job_field}{seeker_field}"
|
642 |
+
field_scores[field_pair] = field_score
|
643 |
+
# Calculate all risk scores
|
644 |
+
risk_scores = {
|
645 |
+
'job_stability': self.calculate_job_stability_risk(processed_seeker.experiences),
|
646 |
+
'job_duration': self.calculate_job_duration_risk(processed_seeker.experiences),
|
647 |
+
'employment_recency': self.calculate_employment_recency_risk(processed_seeker.experiences)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
648 |
}
|
649 |
|
650 |
+
# Add primary skills analysis
|
651 |
+
risk_scores.update(self.analyze_primary_skills(job_posting, processed_seeker))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
652 |
|
653 |
+
# Calculate job title match
|
654 |
+
title_match_percentage, title_risk = self.calculate_job_title_match(
|
655 |
+
job_posting, processed_seeker.experiences)
|
656 |
+
risk_scores['job_title_experience'] = title_risk
|
657 |
|
658 |
+
# Calculate all bonus scores
|
659 |
+
bonus_scores = {
|
660 |
+
'responsibility_tenure': self.calculate_responsibility_tenure(processed_seeker.experiences),
|
661 |
+
'employment_pedigree': self.evaluate_employment_pedigree(processed_seeker.experiences, job_posting),
|
662 |
+
'role_impact': self.evaluate_role_impact(processed_seeker.experiences),
|
663 |
+
'management_scope': self.evaluate_management_scope(processed_seeker.experiences),
|
664 |
+
'soft_skills': self.evaluate_soft_skills(processed_seeker.experiences),
|
665 |
+
'certifications': self.evaluate_certifications(processed_seeker.certifications, job_posting)
|
666 |
+
}
|
667 |
+
# Calculate final score using the formula:
|
668 |
+
# Total Score = ∑(Risk Weight × Risk Level) + ∑(Bonus Weight × Bonus Level)
|
669 |
+
total_score = 0.0
|
670 |
+
|
671 |
+
# Add weighted risk scores
|
672 |
+
for category, score in risk_scores.items():
|
673 |
+
weight = FIELD_WEIGHTS.get(category, 0)
|
674 |
+
total_score += weight * score
|
675 |
+
if score != RiskLevel.NO_RISK:
|
676 |
+
explanation_parts.append(
|
677 |
+
f"{category.replace('', ' ').title()}: "
|
678 |
+
f"{'High' if score == RiskLevel.HIGH_RISK else 'Medium' if score == RiskLevel.MEDIUM_RISK else 'Low'} Risk"
|
679 |
+
)
|
680 |
+
|
681 |
+
# Add weighted bonus scores
|
682 |
+
for category, score in bonus_scores.items():
|
683 |
+
weight = FIELD_WEIGHTS.get(category, 0)
|
684 |
+
total_score += weight * score
|
685 |
+
if score != BonusLevel.NO_BONUS:
|
686 |
+
explanation_parts.append(
|
687 |
+
f"{category.replace('', ' ').title()}: "
|
688 |
+
f"{'Best' if score == BonusLevel.BEST else 'Better' if score == BonusLevel.BETTER else 'Good'} Bonus"
|
689 |
+
)
|
690 |
+
# Normalize to 0-100 scale
|
691 |
+
min_possible = sum(w * RiskLevel.HIGH_RISK for w in FIELD_WEIGHTS.values())
|
692 |
+
max_possible = sum(w * max(RiskLevel.NO_RISK, BonusLevel.BEST) for w in FIELD_WEIGHTS.values())
|
693 |
+
normalized_score = ((total_score - min_possible) / (max_possible - min_possible)) * 100
|
694 |
+
final_score = max(0, min(100, normalized_score))
|
695 |
return MatchResult(
|
696 |
similarity_score=final_score,
|
697 |
+
field_scores={**field_scores, **risk_scores, **bonus_scores},
|
698 |
+
explanation=" | ".join(explanation_parts)
|
699 |
)
|
700 |
|
701 |
def initialize_embedding_system(job_encoder, seeker_encoder):
|