sebastianalgharaballi commited on
Commit
66bc137
·
verified ·
1 Parent(s): 0516311

new scoring

Browse files
Files changed (1) hide show
  1. embeddings.py +543 -82
embeddings.py CHANGED
@@ -58,6 +58,84 @@ class MatchResult:
58
  explanation: str
59
  status: str = "unseen"
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  class EmbeddingManager:
62
  def __init__(self, job_encoder, seeker_encoder):
63
  self.job_encoder = job_encoder
@@ -65,10 +143,7 @@ class EmbeddingManager:
65
 
66
  def get_job_fields(self, job_posting: JobPosting) -> Dict[str, str]:
67
  """Extract relevant fields from job posting"""
68
- # Convert primary skills list to string
69
  primary_skills_str = ', '.join([skill.skill_name for skill in job_posting.primary_skills]) if job_posting.primary_skills else ''
70
-
71
- # Convert secondary skills list to string
72
  secondary_skills_str = ', '.join([skill.skill_name for skill in job_posting.secondary_skills]) if job_posting.secondary_skills else ''
73
 
74
  return {
@@ -78,8 +153,7 @@ class EmbeddingManager:
78
  'primary_skills': primary_skills_str,
79
  'secondary_skills': secondary_skills_str
80
  }
81
-
82
- def get_seeker_fields(self, processed_seeker: IndependentJobSeekerAssessmentRDS,
83
  unprocessed_seeker: JobseekerInfoRDS) -> Dict[str, str]:
84
  """Extract relevant fields from job seeker"""
85
  return {
@@ -90,8 +164,9 @@ class EmbeddingManager:
90
  'certifications': self._format_certifications(processed_seeker.certifications),
91
  'summary': unprocessed_seeker.summary
92
  }
93
-
94
  def _format_experience(self, experiences: List[dict]) -> str:
 
95
  exp_parts = []
96
  for exp in experiences:
97
  summaries = exp.get('experience_summaries', [])
@@ -99,8 +174,6 @@ class EmbeddingManager:
99
  exp_parts.append(exp_str)
100
  return ' | '.join(exp_parts)
101
 
102
-
103
-
104
  def _format_education(self, educations: List[dict]) -> str:
105
  """Format education entries into a single string"""
106
  edu_parts = []
@@ -111,17 +184,16 @@ class EmbeddingManager:
111
  edu_str = f"{degree} in {field} from {institution}"
112
  edu_parts.append(edu_str)
113
  return ' | '.join(edu_parts)
114
-
115
  def _format_certifications(self, certifications: List[dict]) -> str:
116
  """Format certification entries into a single string"""
117
  cert_parts = []
118
  for cert in certifications:
119
- name = cert.get('name', '') # This is required as per schema
120
  org = cert.get('organization', '')
121
  start = cert.get('start_date', '')
122
  end = cert.get('end_date', '')
123
 
124
- # Build certification string
125
  cert_str = name
126
  if org:
127
  cert_str += f" from {org}"
@@ -136,105 +208,494 @@ class EmbeddingManager:
136
  cert_parts.append(cert_str)
137
 
138
  return ' | '.join(cert_parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
- def embed_jobposting(self, job_posting: JobPosting) -> Dict[str, np.ndarray]:
141
- """Generate embeddings for job posting fields"""
142
- fields = self.get_job_fields(job_posting)
143
- return self.job_encoder.encode_fields(fields)
144
-
145
- def embed_jobseeker(self, processed_seeker: IndependentJobSeekerAssessmentRDS,
146
- unprocessed_seeker: JobseekerInfoRDS) -> Dict[str, np.ndarray]:
147
- """Generate embeddings for job seeker fields"""
148
- fields = self.get_seeker_fields(processed_seeker, unprocessed_seeker)
149
- print("DEBUG - Seeker fields:", fields)
150
- return self.seeker_encoder.encode_fields(fields)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- # list of job seeker ids with their scores (from metadata)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  def calculate_similarity(self, job_embeddings: Dict[str, np.ndarray],
154
- seeker_embeddings: Dict[str, np.ndarray]) -> MatchResult:
155
- """Calculate similarity with strict thresholds"""
 
 
 
156
  field_scores = {}
157
  explanation_parts = []
158
 
159
- # Calculate similarity for each field pair
160
  for job_field, seeker_fields in FIELD_MAPPING.items():
161
  if job_field not in job_embeddings:
162
  continue
163
 
164
  job_emb = job_embeddings[job_field]
165
 
166
- # Handle multiple seeker fields for one job field
167
  for seeker_field in seeker_fields:
168
  if seeker_field not in seeker_embeddings:
169
  continue
170
 
171
  seeker_emb = seeker_embeddings[seeker_field]
172
-
173
- # Calculate raw cosine similarity
174
  similarity = np.dot(job_emb, seeker_emb) / (
175
  np.linalg.norm(job_emb) * np.linalg.norm(seeker_emb) + 1e-9
176
  )
177
 
178
- # Scale to [0, 1] much more aggressively
179
- raw_score = (similarity * 0.8) # Compress range
180
- field_score = max(0, min(1, (raw_score + 1) / 2)) # Rescale to [0,1]
181
-
182
- # Apply non-linear transformation for more discrimination
183
- if field_score > 0.9: # Only the very best get boosted
184
- field_score = min(field_score * 1.1, 1.0)
185
- elif field_score < 0.7: # More aggressive penalty for lower scores
186
- field_score = field_score * 0.6
187
 
188
- field_pair_name = f"{job_field}_{seeker_field}"
189
- field_scores[field_pair_name] = field_score
190
-
191
- # Much stricter thresholds for quality descriptions
192
- match_quality = "strong" if field_score > 0.9 else \
193
- "good" if field_score > 0.8 else \
194
- "moderate" if field_score > 0.6 else "weak"
195
-
196
- explanation_parts.append(
197
- f"{match_quality.capitalize()} match on {job_field} to {seeker_field} "
198
- f"(similarity: {field_score:.2f})"
199
- )
200
-
201
- # Calculate weighted average with critical field emphasis
202
- final_score = 0.0
203
- total_weight = 0.0
204
-
205
- # Critical fields get extra weight (subject to change)
206
- critical_fields = {
207
- 'primary_skills_primary_skills': 1,
208
- 'role_description_experience': 1,
209
- 'role_description_certifications': 1, # Added certifications with same weight
210
  }
211
 
212
- for field_pair, score in field_scores.items():
213
- base_weight = FIELD_WEIGHTS.get(field_pair, 0.0)
214
-
215
- # Apply critical field multiplier
216
- weight = base_weight * critical_fields.get(field_pair, 1.0)
217
-
218
- final_score += score * weight
219
- total_weight += weight
220
-
221
- if total_weight > 0:
222
- final_score = final_score / total_weight
223
-
224
- # Final adjustments for extreme discrimination
225
- if final_score > 0.9: # Only truly exceptional matches
226
- final_score = min(final_score * 1.1, 1.0)
227
- elif final_score < 0.7: # Really penalize poor matches
228
- final_score = final_score * 0.6
229
 
230
- explanation = " | ".join(explanation_parts)
231
-
232
- print("DEBUG - All field scores:", field_scores)
 
233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  return MatchResult(
235
  similarity_score=final_score,
236
- field_scores=field_scores,
237
- explanation=explanation
238
  )
239
 
240
  def initialize_embedding_system(job_encoder, seeker_encoder):
 
58
  explanation: str
59
  status: str = "unseen"
60
 
61
+ from typing import Dict, List, Optional, Tuple, Set
62
+ import numpy as np
63
+ from dataclasses import dataclass
64
+ from datetime import datetime, timedelta
65
+ from collections import defaultdict
66
+ import re
67
+ class RiskLevel:
68
+ NO_RISK = 5
69
+ LOW_RISK = -5
70
+ MEDIUM_RISK = -10
71
+ HIGH_RISK = -15
72
+ class BonusLevel:
73
+ NO_BONUS = 0
74
+ GOOD = 1
75
+ BETTER = 2
76
+ BEST = 3
77
+ # Your existing field mappings
78
+ FIELD_MAPPING = {
79
+ 'title': ['summary'],
80
+ 'primary_skills': ['primary_skills'],
81
+ 'secondary_skills': ['secondary_skills'],
82
+ 'role_description': ['experience', 'certifications']
83
+ }
84
+ # Updated field weights incorporating all criteria
85
+ FIELD_WEIGHTS = {
86
+ 'job_stability': 12,
87
+ 'job_duration': 12,
88
+ 'responsibility_tenure': 1.5,
89
+ 'employment_pedigree': 7.5,
90
+ 'primary_skills_experience': 5,
91
+ 'career_experience': 2.5,
92
+ 'role_impact': 5,
93
+ 'management_scope': 1.5,
94
+ 'primary_skills_occurrence': 12,
95
+ 'primary_skills_frequency': 2.5,
96
+ 'primary_skills_recency': 15,
97
+ 'soft_skills': 1,
98
+ 'employment_recency': 7.5,
99
+ 'location_match': 1,
100
+ 'certifications': 2,
101
+ 'job_title_experience': 5,
102
+ 'job_title_match': 7,
103
+ 'primary_skills_primary_skills': 0.5,
104
+ 'secondary_skills_secondary_skills': 0.1,
105
+ 'role_description_experience': 0.25,
106
+ 'role_description_certifications': 0.05,
107
+ 'title_summary': 0.1
108
+ }
109
+ SOFT_SKILLS_KEYWORDS = {
110
+ 'communication': ['effectively communicated', 'presented to stakeholders', 'negotiated', 'collaborated with', 'mediated'],
111
+ 'teamwork': ['worked in a team', 'collaborated with', 'partnered with', 'contributed to a team effort'],
112
+ 'leadership': ['led a team', 'mentored', 'coached', 'managed', 'guided'],
113
+ 'problem_solving': ['resolved', 'addressed challenges', 'innovated', 'strategized', 'implemented solutions'],
114
+ 'adaptability': ['adapted to', 'quickly learned', 'flexible in', 'handled change'],
115
+ 'emotional_intelligence': ['empathized with', 'understood needs', 'fostered relationships', 'built trust', 'managed conflict']
116
+ }
117
+ LEADERSHIP_KEYWORDS = [
118
+ 'led', 'managed', 'directed', 'architected', 'innovated',
119
+ 'spearheaded', 'strategized', 'developed', 'executed',
120
+ 'owned', 'delivered', 'implemented'
121
+ ]
122
+ IMPACT_PATTERNS = {
123
+ 'revenue_growth': r'increased revenue by (\d+)%',
124
+ 'cost_savings': r'saved \$(\d+) million',
125
+ 'project_launch': r'launched .+ generated \$(\d+) million',
126
+ 'project_completion': r'completed .+ ahead of schedule',
127
+ 'budget_management': r'managed \$(\d+) million budget',
128
+ 'risk_mitigation': r'decreased .+ by (\d+)%',
129
+ 'client_retention': r'improved retention by (\d+)%',
130
+ 'satisfaction': r'satisfaction .+ (\d+)% to (\d+)%',
131
+ 'team_growth': r'grew team by (\d+)%'
132
+ }
133
+ EXPERIENCE_LEVELS = {
134
+ 'junior': ['Junior', 'Associate'],
135
+ 'mid': ['Staff', 'Senior', 'Sr.'],
136
+ 'senior': ['Principal', 'Lead', 'Supervisor', 'Manager'],
137
+ 'executive': ['Director', 'VP', 'CXO', 'President', 'Owner', 'Founder', 'Partner']
138
+ }
139
  class EmbeddingManager:
140
  def __init__(self, job_encoder, seeker_encoder):
141
  self.job_encoder = job_encoder
 
143
 
144
  def get_job_fields(self, job_posting: JobPosting) -> Dict[str, str]:
145
  """Extract relevant fields from job posting"""
 
146
  primary_skills_str = ', '.join([skill.skill_name for skill in job_posting.primary_skills]) if job_posting.primary_skills else ''
 
 
147
  secondary_skills_str = ', '.join([skill.skill_name for skill in job_posting.secondary_skills]) if job_posting.secondary_skills else ''
148
 
149
  return {
 
153
  'primary_skills': primary_skills_str,
154
  'secondary_skills': secondary_skills_str
155
  }
156
+ def get_seeker_fields(self, processed_seeker: IndependentJobSeekerAssessmentRDS,
 
157
  unprocessed_seeker: JobseekerInfoRDS) -> Dict[str, str]:
158
  """Extract relevant fields from job seeker"""
159
  return {
 
164
  'certifications': self._format_certifications(processed_seeker.certifications),
165
  'summary': unprocessed_seeker.summary
166
  }
167
+
168
  def _format_experience(self, experiences: List[dict]) -> str:
169
+ """Format experience entries into a single string"""
170
  exp_parts = []
171
  for exp in experiences:
172
  summaries = exp.get('experience_summaries', [])
 
174
  exp_parts.append(exp_str)
175
  return ' | '.join(exp_parts)
176
 
 
 
177
  def _format_education(self, educations: List[dict]) -> str:
178
  """Format education entries into a single string"""
179
  edu_parts = []
 
184
  edu_str = f"{degree} in {field} from {institution}"
185
  edu_parts.append(edu_str)
186
  return ' | '.join(edu_parts)
187
+
188
  def _format_certifications(self, certifications: List[dict]) -> str:
189
  """Format certification entries into a single string"""
190
  cert_parts = []
191
  for cert in certifications:
192
+ name = cert.get('name', '')
193
  org = cert.get('organization', '')
194
  start = cert.get('start_date', '')
195
  end = cert.get('end_date', '')
196
 
 
197
  cert_str = name
198
  if org:
199
  cert_str += f" from {org}"
 
208
  cert_parts.append(cert_str)
209
 
210
  return ' | '.join(cert_parts)
211
+ def calculate_job_stability_risk(self, experiences: List[dict]) -> float:
212
+ """Calculate risk based on employment gaps within last 2 years"""
213
+ if not experiences:
214
+ return RiskLevel.HIGH_RISK
215
+ sorted_experiences = sorted(
216
+ experiences,
217
+ key=lambda x: datetime.fromisoformat(x['start_date'])
218
+ )
219
+
220
+ two_years_ago = datetime.now() - timedelta(days=730)
221
+ gaps = []
222
+
223
+ for i in range(1, len(sorted_experiences)):
224
+ current_start = datetime.fromisoformat(sorted_experiences[i]['start_date'])
225
+ if current_start < two_years_ago:
226
+ continue
227
+
228
+ prev_end = sorted_experiences[i-1].get('end_date')
229
+ prev_end = datetime.fromisoformat(prev_end) if prev_end else datetime.now()
230
+
231
+ if current_start > prev_end:
232
+ gap_months = (current_start - prev_end).days / 30
233
+ gaps.append(int(gap_months))
234
+ if not gaps:
235
+ return RiskLevel.NO_RISK
236
+
237
+ max_gap = max(gaps)
238
+ if max_gap > 9:
239
+ return RiskLevel.HIGH_RISK
240
+ elif max_gap > 3:
241
+ return RiskLevel.MEDIUM_RISK
242
+ return RiskLevel.LOW_RISK
243
+ def calculate_job_duration_risk(self, experiences: List[dict]) -> float:
244
+ """Calculate risk based on number of jobs in recent periods"""
245
+ now = datetime.now()
246
+ one_year_ago = now - timedelta(days=365)
247
+ two_years_ago = now - timedelta(days=730)
248
+
249
+ jobs_last_year = sum(1 for exp in experiences
250
+ if datetime.fromisoformat(exp['start_date']) >= one_year_ago)
251
+ jobs_last_two_years = sum(1 for exp in experiences
252
+ if datetime.fromisoformat(exp['start_date']) >= two_years_ago)
253
+
254
+ # One year criteria
255
+ if jobs_last_year >= 4:
256
+ return RiskLevel.HIGH_RISK
257
+ elif jobs_last_year == 3:
258
+ return RiskLevel.MEDIUM_RISK
259
+ elif jobs_last_year == 2:
260
+ return RiskLevel.LOW_RISK
261
+ elif jobs_last_year == 1:
262
+ return RiskLevel.NO_RISK
263
+
264
+ # Two year criteria
265
+ if jobs_last_two_years >= 5:
266
+ return RiskLevel.HIGH_RISK
267
+ elif jobs_last_two_years in [3, 4]:
268
+ return RiskLevel.MEDIUM_RISK
269
+ elif jobs_last_two_years <= 2:
270
+ return RiskLevel.LOW_RISK
271
+
272
+ return RiskLevel.NO_RISK
273
+ def calculate_employment_recency_risk(self, experiences: List[dict]) -> float:
274
+ """Calculate risk based on most recent employment end date"""
275
+ if not experiences:
276
+ return RiskLevel.HIGH_RISK
277
+
278
+ latest_end_date = max(
279
+ datetime.fromisoformat(exp['end_date']) if exp.get('end_date')
280
+ else datetime.now()
281
+ for exp in experiences
282
+ )
283
+
284
+ months_since_end = (datetime.now() - latest_end_date).days / 30
285
+
286
+ if months_since_end > 9:
287
+ return RiskLevel.HIGH_RISK
288
+ elif months_since_end > 6:
289
+ return RiskLevel.MEDIUM_RISK
290
+ elif months_since_end > 3:
291
+ return RiskLevel.LOW_RISK
292
+ return RiskLevel.NO_RISK
293
+ def calculate_responsibility_tenure(self, experiences: List[dict]) -> float:
294
+ """Calculate bonus based on leadership tenure"""
295
+ leadership_months = 0
296
+
297
+ for exp in experiences:
298
+ description = ' '.join(exp.get('experience_summaries', []))
299
+ if any(keyword in description.lower() for keyword in LEADERSHIP_KEYWORDS):
300
+ start = datetime.fromisoformat(exp['start_date'])
301
+ end = datetime.fromisoformat(exp['end_date']) if exp.get('end_date') else datetime.now()
302
+ leadership_months += (end - start).days / 30
303
+
304
+ if leadership_months >= 24:
305
+ return BonusLevel.BEST
306
+ elif leadership_months >= 12:
307
+ return BonusLevel.BETTER
308
+ elif leadership_months > 0:
309
+ return BonusLevel.GOOD
310
+ return BonusLevel.NO_BONUS
311
+ def evaluate_employment_pedigree(self, experiences: List[dict],
312
+ job_posting: JobPosting) -> float:
313
+ """Evaluate employment pedigree based on company comparisons"""
314
+ score = 0
315
+ recent_experiences = experiences[:3] # Focus on most recent experiences
316
+
317
+ for exp in recent_experiences:
318
+ # Check industry match
319
+ if exp.get('industry') == job_posting.industry:
320
+ score += 1
321
+
322
+ # Check company size match
323
+ if abs(exp.get('company_size', 0) - job_posting.company_size) < 0.2: # Within 20%
324
+ score += 1
325
+
326
+ # Check revenue match
327
+ if abs(exp.get('company_revenue', 0) - job_posting.company_revenue) < 0.2:
328
+ score += 1
329
+
330
+ # Check growth rate match
331
+ if abs(exp.get('growth_rate', 0) - job_posting.growth_rate) < 0.1:
332
+ score += 1
333
+
334
+ # Scale score to bonus levels
335
+ if score >= 10: # High match across multiple factors
336
+ return BonusLevel.BEST
337
+ elif score >= 6:
338
+ return BonusLevel.BETTER
339
+ elif score >= 3:
340
+ return BonusLevel.GOOD
341
+ return BonusLevel.NO_BONUS
342
+
343
+ def evaluate_job_title_experience(self, experiences: List[dict], required_years: int) -> float:
344
+ """Evaluate job title experience against required years"""
345
+ actual_years = sum((datetime.fromisoformat(exp.get('end_date', datetime.now().isoformat())) -
346
+ datetime.fromisoformat(exp['start_date'])).days / 365
347
+ for exp in experiences)
348
+
349
+ percentage = (actual_years / required_years) * 100 if required_years > 0 else 0
350
+
351
+ if percentage >= 100:
352
+ return RiskLevel.NO_RISK
353
+ elif percentage >= 71:
354
+ return RiskLevel.LOW_RISK
355
+ elif percentage >= 61:
356
+ return RiskLevel.MEDIUM_RISK
357
+ return RiskLevel.HIGH_RISK
358
+
359
+ def evaluate_primary_skills_quality(self, experience_description: str) -> float:
360
+ """Evaluate the quality of primary skills usage description"""
361
+ # Count technical terms and action verbs
362
+ tech_terms = len(re.findall(r'\b(?:API|REST|SDK|Framework|Platform|System|Database)\b',
363
+ experience_description, re.IGNORECASE))
364
+ action_verbs = len(re.findall(r'\b(?:Designed|Implemented|Developed|Engineered|Architected)\b',
365
+ experience_description, re.IGNORECASE))
366
+ context_richness = len(re.findall(r'using|with|through|by|via',
367
+ experience_description, re.IGNORECASE))
368
+
369
+ total_score = tech_terms + action_verbs + context_richness
370
+
371
+ if total_score >= 7: # Complex technical implementation with context
372
+ return BonusLevel.BEST
373
+ elif total_score >= 5: # Good technical description
374
+ return BonusLevel.BETTER
375
+ elif total_score >= 3: # Basic technical mention
376
+ return BonusLevel.GOOD
377
+ return BonusLevel.NO_BONUS
378
+ def evaluate_role_impact(self, experiences: List[dict]) -> float:
379
+ """Evaluate quantifiable impact mentions in role descriptions"""
380
+ impact_score = 0
381
+
382
+ for exp in experiences:
383
+ description = ' '.join(exp.get('experience_summaries', []))
384
+
385
+ # Performance improvements
386
+ if perf_match := re.search(r'improv(?:ed|ing)\s+\w+\s+by\s+(\d+)%', description, re.IGNORECASE):
387
+ value = float(perf_match.group(1))
388
+ impact_score += min(3, value // 20) # 20% = 1 point, 40% = 2 points, 60%+ = 3 points
389
+
390
+ # Cost savings
391
+ if cost_match := re.search(r'sav(?:ed|ing)\s+\$(\d+)K', description, re.IGNORECASE):
392
+ value = float(cost_match.group(1))
393
+ impact_score += min(3, value // 100) # $100K = 1 point, $200K = 2 points, $300K+ = 3 points
394
+
395
+ # Time/efficiency improvements
396
+ if time_match := re.search(r'reduc(?:ed|ing)\s+\w+\s+(?:time|duration)\s+by\s+(\d+)%', description, re.IGNORECASE):
397
+ value = float(time_match.group(1))
398
+ impact_score += min(3, value // 25) # 25% = 1 point, 50% = 2 points, 75%+ = 3 points
399
 
400
+ if impact_score >= 6:
401
+ return BonusLevel.BEST
402
+ elif impact_score >= 4:
403
+ return BonusLevel.BETTER
404
+ elif impact_score >= 2:
405
+ return BonusLevel.GOOD
406
+ return BonusLevel.NO_BONUS
407
+
408
+ def evaluate_management_scope(self, experiences: List[dict]) -> float:
409
+ """Evaluate scope of management responsibilities"""
410
+ scope_score = 0
411
+
412
+ for exp in experiences:
413
+ description = ' '.join(exp.get('experience_summaries', []))
414
+
415
+ # Team size
416
+ if team_match := re.search(r'(?:manag|lead)(?:ed|ing)?\s+(?:a\s+)?team\s+of\s+(\d+)', description, re.IGNORECASE):
417
+ team_size = int(team_match.group(1))
418
+ scope_score += min(3, team_size // 3) # 3 people = 1 point, 6 people = 2 points, 9+ = 3 points
419
+
420
+ # Mentoring/training
421
+ if mentor_match := re.search(r'mentor(?:ed|ing)?\s+(\d+)', description, re.IGNORECASE):
422
+ mentees = int(mentor_match.group(1))
423
+ scope_score += min(2, mentees // 2) # 2 mentees = 1 point, 4+ mentees = 2 points
424
+
425
+ # Project leadership
426
+ if re.search(r'led\s+(?:development|migration|implementation)', description, re.IGNORECASE):
427
+ scope_score += 1
428
 
429
+ if scope_score >= 5:
430
+ return BonusLevel.BEST
431
+ elif scope_score >= 3:
432
+ return BonusLevel.BETTER
433
+ elif scope_score >= 1:
434
+ return BonusLevel.GOOD
435
+ return BonusLevel.NO_BONUS
436
+
437
+ def evaluate_soft_skills(self, experiences: List[dict]) -> float:
438
+ """Evaluate presence of soft skills keywords"""
439
+ skill_matches = defaultdict(int)
440
+
441
+ for exp in experiences:
442
+ description = ' '.join(exp.get('experience_summaries', []))
443
+ for category, keywords in SOFT_SKILLS_KEYWORDS.items():
444
+ for keyword in keywords:
445
+ if keyword in description.lower():
446
+ skill_matches[category] += 1
447
+
448
+ # Count categories with significant matches
449
+ categories_present = sum(1 for matches in skill_matches.values() if matches > 0)
450
+
451
+ if categories_present >= 5: # Strong soft skills across most categories
452
+ return BonusLevel.BEST
453
+ elif categories_present >= 3:
454
+ return BonusLevel.BETTER
455
+ elif categories_present >= 1:
456
+ return BonusLevel.GOOD
457
+ return BonusLevel.NO_BONUS
458
+
459
+ def analyze_primary_skills(self, job_posting: JobPosting,
460
+ seeker: IndependentJobSeekerAssessmentRDS) -> Dict[str, float]:
461
+ """Analyze occurrence, frequency, and recency of primary skills"""
462
+ required_skills = {skill.skill_name.lower() for skill in job_posting.primary_skills}
463
+ skill_counts = defaultdict(int)
464
+ recent_skills = set()
465
+
466
+ # Define skill variations
467
+ skill_variations = {
468
+ 'python': ['python', 'py', 'python3'],
469
+ 'aws': ['aws', 'amazon web services', 'amazon aws', 'cloud'],
470
+ 'microservices': ['microservices', 'micro-services', 'microservice', 'micro service'],
471
+ 'rest apis': ['rest', 'restful', 'rest api', 'rest apis', 'restful api', 'restful apis'],
472
+ 'api': ['api', 'apis', 'restful api', 'web api'],
473
+ 'docker': ['docker', 'containerization', 'containers'],
474
+ 'kubernetes': ['kubernetes', 'k8s', 'kubectl'],
475
+ }
476
+
477
+ for i, exp in enumerate(seeker.experiences):
478
+ exp_text = ' '.join(exp.get('experience_summaries', []))
479
+ exp_text = exp_text.lower()
480
+ exp_skills = set()
481
+
482
+ for skill in required_skills:
483
+ # Get variations for this skill
484
+ variations = skill_variations.get(skill.lower(), [skill.lower()])
485
+
486
+ # Count all variations
487
+ skill_count = sum(
488
+ len(re.findall(rf'\b{re.escape(var)}\b', exp_text))
489
+ for var in variations
490
+ )
491
+
492
+ skill_counts[skill] += skill_count
493
+ if skill_count > 0:
494
+ exp_skills.add(skill)
495
+ if i == 0: # Most recent experience
496
+ recent_skills.add(skill)
497
+
498
+ # Calculate occurrence risk
499
+ total_unique = len(set(skill for skill, count in skill_counts.items() if count > 0))
500
+ occurrence_percentage = (total_unique / len(required_skills)) * 100
501
+
502
+ if occurrence_percentage == 100:
503
+ occurrence_risk = RiskLevel.NO_RISK
504
+ elif occurrence_percentage >= 75:
505
+ occurrence_risk = RiskLevel.LOW_RISK
506
+ elif occurrence_percentage >= 50:
507
+ occurrence_risk = RiskLevel.MEDIUM_RISK
508
+ else:
509
+ occurrence_risk = RiskLevel.HIGH_RISK
510
+
511
+ # Calculate frequency risk with more lenient thresholds
512
+ total_mentions = sum(skill_counts.values())
513
+ if total_mentions >= 15: # Changed from 20
514
+ frequency_risk = RiskLevel.LOW_RISK
515
+ elif total_mentions >= 8: # Changed from 10
516
+ frequency_risk = RiskLevel.MEDIUM_RISK
517
+ else:
518
+ frequency_risk = RiskLevel.HIGH_RISK
519
+
520
+ # Calculate recency score
521
+ recent_percentage = (len(recent_skills) / len(required_skills)) * 100
522
+ if recent_percentage == 100:
523
+ recency_risk = RiskLevel.NO_RISK
524
+ elif recent_percentage >= 65:
525
+ recency_risk = RiskLevel.LOW_RISK
526
+ elif recent_percentage >= 50:
527
+ recency_risk = RiskLevel.MEDIUM_RISK
528
+ else:
529
+ recency_risk = RiskLevel.HIGH_RISK
530
+
531
+ return {
532
+ 'primary_skills_occurrence': occurrence_risk,
533
+ 'primary_skills_frequency': frequency_risk,
534
+ 'primary_skills_recency': recency_risk
535
+ }
536
+ def calculate_job_title_match(self, job_posting: JobPosting,
537
+ experiences: List[dict]) -> Tuple[float, float]:
538
+ """Calculate job title match percentage and risk level"""
539
+ target_title = job_posting.title.lower()
540
+ total_months = 0
541
+ matched_months = 0
542
+
543
+ for exp in experiences:
544
+ start_date = datetime.fromisoformat(exp['start_date'])
545
+ end_date = datetime.fromisoformat(exp['end_date']) if exp.get('end_date') else datetime.now()
546
+ months = (end_date - start_date).days / 30
547
+ total_months += months
548
+
549
+ # Check for exact or related title match
550
+ title = exp.get('title', '').lower()
551
+ if target_title in title or title in target_title:
552
+ matched_months += months
553
+
554
+ if total_months == 0:
555
+ return 0, RiskLevel.HIGH_RISK
556
+
557
+ match_percentage = (matched_months / total_months) * 100
558
+
559
+ # Determine risk level based on percentage
560
+ if match_percentage >= 80:
561
+ risk_level = RiskLevel.NO_RISK
562
+ elif match_percentage >= 70:
563
+ risk_level = RiskLevel.LOW_RISK
564
+ elif match_percentage >= 60:
565
+ risk_level = RiskLevel.MEDIUM_RISK
566
+ else:
567
+ risk_level = RiskLevel.HIGH_RISK
568
+
569
+ return match_percentage, risk_level
570
+ def evaluate_career_experience(self, experiences: List[dict],
571
+ required_years: int) -> float:
572
+ """Evaluate career experience level"""
573
+ if not experiences:
574
+ return RiskLevel.HIGH_RISK
575
+
576
+ first_job_date = min(datetime.fromisoformat(exp['start_date']) for exp in experiences)
577
+ years_experience = (datetime.now() - first_job_date).days / 365
578
+
579
+ if years_experience >= 7:
580
+ return RiskLevel.NO_RISK
581
+ elif years_experience >= 3:
582
+ return RiskLevel.LOW_RISK
583
+ return RiskLevel.HIGH_RISK
584
+ def evaluate_certifications(self, certifications: List[dict],
585
+ job_posting: JobPosting) -> float:
586
+ """Evaluate relevance of certifications to job title"""
587
+ if not certifications:
588
+ return BonusLevel.NO_BONUS
589
+
590
+ relevant_certs = 0
591
+ job_keywords = set(job_posting.title.lower().split())
592
+
593
+ for cert in certifications:
594
+ cert_name = cert.get('name', '').lower()
595
+ if any(keyword in cert_name for keyword in job_keywords):
596
+ relevant_certs += 1
597
+
598
+ if relevant_certs >= 3:
599
+ return BonusLevel.BEST
600
+ elif relevant_certs == 2:
601
+ return BonusLevel.BETTER
602
+ elif relevant_certs == 1:
603
+ return BonusLevel.GOOD
604
+ return BonusLevel.NO_BONUS
605
+ def check_location_match(self, seeker_location: str, job_location: str) -> float:
606
+ """Check if experience locations match job location"""
607
+ if not seeker_location or not job_location:
608
+ return RiskLevel.HIGH_RISK
609
+
610
+ if seeker_location.lower() != job_location.lower():
611
+ return RiskLevel.LOW_RISK # Different country is low risk per requirements
612
+ return RiskLevel.NO_RISK
613
  def calculate_similarity(self, job_embeddings: Dict[str, np.ndarray],
614
+ seeker_embeddings: Dict[str, np.ndarray],
615
+ job_posting: JobPosting,
616
+ processed_seeker: IndependentJobSeekerAssessmentRDS) -> MatchResult:
617
+ """Calculate similarity with comprehensive scoring system"""
618
+ # Calculate embedding-based similarity scores
619
  field_scores = {}
620
  explanation_parts = []
621
 
622
+ # Your existing embedding similarity calculation
623
  for job_field, seeker_fields in FIELD_MAPPING.items():
624
  if job_field not in job_embeddings:
625
  continue
626
 
627
  job_emb = job_embeddings[job_field]
628
 
 
629
  for seeker_field in seeker_fields:
630
  if seeker_field not in seeker_embeddings:
631
  continue
632
 
633
  seeker_emb = seeker_embeddings[seeker_field]
 
 
634
  similarity = np.dot(job_emb, seeker_emb) / (
635
  np.linalg.norm(job_emb) * np.linalg.norm(seeker_emb) + 1e-9
636
  )
637
 
638
+ raw_score = (similarity * 0.8)
639
+ field_score = max(0, min(1, (raw_score + 1) / 2))
 
 
 
 
 
 
 
640
 
641
+ field_pair = f"{job_field}{seeker_field}"
642
+ field_scores[field_pair] = field_score
643
+ # Calculate all risk scores
644
+ risk_scores = {
645
+ 'job_stability': self.calculate_job_stability_risk(processed_seeker.experiences),
646
+ 'job_duration': self.calculate_job_duration_risk(processed_seeker.experiences),
647
+ 'employment_recency': self.calculate_employment_recency_risk(processed_seeker.experiences)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  }
649
 
650
+ # Add primary skills analysis
651
+ risk_scores.update(self.analyze_primary_skills(job_posting, processed_seeker))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
 
653
+ # Calculate job title match
654
+ title_match_percentage, title_risk = self.calculate_job_title_match(
655
+ job_posting, processed_seeker.experiences)
656
+ risk_scores['job_title_experience'] = title_risk
657
 
658
+ # Calculate all bonus scores
659
+ bonus_scores = {
660
+ 'responsibility_tenure': self.calculate_responsibility_tenure(processed_seeker.experiences),
661
+ 'employment_pedigree': self.evaluate_employment_pedigree(processed_seeker.experiences, job_posting),
662
+ 'role_impact': self.evaluate_role_impact(processed_seeker.experiences),
663
+ 'management_scope': self.evaluate_management_scope(processed_seeker.experiences),
664
+ 'soft_skills': self.evaluate_soft_skills(processed_seeker.experiences),
665
+ 'certifications': self.evaluate_certifications(processed_seeker.certifications, job_posting)
666
+ }
667
+ # Calculate final score using the formula:
668
+ # Total Score = ∑(Risk Weight × Risk Level) + ∑(Bonus Weight × Bonus Level)
669
+ total_score = 0.0
670
+
671
+ # Add weighted risk scores
672
+ for category, score in risk_scores.items():
673
+ weight = FIELD_WEIGHTS.get(category, 0)
674
+ total_score += weight * score
675
+ if score != RiskLevel.NO_RISK:
676
+ explanation_parts.append(
677
+ f"{category.replace('', ' ').title()}: "
678
+ f"{'High' if score == RiskLevel.HIGH_RISK else 'Medium' if score == RiskLevel.MEDIUM_RISK else 'Low'} Risk"
679
+ )
680
+
681
+ # Add weighted bonus scores
682
+ for category, score in bonus_scores.items():
683
+ weight = FIELD_WEIGHTS.get(category, 0)
684
+ total_score += weight * score
685
+ if score != BonusLevel.NO_BONUS:
686
+ explanation_parts.append(
687
+ f"{category.replace('', ' ').title()}: "
688
+ f"{'Best' if score == BonusLevel.BEST else 'Better' if score == BonusLevel.BETTER else 'Good'} Bonus"
689
+ )
690
+ # Normalize to 0-100 scale
691
+ min_possible = sum(w * RiskLevel.HIGH_RISK for w in FIELD_WEIGHTS.values())
692
+ max_possible = sum(w * max(RiskLevel.NO_RISK, BonusLevel.BEST) for w in FIELD_WEIGHTS.values())
693
+ normalized_score = ((total_score - min_possible) / (max_possible - min_possible)) * 100
694
+ final_score = max(0, min(100, normalized_score))
695
  return MatchResult(
696
  similarity_score=final_score,
697
+ field_scores={**field_scores, **risk_scores, **bonus_scores},
698
+ explanation=" | ".join(explanation_parts)
699
  )
700
 
701
  def initialize_embedding_system(job_encoder, seeker_encoder):