sebastianalgharaballi
commited on
Commit
•
0516311
1
Parent(s):
fb19dc4
new scoring
Browse files- encoder.py +105 -25
encoder.py
CHANGED
@@ -2,11 +2,26 @@ from sentence_transformers import SentenceTransformer
|
|
2 |
from transformers import AutoModel, AutoTokenizer
|
3 |
import torch
|
4 |
import numpy as np
|
5 |
-
from typing import Dict, List, Union, Optional
|
6 |
from abc import ABC, abstractmethod
|
7 |
from dataclasses import dataclass
|
8 |
-
from datetime import datetime
|
|
|
9 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
@dataclass
|
12 |
class Skill:
|
@@ -39,6 +54,11 @@ class JobPosting:
|
|
39 |
reason_for_hire: str = ""
|
40 |
application_of_skills: str = ""
|
41 |
company_id: str = "test_company"
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
@dataclass
|
44 |
class IndependentJobSeekerAssessmentRDS:
|
@@ -53,6 +73,49 @@ class IndependentJobSeekerAssessmentRDS:
|
|
53 |
class JobseekerInfoRDS:
|
54 |
summary: str
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
class BaseFieldEncoder(ABC):
|
57 |
"""Base class for field-specific encoding"""
|
58 |
def __init__(self, model_name: str = 'all-mpnet-base-v2'):
|
@@ -118,7 +181,7 @@ class JobSeekerEncoder(BaseFieldEncoder):
|
|
118 |
'primary_skills',
|
119 |
'secondary_skills',
|
120 |
'certifications',
|
121 |
-
'education'
|
122 |
}
|
123 |
|
124 |
def encode_fields(self, fields: Dict[str, str]) -> Dict[str, np.ndarray]:
|
@@ -175,14 +238,13 @@ class LegacyFieldEncoder:
|
|
175 |
outputs = self.model(
|
176 |
input_ids=input_ids,
|
177 |
attention_mask=attention_mask,
|
178 |
-
output_hidden_states=True
|
179 |
)
|
180 |
|
181 |
# Get last hidden state
|
182 |
last_hidden_state = outputs.last_hidden_state
|
183 |
|
184 |
# Apply attention mask and mean pooling
|
185 |
-
# This is better than just taking CLS token
|
186 |
mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
|
187 |
sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1)
|
188 |
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
@@ -196,11 +258,9 @@ class LegacyFieldEncoder:
|
|
196 |
|
197 |
def encode_jobposting(self, job_posting: JobPosting) -> np.ndarray:
|
198 |
"""Legacy job posting encoding using Qwen2"""
|
199 |
-
# Convert skills lists to strings
|
200 |
primary_skills_str = ', '.join(skill.skill_name for skill in job_posting.primary_skills) if job_posting.primary_skills else 'None'
|
201 |
secondary_skills_str = ', '.join(skill.skill_name for skill in job_posting.secondary_skills) if job_posting.secondary_skills else 'None'
|
202 |
|
203 |
-
# Concatenate all fields into one string, maintaining legacy format
|
204 |
text = f"""
|
205 |
Title: {job_posting.title}
|
206 |
|
@@ -221,7 +281,6 @@ class LegacyFieldEncoder:
|
|
221 |
def encode_jobseeker(self, processed_jobseeker: IndependentJobSeekerAssessmentRDS,
|
222 |
unprocessed_jobseeker: JobseekerInfoRDS) -> np.ndarray:
|
223 |
"""Legacy job seeker encoding using Qwen2"""
|
224 |
-
# Create a single string with all relevant information
|
225 |
text = f"""
|
226 |
Summary: {unprocessed_jobseeker.summary}
|
227 |
|
@@ -242,6 +301,40 @@ class LegacyFieldEncoder:
|
|
242 |
"""Not used in legacy approach"""
|
243 |
pass
|
244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
def create_encoders(model_name: str = 'all-mpnet-base-v2') -> tuple:
|
246 |
"""Create both job posting and seeker encoders using the same base model"""
|
247 |
print(f"Creating encoders using {model_name}...")
|
@@ -252,20 +345,7 @@ def create_encoders(model_name: str = 'all-mpnet-base-v2') -> tuple:
|
|
252 |
print(f"Created encoders with embedding dimension: {job_encoder.embedding_dim}")
|
253 |
return job_encoder, seeker_encoder
|
254 |
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
'role_description': ['experience', 'certifications'] # Role maps to both experience and certs
|
260 |
-
}
|
261 |
-
|
262 |
-
FIELD_WEIGHTS = {
|
263 |
-
'primary_skills_primary_skills': 0.5, # Increased - exact skill matches are critical
|
264 |
-
'secondary_skills_secondary_skills': 0.1, # Decreased - nice to have but less critical
|
265 |
-
'role_description_experience': 0.25,
|
266 |
-
'role_description_certifications': 0.05,
|
267 |
-
'title_summary': 0.1
|
268 |
-
}
|
269 |
-
|
270 |
-
__all__ = ['JobPostingEncoder', 'JobSeekerEncoder', 'LegacyFieldEncoder',
|
271 |
-
'create_encoders', 'FIELD_MAPPING', 'FIELD_WEIGHTS']
|
|
|
2 |
from transformers import AutoModel, AutoTokenizer
|
3 |
import torch
|
4 |
import numpy as np
|
5 |
+
from typing import Dict, List, Union, Optional, Tuple, Set
|
6 |
from abc import ABC, abstractmethod
|
7 |
from dataclasses import dataclass
|
8 |
+
from datetime import datetime, timedelta
|
9 |
+
from collections import defaultdict
|
10 |
import json
|
11 |
+
import re
|
12 |
+
|
13 |
+
# Risk and Bonus Level Constants
|
14 |
+
class RiskLevel:
|
15 |
+
NO_RISK = 5
|
16 |
+
LOW_RISK = -5
|
17 |
+
MEDIUM_RISK = -10
|
18 |
+
HIGH_RISK = -15
|
19 |
+
|
20 |
+
class BonusLevel:
|
21 |
+
NO_BONUS = 0
|
22 |
+
GOOD = 1
|
23 |
+
BETTER = 2
|
24 |
+
BEST = 3
|
25 |
|
26 |
@dataclass
|
27 |
class Skill:
|
|
|
54 |
reason_for_hire: str = ""
|
55 |
application_of_skills: str = ""
|
56 |
company_id: str = "test_company"
|
57 |
+
# New fields for scoring
|
58 |
+
industry: str = ""
|
59 |
+
company_size: int = 0
|
60 |
+
company_revenue: float = 0.0
|
61 |
+
growth_rate: float = 0.0
|
62 |
|
63 |
@dataclass
|
64 |
class IndependentJobSeekerAssessmentRDS:
|
|
|
73 |
class JobseekerInfoRDS:
|
74 |
summary: str
|
75 |
|
76 |
+
@dataclass
|
77 |
+
class MatchResult:
|
78 |
+
"""Stores the result of a job-seeker match with explanation"""
|
79 |
+
similarity_score: float
|
80 |
+
field_scores: Dict[str, float]
|
81 |
+
explanation: str
|
82 |
+
status: str = "unseen"
|
83 |
+
|
84 |
+
# Constants for skill evaluation
|
85 |
+
SOFT_SKILLS_KEYWORDS = {
|
86 |
+
'communication': ['effectively communicated', 'presented to stakeholders', 'negotiated', 'collaborated with', 'mediated'],
|
87 |
+
'teamwork': ['worked in a team', 'collaborated with', 'partnered with', 'contributed to a team effort'],
|
88 |
+
'leadership': ['led a team', 'mentored', 'coached', 'managed', 'guided'],
|
89 |
+
'problem_solving': ['resolved', 'addressed challenges', 'innovated', 'strategized', 'implemented solutions'],
|
90 |
+
'adaptability': ['adapted to', 'quickly learned', 'flexible in', 'handled change'],
|
91 |
+
'emotional_intelligence': ['empathized with', 'understood needs', 'fostered relationships', 'built trust', 'managed conflict']
|
92 |
+
}
|
93 |
+
|
94 |
+
LEADERSHIP_KEYWORDS = [
|
95 |
+
'led', 'managed', 'directed', 'architected', 'innovated',
|
96 |
+
'spearheaded', 'strategized', 'developed', 'executed',
|
97 |
+
'owned', 'delivered', 'implemented'
|
98 |
+
]
|
99 |
+
|
100 |
+
IMPACT_PATTERNS = {
|
101 |
+
'revenue_growth': r'increased revenue by (\d+)%',
|
102 |
+
'cost_savings': r'saved \$(\d+) million',
|
103 |
+
'project_launch': r'launched .+ generated \$(\d+) million',
|
104 |
+
'project_completion': r'completed .+ ahead of schedule',
|
105 |
+
'budget_management': r'managed \$(\d+) million budget',
|
106 |
+
'risk_mitigation': r'decreased .+ by (\d+)%',
|
107 |
+
'client_retention': r'improved retention by (\d+)%',
|
108 |
+
'satisfaction': r'satisfaction .+ (\d+)% to (\d+)%',
|
109 |
+
'team_growth': r'grew team by (\d+)%'
|
110 |
+
}
|
111 |
+
|
112 |
+
EXPERIENCE_LEVELS = {
|
113 |
+
'junior': ['Junior', 'Associate'],
|
114 |
+
'mid': ['Staff', 'Senior', 'Sr.'],
|
115 |
+
'senior': ['Principal', 'Lead', 'Supervisor', 'Manager'],
|
116 |
+
'executive': ['Director', 'VP', 'CXO', 'President', 'Owner', 'Founder', 'Partner']
|
117 |
+
}
|
118 |
+
|
119 |
class BaseFieldEncoder(ABC):
|
120 |
"""Base class for field-specific encoding"""
|
121 |
def __init__(self, model_name: str = 'all-mpnet-base-v2'):
|
|
|
181 |
'primary_skills',
|
182 |
'secondary_skills',
|
183 |
'certifications',
|
184 |
+
'education'
|
185 |
}
|
186 |
|
187 |
def encode_fields(self, fields: Dict[str, str]) -> Dict[str, np.ndarray]:
|
|
|
238 |
outputs = self.model(
|
239 |
input_ids=input_ids,
|
240 |
attention_mask=attention_mask,
|
241 |
+
output_hidden_states=True
|
242 |
)
|
243 |
|
244 |
# Get last hidden state
|
245 |
last_hidden_state = outputs.last_hidden_state
|
246 |
|
247 |
# Apply attention mask and mean pooling
|
|
|
248 |
mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
|
249 |
sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1)
|
250 |
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
|
|
258 |
|
259 |
def encode_jobposting(self, job_posting: JobPosting) -> np.ndarray:
|
260 |
"""Legacy job posting encoding using Qwen2"""
|
|
|
261 |
primary_skills_str = ', '.join(skill.skill_name for skill in job_posting.primary_skills) if job_posting.primary_skills else 'None'
|
262 |
secondary_skills_str = ', '.join(skill.skill_name for skill in job_posting.secondary_skills) if job_posting.secondary_skills else 'None'
|
263 |
|
|
|
264 |
text = f"""
|
265 |
Title: {job_posting.title}
|
266 |
|
|
|
281 |
def encode_jobseeker(self, processed_jobseeker: IndependentJobSeekerAssessmentRDS,
|
282 |
unprocessed_jobseeker: JobseekerInfoRDS) -> np.ndarray:
|
283 |
"""Legacy job seeker encoding using Qwen2"""
|
|
|
284 |
text = f"""
|
285 |
Summary: {unprocessed_jobseeker.summary}
|
286 |
|
|
|
301 |
"""Not used in legacy approach"""
|
302 |
pass
|
303 |
|
304 |
+
# Field mappings and weights
|
305 |
+
FIELD_MAPPING = {
|
306 |
+
'title': ['summary'],
|
307 |
+
'primary_skills': ['primary_skills'],
|
308 |
+
'secondary_skills': ['secondary_skills'],
|
309 |
+
'role_description': ['experience', 'certifications']
|
310 |
+
}
|
311 |
+
|
312 |
+
FIELD_WEIGHTS = {
|
313 |
+
'job_stability': 12,
|
314 |
+
'job_duration': 12,
|
315 |
+
'responsibility_tenure': 1.5,
|
316 |
+
'employment_pedigree': 7.5,
|
317 |
+
'primary_skills_experience': 5,
|
318 |
+
'career_experience': 2.5,
|
319 |
+
'role_impact': 5,
|
320 |
+
'management_scope': 1.5,
|
321 |
+
'primary_skills_occurrence': 12,
|
322 |
+
'primary_skills_frequency': 2.5,
|
323 |
+
'primary_skills_recency': 15,
|
324 |
+
'soft_skills': 1,
|
325 |
+
'employment_recency': 7.5,
|
326 |
+
'location_match': 1,
|
327 |
+
'certifications': 2,
|
328 |
+
'job_title_experience': 5,
|
329 |
+
'job_title_match': 7,
|
330 |
+
# Original embedding weights
|
331 |
+
'primary_skills_primary_skills': 0.5,
|
332 |
+
'secondary_skills_secondary_skills': 0.1,
|
333 |
+
'role_description_experience': 0.25,
|
334 |
+
'role_description_certifications': 0.05,
|
335 |
+
'title_summary': 0.1
|
336 |
+
}
|
337 |
+
|
338 |
def create_encoders(model_name: str = 'all-mpnet-base-v2') -> tuple:
|
339 |
"""Create both job posting and seeker encoders using the same base model"""
|
340 |
print(f"Creating encoders using {model_name}...")
|
|
|
345 |
print(f"Created encoders with embedding dimension: {job_encoder.embedding_dim}")
|
346 |
return job_encoder, seeker_encoder
|
347 |
|
348 |
+
__all__ = ['JobPostingEncoder', 'JobSeekerEncoder', 'LegacyFieldEncoder',
|
349 |
+
'JobPosting', 'IndependentJobSeekerAssessmentRDS', 'JobseekerInfoRDS',
|
350 |
+
'MatchResult', 'create_encoders', 'Skill',
|
351 |
+
'FIELD_MAPPING', 'FIELD_WEIGHTS']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|