sebastianalgharaballi commited on
Commit
0516311
1 Parent(s): fb19dc4

new scoring

Browse files
Files changed (1) hide show
  1. encoder.py +105 -25
encoder.py CHANGED
@@ -2,11 +2,26 @@ from sentence_transformers import SentenceTransformer
2
  from transformers import AutoModel, AutoTokenizer
3
  import torch
4
  import numpy as np
5
- from typing import Dict, List, Union, Optional
6
  from abc import ABC, abstractmethod
7
  from dataclasses import dataclass
8
- from datetime import datetime
 
9
  import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  @dataclass
12
  class Skill:
@@ -39,6 +54,11 @@ class JobPosting:
39
  reason_for_hire: str = ""
40
  application_of_skills: str = ""
41
  company_id: str = "test_company"
 
 
 
 
 
42
 
43
  @dataclass
44
  class IndependentJobSeekerAssessmentRDS:
@@ -53,6 +73,49 @@ class IndependentJobSeekerAssessmentRDS:
53
  class JobseekerInfoRDS:
54
  summary: str
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  class BaseFieldEncoder(ABC):
57
  """Base class for field-specific encoding"""
58
  def __init__(self, model_name: str = 'all-mpnet-base-v2'):
@@ -118,7 +181,7 @@ class JobSeekerEncoder(BaseFieldEncoder):
118
  'primary_skills',
119
  'secondary_skills',
120
  'certifications',
121
- 'education' # Add this line
122
  }
123
 
124
  def encode_fields(self, fields: Dict[str, str]) -> Dict[str, np.ndarray]:
@@ -175,14 +238,13 @@ class LegacyFieldEncoder:
175
  outputs = self.model(
176
  input_ids=input_ids,
177
  attention_mask=attention_mask,
178
- output_hidden_states=True # Get all hidden states
179
  )
180
 
181
  # Get last hidden state
182
  last_hidden_state = outputs.last_hidden_state
183
 
184
  # Apply attention mask and mean pooling
185
- # This is better than just taking CLS token
186
  mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
187
  sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1)
188
  sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
@@ -196,11 +258,9 @@ class LegacyFieldEncoder:
196
 
197
  def encode_jobposting(self, job_posting: JobPosting) -> np.ndarray:
198
  """Legacy job posting encoding using Qwen2"""
199
- # Convert skills lists to strings
200
  primary_skills_str = ', '.join(skill.skill_name for skill in job_posting.primary_skills) if job_posting.primary_skills else 'None'
201
  secondary_skills_str = ', '.join(skill.skill_name for skill in job_posting.secondary_skills) if job_posting.secondary_skills else 'None'
202
 
203
- # Concatenate all fields into one string, maintaining legacy format
204
  text = f"""
205
  Title: {job_posting.title}
206
 
@@ -221,7 +281,6 @@ class LegacyFieldEncoder:
221
  def encode_jobseeker(self, processed_jobseeker: IndependentJobSeekerAssessmentRDS,
222
  unprocessed_jobseeker: JobseekerInfoRDS) -> np.ndarray:
223
  """Legacy job seeker encoding using Qwen2"""
224
- # Create a single string with all relevant information
225
  text = f"""
226
  Summary: {unprocessed_jobseeker.summary}
227
 
@@ -242,6 +301,40 @@ class LegacyFieldEncoder:
242
  """Not used in legacy approach"""
243
  pass
244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  def create_encoders(model_name: str = 'all-mpnet-base-v2') -> tuple:
246
  """Create both job posting and seeker encoders using the same base model"""
247
  print(f"Creating encoders using {model_name}...")
@@ -252,20 +345,7 @@ def create_encoders(model_name: str = 'all-mpnet-base-v2') -> tuple:
252
  print(f"Created encoders with embedding dimension: {job_encoder.embedding_dim}")
253
  return job_encoder, seeker_encoder
254
 
255
- FIELD_MAPPING = {
256
- 'title': ['summary'], # Job title maps to seeker summary
257
- 'primary_skills': ['primary_skills'], # Primary skills to primary skills
258
- 'secondary_skills': ['secondary_skills'], # Secondary skills to secondary skills
259
- 'role_description': ['experience', 'certifications'] # Role maps to both experience and certs
260
- }
261
-
262
- FIELD_WEIGHTS = {
263
- 'primary_skills_primary_skills': 0.5, # Increased - exact skill matches are critical
264
- 'secondary_skills_secondary_skills': 0.1, # Decreased - nice to have but less critical
265
- 'role_description_experience': 0.25,
266
- 'role_description_certifications': 0.05,
267
- 'title_summary': 0.1
268
- }
269
-
270
- __all__ = ['JobPostingEncoder', 'JobSeekerEncoder', 'LegacyFieldEncoder',
271
- 'create_encoders', 'FIELD_MAPPING', 'FIELD_WEIGHTS']
 
2
  from transformers import AutoModel, AutoTokenizer
3
  import torch
4
  import numpy as np
5
+ from typing import Dict, List, Union, Optional, Tuple, Set
6
  from abc import ABC, abstractmethod
7
  from dataclasses import dataclass
8
+ from datetime import datetime, timedelta
9
+ from collections import defaultdict
10
  import json
11
+ import re
12
+
13
+ # Risk and Bonus Level Constants
14
+ class RiskLevel:
15
+ NO_RISK = 5
16
+ LOW_RISK = -5
17
+ MEDIUM_RISK = -10
18
+ HIGH_RISK = -15
19
+
20
+ class BonusLevel:
21
+ NO_BONUS = 0
22
+ GOOD = 1
23
+ BETTER = 2
24
+ BEST = 3
25
 
26
  @dataclass
27
  class Skill:
 
54
  reason_for_hire: str = ""
55
  application_of_skills: str = ""
56
  company_id: str = "test_company"
57
+ # New fields for scoring
58
+ industry: str = ""
59
+ company_size: int = 0
60
+ company_revenue: float = 0.0
61
+ growth_rate: float = 0.0
62
 
63
  @dataclass
64
  class IndependentJobSeekerAssessmentRDS:
 
73
  class JobseekerInfoRDS:
74
  summary: str
75
 
76
+ @dataclass
77
+ class MatchResult:
78
+ """Stores the result of a job-seeker match with explanation"""
79
+ similarity_score: float
80
+ field_scores: Dict[str, float]
81
+ explanation: str
82
+ status: str = "unseen"
83
+
84
+ # Constants for skill evaluation
85
+ SOFT_SKILLS_KEYWORDS = {
86
+ 'communication': ['effectively communicated', 'presented to stakeholders', 'negotiated', 'collaborated with', 'mediated'],
87
+ 'teamwork': ['worked in a team', 'collaborated with', 'partnered with', 'contributed to a team effort'],
88
+ 'leadership': ['led a team', 'mentored', 'coached', 'managed', 'guided'],
89
+ 'problem_solving': ['resolved', 'addressed challenges', 'innovated', 'strategized', 'implemented solutions'],
90
+ 'adaptability': ['adapted to', 'quickly learned', 'flexible in', 'handled change'],
91
+ 'emotional_intelligence': ['empathized with', 'understood needs', 'fostered relationships', 'built trust', 'managed conflict']
92
+ }
93
+
94
+ LEADERSHIP_KEYWORDS = [
95
+ 'led', 'managed', 'directed', 'architected', 'innovated',
96
+ 'spearheaded', 'strategized', 'developed', 'executed',
97
+ 'owned', 'delivered', 'implemented'
98
+ ]
99
+
100
+ IMPACT_PATTERNS = {
101
+ 'revenue_growth': r'increased revenue by (\d+)%',
102
+ 'cost_savings': r'saved \$(\d+) million',
103
+ 'project_launch': r'launched .+ generated \$(\d+) million',
104
+ 'project_completion': r'completed .+ ahead of schedule',
105
+ 'budget_management': r'managed \$(\d+) million budget',
106
+ 'risk_mitigation': r'decreased .+ by (\d+)%',
107
+ 'client_retention': r'improved retention by (\d+)%',
108
+ 'satisfaction': r'satisfaction .+ (\d+)% to (\d+)%',
109
+ 'team_growth': r'grew team by (\d+)%'
110
+ }
111
+
112
+ EXPERIENCE_LEVELS = {
113
+ 'junior': ['Junior', 'Associate'],
114
+ 'mid': ['Staff', 'Senior', 'Sr.'],
115
+ 'senior': ['Principal', 'Lead', 'Supervisor', 'Manager'],
116
+ 'executive': ['Director', 'VP', 'CXO', 'President', 'Owner', 'Founder', 'Partner']
117
+ }
118
+
119
  class BaseFieldEncoder(ABC):
120
  """Base class for field-specific encoding"""
121
  def __init__(self, model_name: str = 'all-mpnet-base-v2'):
 
181
  'primary_skills',
182
  'secondary_skills',
183
  'certifications',
184
+ 'education'
185
  }
186
 
187
  def encode_fields(self, fields: Dict[str, str]) -> Dict[str, np.ndarray]:
 
238
  outputs = self.model(
239
  input_ids=input_ids,
240
  attention_mask=attention_mask,
241
+ output_hidden_states=True
242
  )
243
 
244
  # Get last hidden state
245
  last_hidden_state = outputs.last_hidden_state
246
 
247
  # Apply attention mask and mean pooling
 
248
  mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
249
  sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1)
250
  sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
 
258
 
259
  def encode_jobposting(self, job_posting: JobPosting) -> np.ndarray:
260
  """Legacy job posting encoding using Qwen2"""
 
261
  primary_skills_str = ', '.join(skill.skill_name for skill in job_posting.primary_skills) if job_posting.primary_skills else 'None'
262
  secondary_skills_str = ', '.join(skill.skill_name for skill in job_posting.secondary_skills) if job_posting.secondary_skills else 'None'
263
 
 
264
  text = f"""
265
  Title: {job_posting.title}
266
 
 
281
  def encode_jobseeker(self, processed_jobseeker: IndependentJobSeekerAssessmentRDS,
282
  unprocessed_jobseeker: JobseekerInfoRDS) -> np.ndarray:
283
  """Legacy job seeker encoding using Qwen2"""
 
284
  text = f"""
285
  Summary: {unprocessed_jobseeker.summary}
286
 
 
301
  """Not used in legacy approach"""
302
  pass
303
 
304
+ # Field mappings and weights
305
+ FIELD_MAPPING = {
306
+ 'title': ['summary'],
307
+ 'primary_skills': ['primary_skills'],
308
+ 'secondary_skills': ['secondary_skills'],
309
+ 'role_description': ['experience', 'certifications']
310
+ }
311
+
312
+ FIELD_WEIGHTS = {
313
+ 'job_stability': 12,
314
+ 'job_duration': 12,
315
+ 'responsibility_tenure': 1.5,
316
+ 'employment_pedigree': 7.5,
317
+ 'primary_skills_experience': 5,
318
+ 'career_experience': 2.5,
319
+ 'role_impact': 5,
320
+ 'management_scope': 1.5,
321
+ 'primary_skills_occurrence': 12,
322
+ 'primary_skills_frequency': 2.5,
323
+ 'primary_skills_recency': 15,
324
+ 'soft_skills': 1,
325
+ 'employment_recency': 7.5,
326
+ 'location_match': 1,
327
+ 'certifications': 2,
328
+ 'job_title_experience': 5,
329
+ 'job_title_match': 7,
330
+ # Original embedding weights
331
+ 'primary_skills_primary_skills': 0.5,
332
+ 'secondary_skills_secondary_skills': 0.1,
333
+ 'role_description_experience': 0.25,
334
+ 'role_description_certifications': 0.05,
335
+ 'title_summary': 0.1
336
+ }
337
+
338
  def create_encoders(model_name: str = 'all-mpnet-base-v2') -> tuple:
339
  """Create both job posting and seeker encoders using the same base model"""
340
  print(f"Creating encoders using {model_name}...")
 
345
  print(f"Created encoders with embedding dimension: {job_encoder.embedding_dim}")
346
  return job_encoder, seeker_encoder
347
 
348
+ __all__ = ['JobPostingEncoder', 'JobSeekerEncoder', 'LegacyFieldEncoder',
349
+ 'JobPosting', 'IndependentJobSeekerAssessmentRDS', 'JobseekerInfoRDS',
350
+ 'MatchResult', 'create_encoders', 'Skill',
351
+ 'FIELD_MAPPING', 'FIELD_WEIGHTS']