|
from sentence_transformers import SentenceTransformer |
|
from transformers import AutoModel, AutoTokenizer |
|
import torch |
|
import numpy as np |
|
from typing import Dict, List, Union, Optional, Tuple, Set |
|
from abc import ABC, abstractmethod |
|
from dataclasses import dataclass |
|
from datetime import datetime, timedelta |
|
from collections import defaultdict |
|
import json |
|
import re |
|
|
|
|
|
class RiskLevel: |
|
NO_RISK = 5 |
|
LOW_RISK = -5 |
|
MEDIUM_RISK = -10 |
|
HIGH_RISK = -15 |
|
|
|
class BonusLevel: |
|
NO_BONUS = 0 |
|
GOOD = 1 |
|
BETTER = 2 |
|
BEST = 3 |
|
|
|
@dataclass |
|
class Skill: |
|
skill_name: str |
|
|
|
@dataclass |
|
class JobPosting: |
|
|
|
title: str |
|
role_description: str |
|
company_description: str |
|
primary_skills: List[Skill] |
|
secondary_skills: List[Skill] |
|
|
|
|
|
job_posting_id: str = "test_id" |
|
status: str = "active" |
|
location: str = "Test Location" |
|
workplace_model: str = "hybrid" |
|
job_engagement: str = "contract-to-hire" |
|
min_years_of_experience: int = 0 |
|
max_years_of_experience: int = 0 |
|
project_duration_from: datetime = datetime.now() |
|
project_duration_to: datetime = datetime.now() |
|
hourly_bill_rate_min: float = 50.0 |
|
hourly_bill_rate_max: float = 100.0 |
|
annual_salary_min: float = 100000.0 |
|
annual_salary_max: float = 150000.0 |
|
day_to_day_job_responsibilities: str = "" |
|
reason_for_hire: str = "" |
|
application_of_skills: str = "" |
|
company_id: str = "test_company" |
|
|
|
industry: str = "" |
|
company_size: int = 0 |
|
company_revenue: float = 0.0 |
|
growth_rate: float = 0.0 |
|
|
|
@dataclass |
|
class IndependentJobSeekerAssessmentRDS: |
|
|
|
primary_skills: List[str] |
|
secondary_skills: List[str] |
|
experiences: List[dict] |
|
educations: List[dict] |
|
certifications: List[dict] |
|
|
|
@dataclass |
|
class JobseekerInfoRDS: |
|
summary: str |
|
|
|
@dataclass |
|
class MatchResult: |
|
"""Stores the result of a job-seeker match with explanation""" |
|
similarity_score: float |
|
field_scores: Dict[str, float] |
|
explanation: str |
|
status: str = "unseen" |
|
|
|
|
|
SOFT_SKILLS_KEYWORDS = { |
|
'communication': ['effectively communicated', 'presented to stakeholders', 'negotiated', 'collaborated with', 'mediated'], |
|
'teamwork': ['worked in a team', 'collaborated with', 'partnered with', 'contributed to a team effort'], |
|
'leadership': ['led a team', 'mentored', 'coached', 'managed', 'guided'], |
|
'problem_solving': ['resolved', 'addressed challenges', 'innovated', 'strategized', 'implemented solutions'], |
|
'adaptability': ['adapted to', 'quickly learned', 'flexible in', 'handled change'], |
|
'emotional_intelligence': ['empathized with', 'understood needs', 'fostered relationships', 'built trust', 'managed conflict'] |
|
} |
|
|
|
LEADERSHIP_KEYWORDS = [ |
|
'led', 'managed', 'directed', 'architected', 'innovated', |
|
'spearheaded', 'strategized', 'developed', 'executed', |
|
'owned', 'delivered', 'implemented' |
|
] |
|
|
|
IMPACT_PATTERNS = { |
|
'revenue_growth': r'increased revenue by (\d+)%', |
|
'cost_savings': r'saved \$(\d+) million', |
|
'project_launch': r'launched .+ generated \$(\d+) million', |
|
'project_completion': r'completed .+ ahead of schedule', |
|
'budget_management': r'managed \$(\d+) million budget', |
|
'risk_mitigation': r'decreased .+ by (\d+)%', |
|
'client_retention': r'improved retention by (\d+)%', |
|
'satisfaction': r'satisfaction .+ (\d+)% to (\d+)%', |
|
'team_growth': r'grew team by (\d+)%' |
|
} |
|
|
|
EXPERIENCE_LEVELS = { |
|
'junior': ['Junior', 'Associate'], |
|
'mid': ['Staff', 'Senior', 'Sr.'], |
|
'senior': ['Principal', 'Lead', 'Supervisor', 'Manager'], |
|
'executive': ['Director', 'VP', 'CXO', 'President', 'Owner', 'Founder', 'Partner'] |
|
} |
|
|
|
class BaseFieldEncoder(ABC): |
|
"""Base class for field-specific encoding""" |
|
def __init__(self, model_name: str = 'all-mpnet-base-v2'): |
|
self.model = SentenceTransformer(model_name) |
|
self.embedding_dim = self.model.get_sentence_embedding_dimension() |
|
|
|
|
|
if torch.cuda.is_available(): |
|
self.device = torch.device('cuda') |
|
elif torch.backends.mps.is_available(): |
|
self.device = torch.device('mps') |
|
else: |
|
self.device = torch.device('cpu') |
|
|
|
self.model.to(self.device) |
|
|
|
@abstractmethod |
|
def encode_fields(self, fields: Dict[str, str]) -> Dict[str, np.ndarray]: |
|
"""Encode each field separately""" |
|
pass |
|
|
|
def encode_field(self, text: str) -> np.ndarray: |
|
"""Encode a single field""" |
|
if not text: |
|
return np.zeros(self.embedding_dim) |
|
return self.model.encode(text, convert_to_numpy=True) |
|
|
|
class JobPostingEncoder(BaseFieldEncoder): |
|
"""Encoder specifically for job postings""" |
|
def __init__(self, model_name: str = 'all-mpnet-base-v2'): |
|
super().__init__(model_name) |
|
self.required_fields = { |
|
'title', |
|
'role_description', |
|
'company_description', |
|
'primary_skills', |
|
'secondary_skills' |
|
} |
|
|
|
def encode_fields(self, fields: Dict[str, str]) -> Dict[str, np.ndarray]: |
|
"""Encode all job posting fields""" |
|
|
|
missing_fields = self.required_fields - set(fields.keys()) |
|
if missing_fields: |
|
raise ValueError(f"Missing required fields: {missing_fields}") |
|
|
|
field_embeddings = {} |
|
|
|
|
|
for field_name, content in fields.items(): |
|
if field_name in self.required_fields: |
|
field_embeddings[field_name] = self.encode_field(content) |
|
|
|
return field_embeddings |
|
|
|
class JobSeekerEncoder(BaseFieldEncoder): |
|
"""Encoder specifically for job seekers""" |
|
def __init__(self, model_name: str = 'all-mpnet-base-v2'): |
|
super().__init__(model_name) |
|
self.required_fields = { |
|
'summary', |
|
'experience', |
|
'primary_skills', |
|
'secondary_skills', |
|
'certifications', |
|
'education' |
|
} |
|
|
|
def encode_fields(self, fields: Dict[str, str]) -> Dict[str, np.ndarray]: |
|
"""Encode all job seeker fields""" |
|
|
|
missing_fields = self.required_fields - set(fields.keys()) |
|
if missing_fields: |
|
raise ValueError(f"Missing required fields: {missing_fields}") |
|
|
|
field_embeddings = {} |
|
|
|
|
|
for field_name, content in fields.items(): |
|
if field_name in self.required_fields: |
|
field_embeddings[field_name] = self.encode_field(content) |
|
|
|
return field_embeddings |
|
|
|
class LegacyFieldEncoder: |
|
"""Legacy encoder that uses the original Qwen2 approach""" |
|
def __init__(self, model_path: str = "/Users/sebastian_a/jobposting-embedding"): |
|
|
|
self.model = AutoModel.from_pretrained(model_path) |
|
self.tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
|
|
if torch.cuda.is_available(): |
|
self.device = torch.device('cuda') |
|
elif torch.backends.mps.is_available(): |
|
self.device = torch.device('mps') |
|
else: |
|
self.device = torch.device('cpu') |
|
|
|
self.model.to(self.device) |
|
self.model.eval() |
|
|
|
def _get_embedding(self, text: str) -> np.ndarray: |
|
"""Helper to get embeddings with proper handling""" |
|
with torch.no_grad(): |
|
|
|
encoding = self.tokenizer( |
|
text, |
|
return_tensors='pt', |
|
padding=True, |
|
truncation=True, |
|
max_length=512, |
|
return_attention_mask=True |
|
) |
|
|
|
|
|
input_ids = encoding['input_ids'].to(self.device) |
|
attention_mask = encoding['attention_mask'].to(self.device) |
|
|
|
|
|
outputs = self.model( |
|
input_ids=input_ids, |
|
attention_mask=attention_mask, |
|
output_hidden_states=True |
|
) |
|
|
|
|
|
last_hidden_state = outputs.last_hidden_state |
|
|
|
|
|
mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float() |
|
sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1) |
|
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) |
|
embedding = (sum_embeddings / sum_mask).squeeze() |
|
|
|
|
|
embedding = embedding.cpu().numpy() |
|
embedding = embedding / (np.linalg.norm(embedding) + 1e-9) |
|
|
|
return embedding |
|
|
|
def encode_jobposting(self, job_posting: JobPosting) -> np.ndarray: |
|
"""Legacy job posting encoding using Qwen2""" |
|
primary_skills_str = ', '.join(skill.skill_name for skill in job_posting.primary_skills) if job_posting.primary_skills else 'None' |
|
secondary_skills_str = ', '.join(skill.skill_name for skill in job_posting.secondary_skills) if job_posting.secondary_skills else 'None' |
|
|
|
text = f""" |
|
Title: {job_posting.title} |
|
|
|
About the Role: |
|
{job_posting.role_description} |
|
|
|
Company Description: |
|
{job_posting.company_description} |
|
|
|
Primary Skills Required: |
|
{primary_skills_str} |
|
|
|
Secondary Skills Preferred: |
|
{secondary_skills_str} |
|
""" |
|
return self._get_embedding(text) |
|
|
|
def encode_jobseeker(self, processed_jobseeker: IndependentJobSeekerAssessmentRDS, |
|
unprocessed_jobseeker: JobseekerInfoRDS) -> np.ndarray: |
|
"""Legacy job seeker encoding using Qwen2""" |
|
text = f""" |
|
Summary: {unprocessed_jobseeker.summary} |
|
|
|
Skills: {', '.join(processed_jobseeker.skills)} |
|
|
|
Experience: |
|
{json.dumps(processed_jobseeker.experiences, indent=2)} |
|
|
|
Education: |
|
{json.dumps(processed_jobseeker.educations, indent=2)} |
|
|
|
Certifications: |
|
{json.dumps(processed_jobseeker.certifications, indent=2)} |
|
""" |
|
return self._get_embedding(text) |
|
|
|
def encode_fields(self, fields: Dict[str, str]) -> Dict[str, np.ndarray]: |
|
"""Not used in legacy approach""" |
|
pass |
|
|
|
|
|
FIELD_MAPPING = { |
|
'title': ['summary'], |
|
'primary_skills': ['primary_skills'], |
|
'secondary_skills': ['secondary_skills'], |
|
'role_description': ['experience', 'certifications'] |
|
} |
|
|
|
FIELD_WEIGHTS = { |
|
'job_stability': 12, |
|
'job_duration': 12, |
|
'responsibility_tenure': 1.5, |
|
'employment_pedigree': 7.5, |
|
'primary_skills_experience': 5, |
|
'career_experience': 2.5, |
|
'role_impact': 5, |
|
'management_scope': 1.5, |
|
'primary_skills_occurrence': 12, |
|
'primary_skills_frequency': 2.5, |
|
'primary_skills_recency': 15, |
|
'soft_skills': 1, |
|
'employment_recency': 7.5, |
|
'location_match': 1, |
|
'certifications': 2, |
|
'job_title_experience': 5, |
|
'job_title_match': 7, |
|
|
|
'primary_skills_primary_skills': 0.5, |
|
'secondary_skills_secondary_skills': 0.1, |
|
'role_description_experience': 0.25, |
|
'role_description_certifications': 0.05, |
|
'title_summary': 0.1 |
|
} |
|
|
|
def create_encoders(model_name: str = 'all-mpnet-base-v2') -> tuple: |
|
"""Create both job posting and seeker encoders using the same base model""" |
|
print(f"Creating encoders using {model_name}...") |
|
|
|
job_encoder = JobPostingEncoder(model_name) |
|
seeker_encoder = JobSeekerEncoder(model_name) |
|
|
|
print(f"Created encoders with embedding dimension: {job_encoder.embedding_dim}") |
|
return job_encoder, seeker_encoder |
|
|
|
__all__ = ['JobPostingEncoder', 'JobSeekerEncoder', 'LegacyFieldEncoder', |
|
'JobPosting', 'IndependentJobSeekerAssessmentRDS', 'JobseekerInfoRDS', |
|
'MatchResult', 'create_encoders', 'Skill', |
|
'FIELD_MAPPING', 'FIELD_WEIGHTS'] |