|
from sentence_transformers import SentenceTransformer |
|
from transformers import AutoModel, AutoTokenizer |
|
import torch |
|
import numpy as np |
|
from typing import Dict, List, Union, Optional |
|
from abc import ABC, abstractmethod |
|
from dataclasses import dataclass |
|
from datetime import datetime |
|
import json |
|
|
|
@dataclass |
|
class Skill: |
|
skill_name: str |
|
|
|
@dataclass |
|
class JobPosting: |
|
|
|
title: str |
|
role_description: str |
|
company_description: str |
|
primary_skills: List[Skill] |
|
secondary_skills: List[Skill] |
|
|
|
|
|
job_posting_id: str = "test_id" |
|
status: str = "active" |
|
location: str = "Test Location" |
|
workplace_model: str = "hybrid" |
|
job_engagement: str = "contract-to-hire" |
|
min_years_of_experience: int = 0 |
|
max_years_of_experience: int = 0 |
|
project_duration_from: datetime = datetime.now() |
|
project_duration_to: datetime = datetime.now() |
|
hourly_bill_rate_min: float = 50.0 |
|
hourly_bill_rate_max: float = 100.0 |
|
annual_salary_min: float = 100000.0 |
|
annual_salary_max: float = 150000.0 |
|
day_to_day_job_responsibilities: str = "" |
|
reason_for_hire: str = "" |
|
application_of_skills: str = "" |
|
company_id: str = "test_company" |
|
|
|
@dataclass |
|
class IndependentJobSeekerAssessmentRDS: |
|
|
|
primary_skills: List[str] |
|
secondary_skills: List[str] |
|
experiences: List[dict] |
|
educations: List[dict] |
|
certifications: List[dict] |
|
|
|
@dataclass |
|
class JobseekerInfoRDS: |
|
summary: str |
|
|
|
class BaseFieldEncoder(ABC): |
|
"""Base class for field-specific encoding""" |
|
def __init__(self, model_name: str = 'all-mpnet-base-v2'): |
|
self.model = SentenceTransformer(model_name) |
|
self.embedding_dim = self.model.get_sentence_embedding_dimension() |
|
|
|
|
|
if torch.cuda.is_available(): |
|
self.device = torch.device('cuda') |
|
elif torch.backends.mps.is_available(): |
|
self.device = torch.device('mps') |
|
else: |
|
self.device = torch.device('cpu') |
|
|
|
self.model.to(self.device) |
|
|
|
@abstractmethod |
|
def encode_fields(self, fields: Dict[str, str]) -> Dict[str, np.ndarray]: |
|
"""Encode each field separately""" |
|
pass |
|
|
|
def encode_field(self, text: str) -> np.ndarray: |
|
"""Encode a single field""" |
|
if not text: |
|
return np.zeros(self.embedding_dim) |
|
return self.model.encode(text, convert_to_numpy=True) |
|
|
|
class JobPostingEncoder(BaseFieldEncoder): |
|
"""Encoder specifically for job postings""" |
|
def __init__(self, model_name: str = 'all-mpnet-base-v2'): |
|
super().__init__(model_name) |
|
self.required_fields = { |
|
'title', |
|
'role_description', |
|
'company_description', |
|
'primary_skills', |
|
'secondary_skills' |
|
} |
|
|
|
def encode_fields(self, fields: Dict[str, str]) -> Dict[str, np.ndarray]: |
|
"""Encode all job posting fields""" |
|
|
|
missing_fields = self.required_fields - set(fields.keys()) |
|
if missing_fields: |
|
raise ValueError(f"Missing required fields: {missing_fields}") |
|
|
|
field_embeddings = {} |
|
|
|
|
|
for field_name, content in fields.items(): |
|
if field_name in self.required_fields: |
|
field_embeddings[field_name] = self.encode_field(content) |
|
|
|
return field_embeddings |
|
|
|
class JobSeekerEncoder(BaseFieldEncoder): |
|
"""Encoder specifically for job seekers""" |
|
def __init__(self, model_name: str = 'all-mpnet-base-v2'): |
|
super().__init__(model_name) |
|
self.required_fields = { |
|
'summary', |
|
'experience', |
|
'primary_skills', |
|
'secondary_skills', |
|
'certifications', |
|
'education' |
|
} |
|
|
|
def encode_fields(self, fields: Dict[str, str]) -> Dict[str, np.ndarray]: |
|
"""Encode all job seeker fields""" |
|
|
|
missing_fields = self.required_fields - set(fields.keys()) |
|
if missing_fields: |
|
raise ValueError(f"Missing required fields: {missing_fields}") |
|
|
|
field_embeddings = {} |
|
|
|
|
|
for field_name, content in fields.items(): |
|
if field_name in self.required_fields: |
|
field_embeddings[field_name] = self.encode_field(content) |
|
|
|
return field_embeddings |
|
|
|
class LegacyFieldEncoder: |
|
"""Legacy encoder that uses the original Qwen2 approach""" |
|
def __init__(self, model_path: str = "/Users/sebastian_a/jobposting-embedding"): |
|
|
|
self.model = AutoModel.from_pretrained(model_path) |
|
self.tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
|
|
if torch.cuda.is_available(): |
|
self.device = torch.device('cuda') |
|
elif torch.backends.mps.is_available(): |
|
self.device = torch.device('mps') |
|
else: |
|
self.device = torch.device('cpu') |
|
|
|
self.model.to(self.device) |
|
self.model.eval() |
|
|
|
def _get_embedding(self, text: str) -> np.ndarray: |
|
"""Helper to get embeddings with proper handling""" |
|
with torch.no_grad(): |
|
|
|
encoding = self.tokenizer( |
|
text, |
|
return_tensors='pt', |
|
padding=True, |
|
truncation=True, |
|
max_length=512, |
|
return_attention_mask=True |
|
) |
|
|
|
|
|
input_ids = encoding['input_ids'].to(self.device) |
|
attention_mask = encoding['attention_mask'].to(self.device) |
|
|
|
|
|
outputs = self.model( |
|
input_ids=input_ids, |
|
attention_mask=attention_mask, |
|
output_hidden_states=True |
|
) |
|
|
|
|
|
last_hidden_state = outputs.last_hidden_state |
|
|
|
|
|
|
|
mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float() |
|
sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1) |
|
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) |
|
embedding = (sum_embeddings / sum_mask).squeeze() |
|
|
|
|
|
embedding = embedding.cpu().numpy() |
|
embedding = embedding / (np.linalg.norm(embedding) + 1e-9) |
|
|
|
return embedding |
|
|
|
def encode_jobposting(self, job_posting: JobPosting) -> np.ndarray: |
|
"""Legacy job posting encoding using Qwen2""" |
|
|
|
primary_skills_str = ', '.join(skill.skill_name for skill in job_posting.primary_skills) if job_posting.primary_skills else 'None' |
|
secondary_skills_str = ', '.join(skill.skill_name for skill in job_posting.secondary_skills) if job_posting.secondary_skills else 'None' |
|
|
|
|
|
text = f""" |
|
Title: {job_posting.title} |
|
|
|
About the Role: |
|
{job_posting.role_description} |
|
|
|
Company Description: |
|
{job_posting.company_description} |
|
|
|
Primary Skills Required: |
|
{primary_skills_str} |
|
|
|
Secondary Skills Preferred: |
|
{secondary_skills_str} |
|
""" |
|
return self._get_embedding(text) |
|
|
|
def encode_jobseeker(self, processed_jobseeker: IndependentJobSeekerAssessmentRDS, |
|
unprocessed_jobseeker: JobseekerInfoRDS) -> np.ndarray: |
|
"""Legacy job seeker encoding using Qwen2""" |
|
|
|
text = f""" |
|
Summary: {unprocessed_jobseeker.summary} |
|
|
|
Skills: {', '.join(processed_jobseeker.skills)} |
|
|
|
Experience: |
|
{json.dumps(processed_jobseeker.experiences, indent=2)} |
|
|
|
Education: |
|
{json.dumps(processed_jobseeker.educations, indent=2)} |
|
|
|
Certifications: |
|
{json.dumps(processed_jobseeker.certifications, indent=2)} |
|
""" |
|
return self._get_embedding(text) |
|
|
|
def encode_fields(self, fields: Dict[str, str]) -> Dict[str, np.ndarray]: |
|
"""Not used in legacy approach""" |
|
pass |
|
|
|
def create_encoders(model_name: str = 'all-mpnet-base-v2') -> tuple: |
|
"""Create both job posting and seeker encoders using the same base model""" |
|
print(f"Creating encoders using {model_name}...") |
|
|
|
job_encoder = JobPostingEncoder(model_name) |
|
seeker_encoder = JobSeekerEncoder(model_name) |
|
|
|
print(f"Created encoders with embedding dimension: {job_encoder.embedding_dim}") |
|
return job_encoder, seeker_encoder |
|
|
|
FIELD_MAPPING = { |
|
'title': ['summary'], |
|
'primary_skills': ['primary_skills'], |
|
'secondary_skills': ['secondary_skills'], |
|
'role_description': ['experience', 'certifications'] |
|
} |
|
|
|
FIELD_WEIGHTS = { |
|
'primary_skills_primary_skills': 0.5, |
|
'secondary_skills_secondary_skills': 0.1, |
|
'role_description_experience': 0.25, |
|
'role_description_certifications': 0.05, |
|
'title_summary': 0.1 |
|
} |
|
|
|
__all__ = ['JobPostingEncoder', 'JobSeekerEncoder', 'LegacyFieldEncoder', |
|
'create_encoders', 'FIELD_MAPPING', 'FIELD_WEIGHTS'] |