|
from typing import Dict, List, Optional, Tuple, Set |
|
import numpy as np |
|
from dataclasses import dataclass |
|
from datetime import datetime |
|
|
|
from encoder import FIELD_MAPPING, FIELD_WEIGHTS |
|
|
|
@dataclass |
|
class Skill: |
|
skill_name: str |
|
|
|
@dataclass |
|
class JobPosting: |
|
|
|
title: str |
|
role_description: str |
|
company_description: str |
|
primary_skills: List[Skill] |
|
secondary_skills: List[Skill] |
|
|
|
|
|
job_posting_id: str = "test_id" |
|
status: str = "active" |
|
location: str = "Test Location" |
|
workplace_model: str = "hybrid" |
|
job_engagement: str = "contract-to-hire" |
|
min_years_of_experience: int = 0 |
|
max_years_of_experience: int = 0 |
|
project_duration_from: datetime = datetime.now() |
|
project_duration_to: datetime = datetime.now() |
|
hourly_bill_rate_min: float = 50.0 |
|
hourly_bill_rate_max: float = 100.0 |
|
annual_salary_min: float = 100000.0 |
|
annual_salary_max: float = 150000.0 |
|
day_to_day_job_responsibilities: str = "" |
|
reason_for_hire: str = "" |
|
application_of_skills: str = "" |
|
company_id: str = "test_company" |
|
|
|
@dataclass |
|
class IndependentJobSeekerAssessmentRDS: |
|
|
|
primary_skills: List[str] |
|
secondary_skills: List[str] |
|
experiences: List[dict] |
|
educations: List[dict] |
|
certifications: List[dict] |
|
|
|
@dataclass |
|
class JobseekerInfoRDS: |
|
summary: str |
|
|
|
@dataclass |
|
class MatchResult: |
|
"""Stores the result of a job-seeker match with explanation""" |
|
similarity_score: float |
|
field_scores: Dict[str, float] |
|
explanation: str |
|
status: str = "unseen" |
|
|
|
class EmbeddingManager: |
|
def __init__(self, job_encoder, seeker_encoder): |
|
self.job_encoder = job_encoder |
|
self.seeker_encoder = seeker_encoder |
|
|
|
def get_job_fields(self, job_posting: JobPosting) -> Dict[str, str]: |
|
"""Extract relevant fields from job posting""" |
|
|
|
primary_skills_str = ', '.join([skill.skill_name for skill in job_posting.primary_skills]) if job_posting.primary_skills else '' |
|
|
|
|
|
secondary_skills_str = ', '.join([skill.skill_name for skill in job_posting.secondary_skills]) if job_posting.secondary_skills else '' |
|
|
|
return { |
|
'title': job_posting.title, |
|
'role_description': job_posting.role_description, |
|
'company_description': job_posting.company_description, |
|
'primary_skills': primary_skills_str, |
|
'secondary_skills': secondary_skills_str |
|
} |
|
|
|
def get_seeker_fields(self, processed_seeker: IndependentJobSeekerAssessmentRDS, |
|
unprocessed_seeker: JobseekerInfoRDS) -> Dict[str, str]: |
|
"""Extract relevant fields from job seeker""" |
|
return { |
|
'primary_skills': ', '.join(processed_seeker.primary_skills), |
|
'secondary_skills': ', '.join(processed_seeker.secondary_skills), |
|
'experience': self._format_experience(processed_seeker.experiences), |
|
'education': self._format_education(processed_seeker.educations), |
|
'certifications': self._format_certifications(processed_seeker.certifications), |
|
'summary': unprocessed_seeker.summary |
|
} |
|
|
|
def _format_experience(self, experiences: List[dict]) -> str: |
|
exp_parts = [] |
|
for exp in experiences: |
|
summaries = exp.get('experience_summaries', []) |
|
exp_str = ' '.join(summaries) |
|
exp_parts.append(exp_str) |
|
return ' | '.join(exp_parts) |
|
|
|
|
|
|
|
def _format_education(self, educations: List[dict]) -> str: |
|
"""Format education entries into a single string""" |
|
edu_parts = [] |
|
for edu in educations: |
|
degree = edu.get('degree', '') |
|
field = edu.get('field', '') |
|
institution = edu.get('institution', '') |
|
edu_str = f"{degree} in {field} from {institution}" |
|
edu_parts.append(edu_str) |
|
return ' | '.join(edu_parts) |
|
|
|
def _format_certifications(self, certifications: List[dict]) -> str: |
|
"""Format certification entries into a single string""" |
|
cert_parts = [] |
|
for cert in certifications: |
|
name = cert.get('name', '') |
|
org = cert.get('organization', '') |
|
start = cert.get('start_date', '') |
|
end = cert.get('end_date', '') |
|
|
|
|
|
cert_str = name |
|
if org: |
|
cert_str += f" from {org}" |
|
if start or end: |
|
date_range = [] |
|
if start: |
|
date_range.append(start) |
|
if end: |
|
date_range.append(end) |
|
cert_str += f" ({' - '.join(date_range)})" |
|
|
|
cert_parts.append(cert_str) |
|
|
|
return ' | '.join(cert_parts) |
|
|
|
def embed_jobposting(self, job_posting: JobPosting) -> Dict[str, np.ndarray]: |
|
"""Generate embeddings for job posting fields""" |
|
fields = self.get_job_fields(job_posting) |
|
return self.job_encoder.encode_fields(fields) |
|
|
|
def embed_jobseeker(self, processed_seeker: IndependentJobSeekerAssessmentRDS, |
|
unprocessed_seeker: JobseekerInfoRDS) -> Dict[str, np.ndarray]: |
|
"""Generate embeddings for job seeker fields""" |
|
fields = self.get_seeker_fields(processed_seeker, unprocessed_seeker) |
|
print("DEBUG - Seeker fields:", fields) |
|
return self.seeker_encoder.encode_fields(fields) |
|
|
|
|
|
def calculate_similarity(self, job_embeddings: Dict[str, np.ndarray], |
|
seeker_embeddings: Dict[str, np.ndarray]) -> MatchResult: |
|
"""Calculate similarity with strict thresholds""" |
|
field_scores = {} |
|
explanation_parts = [] |
|
|
|
|
|
for job_field, seeker_fields in FIELD_MAPPING.items(): |
|
if job_field not in job_embeddings: |
|
continue |
|
|
|
job_emb = job_embeddings[job_field] |
|
|
|
|
|
for seeker_field in seeker_fields: |
|
if seeker_field not in seeker_embeddings: |
|
continue |
|
|
|
seeker_emb = seeker_embeddings[seeker_field] |
|
|
|
|
|
similarity = np.dot(job_emb, seeker_emb) / ( |
|
np.linalg.norm(job_emb) * np.linalg.norm(seeker_emb) + 1e-9 |
|
) |
|
|
|
|
|
raw_score = (similarity * 0.8) |
|
field_score = max(0, min(1, (raw_score + 1) / 2)) |
|
|
|
|
|
if field_score > 0.9: |
|
field_score = min(field_score * 1.1, 1.0) |
|
elif field_score < 0.7: |
|
field_score = field_score * 0.6 |
|
|
|
field_pair_name = f"{job_field}_{seeker_field}" |
|
field_scores[field_pair_name] = field_score |
|
|
|
|
|
match_quality = "strong" if field_score > 0.9 else \ |
|
"good" if field_score > 0.8 else \ |
|
"moderate" if field_score > 0.6 else "weak" |
|
|
|
explanation_parts.append( |
|
f"{match_quality.capitalize()} match on {job_field} to {seeker_field} " |
|
f"(similarity: {field_score:.2f})" |
|
) |
|
|
|
|
|
final_score = 0.0 |
|
total_weight = 0.0 |
|
|
|
|
|
critical_fields = { |
|
'primary_skills_primary_skills': 1, |
|
'role_description_experience': 1, |
|
'role_description_certifications': 1, |
|
} |
|
|
|
for field_pair, score in field_scores.items(): |
|
base_weight = FIELD_WEIGHTS.get(field_pair, 0.0) |
|
|
|
|
|
weight = base_weight * critical_fields.get(field_pair, 1.0) |
|
|
|
final_score += score * weight |
|
total_weight += weight |
|
|
|
if total_weight > 0: |
|
final_score = final_score / total_weight |
|
|
|
|
|
if final_score > 0.9: |
|
final_score = min(final_score * 1.1, 1.0) |
|
elif final_score < 0.7: |
|
final_score = final_score * 0.6 |
|
|
|
explanation = " | ".join(explanation_parts) |
|
|
|
print("DEBUG - All field scores:", field_scores) |
|
|
|
return MatchResult( |
|
similarity_score=final_score, |
|
field_scores=field_scores, |
|
explanation=explanation |
|
) |
|
|
|
def initialize_embedding_system(job_encoder, seeker_encoder): |
|
"""Initialize the embedding system""" |
|
return EmbeddingManager(job_encoder, seeker_encoder) |
|
|
|
__all__ = ['EmbeddingManager', 'MatchResult', 'initialize_embedding_system'] |