|
from dataclasses import dataclass |
|
from typing import List, Optional, Dict |
|
from datetime import datetime |
|
import numpy as np |
|
from termcolor import colored |
|
import json |
|
from encoder import create_encoders, FIELD_MAPPING, LegacyFieldEncoder |
|
from embeddings import EmbeddingManager, MatchResult, Skill |
|
|
|
@dataclass |
|
class Skill: |
|
skill_name: str |
|
|
|
@dataclass |
|
class JobPosting: |
|
|
|
title: str |
|
role_description: str |
|
company_description: str |
|
primary_skills: List[Skill] |
|
secondary_skills: List[Skill] |
|
|
|
|
|
job_posting_id: str = "test_id" |
|
status: str = "active" |
|
location: str = "Test Location" |
|
workplace_model: str = "hybrid" |
|
job_engagement: str = "contract-to-hire" |
|
min_years_of_experience: int = 0 |
|
max_years_of_experience: int = 0 |
|
project_duration_from: datetime = datetime.now() |
|
project_duration_to: datetime = datetime.now() |
|
hourly_bill_rate_min: float = 50.0 |
|
hourly_bill_rate_max: float = 100.0 |
|
annual_salary_min: float = 100000.0 |
|
annual_salary_max: float = 150000.0 |
|
day_to_day_job_responsibilities: str = "" |
|
reason_for_hire: str = "" |
|
application_of_skills: str = "" |
|
company_id: str = "test_company" |
|
|
|
@dataclass |
|
class IndependentJobSeekerAssessmentRDS: |
|
|
|
primary_skills: List[str] |
|
secondary_skills: List[str] |
|
experiences: List[dict] |
|
educations: List[dict] |
|
certifications: List[dict] |
|
|
|
@dataclass |
|
class JobseekerInfoRDS: |
|
summary: str |
|
|
|
def transform_jobseeker_to_opensearch(jobseeker: IndependentJobSeekerAssessmentRDS, jobseeker_id: str) -> Dict: |
|
"""Transform jobseeker data to OpenSearch format""" |
|
return { |
|
"jobseeker_id": jobseeker_id, |
|
"primary_skills": jobseeker.primary_skills, |
|
"secondary_skills": jobseeker.secondary_skills, |
|
"experiences": jobseeker.experiences, |
|
"educations": jobseeker.educations, |
|
"certifications": jobseeker.certifications, |
|
} |
|
|
|
def create_test_data(): |
|
"""Create test data matching actual API fields""" |
|
print("\nCreating test data...") |
|
|
|
|
|
job_posting = JobPosting( |
|
title="Senior Software Engineer - ML/Cloud", |
|
role_description="Leading backend development team in cloud infrastructure projects. " |
|
"Focus on machine learning systems and scalable architectures. " |
|
"Responsible for ML pipeline optimization and team mentorship.", |
|
company_description="Tech company specializing in AI solutions", |
|
primary_skills=[ |
|
Skill("Python"), |
|
Skill("AWS"), |
|
Skill("Kubernetes"), |
|
Skill("TensorFlow"), |
|
Skill("PyTorch") |
|
], |
|
secondary_skills=[ |
|
Skill("Docker"), |
|
Skill("CI/CD"), |
|
Skill("Agile"), |
|
Skill("Team Leadership") |
|
] |
|
) |
|
|
|
|
|
matching_seeker = IndependentJobSeekerAssessmentRDS( |
|
primary_skills=[ |
|
"Python", "AWS", "Kubernetes", "TensorFlow", "PyTorch" |
|
], |
|
secondary_skills=[ |
|
"Docker", "CI/CD", "Agile", "Team Leadership" |
|
], |
|
experiences=[{ |
|
"title": "Senior Software Engineer", |
|
"company": "AI Tech Corp", |
|
"duration": "4 years", |
|
"description": "Led machine learning infrastructure team, developed scalable ML pipelines, " |
|
"optimized cloud resources, mentored junior engineers" |
|
}], |
|
educations=[{ |
|
"degree": "Master's", |
|
"field": "Computer Science", |
|
"institution": "Tech University" |
|
}], |
|
certifications=[{ |
|
"name": "AWS Solutions Architect Professional", |
|
"organization": "AWS", |
|
"start_date": "2023-01", |
|
"end_date": "2026-01" |
|
}] |
|
) |
|
|
|
matching_info = JobseekerInfoRDS( |
|
summary="Senior ML engineer specialized in building scalable AI systems and leading engineering teams" |
|
) |
|
|
|
|
|
partial_matching_seeker = IndependentJobSeekerAssessmentRDS( |
|
primary_skills=[ |
|
"Python", "AWS", |
|
"Java", |
|
"TensorFlow" |
|
], |
|
secondary_skills=[ |
|
"Docker", |
|
"Git", |
|
"Scrum" |
|
], |
|
experiences=[{ |
|
"title": "Data Analyst", |
|
"company": "Tech Solutions Inc", |
|
"duration": "2 years", |
|
"description": "Worked on machine learning projects using TensorFlow. " |
|
"Maintained AWS infrastructure and helped with basic Kubernetes deployments. " |
|
"Member of an agile team developing ML-powered features." |
|
}], |
|
educations=[{ |
|
"degree": "Bachelor's", |
|
"field": "Computer Science", |
|
"institution": "Tech University" |
|
}], |
|
certifications=[{ |
|
"name": "AWS Cloud Practitioner", |
|
"organization": "AWS", |
|
"start_date": "2022-01", |
|
"end_date": "2025-01" |
|
}] |
|
) |
|
|
|
partial_matching_info = JobseekerInfoRDS( |
|
summary="Data analyst working on graphical analysis and budget forecasting." |
|
) |
|
|
|
|
|
non_matching_seeker = IndependentJobSeekerAssessmentRDS( |
|
primary_skills=[ |
|
"Java", "Spring", "Oracle" |
|
], |
|
secondary_skills=[ |
|
"Hibernate", "JSP", "Struts" |
|
], |
|
experiences=[{ |
|
"title": "Java Developer", |
|
"company": "Enterprise Corp", |
|
"duration": "5 years", |
|
"description": "Built enterprise banking applications using Java stack, " |
|
"implemented transaction processing systems" |
|
}], |
|
educations=[{ |
|
"degree": "Bachelor's", |
|
"field": "Information Systems", |
|
"institution": "Business School" |
|
}], |
|
certifications=[{ |
|
"name": "Oracle Certified Professional", |
|
"organization": "Oracle", |
|
"start_date": "2022-01", |
|
"end_date": "2025-01" |
|
}] |
|
) |
|
|
|
non_matching_info = JobseekerInfoRDS( |
|
summary="Experienced Java developer specialized in enterprise banking applications" |
|
) |
|
|
|
return ( |
|
job_posting, |
|
matching_seeker, |
|
matching_info, |
|
partial_matching_seeker, |
|
partial_matching_info, |
|
non_matching_seeker, |
|
non_matching_info |
|
) |
|
|
|
def analyze_match_result(match_result: MatchResult, candidate_type: str = "matching"): |
|
"""Analyze and display match results""" |
|
print(f"\nAnalyzing match results for {candidate_type} candidate:") |
|
print("=" * 60) |
|
|
|
|
|
print("\nField-by-Field Analysis:") |
|
print("-" * 40) |
|
|
|
|
|
field_order = [ |
|
'title_summary', |
|
'primary_skills_primary_skills', |
|
'secondary_skills_secondary_skills', |
|
'role_description_experience', |
|
'role_description_certifications' |
|
] |
|
|
|
|
|
for field_pair in field_order: |
|
if field_pair in match_result.field_scores: |
|
score = match_result.field_scores[field_pair] |
|
score_color = "green" if score > 0.85 else "yellow" if score > 0.7 else "red" |
|
print(f"{field_pair:35} | {colored(f'{score:.3f}', score_color)}") |
|
|
|
|
|
print("\nOverall Match Analysis:") |
|
print("-" * 40) |
|
score_color = "green" if match_result.similarity_score > 0.8 else \ |
|
"yellow" if match_result.similarity_score > 0.65 else "red" |
|
print(f"Match Score: {colored(f'{match_result.similarity_score:.3f}', score_color)}") |
|
|
|
|
|
print("\nMatch Interpretation:") |
|
if match_result.similarity_score > 0.8: |
|
print(colored("Strong Match", "green"), "- Highly relevant candidate") |
|
print("Key Strengths:") |
|
print(match_result.explanation) |
|
elif match_result.similarity_score > 0.65: |
|
print(colored("Moderate Match", "yellow"), "- Potentially suitable candidate") |
|
print("Analysis:") |
|
print(match_result.explanation) |
|
else: |
|
print(colored("Weak Match", "red"), "- May not be suitable") |
|
print("Gaps:") |
|
print(match_result.explanation) |
|
|
|
return match_result.similarity_score |
|
|
|
def run_model_comparison_tests(manager: EmbeddingManager): |
|
"""Run comprehensive comparison tests""" |
|
print("\nInitializing embedding manager...") |
|
|
|
|
|
(job_posting, matching_seeker, matching_info, |
|
partial_matching_seeker, partial_matching_info, |
|
non_matching_seeker, non_matching_info) = create_test_data() |
|
|
|
print("\n" + "="*80) |
|
print("Testing with matching candidate (should show high similarity)") |
|
print("="*80) |
|
|
|
|
|
job_embeddings = manager.embed_jobposting(job_posting) |
|
matching_embeddings = manager.embed_jobseeker(matching_seeker, matching_info) |
|
matching_result = manager.calculate_similarity(job_embeddings, matching_embeddings) |
|
matching_similarity = analyze_match_result(matching_result, "matching") |
|
|
|
print("\n" + "="*80) |
|
print("Testing with partially matching candidate (should show moderate similarity)") |
|
print("="*80) |
|
|
|
|
|
partial_embeddings = manager.embed_jobseeker(partial_matching_seeker, partial_matching_info) |
|
partial_result = manager.calculate_similarity(job_embeddings, partial_embeddings) |
|
partial_similarity = analyze_match_result(partial_result, "partial matching") |
|
|
|
print("\n" + "="*80) |
|
print("Testing with non-matching candidate (should show low similarity)") |
|
print("="*80) |
|
|
|
|
|
non_matching_embeddings = manager.embed_jobseeker(non_matching_seeker, non_matching_info) |
|
non_matching_result = manager.calculate_similarity(job_embeddings, non_matching_embeddings) |
|
non_matching_similarity = analyze_match_result(non_matching_result, "non-matching") |
|
|
|
|
|
print("\nComparative Analysis:") |
|
print("="*40) |
|
|
|
|
|
print("\nSimilarity Differences:") |
|
match_vs_partial = matching_similarity - partial_similarity |
|
match_vs_non = matching_similarity - non_matching_similarity |
|
partial_vs_non = partial_similarity - non_matching_similarity |
|
|
|
print(f"Matching vs Partial: {colored(f'{match_vs_partial:>8.3f}', 'blue')}") |
|
print(f"Matching vs Non-Match: {colored(f'{match_vs_non:>8.3f}', 'blue')}") |
|
print(f"Partial vs Non-Match: {colored(f'{partial_vs_non:>8.3f}', 'blue')}") |
|
|
|
|
|
print("\nDiscrimination Ratios:") |
|
match_partial_ratio = matching_similarity / max(partial_similarity, 0.001) |
|
match_non_ratio = matching_similarity / max(non_matching_similarity, 0.001) |
|
|
|
ratio_color = "green" if match_partial_ratio > 1.5 else "yellow" if match_partial_ratio > 1.2 else "red" |
|
print(f"Matching/Partial Ratio: {colored(f'{match_partial_ratio:>8.2f}x', ratio_color)}") |
|
|
|
ratio_color = "green" if match_non_ratio > 2.0 else "yellow" if match_non_ratio > 1.5 else "red" |
|
print(f"Matching/Non-Match Ratio:{colored(f'{match_non_ratio:>8.2f}x', ratio_color)}") |
|
|
|
|
|
print("\nModel Quality Assessment:") |
|
print("-" * 40) |
|
discrimination_score = (match_vs_partial + match_vs_non) / 2 |
|
discrimination_color = "green" if discrimination_score > 0.3 else \ |
|
"yellow" if discrimination_score > 0.2 else "red" |
|
print(f"Discrimination Score: {colored(f'{discrimination_score:.3f}', discrimination_color)}") |
|
|
|
if discrimination_score > 0.3: |
|
print("Model shows good discrimination between candidate types") |
|
elif discrimination_score > 0.2: |
|
print("Model shows moderate discrimination - may need tuning") |
|
else: |
|
print("Model shows poor discrimination - consider adjusting weights or thresholds") |
|
|
|
def run_comparison_tests(job_encoder, seeker_encoder, legacy_encoder): |
|
"""Run tests comparing new field-specific vs legacy approach""" |
|
print("\nRunning comparison tests between field-specific and legacy approaches...") |
|
|
|
|
|
(job_posting, matching_seeker, matching_info, |
|
partial_matching_seeker, partial_matching_info, |
|
non_matching_seeker, non_matching_info) = create_test_data() |
|
|
|
|
|
print("\n" + "="*80) |
|
print("TESTING FIELD-SPECIFIC APPROACH") |
|
print("="*80) |
|
manager = EmbeddingManager(job_encoder, seeker_encoder) |
|
run_model_comparison_tests(manager) |
|
|
|
|
|
print("\n" + "="*80) |
|
print("TESTING LEGACY APPROACH") |
|
print("="*80) |
|
|
|
|
|
print("\nGenerating legacy embeddings...") |
|
job_emb = legacy_encoder.encode_jobposting(job_posting) |
|
match_emb = legacy_encoder.encode_jobseeker(matching_seeker, matching_info) |
|
partial_emb = legacy_encoder.encode_jobseeker(partial_matching_seeker, partial_matching_info) |
|
non_match_emb = legacy_encoder.encode_jobseeker(non_matching_seeker, non_matching_info) |
|
|
|
print("\nCalculating legacy similarities...") |
|
|
|
def calc_legacy_sim(emb1, emb2): |
|
"""Calculate cosine similarity between two embeddings""" |
|
|
|
emb1_norm = emb1 / (np.linalg.norm(emb1) + 1e-9) |
|
emb2_norm = emb2 / (np.linalg.norm(emb2) + 1e-9) |
|
|
|
|
|
sim = np.dot(emb1_norm, emb2_norm) |
|
|
|
|
|
print(f"DEBUG: Embedding norms: {np.linalg.norm(emb1):.3f}, {np.linalg.norm(emb2):.3f}") |
|
print(f"DEBUG: Raw similarity: {sim:.3f}") |
|
|
|
return sim |
|
|
|
|
|
print("\nMatching candidate:") |
|
match_sim = (calc_legacy_sim(job_emb, match_emb) + 1) / 2 |
|
print("\nPartial matching candidate:") |
|
partial_sim = (calc_legacy_sim(job_emb, partial_emb) + 1) / 2 |
|
print("\nNon-matching candidate:") |
|
non_match_sim = (calc_legacy_sim(job_emb, non_match_emb) + 1) / 2 |
|
|
|
print(f"\nLegacy Approach Results:") |
|
print(f"Job embedding shape: {job_emb.shape}") |
|
print(f"Matching embedding shape: {match_emb.shape}") |
|
print(f"Matching candidate similarity: {match_sim:.3f}") |
|
print(f"Partial matching similarity: {partial_sim:.3f}") |
|
print(f"Non-matching similarity: {non_match_sim:.3f}") |
|
|
|
print("\nLegacy Discrimination Analysis:") |
|
print(f"Match vs Partial diff: {(match_sim - partial_sim):.3f}") |
|
print(f"Match vs Non-match diff: {(match_sim - non_match_sim):.3f}") |
|
print(f"Match/Non-match ratio: {(match_sim / non_match_sim):.2f}x") |
|
|
|
|
|
print("\nEmbedding Statistics:") |
|
print(f"Job embedding mean/std: {np.mean(job_emb):.3f}/{np.std(job_emb):.3f}") |
|
print(f"Match embedding mean/std: {np.mean(match_emb):.3f}/{np.std(match_emb):.3f}") |
|
print(f"Partial embedding mean/std: {np.mean(partial_emb):.3f}/{np.std(partial_emb):.3f}") |
|
print(f"Non-match embedding mean/std: {np.mean(non_match_emb):.3f}/{np.std(non_match_emb):.3f}") |
|
|
|
|
|
def main(): |
|
"""Main test function with both approaches""" |
|
print("Creating encoders...") |
|
|
|
|
|
field_encoder, seeker_encoder = create_encoders('all-mpnet-base-v2') |
|
|
|
|
|
legacy_encoder = LegacyFieldEncoder("/Users/sebastian_a/jobposting-embedding") |
|
|
|
|
|
run_comparison_tests(field_encoder, seeker_encoder, legacy_encoder) |
|
|
|
if __name__ == "__main__": |
|
main() |