stealth-talent
/

embeddings-sebastian

Model card Files Files and versions Community

sebastianalgharaballi commited on 26 days ago

Commit

0516311

•

1 Parent(s): fb19dc4

new scoring

Browse files

Files changed (1) hide show

encoder.py +105 -25

encoder.py CHANGED Viewed

@@ -2,11 +2,26 @@ from sentence_transformers import SentenceTransformer
 from transformers import AutoModel, AutoTokenizer
 import torch
 import numpy as np
-from typing import Dict, List, Union, Optional
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from datetime import datetime
 import json
 @dataclass
 class Skill:
@@ -39,6 +54,11 @@ class JobPosting:
     reason_for_hire: str = ""
     application_of_skills: str = ""
     company_id: str = "test_company"
 @dataclass
 class IndependentJobSeekerAssessmentRDS:
@@ -53,6 +73,49 @@ class IndependentJobSeekerAssessmentRDS:
 class JobseekerInfoRDS:
     summary: str
 class BaseFieldEncoder(ABC):
     """Base class for field-specific encoding"""
     def __init__(self, model_name: str = 'all-mpnet-base-v2'):
@@ -118,7 +181,7 @@ class JobSeekerEncoder(BaseFieldEncoder):
             'primary_skills',
             'secondary_skills',
             'certifications',
-            'education'  # Add this line
         }
     def encode_fields(self, fields: Dict[str, str]) -> Dict[str, np.ndarray]:
@@ -175,14 +238,13 @@ class LegacyFieldEncoder:
             outputs = self.model(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                output_hidden_states=True  # Get all hidden states
             )
             # Get last hidden state
             last_hidden_state = outputs.last_hidden_state
             # Apply attention mask and mean pooling
-            # This is better than just taking CLS token
             mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
             sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1)
             sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
@@ -196,11 +258,9 @@ class LegacyFieldEncoder:
     def encode_jobposting(self, job_posting: JobPosting) -> np.ndarray:
         """Legacy job posting encoding using Qwen2"""
-        # Convert skills lists to strings
         primary_skills_str = ', '.join(skill.skill_name for skill in job_posting.primary_skills) if job_posting.primary_skills else 'None'
         secondary_skills_str = ', '.join(skill.skill_name for skill in job_posting.secondary_skills) if job_posting.secondary_skills else 'None'
-        # Concatenate all fields into one string, maintaining legacy format
         text = f"""
         Title: {job_posting.title}
@@ -221,7 +281,6 @@ class LegacyFieldEncoder:
     def encode_jobseeker(self, processed_jobseeker: IndependentJobSeekerAssessmentRDS,
                         unprocessed_jobseeker: JobseekerInfoRDS) -> np.ndarray:
         """Legacy job seeker encoding using Qwen2"""
-        # Create a single string with all relevant information
         text = f"""
         Summary: {unprocessed_jobseeker.summary}
@@ -242,6 +301,40 @@ class LegacyFieldEncoder:
         """Not used in legacy approach"""
         pass
 def create_encoders(model_name: str = 'all-mpnet-base-v2') -> tuple:
     """Create both job posting and seeker encoders using the same base model"""
     print(f"Creating encoders using {model_name}...")
@@ -252,20 +345,7 @@ def create_encoders(model_name: str = 'all-mpnet-base-v2') -> tuple:
     print(f"Created encoders with embedding dimension: {job_encoder.embedding_dim}")
     return job_encoder, seeker_encoder
-FIELD_MAPPING = {
-    'title': ['summary'],                    # Job title maps to seeker summary
-    'primary_skills': ['primary_skills'],    # Primary skills to primary skills
-    'secondary_skills': ['secondary_skills'], # Secondary skills to secondary skills
-    'role_description': ['experience', 'certifications']  # Role maps to both experience and certs
-}
-FIELD_WEIGHTS = {
-    'primary_skills_primary_skills': 0.5,        # Increased - exact skill matches are critical
-    'secondary_skills_secondary_skills': 0.1,    # Decreased - nice to have but less critical
-    'role_description_experience': 0.25,
-    'role_description_certifications': 0.05,
-    'title_summary': 0.1
-}
-__all__ = ['JobPostingEncoder', 'JobSeekerEncoder', 'LegacyFieldEncoder',
-           'create_encoders', 'FIELD_MAPPING', 'FIELD_WEIGHTS']

 from transformers import AutoModel, AutoTokenizer
 import torch
 import numpy as np
+from typing import Dict, List, Union, Optional, Tuple, Set
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
+from datetime import datetime, timedelta
+from collections import defaultdict
 import json
+import re
+# Risk and Bonus Level Constants
+class RiskLevel:
+    NO_RISK = 5
+    LOW_RISK = -5
+    MEDIUM_RISK = -10
+    HIGH_RISK = -15
+class BonusLevel:
+    NO_BONUS = 0
+    GOOD = 1
+    BETTER = 2
+    BEST = 3
 @dataclass
 class Skill:
     reason_for_hire: str = ""
     application_of_skills: str = ""
     company_id: str = "test_company"
+    # New fields for scoring
+    industry: str = ""
+    company_size: int = 0
+    company_revenue: float = 0.0
+    growth_rate: float = 0.0
 @dataclass
 class IndependentJobSeekerAssessmentRDS:
 class JobseekerInfoRDS:
     summary: str
+@dataclass
+class MatchResult:
+    """Stores the result of a job-seeker match with explanation"""
+    similarity_score: float
+    field_scores: Dict[str, float]
+    explanation: str
+    status: str = "unseen"
+# Constants for skill evaluation
+SOFT_SKILLS_KEYWORDS = {
+    'communication': ['effectively communicated', 'presented to stakeholders', 'negotiated', 'collaborated with', 'mediated'],
+    'teamwork': ['worked in a team', 'collaborated with', 'partnered with', 'contributed to a team effort'],
+    'leadership': ['led a team', 'mentored', 'coached', 'managed', 'guided'],
+    'problem_solving': ['resolved', 'addressed challenges', 'innovated', 'strategized', 'implemented solutions'],
+    'adaptability': ['adapted to', 'quickly learned', 'flexible in', 'handled change'],
+    'emotional_intelligence': ['empathized with', 'understood needs', 'fostered relationships', 'built trust', 'managed conflict']
+}
+LEADERSHIP_KEYWORDS = [
+    'led', 'managed', 'directed', 'architected', 'innovated',
+    'spearheaded', 'strategized', 'developed', 'executed',
+    'owned', 'delivered', 'implemented'
+]
+IMPACT_PATTERNS = {
+    'revenue_growth': r'increased revenue by (\d+)%',
+    'cost_savings': r'saved \$(\d+) million',
+    'project_launch': r'launched .+ generated \$(\d+) million',
+    'project_completion': r'completed .+ ahead of schedule',
+    'budget_management': r'managed \$(\d+) million budget',
+    'risk_mitigation': r'decreased .+ by (\d+)%',
+    'client_retention': r'improved retention by (\d+)%',
+    'satisfaction': r'satisfaction .+ (\d+)% to (\d+)%',
+    'team_growth': r'grew team by (\d+)%'
+}
+EXPERIENCE_LEVELS = {
+    'junior': ['Junior', 'Associate'],
+    'mid': ['Staff', 'Senior', 'Sr.'],
+    'senior': ['Principal', 'Lead', 'Supervisor', 'Manager'],
+    'executive': ['Director', 'VP', 'CXO', 'President', 'Owner', 'Founder', 'Partner']
+}
 class BaseFieldEncoder(ABC):
     """Base class for field-specific encoding"""
     def __init__(self, model_name: str = 'all-mpnet-base-v2'):
             'primary_skills',
             'secondary_skills',
             'certifications',
+            'education'
         }
     def encode_fields(self, fields: Dict[str, str]) -> Dict[str, np.ndarray]:
             outputs = self.model(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                output_hidden_states=True
             )
             # Get last hidden state
             last_hidden_state = outputs.last_hidden_state
             # Apply attention mask and mean pooling
             mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
             sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1)
             sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
     def encode_jobposting(self, job_posting: JobPosting) -> np.ndarray:
         """Legacy job posting encoding using Qwen2"""
         primary_skills_str = ', '.join(skill.skill_name for skill in job_posting.primary_skills) if job_posting.primary_skills else 'None'
         secondary_skills_str = ', '.join(skill.skill_name for skill in job_posting.secondary_skills) if job_posting.secondary_skills else 'None'
         text = f"""
         Title: {job_posting.title}
     def encode_jobseeker(self, processed_jobseeker: IndependentJobSeekerAssessmentRDS,
                         unprocessed_jobseeker: JobseekerInfoRDS) -> np.ndarray:
         """Legacy job seeker encoding using Qwen2"""
         text = f"""
         Summary: {unprocessed_jobseeker.summary}
         """Not used in legacy approach"""
         pass
+# Field mappings and weights
+FIELD_MAPPING = {
+    'title': ['summary'],
+    'primary_skills': ['primary_skills'],
+    'secondary_skills': ['secondary_skills'],
+    'role_description': ['experience', 'certifications']
+}
+FIELD_WEIGHTS = {
+    'job_stability': 12,
+    'job_duration': 12,
+    'responsibility_tenure': 1.5,
+    'employment_pedigree': 7.5,
+    'primary_skills_experience': 5,
+    'career_experience': 2.5,
+    'role_impact': 5,
+    'management_scope': 1.5,
+    'primary_skills_occurrence': 12,
+    'primary_skills_frequency': 2.5,
+    'primary_skills_recency': 15,
+    'soft_skills': 1,
+    'employment_recency': 7.5,
+    'location_match': 1,
+    'certifications': 2,
+    'job_title_experience': 5,
+    'job_title_match': 7,
+    # Original embedding weights
+    'primary_skills_primary_skills': 0.5,
+    'secondary_skills_secondary_skills': 0.1,
+    'role_description_experience': 0.25,
+    'role_description_certifications': 0.05,
+    'title_summary': 0.1
+}
 def create_encoders(model_name: str = 'all-mpnet-base-v2') -> tuple:
     """Create both job posting and seeker encoders using the same base model"""
     print(f"Creating encoders using {model_name}...")
     print(f"Created encoders with embedding dimension: {job_encoder.embedding_dim}")
     return job_encoder, seeker_encoder
+__all__ = ['JobPostingEncoder', 'JobSeekerEncoder', 'LegacyFieldEncoder',
+           'JobPosting', 'IndependentJobSeekerAssessmentRDS', 'JobseekerInfoRDS',
+           'MatchResult', 'create_encoders', 'Skill',
+           'FIELD_MAPPING', 'FIELD_WEIGHTS']