RECOMMENDATION MODEL

In [7]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
# Mock data creation
def create_mock_data():
    users_data = "rematch_train_candidate_field.csv"
    applicants = pd.read_csv(users_data)

    jobs_data = "jobs_data.csv"
    companies = pd.read_csv(jobs_data)

    train_applicants = applicants
    test_data = "1st_test.csv"
    # "/content/sample_data/test_train.csv"
    test_applicants = pd.read_csv(test_data)

    return train_applicants, test_applicants, companies

In [9]:
train_user, test_user, jobs = create_mock_data()

In [10]:
print(type(train_user))

<class 'pandas.core.frame.DataFrame'>


In [11]:
print("Training data size:", train_user.shape[0])
print("Test data size:", test_user.shape[0])

Training data size: 23724
Test data size: 4745


In [12]:
list_hard_skill = [test_user["hard_skill"].iloc[i].replace("[", "").replace("]", "").replace("'", "") for i in range(len(test_user))]
list_soft_skill = [test_user["soft_skill"].iloc[i].replace("[", "").replace("]", "").replace("'", "") for i in range(len(test_user))]

In [13]:
print(type(list_hard_skill))

<class 'list'>


In [14]:
test_user["final_hard_skill"] = pd.DataFrame(list_hard_skill)
test_user["final_soft_skill"] = pd.DataFrame(list_soft_skill)
test_user.head(3)

Unnamed: 0,User ID,candidate_field,label,hard_skill,soft_skill,final_hard_skill,final_soft_skill
0,14649,it jobs,1,"['act', 'advertising sales', 'algorithms', 'bu...","['collaboration', 'decision making', 'operatio...","act, advertising sales, algorithms, business, ...","collaboration, decision making, operations, wr..."
1,801,marketing,0,"['act', 'brand communication', 'business', 'bu...","['collaboration', 'customer service', 'managem...","act, brand communication, business, business d...","collaboration, customer service, management"
2,4393,accounting,0,"['application', 'balance sheet', 'finance', 'p...","['filing', 'management']","application, balance sheet, finance, property ...","filing, management"


In [15]:
list_hard_skill = [train_user["hard_skill"].iloc[i].replace("[", "").replace("]", "").replace("'", "") for i in range(len(train_user))]
list_soft_skill = [train_user["soft_skill"].iloc[i].replace("[", "").replace("]", "").replace("'", "") for i in range(len(train_user))]

In [16]:
train_user["final_hard_skill"] = pd.DataFrame(list_hard_skill)
train_user["final_soft_skill"] = pd.DataFrame(list_soft_skill)
train_user.head(3)

Unnamed: 0,User ID,candidate_field,label,hard_skill,soft_skill,final_hard_skill,final_soft_skill
0,1,retail & consumer products,0,"['business', 'merchandising', 'sales', 'service']",['customer service'],"business, merchandising, sales, service",customer service
1,2,sales,0,"['application', 'business', 'business requirem...","['accountability', 'collaboration', 'innovatio...","application, business, business requirements, ...","accountability, collaboration, innovation, man..."
2,3,healthcare & medical,0,"['application', 'cancer', 'endocrinology', 'hy...","['research', 'training and development']","application, cancer, endocrinology, hydrothera...","research, training and development"


In [17]:
list_hard_skill = [jobs["Hard Skills"].iloc[i].replace("[", "").replace("]", "").replace("'", "") for i in range(len(jobs))]
list_soft_skill = [jobs["Soft Skills"].iloc[i].replace("[", "").replace("]", "").replace("'", "") for i in range(len(jobs))]

In [18]:
jobs["final_hard_skill"] = pd.DataFrame(list_hard_skill)
jobs["final_soft_skill"] = pd.DataFrame(list_soft_skill)
jobs.head(3)

Unnamed: 0,Job ID,Major,Hard Skills,Soft Skills,final_hard_skill,final_soft_skill
0,1,accounting,"['business', 'finance', 'excel', 'tax', 'servi...","['management', 'planning', 'operations', 'lead...","business, finance, excel, tax, service, data, ...","management, planning, operations, leadership, ..."
1,2,administration & office support,"['service', 'business', 'data', 'excel', 'appl...","['management', 'customer service', 'microsoft ...","service, business, data, excel, application, s...","management, customer service, microsoft office..."
2,3,"advertising, arts & media","['business', 'digital', 'sales', 'service', 'a...","['management', 'social media', 'writing', 'com...","business, digital, sales, service, application...","management, social media, writing, communicati..."


In [19]:
# Feature Engineering
def feature_engineering(applicants, companies):
    # Vectorize skills and majors
    tfidf_vectorizer_skills = TfidfVectorizer()
    tfidf_vectorizer_majors = TfidfVectorizer()

    all_skills = pd.concat([applicants['final_hard_skill'], applicants['final_soft_skill'],
                            companies['final_hard_skill'], companies['final_soft_skill']])
    all_majors = pd.concat([applicants['candidate_field'], companies['Major']])

    all_skills_vectorized = tfidf_vectorizer_skills.fit_transform(all_skills)
    all_majors_vectorized = tfidf_vectorizer_majors.fit_transform(all_majors)

    num_applicants = len(applicants)
    num_companies = len(companies)

    # Split the TF-IDF vectors back into applicants and companies
    applicants_skills_vectorized = all_skills_vectorized[:num_applicants*2]  # because each applicant has 2 skill entries
    companies_skills_vectorized = all_skills_vectorized[num_applicants*2:]

    applicants_majors_vectorized = all_majors_vectorized[:num_applicants]
    companies_majors_vectorized = all_majors_vectorized[num_applicants:]

    return (applicants_skills_vectorized, applicants_majors_vectorized,
            companies_skills_vectorized, companies_majors_vectorized, tfidf_vectorizer_skills, tfidf_vectorizer_majors)

In [20]:
def compute_similarity(applicants_skills_vectorized, applicants_majors_vectorized,
                       companies_skills_vectorized, companies_majors_vectorized):
    # Calculate similarity based on skills (averaging hard and soft skills similarities)
    applicants_skills = (applicants_skills_vectorized[0::2] + applicants_skills_vectorized[1::2]) / 2
    companies_skills = (companies_skills_vectorized[0::2] + companies_skills_vectorized[1::2]) / 2

    skills_similarity = cosine_similarity(applicants_skills, companies_skills)

    # Calculate similarity based on majors
    majors_similarity = cosine_similarity(applicants_majors_vectorized, companies_majors_vectorized)

    # Ensure the number of companies in both similarities is aligned
    if skills_similarity.shape[1] != majors_similarity.shape[1]:
        min_dim = min(skills_similarity.shape[1], majors_similarity.shape[1])
        skills_similarity = skills_similarity[:, :min_dim]
        majors_similarity = majors_similarity[:, :min_dim]

    # Combine these similarities (simple average for this example)
    combined_similarity = (skills_similarity + majors_similarity) / 2
    return combined_similarity

In [21]:
# Recommendation Function
def recommend_jobs(applicants, companies, similarity_scores):
    recommendations = {}
    for i, applicant in enumerate(applicants['User ID']):
        if i < len(similarity_scores):
            sorted_company_indices = np.argsort(-similarity_scores[i])  # Descending sort of scores
            recommended_companies = companies.iloc[sorted_company_indices]['Major'].values[:3]  # Top 3 recommendations
            recommendations[applicant] = recommended_companies
    return recommendations

# Testing and Evaluation Function
def print_recommendations(applicants, companies, recommendations):
    # This is a mock function since we don't have ground truth to compare to.
    # In a real scenario, we would compare against actual matches or use some form of feedback.
    print("Recommendations for each applicant:")
    for applicant in recommendations:
        print(f"{applicant}: {recommendations[applicant]}")

In [None]:
# Let's create and process the data, and compute recommendations
# train_applicants, test_applicants, companies = create_mock_data()
applicants_skills_vec, applicants_majors_vec, companies_skills_vec, companies_majors_vec, tfidf_vectorizer_skills, tfidf_vectorizer_majors = feature_engineering(train_user, jobs)

similarity_scores = compute_similarity(applicants_skills_vec, applicants_majors_vec, companies_skills_vec, companies_majors_vec)
recommendations = recommend_jobs(test_user, jobs, similarity_scores)

# Output the recommendations to observe the results
print_recommendations(test_user, jobs, recommendations)

In [23]:
# Process input skills and recommend jobs
def recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec):
    input_hard_skills_vec = tfidf_vectorizer_skills.transform([input_hard_skills])
    input_soft_skills_vec = tfidf_vectorizer_skills.transform([input_soft_skills])
    input_major_vec = tfidf_vectorizer_majors.transform([input_major])

    # Average the vectorized hard and soft skills
    input_skills_vec = (input_hard_skills_vec + input_soft_skills_vec) / 2

    # Compute similarities
    skills_similarity = cosine_similarity(input_skills_vec, companies_skills_vec)
    major_similarity = cosine_similarity(input_major_vec, companies_majors_vec)

    # Ensure the number of companies in both similarities is aligned
    if skills_similarity.shape[1] != major_similarity.shape[1]:
        min_dim = min(skills_similarity.shape[1], major_similarity.shape[1])
        skills_similarity = skills_similarity[:, :min_dim]
        major_similarity = major_similarity[:, :min_dim]

    # Combine similarities
    combined_similarity = (skills_similarity + major_similarity) / 2

    # Get top 3 job recommendations
    sorted_company_indices = np.argsort(-combined_similarity[0])
    recommended_companies = jobs.iloc[sorted_company_indices]['Major'].values[:3]

    return recommended_companies

TEST RECOMMENDED SYSTEM

In [24]:
input_hard_skills = "Java, Excel, Python"
input_soft_skills = "Communication, Teamwork"
input_major = "Economy"

recommended_jobs = recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)
print("Recommended Jobs based on input skills and major:")
print(recommended_jobs)

Recommended Jobs based on input skills and major:
['it jobs' 'sales' 'administration & office support']


Evaluating (PENDING)

In [19]:
def create_ground_truth(csv_file_path):
    data = pd.read_csv(csv_file_path)

    # Tạo dictionary `ground_truth`
    ground_truth = {}
    for index, row in data.iterrows():
        user_id = row['User ID']
        actual_major = row['candidate_field']

        # Thêm vào dictionary, giả sử mỗi ứng viên chỉ chọn một công việc
        ground_truth[user_id] = [actual_major]

    return ground_truth

# Sử dụng hàm trên để tạo `ground_truth`
csv_file_path = '1st_test.csv'
ground_truth = create_ground_truth(csv_file_path)

In [None]:
display(ground_truth)

In [40]:
def precision_at_k(recommendations, ground_truth, k=3):
    """
    Calculate the precision at k for recommendation system.

    Parameters:
    - recommendations (dict): Dictionary where keys are user IDs and values are lists of recommended majors.
    - ground_truth (dict): Dictionary where keys are user IDs and values are lists of truly suitable majors.
    - k (int): The number of top recommendations to consider for calculating precision.

    Returns:
    - float: The average precision at k for all users.
    """
    precision_scores = []

    for applicant, recommended_major in recommendations.items():
        if applicant in ground_truth:
            # Get top k recommendations
            top_k_recs = recommended_major[:k]
            # Calculate the number of relevant recommendations
            relevant_recs = sum(1 for major in top_k_recs if major in ground_truth[applicant])
            # Precision at k for this user
            precision = relevant_recs / k
            precision_scores.append(precision)

    # Average precision at k over all users
    average_precision = np.mean(precision_scores) if precision_scores else 0
    return average_precision

avg_precision = precision_at_k(recommendations, ground_truth)
print("Average Precision@3 with 18979 trains and 4745 tests:", avg_precision)

Average Precision@3 with 18979 trains and 4745 tests: 0.12764313312258516


In [41]:
def recall_at_k(recommendations, ground_truth, k=3):
    recall_scores = []

    for user_id, recommended_majors in recommendations.items():
        if user_id in ground_truth:
            # Get top k recommendations
            top_k_recs = recommended_majors[:k]
            # Calculate the number of relevant recommendations
            relevant_recs = sum(1 for major in top_k_recs if major in ground_truth[user_id])
            # Calculate the total number of relevant items
            total_relevant = len(ground_truth[user_id])
            # Recall at k for this user
            recall = relevant_recs / total_relevant if total_relevant else 0
            recall_scores.append(recall)

    # Average recall at k over all users
    average_recall = sum(recall_scores) / len(recall_scores) if recall_scores else 0
    return average_recall

# Example usage:
avg_recall = recall_at_k(recommendations, ground_truth)
print("Average Recall@3 with 18979 trains and 4745 tests:", avg_recall)


Average Recall@3 with 18979 trains and 4745 tests: 0.38292939936775555


In [42]:
def f1_score_at_k(recommendations, ground_truth, k=3):
    precision = precision_at_k(recommendations, ground_truth, k)
    recall = recall_at_k(recommendations, ground_truth, k)

    if precision + recall == 0:
        return 0

    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

avg_f1_score = f1_score_at_k(recommendations, ground_truth)

print("Average F1 Score@3:", avg_f1_score)

Average F1 Score@3: 0.19146469968387775


Create pipline

In [25]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [26]:
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tfidf_vectorizer_skills = TfidfVectorizer()
        self.tfidf_vectorizer_majors = TfidfVectorizer()

    def fit(self, X, y=None):
        all_skills = pd.concat([X['final_hard_skill'], X['final_soft_skill']])
        all_majors = X['candidate_field']
        
        self.tfidf_vectorizer_skills.fit(all_skills)
        self.tfidf_vectorizer_majors.fit(all_majors)
        return self
    
    def transform(self, X):
        all_skills = pd.concat([X['final_hard_skill'], X['final_soft_skill']])
        all_majors = X['candidate_field']
        
        applicants_skills_vec = self.tfidf_vectorizer_skills.transform(all_skills)
        applicants_majors_vec = self.tfidf_vectorizer_majors.transform(all_majors)
        
        return applicants_skills_vec, applicants_majors_vec

In [27]:
class JobRecommender(BaseEstimator, TransformerMixin):
    def __init__(self, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec):
        self.jobs = jobs
        self.tfidf_vectorizer_skills = tfidf_vectorizer_skills
        self.tfidf_vectorizer_majors = tfidf_vectorizer_majors
        self.companies_skills_vec = companies_skills_vec
        self.companies_majors_vec = companies_majors_vec

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        input_hard_skills_vec = self.tfidf_vectorizer_skills.transform(X['final_hard_skill'])
        input_soft_skills_vec = self.tfidf_vectorizer_skills.transform(X['final_soft_skill'])
        input_major_vec = self.tfidf_vectorizer_majors.transform(X['candidate_field'])

        input_skills_vec = (input_hard_skills_vec + input_soft_skills_vec) / 2

        skills_similarity = cosine_similarity(input_skills_vec, self.companies_skills_vec)
        major_similarity = cosine_similarity(input_major_vec, self.companies_majors_vec)

        if skills_similarity.shape[1] != major_similarity.shape[1]:
            min_dim = min(skills_similarity.shape[1], major_similarity.shape[1])
            skills_similarity = skills_similarity[:, :min_dim]
            major_similarity = major_similarity[:, :min_dim]

        combined_similarity = (skills_similarity + major_similarity) / 2

        recommendations = []
        for i in range(combined_similarity.shape[0]):
            sorted_company_indices = np.argsort(-combined_similarity[i])
            recommended_companies = self.jobs.iloc[sorted_company_indices]['Major'].values[:3]
            recommendations.append(recommended_companies)

        return recommendations

In [28]:
def create_recommendation_pipeline():
    # Instantiate the feature engineering transformer
    feature_engineering = FeatureEngineeringTransformer()

    # Define the recommendation function as a callable estimator
    def recommend_jobs_function(X, y=None):
        applicants_skills_vec, applicants_majors_vec = feature_engineering.fit_transform(X)
        companies_skills_vec, companies_majors_vec = feature_engineering.tfidf_vectorizer_skills.transform(jobs['final_hard_skill']), feature_engineering.tfidf_vectorizer_majors.transform(jobs['Major'])
        
        return recommend_jobs_for_input_skills(X['final_hard_skill'], X['final_soft_skill'], X['candidate_field'], jobs, feature_engineering.tfidf_vectorizer_skills, feature_engineering.tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)

    pipeline = Pipeline([
        ('feature_engineering', feature_engineering),
        ('recommendation', recommend_jobs_function)
    ])
    
    return pipeline
recommendation_pipeline = create_recommendation_pipeline()

In [29]:
import pickle
def create_recommendation_pipeline(jobs):
    feature_engineering = FeatureEngineeringTransformer()

    # Fit feature engineering transformer to get the vectorizers and company vectors
    applicants_skills_vec, applicants_majors_vec = feature_engineering.fit_transform(train_user)
    companies_skills_vec = feature_engineering.tfidf_vectorizer_skills.transform(jobs['final_hard_skill'])
    companies_majors_vec = feature_engineering.tfidf_vectorizer_majors.transform(jobs['Major'])

    recommender = JobRecommender(jobs, feature_engineering.tfidf_vectorizer_skills, feature_engineering.tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)

    pipeline = Pipeline([
        ('feature_engineering', feature_engineering),
        ('recommendation', recommender)
    ])
    
    return pipeline

# Create the pipeline
recommendation_pipeline = create_recommendation_pipeline(jobs)

# Save the pipeline using pickle
model_path = "recommendation_pipeline.pkl"
with open(model_path, mode="bw") as f:
    pickle.dump(recommendation_pipeline, f)
print("Model components saved successfully!")


Model components saved successfully!


Push to Hugging face

In [48]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [50]:
import shutil
import os
from skops import card, hub_utils
from pathlib import Path

model_path = "recommendation_pipeline.pkl"
local_repo = "job-recommendation-model"
# Clear the existing directory if it exists
if os.path.exists(local_repo):
    shutil.rmtree(local_repo)

sample_data = pd.DataFrame({
    'final_hard_skill': ["Python, Java, Finance, Excel"],
    'final_soft_skill': ["Communication, Teamwork"],
    'candidate_field': [""]
})

# Initialize the local repository
hub_utils.init(
    model=model_path,
    requirements=["scikit-learn", "pandas", "numpy"],
    dst=local_repo,
    task="tabular-classification",
    data=sample_data,
)

# # Create model card metadata manually
# metadata = {
#     "model_type": "Custom Recommendation Model",
#     "model_description": "This is a recommendation model for job applicants based on their skills and majors.",
#     "author": "trangannh",
#     "license": "mit",
#     "citation": """
# @misc{example2024recommendation,
#     author = {trangannh},
#     title = {Job Recommendation Model},
#     year = {2024},
#     howpublished = {\\url{https://huggingface.co/job-recommendation-model}},
# }
# """,
#     "limitations": "This model is not ready to be used in production.",
# }

# # Create and save the model card
# model_card = card.Card(model=model_path, metadata=metadata)

# # Add the get started code
# get_started_code = """
# import pickle
# import pandas as pd

# with open('recommendation_model.pkl', 'rb') as file:
#     tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec = pickle.load(file)

# input_hard_skills = "Python, Java, Finance, Excel"
# input_soft_skills = "Communication, Teamwork"
# input_major = ""
# jobs_data = pd.read_csv("/content/sample_data/jobs_data.csv")

# recommended_jobs = recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs_data, 'recommendation_model.pkl')
# print("Recommended Jobs based on input skills and major:")
# print(recommended_jobs)
# """

# model_card.add(
#     get_started_code=get_started_code,
#     model_card_authors="trangannh",
#     model_description="This is a recommendation model for job applicants based on their skills and majors.",
#     limitations="This model is not ready to be used in production."
# )

# # Save the model card
# model_card.save(Path(local_repo) / "README.md")

# Push the repository to Hugging Face Hub
repo_id = "trangannh/job-recommendation-model"
token = ""

hub_utils.push(
    repo_id=repo_id,
    source=local_repo,
    token=token,
    commit_message="Initial commit of the job recommendation model",
    create_remote=True,
)




recommendation_pipeline.pkl:   0%|          | 0.00/163k [00:00<?, ?B/s]

In [30]:
import pickle
import pandas as pd

# Load the model (pipeline)
with open('recommendation_pipeline.pkl', 'rb') as file:
    recommendation_pipeline = pickle.load(file)

# Example input data
input_hard_skills = "Python, Java, Finance, Excel"
input_soft_skills = "Communication, Teamwork"
input_major = "Data Science"
recommended_jobs = recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)
print("Recommended Jobs based on input skills and major:")
print(recommended_jobs)

Recommended Jobs based on input skills and major:
['sales' 'it jobs' 'administration & office support']


Test API

In [31]:
import requests

# Set up the endpoint URL and token
endpoint = "https://api-inference.huggingface.co/models/trangannh/job-recommendation-model"
token = ""

# Prepare data
data = {
    "inputs": {
        "input_hard_skills": "Python, Java, Finance, Excel",
        "input_soft_skills": "Communication, Teamwork",
        "input_major": "Data Science"
    }
}

# Send POST request
headers = {
    "Authorization": f"Bearer {token}",
    "Content-Type": "application/json"
}
response = requests.post(endpoint, headers=headers, json=data)

# Print the response
if response.status_code == 200:
    print(response.json())
else:
    print(f"Error: {response.status_code}")
    print(response.json())


Error: 503
{'error': 'Model trangannh/job-recommendation-model is currently loading', 'estimated_time': 20.0}


In [32]:
import pickle
import pandas as pd

# Load the model (pipeline)
with open('recommendation_pipeline.pkl', 'rb') as file:
    recommendation_pipeline = pickle.load(file)

# Example input data, converting list to string
input_data = pd.DataFrame({
    'final_hard_skill': ["Python, Java, Finance, Excel"],
    'final_soft_skill': ["Communication, Teamwork"],
    'candidate_field': ["Data Science"]
})

# Make recommendations
recommended_jobs = recommendation_pipeline.transform(input_data)

print("Recommended Jobs based on input skills and major:")
for rec in recommended_jobs:
    print(rec)


TypeError: tuple indices must be integers or slices, not str