File size: 3,732 Bytes
d0c6111 0858261 214455e d0c6111 3d769d6 d0c6111 16e8b0c d0c6111 3d769d6 d0c6111 f390ffd d0c6111 3d769d6 d0c6111 3d769d6 d0c6111 3d769d6 214455e d0c6111 214455e d0c6111 3d769d6 d0c6111 3d769d6 d0c6111 3d769d6 d0c6111 3d769d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import os
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from typing import List, Dict, Any
import gradio as gr
# Constants for directories and file names
MODEL_DIR = 'models'
DATA_DIR = 'datasets'
DATA_FILE = 'cleaned_survey_results_public.csv'
MODEL_NAMES = [
'CatBoost Regressor',
'LGBM Regressor',
]
def load_models(model_names: List[str]) -> Dict[str, Any]:
"""Load machine learning models from disk."""
models = {}
for name in model_names:
path = os.path.join(MODEL_DIR, f"{name.replace(' ', '')}.joblib")
try:
models[name] = joblib.load(path)
except Exception as e:
print(f"Error loading model {name}: {str(e)}") # Use print for logging in Gradio
return models
# Load models
models = load_models(MODEL_NAMES)
# Load dataset
data_path = os.path.join(DATA_DIR, DATA_FILE)
df = pd.read_csv(data_path)
# Prepare features and target
X = df.drop(columns=['Salary'])
y = df['Salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)
# Pre-defined input choices
input_choices = {
'MainBranch': df.MainBranch.unique().tolist(),
'Country': X.Country.unique().tolist(),
'EducationLevel': X.EducationLevel.unique().tolist(),
'RemoteWork': df.RemoteWork.unique().tolist(),
}
# Pre-computed statistics for default values
default_comp = float(df.CompTotal.mean()) # Default CompTotal
max_comp = float(df.CompTotal.max() * 1.5)
default_years = 3.0 # Default years of experience
max_years = float(df.YearsOfExperience.max() * 1.5)
def load_and_predict(mainbranch, country, educationlevel, remotework, comptotal, yearsofexperience):
"""Predict salary using loaded models and evaluate statistics."""
input_data = pd.DataFrame(
[[mainbranch, country, educationlevel, remotework, comptotal, yearsofexperience]],
columns=['MainBranch', 'Country', 'EducationLevel', 'RemoteWork', 'CompTotal', 'YearsOfExperience']
)
results = []
for name, model in models.items():
try:
salary_pred = model.predict(input_data)[0]
y_train_pred = model.predict(X_train)
results.append({
'Model': name,
'Predicted Salary': salary_pred,
'R2 Score (%)': r2_score(y_train, y_train_pred) * 100,
'Mean Absolute Error': mean_absolute_error(y_train, y_train_pred),
'Mean Squared Error': mean_squared_error(y_train, y_train_pred),
})
except Exception as e:
print(f"Error during prediction with model {name}: {str(e)}") # Logging
return pd.DataFrame(results).sort_values(by='R2 Score (%)', ascending=False).reset_index(drop=True)
# Gradio interface
inputs = [
gr.Dropdown(choices=input_choices['MainBranch'], label="Main Branch"),
gr.Dropdown(choices=input_choices['Country'], label="Country"),
gr.Dropdown(choices=input_choices['EducationLevel'], label="Education Level"),
gr.Dropdown(choices=input_choices['RemoteWork'], label="Remote Work"),
gr.Number(minimum=0.0, maximum=max_comp, value=default_comp, step=0.5, label="CompTotal"),
gr.Number(minimum=0.0, maximum=50, value=default_years, step=0.5, label="Years of Experience"),
]
output = gr.Dataframe(label="Prediction Results")
gr.Interface(
fn=load_and_predict,
inputs=inputs,
outputs=output,
title="Developer Salary Prediction App",
description="This application predicts developer salaries using multiple machine learning models. Provide your details to get salary predictions.",
).launch() |