|
import pandas as pd |
|
import re |
|
|
|
|
|
COMMON_SKILLS = [ |
|
'python', 'machine learning', 'data analysis', 'sql', 'deep learning', |
|
'statistics', 'data visualization', 'nlp', 'big data', 'java', 'c++', |
|
'excel', 'r', 'cloud computing', 'data mining', 'tableau', 'power bi', |
|
'data engineering', 'pandas', 'numpy', 'tensorflow', 'keras' |
|
] |
|
|
|
def load_data(file_path): |
|
""" |
|
Loads the job data from a CSV file and performs initial preprocessing. |
|
""" |
|
jobs_df = pd.read_csv(file_path) |
|
jobs_df = jobs_df[['job_title', 'description_text', 'salary_formatted']] |
|
jobs_df.dropna(inplace=True) |
|
|
|
|
|
jobs_df['skills'] = jobs_df['description_text'].apply(extract_skills) |
|
return jobs_df |
|
|
|
def extract_skills(description): |
|
""" |
|
Extracts a list of common skills found in the Job Description. |
|
""" |
|
description = description.lower() |
|
skills_found = [skill for skill in COMMON_SKILLS if re.search(r'\b' + skill + r'\b', description)] |
|
return ', '.join(skills_found) if skills_found else '' |
|
|