cmrit / cmrithackathon-master /data_loader.py
Bhuvaneshvar's picture
Upload 2116 files
6370773 verified
raw
history blame
1.17 kB
import pandas as pd
import re
# Define a list of common skills to match within Job Descriptions
COMMON_SKILLS = [
'python', 'machine learning', 'data analysis', 'sql', 'deep learning',
'statistics', 'data visualization', 'nlp', 'big data', 'java', 'c++',
'excel', 'r', 'cloud computing', 'data mining', 'tableau', 'power bi',
'data engineering', 'pandas', 'numpy', 'tensorflow', 'keras'
]
def load_data(file_path):
"""
Loads the job data from a CSV file and performs initial preprocessing.
"""
jobs_df = pd.read_csv(file_path)
jobs_df = jobs_df[['job_title', 'description_text', 'salary_formatted']]
jobs_df.dropna(inplace=True) # Drop rows with missing values
# Extract skills from the Job Descriptions
jobs_df['skills'] = jobs_df['description_text'].apply(extract_skills)
return jobs_df
def extract_skills(description):
"""
Extracts a list of common skills found in the Job Description.
"""
description = description.lower()
skills_found = [skill for skill in COMMON_SKILLS if re.search(r'\b' + skill + r'\b', description)]
return ', '.join(skills_found) if skills_found else ''