import pandas as pd
import re

# Define a list of common skills to match within Job Descriptions
COMMON_SKILLS = [
    'python', 'machine learning', 'data analysis', 'sql', 'deep learning',
    'statistics', 'data visualization', 'nlp', 'big data', 'java', 'c++',
    'excel', 'r', 'cloud computing', 'data mining', 'tableau', 'power bi',
    'data engineering', 'pandas', 'numpy', 'tensorflow', 'keras'
]

def load_data(file_path):
    """
    Loads the job data from a CSV file and performs initial preprocessing.
    """
    jobs_df = pd.read_csv(file_path)
    jobs_df = jobs_df[['job_title', 'description_text', 'salary_formatted']]
    jobs_df.dropna(inplace=True)  # Drop rows with missing values
    
    # Extract skills from the Job Descriptions
    jobs_df['skills'] = jobs_df['description_text'].apply(extract_skills)
    return jobs_df

def extract_skills(description):
    """
    Extracts a list of common skills found in the Job Description.
    """
    description = description.lower()
    skills_found = [skill for skill in COMMON_SKILLS if re.search(r'\b' + skill + r'\b', description)]
    return ', '.join(skills_found) if skills_found else ''