{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "08cf1c6f-0895-4e7b-9279-109c55dd6596", "metadata": {}, "outputs": [], "source": [ "import pandas as pd, spacy, nltk, numpy as np, re, ssl" ] }, { "cell_type": "code", "execution_count": 52, "id": "e3a83c6d-bfb4-4aa2-a9dd-a4fd7ffe6d03", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"soc_2018_direct_match_title_file.csv\")" ] }, { "cell_type": "code", "execution_count": 53, "id": "afa91f8f-d7f6-47a0-adc3-b21866acc2fa", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
2018 SOC Code2018 SOC Title2018 SOC Direct Match TitleIllustrative Example
011-1011Chief ExecutivesAdmiralx
111-1011Chief ExecutivesCEONaN
211-1011Chief ExecutivesChief Executive OfficerNaN
311-1011Chief ExecutivesChief Financial Officerx
411-1011Chief ExecutivesChief Operating Officerx
\n", "
" ], "text/plain": [ " 2018 SOC Code 2018 SOC Title 2018 SOC Direct Match Title \\\n", "0 11-1011 Chief Executives Admiral \n", "1 11-1011 Chief Executives CEO \n", "2 11-1011 Chief Executives Chief Executive Officer \n", "3 11-1011 Chief Executives Chief Financial Officer \n", "4 11-1011 Chief Executives Chief Operating Officer \n", "\n", " Illustrative Example \n", "0 x \n", "1 NaN \n", "2 NaN \n", "3 x \n", "4 x " ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 54, "id": "c2cc8198-f1ba-4318-b4f0-ae2d525290ff", "metadata": {}, "outputs": [], "source": [ "df = df.drop(\"Illustrative Example\", axis=1)" ] }, { "cell_type": "code", "execution_count": 55, "id": "020c3356-8263-47af-b6e3-bf6d27bfee78", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
2018 SOC Code2018 SOC Title2018 SOC Direct Match Title
011-1011Chief ExecutivesAdmiral
111-1011Chief ExecutivesCEO
211-1011Chief ExecutivesChief Executive Officer
311-1011Chief ExecutivesChief Financial Officer
411-1011Chief ExecutivesChief Operating Officer
\n", "
" ], "text/plain": [ " 2018 SOC Code 2018 SOC Title 2018 SOC Direct Match Title\n", "0 11-1011 Chief Executives Admiral\n", "1 11-1011 Chief Executives CEO\n", "2 11-1011 Chief Executives Chief Executive Officer\n", "3 11-1011 Chief Executives Chief Financial Officer\n", "4 11-1011 Chief Executives Chief Operating Officer" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 56, "id": "538a8047-9de8-4d29-961c-6b008c298e67", "metadata": {}, "outputs": [], "source": [ "df[\"Major\"] = df[\"2018 SOC Code\"].apply(lambda x: x[:2]).apply(int)" ] }, { "cell_type": "code", "execution_count": 57, "id": "5969d5bc-69a5-42f6-a774-73a28e85b019", "metadata": {}, "outputs": [], "source": [ "# https://www.bls.gov/soc/2018/soc_2018_class_and_coding_structure.pdf determines the categorization.\n", "def high_level_agg(number):\n", " if 11 <= number <= 29:\n", " category = \"Management, Business, Science, and Arts Occupations\"\n", " elif 31 <= number <= 39:\n", " category = \"Service Occupations\"\n", " elif 41 <= number <= 43:\n", " category = \"Sales and Office Occupations\"\n", " elif 45 <= number <= 49:\n", " category = \"Natural Resources, Construction, and Maintenance Occupations\"\n", " elif 51 <= number <= 53:\n", " category = \"Production, Transportation, and Material Moving Occupations\"\n", " else:\n", " category = \"Military Specific Occupations\"\n", " return category" ] }, { "cell_type": "code", "execution_count": 58, "id": "ebd35a6d-e0cd-497f-9c0b-9acf24de25dc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43,\n", " 45, 47, 49, 51, 53, 55])" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.Major.unique()" ] }, { "cell_type": "code", "execution_count": 59, "id": "729a6707-e442-4ad4-ad50-c6f701e00757", "metadata": {}, "outputs": [], "source": [ "df[\"high_level\"] = df.Major.apply(high_level_agg)" ] }, { "cell_type": "code", "execution_count": 60, "id": "8017e2e0-5635-47fc-bef6-be13e6988177", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
2018 SOC Code2018 SOC Title2018 SOC Direct Match TitleMajorhigh_level
011-1011Chief ExecutivesAdmiral11Management, Business, Science, and Arts Occupa...
111-1011Chief ExecutivesCEO11Management, Business, Science, and Arts Occupa...
211-1011Chief ExecutivesChief Executive Officer11Management, Business, Science, and Arts Occupa...
311-1011Chief ExecutivesChief Financial Officer11Management, Business, Science, and Arts Occupa...
411-1011Chief ExecutivesChief Operating Officer11Management, Business, Science, and Arts Occupa...
\n", "
" ], "text/plain": [ " 2018 SOC Code 2018 SOC Title 2018 SOC Direct Match Title Major \\\n", "0 11-1011 Chief Executives Admiral 11 \n", "1 11-1011 Chief Executives CEO 11 \n", "2 11-1011 Chief Executives Chief Executive Officer 11 \n", "3 11-1011 Chief Executives Chief Financial Officer 11 \n", "4 11-1011 Chief Executives Chief Operating Officer 11 \n", "\n", " high_level \n", "0 Management, Business, Science, and Arts Occupa... \n", "1 Management, Business, Science, and Arts Occupa... \n", "2 Management, Business, Science, and Arts Occupa... \n", "3 Management, Business, Science, and Arts Occupa... \n", "4 Management, Business, Science, and Arts Occupa... " ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 61, "id": "885a1379-3795-4e52-a6a6-b1f03476101e", "metadata": {}, "outputs": [], "source": [ "names = {\"2018 SOC Code\":\"SOC_code\", \"2018 SOC Title\": \"Category\", \"2018 SOC Direct Match Title\":\"Words\"}" ] }, { "cell_type": "code", "execution_count": 62, "id": "b77202c7-8e4a-4bed-bc89-e7f146e857ba", "metadata": {}, "outputs": [], "source": [ "df = df.rename(columns=names)" ] }, { "cell_type": "code", "execution_count": 63, "id": "7035d6dc-0638-4069-8a17-074b7bab5366", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SOC_codeCategoryWordsMajorhigh_level
011-1011Chief ExecutivesAdmiral11Management, Business, Science, and Arts Occupa...
111-1011Chief ExecutivesCEO11Management, Business, Science, and Arts Occupa...
211-1011Chief ExecutivesChief Executive Officer11Management, Business, Science, and Arts Occupa...
311-1011Chief ExecutivesChief Financial Officer11Management, Business, Science, and Arts Occupa...
411-1011Chief ExecutivesChief Operating Officer11Management, Business, Science, and Arts Occupa...
\n", "
" ], "text/plain": [ " SOC_code Category Words Major \\\n", "0 11-1011 Chief Executives Admiral 11 \n", "1 11-1011 Chief Executives CEO 11 \n", "2 11-1011 Chief Executives Chief Executive Officer 11 \n", "3 11-1011 Chief Executives Chief Financial Officer 11 \n", "4 11-1011 Chief Executives Chief Operating Officer 11 \n", "\n", " high_level \n", "0 Management, Business, Science, and Arts Occupa... \n", "1 Management, Business, Science, and Arts Occupa... \n", "2 Management, Business, Science, and Arts Occupa... \n", "3 Management, Business, Science, and Arts Occupa... \n", "4 Management, Business, Science, and Arts Occupa... " ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 64, "id": "3f8c4a84-a50e-4dfe-9448-ac69c00750f4", "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"soc-professions-2018.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "753cbdaf-41a5-4665-b13f-145702b293ec", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "b44845e3-5a9f-4009-894c-a8e7b43b4d1b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }