{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "08cf1c6f-0895-4e7b-9279-109c55dd6596",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd, spacy, nltk, numpy as np, re, ssl"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "e3a83c6d-bfb4-4aa2-a9dd-a4fd7ffe6d03",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"soc_2018_direct_match_title_file.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "afa91f8f-d7f6-47a0-adc3-b21866acc2fa",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 2018 SOC Code | \n",
" 2018 SOC Title | \n",
" 2018 SOC Direct Match Title | \n",
" Illustrative Example | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Admiral | \n",
" x | \n",
"
\n",
" \n",
" 1 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" CEO | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Chief Executive Officer | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Chief Financial Officer | \n",
" x | \n",
"
\n",
" \n",
" 4 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Chief Operating Officer | \n",
" x | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 2018 SOC Code 2018 SOC Title 2018 SOC Direct Match Title \\\n",
"0 11-1011 Chief Executives Admiral \n",
"1 11-1011 Chief Executives CEO \n",
"2 11-1011 Chief Executives Chief Executive Officer \n",
"3 11-1011 Chief Executives Chief Financial Officer \n",
"4 11-1011 Chief Executives Chief Operating Officer \n",
"\n",
" Illustrative Example \n",
"0 x \n",
"1 NaN \n",
"2 NaN \n",
"3 x \n",
"4 x "
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "c2cc8198-f1ba-4318-b4f0-ae2d525290ff",
"metadata": {},
"outputs": [],
"source": [
"df = df.drop(\"Illustrative Example\", axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "020c3356-8263-47af-b6e3-bf6d27bfee78",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 2018 SOC Code | \n",
" 2018 SOC Title | \n",
" 2018 SOC Direct Match Title | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Admiral | \n",
"
\n",
" \n",
" 1 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" CEO | \n",
"
\n",
" \n",
" 2 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Chief Executive Officer | \n",
"
\n",
" \n",
" 3 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Chief Financial Officer | \n",
"
\n",
" \n",
" 4 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Chief Operating Officer | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 2018 SOC Code 2018 SOC Title 2018 SOC Direct Match Title\n",
"0 11-1011 Chief Executives Admiral\n",
"1 11-1011 Chief Executives CEO\n",
"2 11-1011 Chief Executives Chief Executive Officer\n",
"3 11-1011 Chief Executives Chief Financial Officer\n",
"4 11-1011 Chief Executives Chief Operating Officer"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "538a8047-9de8-4d29-961c-6b008c298e67",
"metadata": {},
"outputs": [],
"source": [
"df[\"Major\"] = df[\"2018 SOC Code\"].apply(lambda x: x[:2]).apply(int)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "5969d5bc-69a5-42f6-a774-73a28e85b019",
"metadata": {},
"outputs": [],
"source": [
"# https://www.bls.gov/soc/2018/soc_2018_class_and_coding_structure.pdf determines the categorization.\n",
"def high_level_agg(number):\n",
" if 11 <= number <= 29:\n",
" category = \"Management, Business, Science, and Arts Occupations\"\n",
" elif 31 <= number <= 39:\n",
" category = \"Service Occupations\"\n",
" elif 41 <= number <= 43:\n",
" category = \"Sales and Office Occupations\"\n",
" elif 45 <= number <= 49:\n",
" category = \"Natural Resources, Construction, and Maintenance Occupations\"\n",
" elif 51 <= number <= 53:\n",
" category = \"Production, Transportation, and Material Moving Occupations\"\n",
" else:\n",
" category = \"Military Specific Occupations\"\n",
" return category"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "ebd35a6d-e0cd-497f-9c0b-9acf24de25dc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43,\n",
" 45, 47, 49, 51, 53, 55])"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.Major.unique()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "729a6707-e442-4ad4-ad50-c6f701e00757",
"metadata": {},
"outputs": [],
"source": [
"df[\"high_level\"] = df.Major.apply(high_level_agg)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "8017e2e0-5635-47fc-bef6-be13e6988177",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 2018 SOC Code | \n",
" 2018 SOC Title | \n",
" 2018 SOC Direct Match Title | \n",
" Major | \n",
" high_level | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Admiral | \n",
" 11 | \n",
" Management, Business, Science, and Arts Occupa... | \n",
"
\n",
" \n",
" 1 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" CEO | \n",
" 11 | \n",
" Management, Business, Science, and Arts Occupa... | \n",
"
\n",
" \n",
" 2 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Chief Executive Officer | \n",
" 11 | \n",
" Management, Business, Science, and Arts Occupa... | \n",
"
\n",
" \n",
" 3 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Chief Financial Officer | \n",
" 11 | \n",
" Management, Business, Science, and Arts Occupa... | \n",
"
\n",
" \n",
" 4 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Chief Operating Officer | \n",
" 11 | \n",
" Management, Business, Science, and Arts Occupa... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 2018 SOC Code 2018 SOC Title 2018 SOC Direct Match Title Major \\\n",
"0 11-1011 Chief Executives Admiral 11 \n",
"1 11-1011 Chief Executives CEO 11 \n",
"2 11-1011 Chief Executives Chief Executive Officer 11 \n",
"3 11-1011 Chief Executives Chief Financial Officer 11 \n",
"4 11-1011 Chief Executives Chief Operating Officer 11 \n",
"\n",
" high_level \n",
"0 Management, Business, Science, and Arts Occupa... \n",
"1 Management, Business, Science, and Arts Occupa... \n",
"2 Management, Business, Science, and Arts Occupa... \n",
"3 Management, Business, Science, and Arts Occupa... \n",
"4 Management, Business, Science, and Arts Occupa... "
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "885a1379-3795-4e52-a6a6-b1f03476101e",
"metadata": {},
"outputs": [],
"source": [
"names = {\"2018 SOC Code\":\"SOC_code\", \"2018 SOC Title\": \"Category\", \"2018 SOC Direct Match Title\":\"Words\"}"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "b77202c7-8e4a-4bed-bc89-e7f146e857ba",
"metadata": {},
"outputs": [],
"source": [
"df = df.rename(columns=names)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "7035d6dc-0638-4069-8a17-074b7bab5366",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SOC_code | \n",
" Category | \n",
" Words | \n",
" Major | \n",
" high_level | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Admiral | \n",
" 11 | \n",
" Management, Business, Science, and Arts Occupa... | \n",
"
\n",
" \n",
" 1 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" CEO | \n",
" 11 | \n",
" Management, Business, Science, and Arts Occupa... | \n",
"
\n",
" \n",
" 2 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Chief Executive Officer | \n",
" 11 | \n",
" Management, Business, Science, and Arts Occupa... | \n",
"
\n",
" \n",
" 3 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Chief Financial Officer | \n",
" 11 | \n",
" Management, Business, Science, and Arts Occupa... | \n",
"
\n",
" \n",
" 4 | \n",
" 11-1011 | \n",
" Chief Executives | \n",
" Chief Operating Officer | \n",
" 11 | \n",
" Management, Business, Science, and Arts Occupa... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SOC_code Category Words Major \\\n",
"0 11-1011 Chief Executives Admiral 11 \n",
"1 11-1011 Chief Executives CEO 11 \n",
"2 11-1011 Chief Executives Chief Executive Officer 11 \n",
"3 11-1011 Chief Executives Chief Financial Officer 11 \n",
"4 11-1011 Chief Executives Chief Operating Officer 11 \n",
"\n",
" high_level \n",
"0 Management, Business, Science, and Arts Occupa... \n",
"1 Management, Business, Science, and Arts Occupa... \n",
"2 Management, Business, Science, and Arts Occupa... \n",
"3 Management, Business, Science, and Arts Occupa... \n",
"4 Management, Business, Science, and Arts Occupa... "
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "3f8c4a84-a50e-4dfe-9448-ac69c00750f4",
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"soc-professions-2018.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "753cbdaf-41a5-4665-b13f-145702b293ec",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "b44845e3-5a9f-4009-894c-a8e7b43b4d1b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}