# ISCO-08 hierarchical accuracy measure

In [1]:
import evaluate

ham = evaluate.load("/home/dux/workspace/1-IEA_RnD/isco_hierarchical_accuracy")
print(ham.description)

ISCO CSV file downloaded
Weighted ISCO hierarchy dictionary created as isco_hierarchy

The ISCO-08 Hierarchical Accuracy Measure is an implementation of the measure described in [Functional Annotation of Genes Using Hierarchical Text Categorization](https://www.researchgate.net/publication/44046343_Functional_Annotation_of_Genes_Using_Hierarchical_Text_Categorization) (Kiritchenko, Svetlana and Famili, Fazel. 2005) and adapted for the ISCO-08 classification scheme by the International Labour Organization.

The measure rewards more precise classifications that correctly identify an occupation's placement down to the specific Unit group level and applies penalties for misclassifications based on the hierarchical distance between the correct and assigned categories.




In [2]:
references = ["1111", "1112", "1113", "1114", "1120"]
predictions = ["1111", "1113", "1120", "1211", "2111"]

print(f"References: {references}")
print(f"Predictions: {predictions}")
print(ham.compute(references=references, predictions=predictions))

References: ['1111', '1112', '1113', '1114', '1120']
Predictions: ['1111', '1113', '1120', '1211', '2111']
Accuracy: 0.2, Hierarchical Precision: 0.5, Hierarchical Recall: 0.7777777777777778, Hierarchical F-measure: 0.6086956521739131
{'accuracy': 0.2, 'hierarchical_precision': 0.5, 'hierarchical_recall': 0.7777777777777778, 'hierarchical_fmeasure': 0.6086956521739131}


In [16]:
# Compute all test cases and print the results
from tests import test_cases

test_number = 1

for test_case in test_cases:
    references = test_case["references"]
    predictions = test_case["predictions"]
    print(f"TEST CASE #{test_number}")
    print(f"References: {references}")
    print(f"Predictions: {predictions}")
    print(ham.compute(references=references, predictions=predictions))
    print()
    test_number += 1

TEST CASE #1
References: ['1111', '1111', '1111', '1111', '1111', '1111', '1111', '1111', '1111', '1111']
Predictions: ['1111', '1112', '1120', '1211', '1311', '2111', '111', '11', '1', '9999']
Accuracy: 0.1, Hierarchical Precision: 0.2222222222222222, Hierarchical Recall: 1.0, Hierarchical F-measure: 0.3636363636363636
{'accuracy': 0.1, 'hierarchical_precision': 0.2222222222222222, 'hierarchical_recall': 1.0, 'hierarchical_fmeasure': 0.3636363636363636}

TEST CASE #2
References: ['1111']
Predictions: ['1111']
Accuracy: 1.0, Hierarchical Precision: 1.0, Hierarchical Recall: 1.0, Hierarchical F-measure: 1.0
{'accuracy': 1.0, 'hierarchical_precision': 1.0, 'hierarchical_recall': 1.0, 'hierarchical_fmeasure': 1.0}

TEST CASE #3
References: ['1111']
Predictions: ['1112']
Accuracy: 0.0, Hierarchical Precision: 0.75, Hierarchical Recall: 0.75, Hierarchical F-measure: 0.75
{'accuracy': 0.0, 'hierarchical_precision': 0.75, 'hierarchical_recall': 0.75, 'hierarchical_fmeasure': 0.75}

TEST CASE 

# Model evaluation using the test split of the dataset

In [None]:
from datasets import load_dataset, get_dataset_config_names, get_dataset_infos, get_dataset_split_names

dataset = load_dataset("ICILS/multilingual_parental_occupations", "ilo")
dataset

In [2]:
import os
from datasets import load_dataset
from transformers import pipeline
import evaluate
import json

# Ensure that the HF_TOKEN environment variable is set
hf_token = os.getenv("HF_TOKEN")
if hf_token is None:
    raise ValueError("HF_TOKEN environment variable is not set.")

test_split = load_dataset("ICILS/multilingual_parental_occupations", "icils", split="test", token=hf_token)
validation_split = load_dataset("ICILS/multilingual_parental_occupations", "icils", split="validation", token=hf_token)

# Load the dataset
test_data_subset = (
   test_split.shuffle(seed=42).select(range(100))
)

# Initialize the pipeline
model = "danieldux/XLM-R-ISCO-v2" # ICILS/XLM-R-ISCO
pipe = pipeline("text-classification", model=model, token=hf_token)

# Initialize the hierarchical accuracy measure
hierarchical_accuracy = evaluate.load("danieldux/isco_hierarchical_accuracy")

Downloading builder script:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

ISCO CSV file downloaded
Weighted ISCO hierarchy dictionary created as isco_hierarchy


In [3]:
test_data_subset

Dataset({
    features: ['IDSTUD', 'JOB_DUTIES', 'ISCO', 'ISCO_REL', 'ISCO_TITLE', 'ISCO_CODE_TITLE', 'COUNTRY', 'LANGUAGE'],
    num_rows: 100
})

## Test set

In [4]:
import datetime

stamp = datetime.datetime.now().strftime("%Y-%m-%d--%H:%M")
print(stamp)

# Evaluate the model
predictions = []
references = []
for example in test_data_subset:

    # Predict
    prediction = pipe(
        example["JOB_DUTIES"]
    )  # Use the key "JOB_DUTIES" for the text data
    # predicted_label = extract_isco_code(prediction[0]["label"])
    predicted_label = prediction[0]["label"]
    predictions.append(predicted_label)

    # Reference
    reference_label = example["ISCO"]  # Use the key "ISCO" for the ISCO code
    references.append(reference_label)

# Compute the hierarchical accuracy
test_results = hierarchical_accuracy.compute(predictions=predictions, references=references)

# Save the results to a JSON file
with open(f"test_split_results-{stamp}.json", "w") as f:
    json.dump(test_results, f)

print(f"Evaluation results saved to test_split_results-{stamp}.json")



2024-03-31--01:29
Evaluation results saved to test_split_results-2024-03-31--01:29.json


In [5]:
test_results

{'accuracy': 0.82,
 'hierarchical_precision': 0.9090909090909091,
 'hierarchical_recall': 0.8839779005524862,
 'hierarchical_fmeasure': 0.8963585434173669}

In [6]:
import pandas as pd

test_data_df = test_data.to_pandas()
results_df = pd.DataFrame(columns=['Language', 'Accuracy', 'Hierarchical Precision', 'Hierarchical Recall', 'Hierarchical F1'])

# Iterate over unique languages
for language in test_data_df["LANGUAGE"].unique():
    # Filter test data for the current language
    test_data_subset = test_data_df[test_data_df["LANGUAGE"] == language]

    # Evaluate the model for the current language
    predictions = []
    references = []
    for example in test_data_subset.to_dict("records"):
        # Predict
        prediction = pipe(example["JOB_DUTIES"])
        predicted_label = extract_isco_code(prediction[0]["label"])
        predictions.append(predicted_label)

        # Reference
        reference_label = example["ISCO"]
        references.append(reference_label)

    # Compute the hierarchical accuracy for the current language
    test_results = hierarchical_accuracy.compute(
        predictions=predictions, references=references
    )

    # Save the results to a JSON file
    results_df = pd.concat(
        [
            pd.DataFrame(
                {
                    "Language": [language],
                    "Accuracy": [test_results["accuracy"]],
                    "Hierarchical Precision": [test_results["hierarchical_precision"]],
                    "Hierarchical Recall": [test_results["hierarchical_recall"]],
                    "Hierarchical F1": [test_results["hierarchical_fmeasure"]],
                }
            ),
            results_df,
        ],
        ignore_index=True
    )

# Print the evaluation results
print(results_df)

Accuracy: 0.8523316062176166, Hierarchical Precision: 0.9711751662971175, Hierarchical Recall: 0.9733333333333334, Hierarchical F-measure: 0.9722530521642619


  results_df = pd.concat(


Accuracy: 0.8549323017408124, Hierarchical Precision: 0.9425981873111783, Hierarchical Recall: 0.96, Hierarchical F-measure: 0.9512195121951218
Accuracy: 0.817351598173516, Hierarchical Precision: 0.9076305220883534, Hierarchical Recall: 0.9377593360995851, Hierarchical F-measure: 0.9224489795918367
Accuracy: 0.8160919540229885, Hierarchical Precision: 0.9140893470790378, Hierarchical Recall: 0.9204152249134948, Hierarchical F-measure: 0.9172413793103448
Accuracy: 0.7801724137931034, Hierarchical Precision: 0.8776978417266187, Hierarchical Recall: 0.9207547169811321, Hierarchical F-measure: 0.8987108655616942
Accuracy: 0.8200836820083682, Hierarchical Precision: 0.9007352941176471, Hierarchical Recall: 0.9176029962546817, Hierarchical F-measure: 0.9090909090909092
Accuracy: 0.5149253731343284, Hierarchical Precision: 0.7487684729064039, Hierarchical Recall: 0.8, Hierarchical F-measure: 0.7735368956743003
Accuracy: 0.9, Hierarchical Precision: 0.9244444444444444, Hierarchical Recall: 0.

In [7]:
results_df.to_csv('model_language_results.csv', index=False)

## Validation set

In [78]:
# Evaluate the model
predictions = []
references = []
for example in validation_data:

    # Predict
    prediction = pipe(
        example["JOB_DUTIES"]
    )  # Use the key "JOB_DUTIES" for the text data
    predicted_label = extract_isco_code(prediction[0]["label"])
    predictions.append(predicted_label)

    # Reference
    reference_label = example["ISCO"]  # Use the key "ISCO" for the ISCO code
    references.append(reference_label)

# Compute the hierarchical accuracy
validation_results = hierarchical_accuracy.compute(predictions=predictions, references=references)

# Save the results to a JSON file
with open("isco_validation_results.json", "w") as f:
    json.dump(validation_results, f)

print("Evaluation results saved to isco_validation_results.json")

Accuracy: 0.8576800694243564, Hierarchical Precision: 0.9757462686567164, Hierarchical Recall: 0.9812382739212008, Hierarchical F-measure: 0.9784845650140319
Evaluation results saved to isco_validation_results.json


# Inter rater agreement

## All ICILS 2018 data

In [8]:
import pandas as pd

# icils_isco_int_ml = "/datasets/isco-data/processed/2018/icils_2018_isco_ml.parquet"
icils_isco_int_ml = "gs://isco-data-asia-southeast1/processed/2018/icils_2018_isco_ml.parquet"

icils_df = pd.read_parquet(icils_isco_int_ml)[['JOB', 'DUTIES', 'ISCO', 'ISCO_REL', 'LANGUAGE']]

# Create a new pandas dataframe with samples that have ISCO_REL values
isco_rel_df = icils_df[icils_df['ISCO'].notna()].copy()

# remove rows with None values in ISCO_REL
isco_rel_df = isco_rel_df[isco_rel_df['ISCO_REL'].notna()]

# Group the DataFrame by LANGUAGE column
grouped_df = isco_rel_df.groupby('LANGUAGE')

### By language

In [None]:

results_df = pd.DataFrame(columns=['Language', 'Accuracy', 'Hierarchical Precision', 'Hierarchical Recall', 'Hierarchical F1'])

# Iterate over each group
for language, group in grouped_df:
    references = group['ISCO'].tolist()
    predictions = group['ISCO_REL'].tolist()
    
    # Apply the compute function
    rel_result = hierarchical_accuracy.compute(references=references, predictions=predictions)
    
    # Create a new DataFrame with the result for the current group
    group_result_df = pd.DataFrame({'Language': [language], 'Accuracy': [rel_result['accuracy']], 'Hierarchical Precision': [rel_result['hierarchical_precision']], 'Hierarchical Recall': [rel_result['hierarchical_recall']], 'Hierarchical F1': [rel_result['hierarchical_fmeasure']]})
    
    # Concatenate the group_result_df with the results_df
    results_df = pd.concat([results_df, group_result_df], ignore_index=True)
    
    # Print the result
    print(f"Language: {language}")
    # print(f"References: {references}")
    # print(f"Predictions: {predictions}")
    print(f"Result: {rel_result}")
    print()

average_accuracy = results_df['Accuracy'].mean()
average_hierarchical_precision = results_df['Hierarchical Precision'].mean()
average_hierarchical_recall = results_df['Hierarchical Recall'].mean()
average_hierarchical_f1 = results_df['Hierarchical F1'].mean()

average_row = ['Average', average_accuracy, average_hierarchical_precision, average_hierarchical_recall, average_hierarchical_f1]
results_df.loc[len(results_df)] = average_row


results_df.to_csv('inter-rater_language_results.csv', index=False)

## Training data

In [None]:
import pandas as pd

test_data_df = test_data.to_pandas()
unknown_reliability_samples = test_data_df[test_data_df['ISCO_REL'].isna() | test_data_df['ISCO_REL'].isin(["9998", "9999"])]

# Exclude unknown reliability samples from test_data_df
test_split_rel_df = test_data_df[~test_data_df['ISCO_REL'].isna() & ~test_data_df['ISCO_REL'].isin(["9998", "9999"])]

# Group the DataFrame by LANGUAGE column
test_split_rel_grouped_df = test_split_rel_df.groupby('LANGUAGE')

## Validation data

## Test data

In [None]:
# create a dataframe with samples where ISCO and ISCO_REL the same
isco_rel_df_same = isco_rel_df[isco_rel_df['ISCO'] == isco_rel_df['ISCO_REL']]

isco_rel_df_same

In [None]:
# create a dataframe with samples where ISCO and ISCO_REL are different
isco_rel_df_diff = isco_rel_df[isco_rel_df['ISCO'] != isco_rel_df['ISCO_REL']]

isco_rel_df_diff

In [64]:
# Make a list of all values in ISCO and ISCO_REL columns
coder1 = list(isco_rel_df['ISCO'])
coder2 = list(isco_rel_df['ISCO_REL'])

In [None]:
# Compute the hierarchical accuracy
reliability_results = hierarchical_accuracy.compute(predictions=coder2, references=coder1)

# Save the results to a JSON file
with open("isco_rel_results.json", "w") as f:
    json.dump(reliability_results, f)

print("Evaluation results saved to isco_rel_results.json")