Spaces:
Running
Running
File size: 224,865 Bytes
32fbd07 |
|
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"analyzer_report_fn = \"/home/michal/Development/github/pl-asr-bigos-tools/data/analyzer-reports/bigos-20240425.json\"\n",
"\n",
"# read json\n",
"import json\n",
"with open(analyzer_report_fn, \"r\") as f:\n",
" analyzer_report = json.load(f)\n",
"\n",
"print(analyzer_report)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_num_of_samples_per_split(dataset_hf):\n",
" # input - huggingface dataset object\n",
" # output - dictionary with statistics about number of samples per split\n",
" out_dict = {}\n",
" # number of samples per subset and split\n",
" metric = \"samples_count\"\n",
" print(\"Calculating {}\".format(metric))\n",
"\n",
" out_dict[metric] = {}\n",
" for split in dataset_hf.keys():\n",
" samples = dataset_hf[split].num_rows\n",
" print(split, samples)\n",
" out_dict[metric][split] = samples\n",
" # add number of samples for all splits\n",
" out_dict[metric][\"all_splits\"] = sum(out_dict[metric].values())\n",
"\n",
" return out_dict\n",
"\n",
"def get_audio_duration_per_split(dataset_hf):\n",
" # input - huggingface dataset object\n",
" # output - dictionary with statistics about audio duration per split\n",
" out_dict = {}\n",
" metric = \"audio_duration[h]\"\n",
" print(\"Calculating {}\".format(metric))\n",
"\n",
" out_dict[metric] = {}\n",
" for split in dataset_hf.keys():\n",
" #sampling_rate = dataset_hf[split][\"sampling_rate\"][0]\n",
" #audio_total_length_samples = 0\n",
" #audio_total_length_samples = sum(len(audio_file[\"array\"]) for audio_file in dataset_hf[\"test\"][\"audio\"])\n",
" audio_total_length_seconds = sum(len(audio_file[\"array\"]) / audio_file[\"sampling_rate\"] for audio_file in dataset_hf[split][\"audio\"])\n",
" audio_total_length_hours = round(audio_total_length_seconds / 3600,2)\n",
" out_dict[metric][split] = audio_total_length_hours\n",
" print(split, audio_total_length_hours)\n",
" # add number of samples for all splits\n",
" out_dict[metric][\"all_splits\"] = sum(out_dict[metric].values())\n",
" return out_dict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_speakers_per_split(dataset_hf):\n",
" # input - huggingface dataset object\n",
" # output - dictionary with statistics about audio duration per split\n",
" out_dict = {}\n",
" metric = \"speakers_count\"\n",
" print(\"Calculating {}\".format(metric))\n",
"\n",
" out_dict[metric] = {}\n",
" for split in dataset_hf.keys():\n",
" # extract speakers from file_id \n",
" speakers_ids_all = [str(fileid).split(\"-\")[4] for fileid in dataset_hf[split][\"audioname\"]]\n",
" speakers_ids_uniq = list(set(speakers_ids_all))\n",
" speakers_count = len(speakers_ids_uniq)\n",
" print(split, speakers_count)\n",
" # add number of samples for all splits\n",
" out_dict[metric][\"all_splits\"] = sum(out_dict[metric].values())\n",
" return out_dict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_uniq_utts_per_split(dataset_hf):\n",
" # input - huggingface dataset object\n",
" # output - dictionary with statistics about audio duration per split\n",
" out_dict = {}\n",
" metric = \"utterances_unique_count\"\n",
" print(\"Calculating {}\".format(metric))\n",
"\n",
" out_dict[metric] = {}\n",
" for split in dataset_hf.keys():\n",
" # extract speakers from file_id \n",
" utts_all = dataset_hf[split][\"ref_orig\"]\n",
" utts_uniq = list(set(utts_all))\n",
" utts_uniq_count = len(utts_uniq)\n",
" print(split, utts_uniq_count)\n",
" # add number of samples for all splits\n",
" out_dict[metric][\"all_splits\"] = \"N/A\"\n",
" return out_dict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_words_per_split(dataset_hf):\n",
" # input - huggingface dataset object\n",
" # output - dictionary with statistics about audio duration per split\n",
" out_dict = {}\n",
" metric = \"words_count\"\n",
" print(\"Calculating {}\".format(metric))\n",
"\n",
" out_dict[metric] = {}\n",
" for split in dataset_hf.keys():\n",
" # extract speakers from file_id \n",
" utts_all = dataset_hf[split][\"ref_orig\"]\n",
" utts_lenghts = [len(utt.split(\" \")) for utt in utts_all]\n",
" words_all_count = sum(utts_lenghts)\n",
" print(split, words_all_count)\n",
" # add number of samples for all splits\n",
" out_dict[metric][\"all_splits\"] = sum(out_dict[metric].values())\n",
" return out_dict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_unique_words_per_split(dataset_hf):\n",
" # input - huggingface dataset object\n",
" # output - dictionary with statistics about audio duration per split\n",
" out_dict = {}\n",
" out_words_list = []\n",
" metric = \"words_unique\"\n",
" print(\"Calculating {}\".format(metric))\n",
"\n",
" out_dict[metric] = {}\n",
" for split in dataset_hf.keys():\n",
" # extract speakers from file_id \n",
" utts_all = dataset_hf[split][\"ref_orig\"]\n",
" words_all = \" \".join(utts_all).split(\" \")\n",
" words_uniq = list(set(words_all))\n",
" out_words_list = out_words_list + words_uniq\n",
" words_uniq_count = len(words_uniq)\n",
" print(split, words_uniq_count)\n",
" # add number of samples for all splits\n",
" out_words_uniq = list(set((out_words_list)))\n",
" out_words_uniq_count = len(out_words_uniq)\n",
" out_dict[metric][\"all_splits\"] = out_words_uniq_count\n",
" print(\"all\", out_words_uniq_count)\n",
"\n",
" return out_dict, out_words_uniq"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_chars_per_split(dataset_hf):\n",
" # input - huggingface dataset object\n",
" # output - dictionary with statistics about audio duration per split\n",
" out_dict = {}\n",
"\n",
" metric = \"chars\"\n",
" print(\"Calculating {}\".format(metric))\n",
"\n",
" out_dict[metric] = {}\n",
" for split in dataset_hf.keys():\n",
" # extract speakers from file_id \n",
" utts_all = dataset_hf[split][\"ref_orig\"]\n",
" words_all = \" \".join(utts_all).split(\" \")\n",
" chars_all = \" \".join(words_all)\n",
" chars_all_count = len(chars_all)\n",
" print(split, chars_all_count)\n",
" # add number of samples for all splits\n",
" out_dict[metric][\"all_splits\"] = sum(out_dict[metric].values())\n",
" return out_dict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_unique_chars_per_split(dataset_hf):\n",
" # input - huggingface dataset object\n",
" # output - dictionary with statistics about audio duration per split\n",
" out_dict = {}\n",
" out_chars_list = []\n",
" metric = \"chars_unique\"\n",
" print(\"Calculating {}\".format(metric))\n",
"\n",
" out_dict[metric] = {}\n",
" for split in dataset_hf.keys():\n",
" # extract speakers from file_id \n",
" utts_all = dataset_hf[split][\"ref_orig\"]\n",
" words_all = \" \".join(utts_all).split(\" \")\n",
" words_uniq = list(set(words_all))\n",
" chars_uniq = list(set(\"\".join(words_uniq)))\n",
" chars_uniq_count = len(chars_uniq)\n",
" print(split, chars_uniq_count)\n",
" out_chars_list = out_chars_list + chars_uniq\n",
" # add number of samples for all splits\n",
" out_chars_uniq = list(set((out_chars_list)))\n",
" out_chars_uniq_count = len(out_chars_uniq)\n",
" out_dict[metric][\"all_splits\"] = out_chars_uniq_count\n",
" print(\"all\", out_chars_uniq_count)\n",
"\n",
" return out_dict, out_chars_uniq"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_meta_coverage_sex_per_split(dataset_hf):\n",
" # input - huggingface dataset object\n",
" # output - dictionary with statistics about audio duration per split\n",
" out_dict = {}\n",
" metric = \"meta_coverage_sex\"\n",
" print(\"Calculating {}\".format(metric))\n",
"\n",
" out_dict[metric] = {}\n",
" for split in dataset_hf.keys():\n",
" \n",
" # extract speakers from file_id\n",
" meta_info = dataset_hf[split][\"speaker_sex\"]\n",
"\n",
" # calculate coverage\n",
" meta_info_count = len(meta_info)\n",
" meta_info_not_null_count = len([x for x in meta_info if x != \"N/A\"])\n",
" meta_info_coverage = round(meta_info_not_null_count / meta_info_count, 2)\n",
" print(split, meta_info_coverage)\n",
"\n",
" # add number of samples for all splits\n",
" out_dict[metric][split] = meta_info_coverage\n",
"\n",
" # add number of samples for all splits\n",
" out_dict[metric][\"all_splits\"] = \"N/A\"\n",
" return out_dict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_meta_coverage_age_per_split(dataset_hf):\n",
" # input - huggingface dataset object\n",
" # output - dictionary with statistics about audio duration per split\n",
" out_dict = {}\n",
" metric = \"meta_coverage_age\"\n",
" print(\"Calculating {}\".format(metric))\n",
"\n",
" out_dict[metric] = {}\n",
" for split in dataset_hf.keys():\n",
" meta_info = dataset_hf[split][\"speaker_age\"]\n",
" \n",
" # calculate coverage\n",
" meta_info_count = len(meta_info)\n",
" meta_info_not_null_count = len([x for x in meta_info if x != \"N/A\"])\n",
" meta_info_coverage = round(meta_info_not_null_count / meta_info_count, 2)\n",
" print(split, meta_info_coverage)\n",
"\n",
" # add number of samples for all splits\n",
" out_dict[metric][split] = meta_info_coverage\n",
"\n",
" # add number of samples for all splits\n",
" out_dict[metric][\"all_splits\"] = \"N/A\"\n",
" return out_dict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def speech_rate_per_split(dataset_hf):\n",
" # input - huggingface dataset object\n",
" # output - dictionary with statistics about audio duration per split\n",
" out_dict = {}\n",
" metric = \"speech_rate\"\n",
" print(\"Calculating {}\".format(metric))\n",
"\n",
" out_dict[metric] = {}\n",
" for split in dataset_hf.keys():\n",
" # extract speakers from file_id \n",
" utts_all = dataset_hf[split][\"ref_orig\"]\n",
" words_all = \" \".join(utts_all).split(\" \")\n",
" words_all_count = len(words_all)\n",
" audio_total_length_seconds = sum(len(audio_file[\"array\"]) / audio_file[\"sampling_rate\"] for audio_file in dataset_hf[split][\"audio\"])\n",
" speech_rate = round(words_all_count / audio_total_length_seconds, 2)\n",
" print(split, speech_rate)\n",
" out_dict[metric][split] = speech_rate\n",
" # add number of samples for all splits\n",
" out_dict[metric][\"all_splits\"] = \"N/A\"\n",
" return out_dict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# distribution of speaker age\n",
"def get_speaker_age_distribution(dataset_hf):\n",
" no_meta=False\n",
" age_buckets = ['teens','twenties', 'thirties', 'fourties', 'fifties', 'sixties', 'seventies', 'eighties', 'nineties']\n",
" # input - huggingface dataset object\n",
" # output - dictionary with statistics about audio duration per split\n",
" out_dict = {}\n",
" metric = \"speaker_age_distribution\"\n",
" print(\"Calculating {}\".format(metric))\n",
"\n",
" out_dict[metric] = {}\n",
" values_count_total = {}\n",
" for age in age_buckets:\n",
" values_count_total[age]=0\n",
" for split in dataset_hf.keys():\n",
" meta_info = dataset_hf[split][\"speaker_age\"]\n",
" meta_info_not_null = [x for x in meta_info if x != \"N/A\"]\n",
" out_dict[metric][split] = {}\n",
"\n",
" if len(meta_info_not_null) == 0:\n",
" out_dict[metric][split][age]=\"N/A\"\n",
" no_meta=True\n",
" continue\n",
" for age in age_buckets:\n",
" values_count = meta_info_not_null.count(age)\n",
" values_count_total[age] += values_count\n",
" out_dict[metric][split][age] = round(values_count/len(meta_info_not_null),2)\n",
" print(split, out_dict[metric][split])\n",
" \n",
" # add number of samples for all splits\n",
" if (no_meta):\n",
" out_dict[metric][\"all_splits\"] = \"N/A\"\n",
" return out_dict\n",
" \n",
" out_dict[metric][\"all_splits\"] = {}\n",
" # calculate total number of samples in values_count_total\n",
" for age in age_buckets:\n",
" total_samples = sum(values_count_total.values())\n",
" out_dict[metric][\"all_splits\"][age] = round(values_count_total[age]/total_samples,2)\n",
" return out_dict\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# distribution of speaker age\n",
"def get_speaker_sex_distribution(dataset_hf):\n",
" no_meta=False\n",
" sex_types = ['male', 'female']\n",
"\n",
" # input - huggingface dataset object\n",
" # output - dictionary with statistics about audio duration per split\n",
" out_dict = {}\n",
" metric = \"speaker_sex_distribution\"\n",
" print(\"Calculating {}\".format(metric))\n",
" out_dict[metric] = {}\n",
" values_count_total = {}\n",
" for sex in sex_types:\n",
" values_count_total[sex]=0\n",
" for split in dataset_hf.keys():\n",
" meta_info = dataset_hf[split][\"speaker_sex\"]\n",
" meta_info_not_null = [x for x in meta_info if x != \"N/A\"]\n",
" out_dict[metric][split] = {}\n",
"\n",
" if len(meta_info_not_null) == 0:\n",
" out_dict[metric][split][sex]=\"N/A\"\n",
" no_meta=True\n",
" continue\n",
" for sex in sex_types:\n",
" values_count = meta_info_not_null.count(sex)\n",
" values_count_total[sex] += values_count\n",
" out_dict[metric][split][sex] = round(values_count/len(meta_info_not_null),2)\n",
" print(split, out_dict[metric][split])\n",
" \n",
" # add number of samples for all splits\n",
" if (no_meta):\n",
" out_dict[metric][\"all_splits\"] = \"N/A\"\n",
" return out_dict\n",
" \n",
" out_dict[metric][\"all_splits\"] = {}\n",
" # calculate total number of samples in values_count_total\n",
" for sex in sex_types:\n",
" total_samples = sum(values_count_total.values())\n",
" out_dict[metric][\"all_splits\"][sex] = round(values_count_total[sex]/total_samples,2)\n",
" return out_dict\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"recordings_per_speaker_stats_dict = {}\n",
"def recordings_per_speaker_stats(dataset_hf):\n",
" # input - huggingface dataset object\n",
" # output - dictionary with statistics about audio duration per split\n",
" out_dict = {}\n",
" metric = \"recordings_per_speaker\"\n",
" print(\"Calculating {}\".format(metric))\n",
" \n",
" recordings_per_speaker_stats_dict = {}\n",
"\n",
" out_dict[metric] = {}\n",
" for split in dataset_hf.keys():\n",
" # extract speakers from file_id \n",
" audiopaths = dataset_hf[split][\"audioname\"]\n",
" speaker_prefixes = [str(fileid).split(\"-\")[0:5] for fileid in audiopaths]\n",
"\n",
" # create dictionary with list of audio paths matching speaker prefix\n",
" speakers_dict = {}\n",
" # Create initial dictionary keys from speaker prefixes\n",
" for speaker_prefix in speaker_prefixes:\n",
" speaker_prefix_str = \"-\".join(speaker_prefix)\n",
" speakers_dict[speaker_prefix_str] = []\n",
"\n",
" # Populate the dictionary with matching audio paths\n",
" for audio_path in audiopaths:\n",
" for speaker_prefix_str in speakers_dict.keys():\n",
" if speaker_prefix_str in audio_path:\n",
" speakers_dict[speaker_prefix_str].append(audio_path)\n",
"\n",
"\n",
" # todo calculate recordings_per_speaker_stats_dict\n",
" # iterate of speaker_dict prefixes and calculate number of recordings per speaker.\n",
" recordings_per_speaker_stats_dict = {}\n",
" for speaker_prefix_str in speakers_dict.keys():\n",
" recordings_per_speaker_stats_dict[speaker_prefix_str] = len(speakers_dict[speaker_prefix_str])\n",
" out_dict[metric][split] = {}\n",
" \n",
" out_dict[metric][split][\"recordings_per_speaker_list\"] = recordings_per_speaker_stats_dict \n",
" \n",
" # use recordings_per_speaker_stats to calculate statistics like min, max, avg, median, std\n",
" out_dict[metric][split][\"recordings_per_speaker_stats\"] = {}\n",
" speakers = len(list(recordings_per_speaker_stats_dict.keys()))\n",
" recordings_total = len(audiopaths)\n",
" average_recordings_per_speaker = round( recordings_total / speakers,2)\n",
" out_dict[metric][split][\"recordings_per_speaker_stats\"][\"average\"] = average_recordings_per_speaker\n",
" out_dict[metric][split][\"recordings_per_speaker_stats\"][\"std\"] = round(np.std(list(recordings_per_speaker_stats_dict.values())),2)\n",
" out_dict[metric][split][\"recordings_per_speaker_stats\"][\"median\"] = np.median(list(recordings_per_speaker_stats_dict.values()))\n",
"\n",
" out_dict[metric][split][\"recordings_per_speaker_stats\"][\"min\"] = min(recordings_per_speaker_stats_dict.values())\n",
" out_dict[metric][split][\"recordings_per_speaker_stats\"][\"max\"] = max(recordings_per_speaker_stats_dict.values())\n",
"\n",
" # add number of samples for all splits\n",
" out_dict[metric][\"all_splits\"] = \"N/A\"\n",
" return out_dict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"from datasets import load_dataset\n",
"import os\n",
"\n",
"def distribution_audio_duration(dataset_hf, output_dir, metric = \"audio_duration_seconds\", dimension = \"speaker_sex\"):\n",
" # input - huggingface dataset object\n",
" # output - figure with distribution of audio duration per sex\n",
" out_dict = {}\n",
"\n",
" print(\"Calculating {}\".format(metric))\n",
" out_dict[metric] = {}\n",
" # drop samples for which dimension column values are equal to \"N/A\"\n",
" for split in dataset_hf.keys():\n",
" df_dataset = pd.DataFrame(dataset_hf[split])\n",
" df_dataset = df_dataset.drop(columns=[\"audio\"])\n",
" \n",
" # remove values equal to \"N/A\" for column dimension\n",
" df_filtered = df_dataset[df_dataset[dimension] != \"N/A\"] \n",
" df_filtered = df_filtered[df_filtered[dimension] != \"other\"]\n",
" # if df_filtered is empty, skip violin plot generation for this split and dimension\n",
" if df_filtered.empty:\n",
" print(\"No data for split {} and dimension {}\".format(split, dimension))\n",
" continue\n",
" plt.figure(figsize=(15, 10))\n",
" sns.violinplot(data = df_filtered, hue=dimension, x='dataset', y=metric, split=True, fill = False, inner=\"box\", legend='auto', common_norm=True)\n",
" plt.title('Violin plot of {} by {} for split {}'.format(metric, dimension, split))\n",
" plt.xlabel(dimension)\n",
" plt.ylabel(metric)\n",
" plt.show()\n",
" # save figure to file\n",
" os.makedirs(output_dir, exist_ok=True)\n",
" output_fn = os.path.join(output_dir, metric + \"-\" + dimension + \"-\" + split + \".png\") \n",
" plt.savefig(output_fn)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from datasets import load_dataset\n",
"from datasets import get_dataset_config_names\n",
"dataset_name = \"amu-cai/pl-asr-bigos-v2\"\n",
"# get dataset config names\n",
"dataset_config_names = get_dataset_config_names(dataset_name)\n",
"# load dataset\n",
"dataset_hf = load_dataset(dataset_name, \"all\")\n",
"\n",
"dataset_statistics={}\n",
"dataset_contents = {}\n",
"output_dir_plots = \"./plots\"\n",
"os.makedirs(output_dir_plots, exist_ok=True)\n",
"output_dir_reports = \"./reports\"\n",
"os.makedirs(output_dir_plots, exist_ok=True)\n",
"\n",
"for config_name in dataset_config_names :\n",
" print(config_name)\n",
" dataset_hf_subset = load_dataset(dataset_name, config_name) \n",
" #dataset_statistics[config_name] = get_num_of_samples_per_split(dataset_hf_subset)\n",
" #dataset_statistics[config_name] = get_uniq_utts_per_split(dataset_hf_subset)\n",
" #dataset_statistics[config_name] = get_words_per_split(dataset_hf_subset)\n",
" \n",
" #dataset_contents[config_name] = {}\n",
"\n",
" #dataset_statistics[config_name], dataset_contents[config_name][\"unique_words\"] = get_unique_words_per_split(dataset_hf_subset)\n",
"\n",
" #dataset_statistics[config_name] = get_chars_per_split(dataset_hf_subset)\n",
" #dataset_statistics[config_name], dataset_contents[config_name][\"unique_chars\"] = get_unique_chars_per_split(dataset_hf_subset)\n",
"\n",
" #dataset_statistics[config_name] = get_audio_duration_per_split(dataset_hf_subset)\n",
" #dataset_statistics[config_name] = get_speakers_per_split(dataset_hf_subset)\n",
"\n",
" #dataset_statistics[config_name] = get_meta_coverage_sex_per_split(dataset_hf_subset)\n",
" #dataset_statistics[config_name] = get_meta_coverage_age_per_split(dataset_hf_subset)\n",
" # metadata coverage per subset in percent - speaker accent\n",
"\n",
" # speech rate per subset\n",
" #dataset_statistics[config_name] = speech_rate_per_split(dataset_hf_subset)\n",
" #dataset_statistics[config_name] = get_speaker_age_distribution(dataset_hf_subset)\n",
" #dataset_statistics[config_name] = get_speaker_sex_distribution(dataset_hf_subset)\n",
" #print(dataset_statistics[config_name])\n",
" \n",
" dataset_statistics[config_name] = recordings_per_speaker_stats(dataset_hf_subset)\n",
" #dataset_statistics[config_name] = uniq_utterances_per_speaker_stats(dataset_hf_subset)\n",
" # number of words per speaker (min, max, med, avg, std)\n",
"\n",
"\n",
" # distribution\n",
" # distribution of audio duration per subset\n",
" output_dir_plots_subset = os.path.join(output_dir_plots, config_name)\n",
" dataset_statistics[config_name] = distribution_audio_duration(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_sex')\n",
" \n",
" # distribution of audio duration per age\n",
" dataset_statistics[config_name] = distribution_audio_duration(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_age')\n",
"\n",
" \n",
" # distribution of speaking rate per subset\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/michal/.pyenv/versions/3.10.11/envs/streamlit/lib/python3.10/site-packages/datasets/load.py:1486: FutureWarning: The repository for amu-cai/pl-asr-bigos-v2 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/amu-cai/pl-asr-bigos-v2\n",
"You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
"Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset({\n",
" features: ['audioname', 'split', 'dataset', 'speaker_id', 'ref_orig', 'audio', 'audio_duration_samples', 'audio_duration_seconds', 'samplingrate_orig', 'sampling_rate', 'audiopath_bigos', 'audiopath_local', 'speaker_age', 'speaker_sex'],\n",
" num_rows: 44\n",
"})\n"
]
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: './reports/pelcra/pl-asr-pelcra-for-bigos/dataset_contents.json'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[2], line 22\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(json_stats_secret, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m 20\u001b[0m stats_dict_secret \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mload(file)\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mjson_contents_public\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m 23\u001b[0m contents_dict_public \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mload(file)\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(json_stats_public, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m file:\n",
"File \u001b[0;32m~/.pyenv/versions/3.10.11/envs/streamlit/lib/python3.10/site-packages/IPython/core/interactiveshell.py:324\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 319\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 320\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 321\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 322\u001b[0m )\n\u001b[0;32m--> 324\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './reports/pelcra/pl-asr-pelcra-for-bigos/dataset_contents.json'"
]
}
],
"source": [
"import json\n",
"import pandas as pd\n",
"\n",
"#dataset_public = \"amu-cai/pl-asr-bigos-v2\"\n",
"#dataset_secret = \"amu-cai/pl-asr-bigos-v2-secret\"\n",
"\n",
"dataset_public = \"pelcra/pl-asr-pelcra-for-bigos\"\n",
"dataset_secret = \"pelcra/pl-asr-pelcra-for-bigos-secret\"\n",
"\n",
"json_contents_public = \"./reports/{}/dataset_contents.json\".format(dataset_public)\n",
"json_stats_public = \"reports/{}/dataset_statistics.json\".format(dataset_public)\n",
"\n",
"json_contents_secret = \"./reports/{}/dataset_contents.json\".format(dataset_secret)\n",
"json_stats_secret = \"reports/{}/dataset_statistics.json\".format(dataset_secret)\n",
"\n",
"with open(json_contents_secret, 'r') as file:\n",
" contents_dict_secret = json.load(file)\n",
"\n",
"with open(json_stats_secret, 'r') as file:\n",
" stats_dict_secret = json.load(file)\n",
"\n",
"with open(json_contents_public, 'r') as file:\n",
" contents_dict_public = json.load(file)\n",
"\n",
"with open(json_stats_public, 'r') as file:\n",
" stats_dict_public = json.load(file)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ul-diabiz_poleval-22\n",
"ul-spokes_mix_emo-18\n",
"ul-spokes_mix_luz-18\n",
"ul-spokes_mix_parl-18\n",
"ul-spokes_biz_bio-23\n",
"ul-spokes_biz_int-23\n",
"ul-spokes_biz_luz-23\n",
"ul-spokes_biz_pod-23\n",
"ul-spokes_biz_pres-23\n",
"ul-spokes_biz_vc-23\n",
"ul-spokes_biz_vc2-23\n",
"ul-spokes_biz_wyw-23\n",
"all\n"
]
}
],
"source": [
"# merge contents if dictionaries for fields utts, words, words_unique, chars, chars_unique and speech_rate\n",
"for dataset in stats_dict_public.keys():\n",
" print(dataset)\n",
" for metric in stats_dict_secret[dataset].keys():\n",
" for split in stats_dict_secret[dataset][metric].keys():\n",
" if split == \"test\":\n",
" stats_dict_public[dataset][metric][split] = stats_dict_secret[dataset][metric][split]\n",
" \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" value\n",
"metric split \n",
"samples test 947\n",
" train 7719\n",
" validation 284\n",
"utts_unique test 944\n",
" train 7556\n",
" validation 280\n",
"words test 12051\n",
" train 89255\n",
" validation 3900\n",
"words_unique test 2772\n",
" train 12341\n",
" validation 1209\n",
"chars test 66433\n",
" train 495454\n",
" validation 23594\n",
"audio[h] test 2.05\n",
" train 16.59\n",
" validation 1.04\n",
"speakers test 24\n",
" train 132\n",
" validation 14\n",
"speech_rate test 1.63\n",
" train 1.49\n",
" validation 1.04\n",
"meta_cov_sex test N/A\n",
" train N/A\n",
" validation N/A\n",
"meta_cov_age test N/A\n",
" train N/A\n",
" validation N/A\n",
"meta_dist_sex test N/A\n",
" train N/A\n",
" validation N/A\n",
"meta_dist_age test N/A\n",
" train N/A\n",
" validation N/A\n",
"samples_per_spk test {'average': 39.46, 'std': 52.52, 'median': 22....\n",
" train {'average': 58.48, 'std': 99.27, 'median': 24....\n",
" validation {'average': 20.29, 'std': 7.64, 'median': 19.0...\n"
]
}
],
"source": [
"# Creating a MultiIndex DataFrame\n",
"rows = []\n",
"for dataset, metrics in stats_dict_public.items():\n",
" if (dataset == \"all\"):\n",
" continue\n",
" for metric, splits in metrics.items():\n",
" for split, value in splits.items():\n",
" if (split == \"all_splits\"):\n",
" continue\n",
" rows.append((dataset, metric, split, value))\n",
"\n",
"# Convert to DataFrame\n",
"df = pd.DataFrame(rows, columns=['dataset', 'metric', 'split', 'value'])\n",
"df.set_index(['dataset', 'metric', 'split'], inplace=True)\n",
"\n",
"print(df.loc['ul-diabiz_poleval-22'])"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>metric</th>\n",
" <th>samples</th>\n",
" <th>utts_unique</th>\n",
" <th>words</th>\n",
" <th>words_unique</th>\n",
" <th>chars</th>\n",
" <th>audio[h]</th>\n",
" <th>speakers</th>\n",
" </tr>\n",
" <tr>\n",
" <th>dataset</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>ul-diabiz_poleval-22</th>\n",
" <td>8950</td>\n",
" <td>8780</td>\n",
" <td>105206</td>\n",
" <td>16322</td>\n",
" <td>585481</td>\n",
" <td>19.68</td>\n",
" <td>170</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ul-spokes_biz_bio-23</th>\n",
" <td>54917</td>\n",
" <td>54136</td>\n",
" <td>1278269</td>\n",
" <td>137520</td>\n",
" <td>7694395</td>\n",
" <td>275.96</td>\n",
" <td>158</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ul-spokes_biz_int-23</th>\n",
" <td>1109</td>\n",
" <td>1101</td>\n",
" <td>23123</td>\n",
" <td>6665</td>\n",
" <td>141643</td>\n",
" <td>4.51</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ul-spokes_biz_luz-23</th>\n",
" <td>41966</td>\n",
" <td>41641</td>\n",
" <td>786593</td>\n",
" <td>108535</td>\n",
" <td>4490695</td>\n",
" <td>148.55</td>\n",
" <td>158</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ul-spokes_biz_pod-23</th>\n",
" <td>22807</td>\n",
" <td>22762</td>\n",
" <td>605852</td>\n",
" <td>83807</td>\n",
" <td>3650700</td>\n",
" <td>110.0</td>\n",
" <td>113</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ul-spokes_biz_pres-23</th>\n",
" <td>17174</td>\n",
" <td>17158</td>\n",
" <td>251841</td>\n",
" <td>54253</td>\n",
" <td>1642817</td>\n",
" <td>64.49</td>\n",
" <td>55</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ul-spokes_biz_vc-23</th>\n",
" <td>45272</td>\n",
" <td>44710</td>\n",
" <td>568780</td>\n",
" <td>77754</td>\n",
" <td>3348648</td>\n",
" <td>104.13</td>\n",
" <td>78</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ul-spokes_biz_vc2-23</th>\n",
" <td>25802</td>\n",
" <td>25596</td>\n",
" <td>755885</td>\n",
" <td>99850</td>\n",
" <td>4526688</td>\n",
" <td>162.08</td>\n",
" <td>84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ul-spokes_biz_wyw-23</th>\n",
" <td>11357</td>\n",
" <td>11204</td>\n",
" <td>259517</td>\n",
" <td>45114</td>\n",
" <td>1552980</td>\n",
" <td>56.41</td>\n",
" <td>38</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ul-spokes_mix_emo-18</th>\n",
" <td>24329</td>\n",
" <td>21063</td>\n",
" <td>252380</td>\n",
" <td>19819</td>\n",
" <td>1379695</td>\n",
" <td>51.23</td>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ul-spokes_mix_luz-18</th>\n",
" <td>20919</td>\n",
" <td>19668</td>\n",
" <td>204587</td>\n",
" <td>26106</td>\n",
" <td>1132428</td>\n",
" <td>37.48</td>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ul-spokes_mix_parl-18</th>\n",
" <td>8656</td>\n",
" <td>8521</td>\n",
" <td>100992</td>\n",
" <td>18681</td>\n",
" <td>669210</td>\n",
" <td>24.55</td>\n",
" <td>48</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"metric samples utts_unique words words_unique chars \\\n",
"dataset \n",
"ul-diabiz_poleval-22 8950 8780 105206 16322 585481 \n",
"ul-spokes_biz_bio-23 54917 54136 1278269 137520 7694395 \n",
"ul-spokes_biz_int-23 1109 1101 23123 6665 141643 \n",
"ul-spokes_biz_luz-23 41966 41641 786593 108535 4490695 \n",
"ul-spokes_biz_pod-23 22807 22762 605852 83807 3650700 \n",
"ul-spokes_biz_pres-23 17174 17158 251841 54253 1642817 \n",
"ul-spokes_biz_vc-23 45272 44710 568780 77754 3348648 \n",
"ul-spokes_biz_vc2-23 25802 25596 755885 99850 4526688 \n",
"ul-spokes_biz_wyw-23 11357 11204 259517 45114 1552980 \n",
"ul-spokes_mix_emo-18 24329 21063 252380 19819 1379695 \n",
"ul-spokes_mix_luz-18 20919 19668 204587 26106 1132428 \n",
"ul-spokes_mix_parl-18 8656 8521 100992 18681 669210 \n",
"\n",
"metric audio[h] speakers \n",
"dataset \n",
"ul-diabiz_poleval-22 19.68 170 \n",
"ul-spokes_biz_bio-23 275.96 158 \n",
"ul-spokes_biz_int-23 4.51 9 \n",
"ul-spokes_biz_luz-23 148.55 158 \n",
"ul-spokes_biz_pod-23 110.0 113 \n",
"ul-spokes_biz_pres-23 64.49 55 \n",
"ul-spokes_biz_vc-23 104.13 78 \n",
"ul-spokes_biz_vc2-23 162.08 84 \n",
"ul-spokes_biz_wyw-23 56.41 38 \n",
"ul-spokes_mix_emo-18 51.23 40 \n",
"ul-spokes_mix_luz-18 37.48 21 \n",
"ul-spokes_mix_parl-18 24.55 48 "
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Get the total number of speakers, samples, unique utts, words, unique words, chars, unique chars and speech rate\n",
"metrics = [\"samples\", \"utts_unique\", \"words\", \"words_unique\", \"chars\", \"audio[h]\", \"speakers\"]\n",
" # unique utts, words, unique words, chars, unique chars and speech rate\n",
"# filter the multiindex dataframe to leave only specific metrics\n",
"df_total = df.loc[(slice(None), metrics), :]\n",
"df_total = df_total.unstack(level ='split')\n",
"df_total['value', 'total'] = df_total['value'].sum(axis=1)\n",
"df_total.columns = df_total.columns.droplevel(0)\n",
"columns_to_drop = ['test', 'train', 'validation']\n",
"df_total.drop(columns = columns_to_drop, inplace = True)\n",
"df_total = df_total.unstack(level ='metric')\n",
"df_total.columns = df_total.columns.droplevel(0)\n",
"df_total"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th colspan=\"4\" halign=\"left\">value</th>\n",
" </tr>\n",
" <tr>\n",
" <th>split</th>\n",
" <th>test</th>\n",
" <th>train</th>\n",
" <th>validation</th>\n",
" <th>total</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>fair-mls-20</th>\n",
" <td>519</td>\n",
" <td>25042</td>\n",
" <td>511</td>\n",
" <td>26072</td>\n",
" </tr>\n",
" <tr>\n",
" <th>google-fleurs-22</th>\n",
" <td>758</td>\n",
" <td>2841</td>\n",
" <td>338</td>\n",
" <td>3937</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mailabs-corpus_librivox-19</th>\n",
" <td>1501</td>\n",
" <td>11834</td>\n",
" <td>1527</td>\n",
" <td>14862</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mozilla-common_voice_15-23</th>\n",
" <td>8896</td>\n",
" <td>19119</td>\n",
" <td>8895</td>\n",
" <td>36910</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pjatk-clarin_mobile-15</th>\n",
" <td>392</td>\n",
" <td>2861</td>\n",
" <td>242</td>\n",
" <td>3495</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pjatk-clarin_studio-15</th>\n",
" <td>1404</td>\n",
" <td>44</td>\n",
" <td>40</td>\n",
" <td>1488</td>\n",
" </tr>\n",
" <tr>\n",
" <th>polyai-minds14-21</th>\n",
" <td>53</td>\n",
" <td>462</td>\n",
" <td>47</td>\n",
" <td>562</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pwr-azon_read-20</th>\n",
" <td>586</td>\n",
" <td>1820</td>\n",
" <td>382</td>\n",
" <td>2788</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pwr-azon_spont-20</th>\n",
" <td>48</td>\n",
" <td>357</td>\n",
" <td>51</td>\n",
" <td>456</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pwr-maleset-unk</th>\n",
" <td>477</td>\n",
" <td>3783</td>\n",
" <td>478</td>\n",
" <td>4738</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pwr-shortwords-unk</th>\n",
" <td>92</td>\n",
" <td>761</td>\n",
" <td>86</td>\n",
" <td>939</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pwr-viu-unk</th>\n",
" <td>267</td>\n",
" <td>2146</td>\n",
" <td>290</td>\n",
" <td>2703</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Total</th>\n",
" <td>14993</td>\n",
" <td>71070</td>\n",
" <td>12887</td>\n",
" <td>98950</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" value \n",
"split test train validation total\n",
"fair-mls-20 519 25042 511 26072\n",
"google-fleurs-22 758 2841 338 3937\n",
"mailabs-corpus_librivox-19 1501 11834 1527 14862\n",
"mozilla-common_voice_15-23 8896 19119 8895 36910\n",
"pjatk-clarin_mobile-15 392 2861 242 3495\n",
"pjatk-clarin_studio-15 1404 44 40 1488\n",
"polyai-minds14-21 53 462 47 562\n",
"pwr-azon_read-20 586 1820 382 2788\n",
"pwr-azon_spont-20 48 357 51 456\n",
"pwr-maleset-unk 477 3783 478 4738\n",
"pwr-shortwords-unk 92 761 86 939\n",
"pwr-viu-unk 267 2146 290 2703\n",
"Total 14993 71070 12887 98950"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Access all data where Metric is 'utts'\n",
"df_utts = df.xs('samples', level='metric')\n",
"\n",
"# change split to columns\n",
"df_utts = df_utts.unstack(level='split')\n",
"df_utts\n",
"\n",
"# add column with total number of samples\n",
"df_utts['value', 'total'] = df_utts['value'].sum(axis=1)\n",
"df_utts\n",
"\n",
"# create a new row with total number of samples and concatenate it to the DataFrame\n",
"df_total = df_utts.sum()\n",
"df_total.name = ('Total')\n",
"df_utts = pd.concat([df_utts, pd.DataFrame(df_total).T])\n",
"df_utts\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" value\n",
"dataset metric \n",
"pjatk-clarin_mobile-15 samples 2861\n",
" utts 2857\n",
" words 74634\n",
" words_unique 23166\n",
" chars 507238\n",
"... ...\n",
"all meta_cov_sex 0.57\n",
" meta_cov_age 0.24\n",
" meta_dist_sex {'male': 0.64, 'female': 0.36}\n",
" meta_dist_age {'teens': 0.03, 'twenties': 0.43, 'thirties': ...\n",
" samples_per_spk {'average': 194.71, 'std': 689.86, 'median': 4...\n",
"\n",
"[169 rows x 1 columns]\n"
]
}
],
"source": [
"\n",
"# Access all 'train' splits across all metrics\n",
"print(df.xs('train', level='split'))\n",
"\n",
"# xs is the best for single level indexing. It can be chained, but is less effective than loc or boolean masking"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" value\n",
"dataset metric split \n",
"mozilla-common_voice_15-23 samples test 8896\n",
"mailabs-corpus_librivox-19 samples test 1501\n",
"pjatk-clarin_studio-15 samples test 1404\n",
"google-fleurs-22 samples test 758\n",
"pwr-azon_read-20 samples test 586\n",
"fair-mls-20 samples test 519\n",
"pwr-maleset-unk samples test 477\n",
"pjatk-clarin_mobile-15 samples test 392\n",
"pwr-viu-unk samples test 267\n"
]
}
],
"source": [
"# Boolean masking for a more complex condition across levels\n",
"mask_test_set = (df.index.get_level_values('metric') == 'samples') & (df.index.get_level_values('split') == 'test') \n",
"df_test = df.loc[mask_test_set]\n",
"# convert value to numbric\n",
"# sort by value\n",
"df_test = df_test.sort_values(by='value', ascending=False)\n",
"# filter out values smaller than 100\n",
"df_test = df_test[df_test['value'] > 100]\n",
"\n",
"# remove dataset \"all\"\n",
"df_test = df_test.drop('all', level='dataset')\n",
"print(df_test)\n"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe thead tr:last-of-type th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th colspan=\"3\" halign=\"left\">value</th>\n",
" </tr>\n",
" <tr>\n",
" <th>split</th>\n",
" <th>test</th>\n",
" <th>train</th>\n",
" <th>validation</th>\n",
" </tr>\n",
" <tr>\n",
" <th>dataset</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>fair-mls-20</th>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>google-fleurs-22</th>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mailabs-corpus_librivox-19</th>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mozilla-common_voice_15-23</th>\n",
" <td>{'teens': 0.11, 'twenties': 0.38, 'thirties': ...</td>\n",
" <td>{'teens': 0.03, 'twenties': 0.43, 'thirties': ...</td>\n",
" <td>{'teens': 0.15, 'twenties': 0.46, 'thirties': ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pjatk-clarin_mobile-15</th>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pjatk-clarin_studio-15</th>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>polyai-minds14-21</th>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pwr-azon_read-20</th>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pwr-azon_spont-20</th>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pwr-maleset-unk</th>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pwr-shortwords-unk</th>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pwr-viu-unk</th>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" value \\\n",
"split test \n",
"dataset \n",
"fair-mls-20 N/A \n",
"google-fleurs-22 N/A \n",
"mailabs-corpus_librivox-19 N/A \n",
"mozilla-common_voice_15-23 {'teens': 0.11, 'twenties': 0.38, 'thirties': ... \n",
"pjatk-clarin_mobile-15 N/A \n",
"pjatk-clarin_studio-15 N/A \n",
"polyai-minds14-21 N/A \n",
"pwr-azon_read-20 N/A \n",
"pwr-azon_spont-20 N/A \n",
"pwr-maleset-unk N/A \n",
"pwr-shortwords-unk N/A \n",
"pwr-viu-unk N/A \n",
"\n",
" \\\n",
"split train \n",
"dataset \n",
"fair-mls-20 N/A \n",
"google-fleurs-22 N/A \n",
"mailabs-corpus_librivox-19 N/A \n",
"mozilla-common_voice_15-23 {'teens': 0.03, 'twenties': 0.43, 'thirties': ... \n",
"pjatk-clarin_mobile-15 N/A \n",
"pjatk-clarin_studio-15 N/A \n",
"polyai-minds14-21 N/A \n",
"pwr-azon_read-20 N/A \n",
"pwr-azon_spont-20 N/A \n",
"pwr-maleset-unk N/A \n",
"pwr-shortwords-unk N/A \n",
"pwr-viu-unk N/A \n",
"\n",
" \n",
"split validation \n",
"dataset \n",
"fair-mls-20 N/A \n",
"google-fleurs-22 N/A \n",
"mailabs-corpus_librivox-19 N/A \n",
"mozilla-common_voice_15-23 {'teens': 0.15, 'twenties': 0.46, 'thirties': ... \n",
"pjatk-clarin_mobile-15 N/A \n",
"pjatk-clarin_studio-15 N/A \n",
"polyai-minds14-21 N/A \n",
"pwr-azon_read-20 N/A \n",
"pwr-azon_spont-20 N/A \n",
"pwr-maleset-unk N/A \n",
"pwr-shortwords-unk N/A \n",
"pwr-viu-unk N/A "
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# What is the total distribution of age in common voice dataset and overall?\n",
"df_age = df.xs('meta_dist_age', level='metric')\n",
"df_age = df_age.unstack(level='split')\n",
"#df_age['value', 'total'] = df_age['value'].sum(axis=1)\n",
"df_age"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset \n",
"pelcra = load_dataset(\"pelcra/pl-asr-pelcra-for-bigos\", \"all\", split=\"test\")\n",
"df_test = pelcra.to_pandas()\n",
"df_test = df_test.drop(columns=[\"audio\"])\n",
"df_test.to_csv(\"test.tsv\", sep=\"\\t\",index=False)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1400x800 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"\n",
"# Load the data from the TSV file into a DataFrame\n",
"#file_path = '/mnt/data/test.tsv'\n",
"#data = pd.read_csv(file_path, sep='\\t')\n",
"data = df_test\n",
"# Group the data by audio duration and calculate the count and total duration for each group\n",
"duration_group = data.groupby('audio_duration_seconds').agg(\n",
" sample_count=('audio_duration_seconds', 'size'),\n",
" total_duration=('audio_duration_seconds', 'sum')\n",
").reset_index()\n",
"\n",
"# eliminate outliers - samples with duration longer than 150 seconds\n",
"duration_group = duration_group[duration_group['audio_duration_seconds'] < 120]\n",
"\n",
"# Calculate the cumulative percentage of the total duration\n",
"duration_group['cumulative_duration'] = (duration_group['total_duration'].cumsum() / \n",
" duration_group['total_duration'].sum()) * 100\n",
"\n",
"# Plotting the data\n",
"fig, ax1 = plt.subplots(figsize=(14, 8))\n",
"\n",
"# Left axis - Count of samples (blue line)\n",
"ax1.set_xlabel('Sample Duration (seconds)')\n",
"ax1.set_ylabel('Sample Count', color='blue')\n",
"ax1.plot(duration_group['audio_duration_seconds'], duration_group['sample_count'], color='blue')\n",
"ax1.tick_params(axis='y', labelcolor='blue')\n",
"\n",
"# Right axis - Total duration (orange line)\n",
"ax2 = ax1.twinx()\n",
"ax2.set_ylabel('Total Duration (hours)', color='orange')\n",
"ax2.plot(duration_group['audio_duration_seconds'], duration_group['total_duration'] / 3600, color='orange')\n",
"ax2.tick_params(axis='y', labelcolor='orange')\n",
"\n",
"# Adding Cumulative % (green dashed line)\n",
"ax3 = ax1.twinx()\n",
"ax3.spines[\"right\"].set_position((\"axes\", 1.15))\n",
"ax3.set_ylabel('Cumulative % of Corpus Total', color='green')\n",
"ax3.plot(duration_group['audio_duration_seconds'], duration_group['cumulative_duration'], color='green', linestyle='--')\n",
"ax3.tick_params(axis='y', labelcolor='green')\n",
"\n",
"# Title and legend\n",
"plt.title('Sample Duration Distributions')\n",
"fig.tight_layout() # Adjust the layout to make room for the third y-axis\n",
"\n",
"# Show plot\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/michal/.pyenv/versions/3.10.11/envs/streamlit/lib/python3.10/site-packages/datasets/load.py:1486: FutureWarning: The repository for amu-cai/pl-asr-bigos-v2 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/amu-cai/pl-asr-bigos-v2\n",
"You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
"Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
" warnings.warn(\n",
"Downloading data: 100%|ββββββββββ| 976M/976M [01:07<00:00, 14.5MB/s] \n",
"Downloading data: 100%|ββββββββββ| 78.8M/78.8M [00:05<00:00, 14.6MB/s]\n",
"Downloading data: 100%|ββββββββββ| 129M/129M [00:08<00:00, 16.1MB/s] \n",
"Downloading data: 100%|ββββββββββ| 934k/934k [00:00<00:00, 11.4MB/s]\n",
"Downloading data: 100%|ββββββββββ| 77.5k/77.5k [00:00<00:00, 7.19MB/s]\n",
"Downloading data: 100%|ββββββββββ| 52.6k/52.6k [00:00<00:00, 3.63MB/s]\n",
"Generating test split: 22 examples [00:00, 206.09 examples/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Generating examples\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Generating test split: 392 examples [00:01, 310.58 examples/s]\n",
"Generating train split: 36 examples [00:00, 335.23 examples/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Generating examples\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Generating train split: 2861 examples [00:08, 321.16 examples/s]\n",
"Generating validation split: 34 examples [00:00, 330.58 examples/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Generating examples\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Generating validation split: 242 examples [00:00, 317.93 examples/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"DatasetDict({\n",
" test: Dataset({\n",
" features: ['audioname', 'split', 'dataset', 'speaker_id', 'ref_orig', 'audio', 'audio_duration_samples', 'audio_duration_seconds', 'samplingrate_orig', 'sampling_rate', 'audiopath_bigos', 'audiopath_local', 'speaker_age', 'speaker_sex'],\n",
" num_rows: 392\n",
" })\n",
" train: Dataset({\n",
" features: ['audioname', 'split', 'dataset', 'speaker_id', 'ref_orig', 'audio', 'audio_duration_samples', 'audio_duration_seconds', 'samplingrate_orig', 'sampling_rate', 'audiopath_bigos', 'audiopath_local', 'speaker_age', 'speaker_sex'],\n",
" num_rows: 2861\n",
" })\n",
" validation: Dataset({\n",
" features: ['audioname', 'split', 'dataset', 'speaker_id', 'ref_orig', 'audio', 'audio_duration_samples', 'audio_duration_seconds', 'samplingrate_orig', 'sampling_rate', 'audiopath_bigos', 'audiopath_local', 'speaker_age', 'speaker_sex'],\n",
" num_rows: 242\n",
" })\n",
"})\n"
]
}
],
"source": [
"import os\n",
"from datasets import load_dataset\n",
"from datasets import get_dataset_config_names\n",
"dataset_name = \"amu-cai/pl-asr-bigos-v2\"\n",
"# get dataset config names\n",
"dataset_config_names = get_dataset_config_names(dataset_name)\n",
"# load dataset\n",
"dataset_hf = load_dataset(dataset_name, \"pjatk-clarin_mobile-15\")\n",
"\n",
"print(dataset_hf)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "bigos-hf",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|