Spaces:

ylacombe
/

speech-explorer

Build error

App Files Files Community

ylacombe commited on May 15

Commit

db36668

•

1 Parent(s): e1c31aa

Upload 17 files

Browse files

Files changed (17) hide show

analyze.py +148 -0
app.py +147 -0
dataspeech/__init__.py +2 -0
dataspeech/__pycache__/__init__.cpython-38.pyc +0 -0
dataspeech/cpu_enrichments/__init__.py +2 -0
dataspeech/cpu_enrichments/__pycache__/__init__.cpython-38.pyc +0 -0
dataspeech/cpu_enrichments/__pycache__/rate.cpython-38.pyc +0 -0
dataspeech/cpu_enrichments/rate.py +34 -0
dataspeech/gpu_enrichments/__init__.py +2 -0
dataspeech/gpu_enrichments/__pycache__/__init__.cpython-38.pyc +0 -0
dataspeech/gpu_enrichments/__pycache__/pitch.cpython-38.pyc +0 -0
dataspeech/gpu_enrichments/__pycache__/snr_and_reverb.cpython-38.pyc +0 -0
dataspeech/gpu_enrichments/pitch.py +64 -0
dataspeech/gpu_enrichments/snr_and_reverb.py +47 -0
metadata_to_text.py +287 -0
v01_bin_edges.json +1 -0
v01_text_bins.json +12 -0

analyze.py ADDED Viewed

	@@ -0,0 +1,148 @@

+from itertools import count, islice
+from typing import Any, Iterable, Literal, Optional, TypeVar, Union, overload, Dict, List, Tuple
+from collections import defaultdict
+import json
+import torch
+from datasets import Dataset, Audio
+from dataspeech import rate_apply, pitch_apply, snr_apply
+from metadata_to_text import bins_to_text, speaker_level_relative_to_gender
+Row = Dict[str, Any]
+T = TypeVar("T")
+BATCH_SIZE = 20
+@overload
+def batched(it: Iterable[T], n: int) -> Iterable[List[T]]:
+    ...
+@overload
+def batched(it: Iterable[T], n: int, with_indices: Literal[False]) -> Iterable[List[T]]:
+    ...
+@overload
+def batched(it: Iterable[T], n: int, with_indices: Literal[True]) -> Iterable[Tuple[List[int], List[T]]]:
+    ...
+def batched(
+    it: Iterable[T], n: int, with_indices: bool = False
+) -> Union[Iterable[List[T]], Iterable[Tuple[List[int], List[T]]]]:
+    it, indices = iter(it), count()
+    while batch := list(islice(it, n)):
+        yield (list(islice(indices, len(batch))), batch) if with_indices else batch
+def analyze(
+    batch: List[Dict[str, Any]],
+    cache: Optional[Dict[str, List[Any]]] = None,
+) -> List[List[Any]]:
+    cache = {} if cache is None else cache
+    return batch
+def run_dataspeech(
+    rows: Iterable[Row], audio_column_name: str, text_column_name: str
+) -> Iterable[Any]:
+    cache: Dict[str, List[Any]] = {}
+    # TODO: add speaker and gender to app
+    speaker_id_column_name = "speaker_id"
+    gender_column_name = "gender"
+    for batch in batched(rows, BATCH_SIZE):
+        tmp_dict = defaultdict(list)
+        for sample in batch:
+            for key in sample:
+                if key in [audio_column_name, text_column_name, speaker_id_column_name, gender_column_name]:
+                    tmp_dict[key].append(sample[key]) if key != audio_column_name else tmp_dict[key].append(sample[key][0]["src"])
+        tmp_dataset = Dataset.from_dict(tmp_dict).cast_column(audio_column_name, Audio())
+        ## 1. Extract continous tags
+        pitch_dataset = tmp_dataset.map(
+            pitch_apply,
+            batched=True,
+            batch_size=BATCH_SIZE,
+            with_rank=True if torch.cuda.device_count()>0 else False,
+            num_proc=torch.cuda.device_count(),
+            remove_columns=[audio_column_name], # tricks to avoid rewritting audio
+            fn_kwargs={"audio_column_name": audio_column_name, "penn_batch_size": 4096},
+        )
+        snr_dataset = tmp_dataset.map(
+            snr_apply,
+            batched=True,
+            batch_size=BATCH_SIZE,
+            with_rank=True if torch.cuda.device_count()>0 else False,
+            num_proc=torch.cuda.device_count(),
+            remove_columns=[audio_column_name], # tricks to avoid rewritting audio
+            fn_kwargs={"audio_column_name": audio_column_name},
+        )
+        rate_dataset = tmp_dataset.map(
+            rate_apply,
+            with_rank=False,
+            num_proc=1,
+            remove_columns=[audio_column_name], # tricks to avoid rewritting audio
+            fn_kwargs={"audio_column_name": audio_column_name, "text_column_name": text_column_name},
+        )
+        enriched_dataset = pitch_dataset.add_column("snr", snr_dataset["snr"]).add_column("c50", snr_dataset["c50"])
+        enriched_dataset = enriched_dataset.add_column("speaking_rate", rate_dataset["speaking_rate"]).add_column("phonemes", rate_dataset["phonemes"])
+        ## 2. Map continuous tags to text tags
+        text_bins_dict = {}
+        with open("./create_dataset_app/v01_text_bins.json") as json_file:
+            text_bins_dict = json.load(json_file)
+        bin_edges_dict = {}
+        with open("./create_dataset_app/v01_bin_edges.json") as json_file:
+            bin_edges_dict = json.load(json_file)
+        speaker_level_pitch_bins = text_bins_dict.get("speaker_level_pitch_bins")
+        speaker_rate_bins = text_bins_dict.get("speaker_rate_bins")
+        snr_bins = text_bins_dict.get("snr_bins")
+        reverberation_bins = text_bins_dict.get("reverberation_bins")
+        utterance_level_std = text_bins_dict.get("utterance_level_std")
+        enriched_dataset = [enriched_dataset]
+        if "gender" in batch[0] and "speaker_id" in batch[0]:
+            bin_edges = None
+            if "pitch_bins_male" in bin_edges_dict and "pitch_bins_female" in bin_edges_dict:
+                bin_edges = {"male": bin_edges_dict["pitch_bins_male"], "female": bin_edges_dict["pitch_bins_female"]}
+            enriched_dataset, _ = speaker_level_relative_to_gender(enriched_dataset, speaker_level_pitch_bins, "speaker_id", "gender", "utterance_pitch_mean", "pitch", batch_size=20, num_workers=1, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=bin_edges)
+        enriched_dataset, _ = bins_to_text(enriched_dataset, speaker_rate_bins, "speaking_rate", "speaking_rate", batch_size=20, num_workers=1, leading_split_for_bins=None, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=bin_edges_dict.get("speaking_rate",None))
+        enriched_dataset, _ = bins_to_text(enriched_dataset, snr_bins, "snr", "noise", batch_size=20, num_workers=1, leading_split_for_bins=None, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=bin_edges_dict.get("noise",None), lower_range=None)
+        enriched_dataset, _ = bins_to_text(enriched_dataset, reverberation_bins, "c50", "reverberation", batch_size=20, num_workers=1, leading_split_for_bins=None, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=bin_edges_dict.get("reverberation",None))
+        enriched_dataset, _ = bins_to_text(enriched_dataset, utterance_level_std, "utterance_pitch_std", "speech_monotony", batch_size=20, num_workers=1, leading_split_for_bins=None, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=bin_edges_dict.get("speech_monotony",None))
+        enriched_dataset = enriched_dataset[0]
+        for i,sample in enumerate(batch):
+            new_sample = {}
+            new_sample[audio_column_name] = f"<audio src='{sample[audio_column_name][0]['src']}' controls></audio>"
+            for col in ["speaking_rate", "reverberation", "noise", "speech_monotony", "c50", "snr",]: # phonemes, speaking_rate, utterance_pitch_std, utterance_pitch_mean
+                new_sample[col] = enriched_dataset[col][i]
+            if "gender" in batch[0] and "speaker_id" in batch[0]:
+                new_sample["pitch"] = enriched_dataset["pitch"][i]
+                new_sample[gender_column_name] = sample[col]
+                new_sample[speaker_id_column_name] = sample[col]
+            new_sample[text_column_name] = sample[text_column_name]
+            batch[i] = new_sample
+        yield analyze(
+            batch=batch,
+            cache=cache,
+        )

app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from collections import Counter
+from itertools import count, groupby, islice
+from operator import itemgetter
+from typing import Any, Iterable, TypeVar, List, Dict, Tuple, Optional
+import gradio as gr
+import requests
+import pandas as pd
+from datasets import Features
+from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from requests.adapters import HTTPAdapter, Retry
+from analyze import run_dataspeech
+MAX_ROWS = 100
+T = TypeVar("T")
+session = requests.Session()
+retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
+session.mount('http://', HTTPAdapter(max_retries=retries))
+def stream_rows(dataset: str, config: str, split: str) -> Iterable[Dict[str, Any]]:
+    batch_size = 100
+    for i in count():
+        rows_resp = session.get(f"https://datasets-server.huggingface.co/rows?dataset={dataset}&config={config}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=10).json()
+        if "error" in rows_resp:
+            raise RuntimeError(rows_resp["error"])
+        if not rows_resp["rows"]:
+            break
+        for row_item in rows_resp["rows"]:
+            yield row_item["row"]
+class track_iter:
+    def __init__(self, it: Iterable[T]):
+        self.it = it
+        self.next_idx = 0
+    def __iter__(self) -> T:
+        for item in self.it:
+            self.next_idx += 1
+            yield item
+def report(next_row_idx: int, num_rows: int) -> Dict[str, float]:
+    if num_rows == next_row_idx:
+        return f"Scan finished: {num_rows} samples analyzed"
+    else:
+        return f"Tagging in progress - {next_row_idx/num_rows*100}% of rows analyzed..."
+def analyze_dataset(dataset: str, audio_column_name: str, text_column_name: str, configuration_name: Optional[str]  = None, split_name: Optional[str] = None) -> Tuple[str, List[List[Any]]]:
+    info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
+    if "error" in info_resp:
+        yield "❌ " + info_resp["error"], pd.DataFrame()
+        return
+    if configuration_name in info_resp["dataset_info"]:
+        config = configuration_name
+    elif configuration_name != "" and configuration_name is not None:
+        yield "❌ " + f"The configuration you've passed `{configuration_name}` was not found in the dataset configs: {', '.join(info_resp['dataset_info'].keys())}. Try again with the right config name.", gr.DataFrame()
+        return
+    else:
+        config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
+    features = Features.from_dict(info_resp["dataset_info"][config]["features"])
+    if split_name in info_resp["dataset_info"][config]["splits"]:
+        split = split_name
+    elif split_name != "" and split_name is not None:
+        yield "❌ " + f"The splt you've passed `{split_name}` was not found in the dataset splits: {', '.join(info_resp['dataset_info'][config]['splits'])}. Try again with the right config name.", gr.DataFrame()
+        return
+    else:
+        split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(iter(info_resp["dataset_info"][config]["splits"]))
+    num_rows = min(info_resp["dataset_info"][config]["splits"][split]["num_examples"], MAX_ROWS)
+    rows = track_iter(islice(stream_rows(dataset, config, split), MAX_ROWS))
+    if audio_column_name not in features:
+        yield "❌ " + f"The audio column name you've passed `{audio_column_name}` was not found in the dataset columns: {', '.join(features.keys())}. Try again with the right column name.", gr.DataFrame()
+        return
+    if text_column_name not in features:
+        yield "❌ " + f"The text column name you've passed `{text_column_name}` was not found in the dataset columns: {', '.join(features.keys())}. Try again with the right column name.", gr.DataFrame()
+        return
+    if "gender" in features:
+        yield "Gender has been detected. We'll compute pitch.", pd.DataFrame()
+    dataframe = []
+    for batch in run_dataspeech(
+        rows, audio_column_name, text_column_name
+    ):
+        headers = list(batch[0].keys())
+        batch = [list(sample.values()) for sample in batch]
+        dataframe.extend(batch)
+        datatype = ["str"  if col != audio_column_name else "markdown" for col in headers]
+        yield (report(next_row_idx=rows.next_idx, num_rows=num_rows), gr.DataFrame(dataframe, headers=headers, datatype=datatype, wrap=True))
+    yield (report(next_row_idx=rows.next_idx, num_rows=num_rows), gr.DataFrame(dataframe, headers=headers, datatype=datatype, wrap=True))
+with gr.Blocks() as demo:
+    gr.Markdown("# Analyze speech dataset using Data-Speech")
+    gr.Markdown("The space takes an HF dataset name as an input, as well as the audio column name to analyze, and returns the speaking rate, noise level, reverberation level, monotony level and pitch. Note that pitch is only computed if a `speaker_id` column and a `gender` column are found.")
+    hub_search = HuggingfaceHubSearch(
+            label="Hub Dataset ID",
+            placeholder="Search for dataset id on Huggingface",
+            search_type="dataset",
+        )
+    audio_column_name = gr.Textbox(
+            value="audio",
+            label="Audio column name.",
+        )
+    text_column_name = gr.Textbox(
+            value="text",
+            label="Transcription column name.",
+        )
+    with gr.Accordion("(Optional) specify configuration and split of the dataset to be analysed", open=False):
+        configuration_name = gr.Textbox(
+            value=None,
+            label="Configuration name.",
+        )
+        split_name = gr.Textbox(
+            value=None,
+            label="Split name.",
+        )
+    button = gr.Button("Run Data-Speech Scan")
+    outputs = [
+        gr.Label(show_label=False),
+        gr.DataFrame(),
+    ]
+    button.click(analyze_dataset, [hub_search, audio_column_name, text_column_name, configuration_name, split_name], outputs)
+    gr.Examples(
+        [
+            ["blabble-io/libritts_r", "audio", "text_normalized", "clean"],
+            ["blabble-io/libritts_r", "audio", "text_normalized", "other"],
+            ["espnet/yodas", "audio", "text", "en000",],
+            ["ylacombe/english_dialects", "audio", "text"]
+        ],
+        [hub_search, audio_column_name, text_column_name, configuration_name],
+        outputs,
+        fn=analyze_dataset,
+        run_on_click=True,
+        cache_examples=False,
+    )
+demo.launch(debug=False)

dataspeech/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .cpu_enrichments import rate_apply
2	+ from .gpu_enrichments import pitch_apply, snr_apply

dataspeech/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (276 Bytes). View file

dataspeech/cpu_enrichments/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .rate import rate_apply
2	+

dataspeech/cpu_enrichments/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (210 Bytes). View file

dataspeech/cpu_enrichments/__pycache__/rate.cpython-38.pyc ADDED Viewed

Binary file (860 Bytes). View file

dataspeech/cpu_enrichments/rate.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from g2p import make_g2p
+transducer = make_g2p('eng', 'eng-ipa')
+def rate_apply(batch, rank=None, audio_column_name="audio", text_column_name="text"):
+    if isinstance(batch[audio_column_name], list):
+        speaking_rates = []
+        phonemes_list = []
+        for text, audio in zip(batch[text_column_name], batch[audio_column_name]):
+            phonemes = transducer(text).output_string
+            sample_rate = audio["sampling_rate"]
+            audio_length = len(audio["array"].squeeze()) / sample_rate
+            speaking_rate = len(phonemes) / audio_length
+            speaking_rates.append(speaking_rate)
+            phonemes_list.append(phonemes)
+        batch["speaking_rate"] = speaking_rates
+        batch["phonemes"] = phonemes_list
+    else:
+        phonemes = transducer(batch[text_column_name]).output_string
+        sample_rate = batch[audio_column_name]["sampling_rate"]
+        audio_length = len(batch[audio_column_name]["array"].squeeze()) / sample_rate
+        speaking_rate = len(phonemes) / audio_length
+        batch["speaking_rate"] = speaking_rate
+        batch["phonemes"] = phonemes
+    return batch

dataspeech/gpu_enrichments/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .pitch import pitch_apply
2	+ from .snr_and_reverb import snr_apply

dataspeech/gpu_enrichments/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (260 Bytes). View file

dataspeech/gpu_enrichments/__pycache__/pitch.cpython-38.pyc ADDED Viewed

Binary file (1.25 kB). View file

dataspeech/gpu_enrichments/__pycache__/snr_and_reverb.cpython-38.pyc ADDED Viewed

Binary file (1.3 kB). View file

dataspeech/gpu_enrichments/pitch.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+import penn
+# Here we'll use a 10 millisecond hopsize
+hopsize = .01
+# Provide a sensible frequency range given your domain and model
+fmin = 30.
+fmax = 1000.
+# Select a checkpoint to use for inference. Selecting None will
+# download and use FCNF0++ pretrained on MDB-stem-synth and PTDB
+checkpoint = None
+# Centers frames at hopsize / 2, 3 * hopsize / 2, 5 * hopsize / 2, ...
+center = 'half-hop'
+# (Optional) Linearly interpolate unvoiced regions below periodicity threshold
+interp_unvoiced_at = .065
+def pitch_apply(batch, rank=None, audio_column_name="audio", output_column_name="utterance_pitch", penn_batch_size=4096):
+    if isinstance(batch[audio_column_name], list):
+        utterance_pitch_mean = []
+        utterance_pitch_std = []
+        for sample in batch[audio_column_name]:
+            # Infer pitch and periodicity
+            pitch, periodicity = penn.from_audio(
+                torch.tensor(sample["array"][None, :]).float(),
+                sample["sampling_rate"],
+                hopsize=hopsize,
+                fmin=fmin,
+                fmax=fmax,
+                checkpoint=checkpoint,
+                batch_size=penn_batch_size,
+                center=center,
+                interp_unvoiced_at=interp_unvoiced_at,
+                gpu=(rank or 0)% torch.cuda.device_count() if rank else rank
+                )
+            utterance_pitch_mean.append(pitch.mean().cpu())
+            utterance_pitch_std.append(pitch.std().cpu())
+        batch[f"{output_column_name}_mean"] = utterance_pitch_mean
+        batch[f"{output_column_name}_std"] = utterance_pitch_std
+    else:
+        sample = batch[audio_column_name]
+        pitch, periodicity = penn.from_audio(
+                torch.tensor(sample["array"][None, :]).float(),
+                sample["sampling_rate"],
+                hopsize=hopsize,
+                fmin=fmin,
+                fmax=fmax,
+                checkpoint=checkpoint,
+                batch_size=penn_batch_size,
+                center=center,
+                interp_unvoiced_at=interp_unvoiced_at,
+                gpu=(rank or 0)% torch.cuda.device_count() if rank else rank
+                )
+        batch[f"{output_column_name}_mean"] = pitch.mean().cpu()
+        batch[f"{output_column_name}_std"] = pitch.std().cpu()
+    return batch

dataspeech/gpu_enrichments/snr_and_reverb.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from pyannote.audio import Model
+from pathlib import Path
+from brouhaha.pipeline import RegressiveActivityDetectionPipeline
+import torch
+from huggingface_hub import hf_hub_download
+model = None
+def snr_apply(batch, rank=None, audio_column_name="audio"):
+    global model
+    if model is None:
+        model = Model.from_pretrained(
+            Path(hf_hub_download(repo_id="ylacombe/brouhaha-best", filename="best.ckpt")),
+            strict=False,
+        )
+    if rank is not None:
+        # move the model to the right GPU if not there already
+        device = f"cuda:{(rank or 0)% torch.cuda.device_count()}"
+        # move to device and create pipeline here because the pipeline moves to the first GPU it finds anyway
+        model.to(device)
+    pipeline = RegressiveActivityDetectionPipeline(segmentation=model)
+    if rank:
+        pipeline.to(torch.device(device))
+    device = pipeline._models["segmentation"].device
+    if isinstance(batch[audio_column_name], list):
+        snr = []
+        c50 = []
+        for sample in batch[audio_column_name]:
+            res = pipeline({"sample_rate": sample["sampling_rate"],
+                            "waveform": torch.tensor(sample["array"][None, :]).to(device).float()})
+            snr.append(res["snr"].mean())
+            c50.append(res["c50"].mean())
+        batch["snr"] = snr
+        batch["c50"] = c50
+    else:
+        res = pipeline({"sample_rate": batch[audio_column_name]["sampling_rate"],
+                        "waveform": torch.tensor(batch[audio_column_name]["array"][None, :]).to(device).float()})
+        batch["snr"] = res["snr"].mean()
+        batch["c50"] = res["c50"].mean()
+    return batch

metadata_to_text.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import numpy as np
+import pandas as pd
+from datasets import load_dataset, DatasetDict
+from multiprocess import set_start_method
+import argparse
+from pathlib import Path
+import os
+import matplotlib.pyplot as plt
+import json
+SPEAKER_RATE_BINS = ["very slowly", "quite slowly", "slightly slowly", "moderate speed", "slightly fast", "quite fast", "very fast"]
+SNR_BINS = ["very noisy", "quite noisy", "slightly noisy", "moderate ambient sound", "slightly clear", "quite clear", "very clear"]
+REVERBERATION_BINS = ["very roomy sounding", "quite roomy sounding", "slightly roomy sounding", "moderate reverberation", "slightly confined sounding", "quite confined sounding", "very confined sounding"]
+UTTERANCE_LEVEL_STD = ["very monotone", "quite monotone", "slightly monotone", "moderate intonation", "slightly expressive", "quite expressive", "very expressive"]
+# this one is supposed to be apply to speaker-level mean pitch, and relative to gender
+SPEAKER_LEVEL_PITCH_BINS = ["very low pitch", "quite low pitch", "slightly low pitch", "moderate pitch", "slightly high pitch", "quite high pitch", "very high pitch"]
+def visualize_bins_to_text(values_1, values_2, name_1, name_2, text_bins, save_dir, output_column_name, default_bins=100, lower_range=None):
+    # Save both histograms into a single figure
+    fig, axs = plt.subplots(2, figsize=(8,6), sharex=True)
+    # Plot histogram and vertical lines for subplot 1
+    axs[0].hist(values_1, bins=default_bins, color='blue', alpha=0.7)
+    _, bin_edges1 = np.histogram(values_1, bins=len(text_bins), range=(lower_range, values_1.max()) if lower_range else None)
+    for edge in bin_edges1:
+        axs[0].axvline(x=edge, color='red', linestyle='--', linewidth=1)
+    # Plot histogram and vertical lines for subplot 2
+    axs[1].hist(values_2, bins=50, color='green', alpha=0.7)
+    _, bin_edges2 = np.histogram(values_2, bins=len(text_bins), range=(lower_range, values_2.max()) if lower_range else None)
+    for edge in bin_edges2:
+        axs[1].axvline(x=edge, color='red', linestyle='--', linewidth=1)
+    # Add labels and title
+    axs[0].set_title(name_1)
+    axs[1].set_title(name_2)
+    axs[0].set_yscale('log')
+    axs[1].set_yscale('log')
+    axs[0].set_ylabel('Frequency')
+    axs[1].set_ylabel('Frequency')
+    axs[1].set_xlabel(f'{output_column_name}')
+    # Adjust layout
+    plt.tight_layout()
+    filename = f"{output_column_name}.png"
+    filepath = os.path.join(save_dir, filename)
+    plt.savefig(filepath)
+    print(f"Plots saved at '{filename}'!")
+def bins_to_text(dataset, text_bins, column_name, output_column_name, leading_split_for_bins="train", batch_size = 4, num_workers = 1, std_tolerance=5, save_dir=None, only_save_plot=False, lower_range=None, bin_edges=None):
+    '''
+    Compute bins of `column_name` from the splits `leading_split_for_bins` and apply text bins to every split.
+    `leading_split_for_bins` can be a string or a list.
+    '''
+    if bin_edges is None:
+        values = []
+        for df in dataset:
+            for split in df:
+                if leading_split_for_bins is None or leading_split_for_bins in split:
+                    values.extend(df[split][column_name])
+        # filter out outliers
+        values = np.array(values)
+        if std_tolerance is not None:
+            filtered_values = values[np.abs(values - np.mean(values)) < std_tolerance * np.std(values)]
+        if save_dir is not None:
+            visualize_bins_to_text(values, filtered_values, "Before filtering", "After filtering", text_bins, save_dir, output_column_name, lower_range)
+        # speaking_rate can easily have outliers
+        if save_dir is not None and output_column_name=="speaking_rate":
+            visualize_bins_to_text(filtered_values, filtered_values, "After filtering", "After filtering", text_bins, save_dir, f"{output_column_name}_after_filtering", lower_range)
+        values = filtered_values
+        hist, bin_edges = np.histogram(values, bins = len(text_bins), range=(lower_range, values.max()) if lower_range else None)
+        if only_save_plot:
+            return dataset, bin_edges
+    else:
+        print(f"Already computed bin edges have been passed for {output_column_name}. Will use: {bin_edges}.")
+    def batch_association(batch):
+        index_bins = np.searchsorted(bin_edges, batch, side="left")
+        # do min(max(...)) when values are outside of the main bins
+        # it happens when value = min or max or have been filtered out from bins computation
+        batch_bins = [text_bins[min(max(i-1, 0), len(text_bins)-1)] for i in index_bins]
+        return {
+            output_column_name: batch_bins
+        }
+    dataset = [df.map(batch_association, batched=True, batch_size=batch_size, input_columns=[column_name], num_proc=num_workers) for df in dataset]
+    return dataset, bin_edges
+def speaker_level_relative_to_gender(dataset, text_bins, speaker_column_name, gender_column_name, column_name, output_column_name, batch_size = 4, num_workers=1, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=None):
+    '''
+    Computes mean values on a speaker level and computes bins on top relative to the gender column name.
+    Then associate a text bin to the column.
+    This time, doesn't use leading_split_for_bins, computes it for all. Could probably be optimized
+    '''
+    list_data = []
+    for df in dataset:
+        for split in df:
+            panda_data = df[split].remove_columns([col for col in df[split].column_names if col not in {speaker_column_name, column_name, gender_column_name}]).to_pandas()
+            list_data.append(panda_data)
+    dataframe = pd.concat(list_data, ignore_index=True)
+    dataframe = dataframe.groupby(speaker_column_name).agg({column_name: "mean", gender_column_name: "first"})
+    if bin_edges is None:
+        bin_edges = {}
+        if save_dir is not None:
+            save_dict = {}
+            save_dict_afer_filtering = {}
+        for category in ["male", "female"]:
+            values = dataframe[dataframe[gender_column_name] == category][column_name]
+            values = np.array(values)
+            if save_dir is not None:
+                save_dict[category] = values
+            if std_tolerance is not None:
+                # filter out outliers
+                values = values[np.abs(values - np.mean(values)) < std_tolerance * np.std(values)]
+                if save_dir is not None:
+                    save_dict_afer_filtering[category] = values
+            bin_edges[category] = np.histogram(values, len(text_bins))[1]
+        if save_dir is not None:
+            visualize_bins_to_text(save_dict["male"], save_dict["female"], "Male distribution", "Female distribution", text_bins, save_dir, output_column_name)
+            if std_tolerance is not None:
+                visualize_bins_to_text(save_dict_afer_filtering["male"], save_dict_afer_filtering["female"], "Male distribution", "Female distribution", text_bins, save_dir, f"{output_column_name}_after_filtering")
+        if only_save_plot:
+            return dataset, bin_edges
+    speaker_id_to_bins = dataframe.apply(lambda x: np.searchsorted(bin_edges[x[gender_column_name]], x[column_name]), axis=1).to_dict()
+    def batch_association(batch):
+        index_bins = [speaker_id_to_bins[speaker] for speaker in batch]
+        # do min(max(...)) when values are outside of the main bins
+        # it happens when value = min or max or have been filtered out from bins computation
+        batch_bins = [text_bins[min(max(i-1, 0), len(text_bins)-1)] for i in index_bins]
+        return {
+            output_column_name: batch_bins
+        }
+    dataset = [df.map(batch_association, batched=True, input_columns=[speaker_column_name], batch_size=batch_size, num_proc=num_workers) for df in dataset]
+    return dataset, bin_edges
+if __name__ == "__main__":
+    set_start_method("spawn")
+    parser = argparse.ArgumentParser()
+    parser.add_argument("dataset_name", type=str, help="Path or name of the dataset(s). If multiple datasets, names have to be separated by `+`.")
+    parser.add_argument("--configuration", default=None, type=str, help="Dataset configuration(s) to use (or configuration separated by +).")
+    parser.add_argument("--output_dir", default=None, type=str, help="If specified, save the dataset(s) on disk. If multiple datasets, paths have to be separated by `+`.")
+    parser.add_argument("--repo_id", default=None, type=str, help="If specified, push the dataset(s) to the hub. If multiple datasets, names have to be separated by `+`.")
+    parser.add_argument("--path_to_text_bins", default=None, type=str, help="If specified, points to a JSON file which contains the text bins that will be associated to each bins. Will use default bins.")
+    parser.add_argument("--path_to_bin_edges", default=None, type=str, help="If specified, points to a JSON file which contains the bin edges. Useful if you want to apply already computed bins to new datasets. If not specified, will recompute bin edges from scratch.")
+    parser.add_argument("--save_bin_edges", default=None, type=str, help="If specified, it's the name of the JSON file which will contains the edge bins that have been computed. Useful if you want to reuse those bin eges on new datasets. By default, it won't save those edges..")
+    parser.add_argument("--avoid_pitch_computation", default=False, action="store_true", help="If `True`, will not compute `pitch`. Note that `pitch` is computed on a speaker-level, relative to gender, so you don't need it in a mono-speaker setting.")
+    parser.add_argument("--cpu_num_workers", default=1, type=int, help="Number of CPU workers.")
+    parser.add_argument("--batch_size", default=16, type=int, help="Batch size in `Dataset.map` operations. https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Dataset.map")
+    parser.add_argument("--speaker_id_column_name", default="speaker_id", type=str, help="Speaker id column name. Only used if `avoid_pitch_computation=False`")
+    parser.add_argument("--gender_column_name", default="gender", type=str, help="Gender column name. .Only used if `avoid_pitch_computation=False`")
+    parser.add_argument("--pitch_std_tolerance", default=2., type=float, help="Standard deviation tolerance for pitch estimation. Any value that is outside mean ± std * tolerance is discared. Only used if `avoid_pitch_computation=False`.")
+    parser.add_argument("--speaking_rate_std_tolerance", default=4., type=float, help="Standard deviation tolerance for speaking rate estimation. Any value that is outside mean ± std * tolerance is discared. Only used if `path_to_bin_edges=False`.")
+    parser.add_argument("--snr_std_tolerance", default=3.5, type=float, help="Standard deviation tolerance for SNR estimation. Any value that is outside mean ± std * tolerance is discared. Only used if `path_to_bin_edges=False`.")
+    parser.add_argument("--reverberation_std_tolerance", default=4, type=float, help="Standard deviation tolerance for reverberation estimation. Any value that is outside mean ± std * tolerance is discared. Only used if `path_to_bin_edges=False`.")
+    parser.add_argument("--speech_monotony_std_tolerance", default=4, type=float, help="Standard deviation tolerance for speech monotony estimation. Any value that is outside mean ± std * tolerance is discared. Only used if `path_to_bin_edges=False`.")
+    parser.add_argument("--leading_split_for_bins", default=None, type=str, help="If specified, will use every split that contains this string to compute statistics. If not specified, will use every split. Only used if `path_to_bin_edges=False`.")
+    parser.add_argument("--plot_directory", default=None, type=str, help="If specified, will save visualizing plots to this directory. Only used if `path_to_bin_edges=False`.")
+    parser.add_argument("--only_save_plot", default=False, action="store_true", help="If `True` and `--plot_directory` is specified, will only compute plot. Only used if `path_to_bin_edges=False`.")
+    parser.add_argument("--snr_lower_range", default=50, type=float, help="The lower range of the SNR bins")
+    args = parser.parse_args()
+    if args.plot_directory is None and args.only_save_plot:
+        raise ValueError("`only_save_plot=true` but `plot_directory` is not specified. Please give a path to the directory where you want the plot to be saved.")
+    if args.only_save_plot and args.path_to_bin_edges:
+        raise ValueError("`only_save_plot=true` but `path_to_bin_edges` is specified. Since the latter is specified, we won't redo computations that would have been used for plotting. Chose one ar another. Note that if you use this script to label a new dataset for fine-tuning, I'd recommend avoiding plotting and set `only_save_plot=false`")
+    text_bins_dict = {}
+    if args.path_to_text_bins:
+        with open(args.path_to_text_bins) as json_file:
+            text_bins_dict = json.load(json_file)
+    bin_edges_dict = {}
+    if args.path_to_bin_edges:
+        with open(args.path_to_bin_edges) as json_file:
+            bin_edges_dict = json.load(json_file)
+    speaker_level_pitch_bins = text_bins_dict.get("speaker_level_pitch_bins", SPEAKER_LEVEL_PITCH_BINS)
+    speaker_rate_bins = text_bins_dict.get("speaker_rate_bins", SPEAKER_RATE_BINS)
+    snr_bins = text_bins_dict.get("snr_bins", SNR_BINS)
+    reverberation_bins = text_bins_dict.get("reverberation_bins", REVERBERATION_BINS)
+    utterance_level_std = text_bins_dict.get("utterance_level_std", UTTERANCE_LEVEL_STD)
+    output_dirs = [args.output_dir] if args.output_dir is not None else None
+    repo_ids = [args.repo_id] if args.repo_id is not None else None
+    if args.configuration:
+        if "+" in args.dataset_name:
+            dataset_names = args.dataset_name.split("+")
+            dataset_configs = args.configuration.split("+")
+            if len(dataset_names) != len(dataset_configs):
+                raise ValueError(f"There are {len(dataset_names)} datasets spotted but {len(dataset_configs)} configuration spotted")
+            if args.repo_id is not None:
+                repo_ids = args.repo_id.split("+")
+                if len(dataset_names) != len(repo_ids):
+                    raise ValueError(f"There are {len(dataset_names)} datasets spotted but {len(repo_ids)} repository ids spotted")
+            if args.output_dir is not None:
+                output_dirs = args.output_dir.split("+")
+                if len(dataset_names) != len(output_dirs):
+                    raise ValueError(f"There are {len(dataset_names)} datasets spotted but {len(output_dirs)} local paths on which to save the datasets spotted")
+            dataset = []
+            for dataset_name, dataset_config in zip(dataset_names, dataset_configs):
+                tmp_dataset = load_dataset(dataset_name, dataset_config)
+                dataset.append(tmp_dataset)
+        else:
+            dataset = [load_dataset(args.dataset_name, args.configuration)]
+            dataset_configs = [args.configuration]
+    else:
+        if "+" in args.dataset_name:
+            dataset_names = args.dataset_name.split("+")
+            if args.repo_id is not None:
+                repo_ids = args.repo_id.split("+")
+                if len(dataset_names) != len(repo_ids):
+                    raise ValueError(f"There are {len(dataset_names)} datasets spotted but {len(repo_ids)} repository ids spotted")
+            if args.output_dir is not None:
+                output_dirs = args.output_dir.split("+")
+                if len(dataset_names) != len(output_dirs):
+                    raise ValueError(f"There are {len(dataset_names)} datasets spotted but {len(output_dirs)} local paths on which to save the datasets spotted")
+            dataset = []
+            for dataset_name, dataset_config in zip(dataset_names):
+                tmp_dataset = load_dataset(dataset_name)
+                dataset.append(tmp_dataset)
+        else:
+            dataset = [load_dataset(args.dataset_name)]
+    if args.plot_directory:
+        Path(args.plot_directory).mkdir(parents=True, exist_ok=True)
+    if not args.avoid_pitch_computation:
+        bin_edges = None
+        if "pitch_bins_male" in bin_edges_dict and "pitch_bins_female" in bin_edges_dict:
+            bin_edges = {"male": bin_edges_dict["pitch_bins_male"], "female": bin_edges_dict["pitch_bins_female"]}
+        dataset, pitch_bin_edges = speaker_level_relative_to_gender(dataset, speaker_level_pitch_bins, args.speaker_id_column_name, args.gender_column_name, "utterance_pitch_mean", "pitch", batch_size=args.batch_size, num_workers=args.cpu_num_workers, std_tolerance=args.pitch_std_tolerance, save_dir=args.plot_directory, only_save_plot=args.only_save_plot, bin_edges=bin_edges)
+    dataset, speaking_rate_bin_edges = bins_to_text(dataset, speaker_rate_bins, "speaking_rate", "speaking_rate", batch_size=args.batch_size, num_workers=args.cpu_num_workers, leading_split_for_bins=args.leading_split_for_bins, std_tolerance=args.speaking_rate_std_tolerance, save_dir=args.plot_directory, only_save_plot=args.only_save_plot, bin_edges=bin_edges_dict.get("speaking_rate",None))
+    dataset, noise_bin_edges = bins_to_text(dataset, snr_bins, "snr", "noise", batch_size=args.batch_size, num_workers=args.cpu_num_workers, leading_split_for_bins=args.leading_split_for_bins, std_tolerance=args.snr_std_tolerance, save_dir=args.plot_directory, only_save_plot=args.only_save_plot, bin_edges=bin_edges_dict.get("noise",None), lower_range=args.snr_lower_range)
+    dataset, reverberation_bin_edges = bins_to_text(dataset, reverberation_bins, "c50", "reverberation", batch_size=args.batch_size, num_workers=args.cpu_num_workers, leading_split_for_bins=args.leading_split_for_bins, std_tolerance=args.reverberation_std_tolerance, save_dir=args.plot_directory, only_save_plot=args.only_save_plot, bin_edges=bin_edges_dict.get("reverberation",None))
+    dataset, speech_monotony_bin_edges = bins_to_text(dataset, utterance_level_std, "utterance_pitch_std", "speech_monotony", batch_size=args.batch_size, num_workers=args.cpu_num_workers, leading_split_for_bins=args.leading_split_for_bins, std_tolerance=args.speech_monotony_std_tolerance, save_dir=args.plot_directory, only_save_plot=args.only_save_plot, bin_edges=bin_edges_dict.get("speech_monotony",None))
+    if args.save_bin_edges:
+        bin_edges = {
+            "speaking_rate": speaking_rate_bin_edges.tolist(),
+            "noise": noise_bin_edges.tolist(),
+            "reverberation": reverberation_bin_edges.tolist(),
+            "speech_monotony": speech_monotony_bin_edges.tolist(),
+        }
+        if not args.avoid_pitch_computation:
+            bin_edges["pitch_bins_male"] = pitch_bin_edges["male"].tolist()
+            bin_edges["pitch_bins_female"] = pitch_bin_edges["female"].tolist()
+        with open(args.save_bin_edges, "w") as outfile:
+            json.dump(bin_edges, outfile)
+    if not args.only_save_plot:
+        if args.output_dir:
+            for output_dir, df in zip(output_dirs, dataset):
+                df.save_to_disk(output_dir)
+        if args.repo_id:
+            for i, (repo_id, df) in enumerate(zip(repo_ids, dataset)):
+                if args.configuration:
+                    df.push_to_hub(repo_id, dataset_configs[i])
+                else:
+                    df.push_to_hub(repo_id)

v01_bin_edges.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"speaking_rate": [3.508771929824561, 6.187242299296628, 8.865712668768696, 11.544183038240764, 14.22265340771283, 16.901123777184896, 19.579594146656966, 22.258064516129032], "noise": [50.0, 53.460838317871094, 56.92167663574219, 60.38251495361328, 63.843353271484375, 67.30419158935547, 70.76502990722656, 74.22586822509766], "reverberation": [30.498437881469727, 34.706024169921875, 38.91361045837402, 43.12119674682617, 47.32878303527832, 51.53636932373047, 55.74395561218262, 59.951541900634766], "speech_monotony": [0.0, 17.430070059640066, 34.86014011928013, 52.2902101789202, 69.72028023856026, 87.15035029820032, 104.5804203578404, 122.01049041748047], "pitch_bins_male": [74.04898071289062, 88.6379623413086, 103.22694396972656, 117.81592559814453, 132.4049072265625, 146.993896484375, 161.58287048339844, 176.17185974121094], "pitch_bins_female": [130.46119689941406, 149.0537567138672, 167.64630126953125, 186.23886108398438, 204.83140563964844, 223.42396545410156, 242.01651000976562, 260.60906982421875]}

v01_text_bins.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "speaker_rate_bins":
+        ["very slowly", "quite slowly", "slightly slowly", "moderate speed", "slightly fast", "quite fast", "very fast"],
+    "snr_bins":
+        ["very noisy", "quite noisy", "slightly noisy", "moderate ambient sound", "slightly clear", "quite clear", "very clear"],
+    "reverberation_bins":
+        ["very roomy sounding", "quite roomy sounding", "slightly roomy sounding", "moderate reverberation", "slightly confined sounding", "quite confined sounding", "very confined sounding"],
+    "utterance_level_std":
+        ["very monotone", "quite monotone", "slightly monotone", "moderate intonation", "slightly expressive", "quite expressive", "very expressive"],
+    "speaker_level_pitch_bins":
+        ["very low pitch", "quite low pitch", "slightly low pitch", "moderate pitch", "slightly high pitch", "quite high pitch", "very high pitch"]
+}