Jiarong
Narua
pangloss / pangloss.py
albertvillanova's picture
Remove deprecated tasks (#1)
78e2ab2 verified
raw
history blame
8.85 kB
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pangloss datasets for Yongning Na (yong1288) and Japhug (japh1234)"""
import csv
import json
import os
import datasets
_CITATION = {
"yong1288": """
@misc{michaud_alexis_2021_5336698,
author = {Michaud, Alexis and
Galliot, Benjamin and
Guillaume, Séverine},
title = {{Yongning Na for Natural Language Processing: a
single-speaker audio corpus with transcriptions}},
month = aug,
year = 2021,
publisher = {Zenodo},
version = {1.0},
doi = {10.5281/zenodo.5336698},
url = {https://doi.org/10.5281/zenodo.5336698}
}
""",
"japh1234": """\
@misc{jacques_guillaume_2021_5521112,
author = {Jacques, Guillaume and
Galliot, Benjamin and
Guillaume, Séverine},
title = {{Japhug for Natural Language Processing: a single-
speaker audio corpus with transcriptions}},
month = sep,
year = 2021,
publisher = {Zenodo},
version = {1.0},
doi = {10.5281/zenodo.5521112},
url = {https://doi.org/10.5281/zenodo.5521112}
}
"""
}
_DESCRIPTION = """\
These datasets are extracts from the Pangloss collection and have
been preprocessed for ASR experiments in Na and Japhug.
"""
_HOMEPAGE = "https://pangloss.cnrs.fr/"
_LICENSE = "https://creativecommons.org/licenses/by-nc-sa/4.0/fr/legalcode"
# The HuggingFace Datasets library doesn't host the datasets but only points to the original files.
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
_VERSION = datasets.Version("1.0.0")
_LANGUAGES = {
"yong1288": {
"url": "https://mycore.core-cloud.net/index.php/s/vaGMeRf4Iij8MWR/download",
"homepage": "https://zenodo.org/record/5336698",
"description": "Yongning Na dataset",
"translations": ["fr", "en", "zh"]
},
"japh1234": {
"url": "https://mycore.core-cloud.net/index.php/s/kuQCxmyVcUFWroV/download",
"homepage": "https://zenodo.org/record/5521112",
"description": "Japhug dataset",
"translations": ["fr", "zh"]
}
}
# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
class PanglossDataset(datasets.GeneratorBasedBuilder):
"""The Pangloss datasets are extracts from Pangloss Collections that can be used for ASR experiments in these languages."""
field_translations = {
"chemin_audio": "path",
"nature": "doctype",
"forme": "sentence",
"traduction:fr": "translation:fr",
"traduction:en": "translation:en",
"traduction:zh": "translation:zh"
}
# This is an example of a dataset with multiple configurations.
# If you don't want/need to define several sub-sets in your dataset,
# just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.
# If you need to make complex sub-parts in the datasets with configurable options
# You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig
# BUILDER_CONFIG_CLASS = MyBuilderConfig
# You will be able to load one or the other configurations in the following list with
# data = datasets.load_dataset('my_dataset', 'first_domain')
# data = datasets.load_dataset('my_dataset', 'second_domain')
BUILDER_CONFIGS = [
datasets.BuilderConfig(name=language_name, version=_VERSION, description=language_data["description"])
for language_name, language_data in _LANGUAGES.items()
]
#DEFAULT_CONFIG_NAME = "na" # It's not mandatory to have a default configuration. Just use one if it make sense.
def _info(self):
# TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
features = datasets.Features(
{
"path": datasets.Value("string"),
"audio": datasets.features.Audio(sampling_rate=16_000),
"sentence": datasets.Value("string"),
"doctype": datasets.Value("string"),
**{f"translation:{language_code}": datasets.Value("string") for language_code in _LANGUAGES[self.config.name]["translations"]}
}
)
return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# This defines the different columns of the dataset and their types
features=features, # Here we define them above because they are different between the two configurations
# If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
# specify them. They'll be used if as_supervised=True in builder.as_dataset.
# supervised_keys=("sentence", "label"),
# Homepage of the dataset for documentation
homepage=_HOMEPAGE,
# License for the dataset if available
license=_LICENSE,
# Citation for the dataset
citation=_CITATION,
)
def _split_generators(self, dl_manager):
# TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
# If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
# dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLS
# It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
# By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
urls = _LANGUAGES[self.config.name]["url"]
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(data_dir, self.config.name, "train.csv"),
"split": "train"
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(data_dir, self.config.name, "test.csv"),
"split": "test"
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(data_dir, self.config.name, "validation.csv"),
"split": "validation"
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
# TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
# The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
with open(filepath, encoding="utf-8") as file_descriptor:
reader = csv.DictReader(file_descriptor)
for key, row in enumerate(reader):
translated_fieldnames = [self.field_translations[fieldname] for fieldname in reader.fieldnames if fieldname in self.field_translations.keys()]
data = dict(zip(translated_fieldnames, row.values()))
data["audio"] = os.path.join(os.path.dirname(filepath), data["path"])
# Yields examples as (key, example) tuples
yield key, data
if __name__ == "__main__":
# for language in _LANGUAGES.keys():
datasets.load_dataset("pangloss.py", "japh1234")
# datasets-cli test datasets/pangloss --save_infos --all_configs
# datasets-cli dummy_data datasets/pangloss --auto_generate