Commit
·
2b994e2
1
Parent(s):
06d47ea
remove dataset scripts
Browse files- upload_nst_sv_to_hf_dataset.py +0 -116
upload_nst_sv_to_hf_dataset.py
DELETED
@@ -1,116 +0,0 @@
|
|
1 |
-
""" Script to load, transform and upload swedish NST dataset to 🤗 datasets.
|
2 |
-
|
3 |
-
Dataset source: https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-56/
|
4 |
-
|
5 |
-
Procedure:
|
6 |
-
1. Loop over annotations
|
7 |
-
2. Decide whether to discard specific item
|
8 |
-
3. Create DatasetDict = {
|
9 |
-
features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
|
10 |
-
num_rows: 11030
|
11 |
-
}
|
12 |
-
3b. Mapping common_voice <---> NST
|
13 |
-
- 'client_id': info.Speaker_ID
|
14 |
-
- 'path': val_recording.file
|
15 |
-
- 'audio': wav file (binary)
|
16 |
-
- 'sentence': val_recording.text
|
17 |
-
- 'up_votes': 0
|
18 |
-
- 'down_votes': 0
|
19 |
-
- 'age': info.Age
|
20 |
-
- 'gender': info.Sex
|
21 |
-
- 'accent': ""
|
22 |
-
- 'locale': "sv"
|
23 |
-
- 'segment': ""
|
24 |
-
4. Dump to parquet
|
25 |
-
5. Upload to hub
|
26 |
-
|
27 |
-
Filter out:
|
28 |
-
- examples with single words
|
29 |
-
- examples with single characters
|
30 |
-
- examples with words splitted in single characters
|
31 |
-
- remove "\\Punkt", "\\Komma" from sentences
|
32 |
-
|
33 |
-
"""
|
34 |
-
|
35 |
-
import json
|
36 |
-
import os
|
37 |
-
|
38 |
-
import pandas as pd
|
39 |
-
import torchaudio
|
40 |
-
|
41 |
-
from datasets import load_dataset
|
42 |
-
|
43 |
-
|
44 |
-
hf_dataset_repo = "marinone94/nst_sv"
|
45 |
-
audio_files_path = "/Users/emiliomarinone/datasets/nst_sv/audio_files"
|
46 |
-
annotations_path = "/Users/emiliomarinone/datasets/nst_sv/annotations"
|
47 |
-
|
48 |
-
|
49 |
-
def load_audio_file(rel_filepath):
|
50 |
-
audio_filepath = f'{audio_files_path}/{rel_filepath}'
|
51 |
-
data_waveform, sampling_rate = torchaudio.load(audio_filepath)
|
52 |
-
return {
|
53 |
-
"path": rel_filepath,
|
54 |
-
"array": data_waveform[0].t().numpy(),
|
55 |
-
"sampling_rate": sampling_rate
|
56 |
-
}
|
57 |
-
|
58 |
-
|
59 |
-
def is_record_valid(text):
|
60 |
-
text_split = text.split()
|
61 |
-
|
62 |
-
if len(text_split) < 2:
|
63 |
-
return False
|
64 |
-
|
65 |
-
is_all_single_chars = True
|
66 |
-
for token in text_split:
|
67 |
-
if len(token) != 1:
|
68 |
-
is_all_single_chars = False
|
69 |
-
break
|
70 |
-
if is_all_single_chars:
|
71 |
-
return False
|
72 |
-
|
73 |
-
return True
|
74 |
-
|
75 |
-
|
76 |
-
def clean_text(text):
|
77 |
-
return text.replace("\\\\Komma", "").replace("\\\\Punkt", "")
|
78 |
-
|
79 |
-
|
80 |
-
def create_dataset_row(annotation_filename):
|
81 |
-
annotations_filepath = os.path.join(annotations_path, annotation_filename)
|
82 |
-
with open(annotations_filepath, "r") as f:
|
83 |
-
annotation = json.load(f)
|
84 |
-
|
85 |
-
dataset_rows = []
|
86 |
-
for recording in annotation["val_recordings"]:
|
87 |
-
if is_record_valid(recording["text"]):
|
88 |
-
rel_filepath = f'{annotation["pid"]}/{annotation["pid"]}_{recording["file"]}'.replace(".wav", "-2.wav")
|
89 |
-
dataset_row = {
|
90 |
-
"client_id": annotation["info"]["Speaker_ID"],
|
91 |
-
'path': rel_filepath,
|
92 |
-
'audio': load_audio_file(rel_filepath),
|
93 |
-
'sentence': clean_text(recording["text"]),
|
94 |
-
'up_votes': 0,
|
95 |
-
'down_votes': 0,
|
96 |
-
'age': annotation["info"]["Age"],
|
97 |
-
'gender': annotation["info"]["Sex"],
|
98 |
-
'accent': "",
|
99 |
-
'locale': "sv",
|
100 |
-
'segment': ""
|
101 |
-
}
|
102 |
-
dataset_rows.append(dataset_row)
|
103 |
-
|
104 |
-
return dataset_rows
|
105 |
-
|
106 |
-
|
107 |
-
dataset_rows = []
|
108 |
-
for i, filename in enumerate(os.listdir(annotations_path)):
|
109 |
-
dataset_rows.extend(create_dataset_row(filename))
|
110 |
-
if i == 1:
|
111 |
-
break
|
112 |
-
|
113 |
-
df = pd.DataFrame(dataset_rows)
|
114 |
-
df.to_parquet("dataset.parquet")
|
115 |
-
dataset = load_dataset("parquet", data_files="dataset.parquet")
|
116 |
-
dataset.push_to_hub("marinone94/nst_sv")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|