Spaces:
Sleeping
Sleeping
Yurii Paniv
commited on
Commit
•
03f568d
1
Parent(s):
369ee40
Update and rename converter.py to import_ukrainian.py
Browse files- scripts/converter.py +0 -97
- scripts/import_ukrainian.py +240 -0
scripts/converter.py
DELETED
@@ -1,97 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from random import shuffle
|
3 |
-
from shutil import copyfile
|
4 |
-
|
5 |
-
# file template needed for import script
|
6 |
-
template = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}"
|
7 |
-
# structure example below
|
8 |
-
# client_id path sentence up_votes down_votes age gender accent locale segment
|
9 |
-
structure = template.format("client_id", "path", "sentence", "up_votes",
|
10 |
-
"down_votes", "age", "gender", "accent", "locale", "segment")
|
11 |
-
|
12 |
-
iterator = 1
|
13 |
-
speaker_iterator = 1
|
14 |
-
|
15 |
-
|
16 |
-
def write_dataset(path, name, data):
|
17 |
-
"""
|
18 |
-
Function to write converted data list
|
19 |
-
"""
|
20 |
-
global iterator
|
21 |
-
global speaker_iterator
|
22 |
-
file_path = os.path.join(path, name)
|
23 |
-
clip_path = os.path.join(os.path.dirname(path), "wav")
|
24 |
-
result = open(file_path, mode="w", encoding="utf-8")
|
25 |
-
result.write(structure)
|
26 |
-
result.write("\n")
|
27 |
-
for row in data:
|
28 |
-
file_name = row[0]
|
29 |
-
if file_name.endswith(".wav"):
|
30 |
-
pass
|
31 |
-
elif file_name.endswith(".mp3"):
|
32 |
-
pass
|
33 |
-
elif file_name.find(".") == -1:
|
34 |
-
file_name += ".wav"
|
35 |
-
parted_name = file_name.split(".")
|
36 |
-
|
37 |
-
new_file_name = f"{iterator}." + parted_name[1]
|
38 |
-
|
39 |
-
old_file_path = os.path.join(clip_path, file_name)
|
40 |
-
new_file_path = os.path.join("clips", new_file_name)
|
41 |
-
if os.path.exists(old_file_path):
|
42 |
-
copyfile(old_file_path,
|
43 |
-
new_file_path)
|
44 |
-
result.write(template.format(
|
45 |
-
speaker_iterator, new_file_name, row[1], "", "", "", "", "", "uk", "\n"))
|
46 |
-
speaker_iterator += 1
|
47 |
-
iterator += 1
|
48 |
-
else:
|
49 |
-
print("File not found", old_file_path)
|
50 |
-
result.close()
|
51 |
-
|
52 |
-
|
53 |
-
if not os.path.exists("clips"):
|
54 |
-
os.makedirs("clips") # create folder to contain processed clips
|
55 |
-
|
56 |
-
# iterate over all data lists and write converted version near them
|
57 |
-
for subdir, dirs, files in os.walk(os.path.abspath(os.path.curdir)):
|
58 |
-
print(subdir)
|
59 |
-
for file in files:
|
60 |
-
if file == "txt.final.data":
|
61 |
-
file_path = os.path.join(subdir, file)
|
62 |
-
file = open(file_path, mode="r")
|
63 |
-
data = [row.replace(" \n", "").split(" ", 1)
|
64 |
-
for row in file.readlines()]
|
65 |
-
file.close()
|
66 |
-
|
67 |
-
shuffle(data)
|
68 |
-
|
69 |
-
dataset_size = len(data)
|
70 |
-
train_point = int(dataset_size*0.8)
|
71 |
-
dev_point = int(train_point + (dataset_size - train_point) / 2)
|
72 |
-
# split dataset
|
73 |
-
write_dataset(subdir, "train.tsv", data[:train_point])
|
74 |
-
write_dataset(subdir, "dev.tsv", data[train_point:dev_point])
|
75 |
-
write_dataset(subdir, "test.tsv", data[dev_point:])
|
76 |
-
|
77 |
-
# write dataset splits into single files
|
78 |
-
final_files = {
|
79 |
-
"train.tsv": open("train.tsv", mode="w", encoding="utf-8"),
|
80 |
-
"dev.tsv": open("dev.tsv", mode="w", encoding="utf-8"),
|
81 |
-
"test.tsv": open("test.tsv", mode="w", encoding="utf-8")
|
82 |
-
}
|
83 |
-
for file in final_files.values():
|
84 |
-
file.write(structure)
|
85 |
-
file.write("\n")
|
86 |
-
|
87 |
-
for subdir, dirs, files in os.walk(os.path.curdir):
|
88 |
-
for file in files:
|
89 |
-
if file in ["train.tsv", "dev.tsv", "test.tsv"]:
|
90 |
-
input_file = open(os.path.join(subdir, file))
|
91 |
-
data = [row for row in input_file.readlines()][1::]
|
92 |
-
input_file.close()
|
93 |
-
for row in data:
|
94 |
-
final_files[file].write(row)
|
95 |
-
|
96 |
-
for file in final_files.values():
|
97 |
-
file.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/import_ukrainian.py
ADDED
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
"""
|
3 |
+
This script transforms custom dataset, gathered from Internet into
|
4 |
+
DeepSpeech-ready .csv file
|
5 |
+
Use "python3 import_ukrainian.py -h" for help
|
6 |
+
"""
|
7 |
+
import csv
|
8 |
+
import os
|
9 |
+
import subprocess
|
10 |
+
import unicodedata
|
11 |
+
from multiprocessing import Pool
|
12 |
+
|
13 |
+
import progressbar
|
14 |
+
import sox
|
15 |
+
|
16 |
+
from deepspeech_training.util.downloader import SIMPLE_BAR
|
17 |
+
from deepspeech_training.util.importers import (
|
18 |
+
get_counter,
|
19 |
+
get_imported_samples,
|
20 |
+
get_importers_parser,
|
21 |
+
get_validate_label,
|
22 |
+
print_import_report,
|
23 |
+
)
|
24 |
+
from ds_ctcdecoder import Alphabet
|
25 |
+
|
26 |
+
FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
|
27 |
+
SAMPLE_RATE = 16000
|
28 |
+
CHANNELS = 1
|
29 |
+
MAX_SECS = 10
|
30 |
+
PARAMS = None
|
31 |
+
FILTER_OBJ = None
|
32 |
+
AUDIO_DIR = None
|
33 |
+
|
34 |
+
|
35 |
+
class LabelFilter:
|
36 |
+
def __init__(self, normalize, alphabet, validate_fun):
|
37 |
+
self.normalize = normalize
|
38 |
+
self.alphabet = alphabet
|
39 |
+
self.validate_fun = validate_fun
|
40 |
+
|
41 |
+
def filter(self, label):
|
42 |
+
if self.normalize:
|
43 |
+
label = unicodedata.normalize("NFKD", label.strip()).encode(
|
44 |
+
"ascii", "ignore").decode("ascii", "ignore")
|
45 |
+
label = self.validate_fun(label)
|
46 |
+
if self.alphabet and label and not self.alphabet.CanEncode(label):
|
47 |
+
label = None
|
48 |
+
return label
|
49 |
+
|
50 |
+
|
51 |
+
def init_worker(params):
|
52 |
+
global FILTER_OBJ # pylint: disable=global-statement
|
53 |
+
global AUDIO_DIR # pylint: disable=global-statement
|
54 |
+
AUDIO_DIR = params.audio_dir if params.audio_dir else os.path.join(
|
55 |
+
params.tsv_dir, "clips")
|
56 |
+
validate_label = get_validate_label(params)
|
57 |
+
alphabet = Alphabet(
|
58 |
+
params.filter_alphabet) if params.filter_alphabet else None
|
59 |
+
FILTER_OBJ = LabelFilter(params.normalize, alphabet, validate_label)
|
60 |
+
|
61 |
+
|
62 |
+
def one_sample(sample):
|
63 |
+
""" Take an audio file, and optionally convert it to 16kHz WAV """
|
64 |
+
global AUDIO_DIR
|
65 |
+
source_filename = sample[0]
|
66 |
+
if not os.path.splitext(source_filename.lower())[1] == ".wav":
|
67 |
+
source_filename += ".wav"
|
68 |
+
# Storing wav files next to the mp3 ones - just with a different suffix
|
69 |
+
output_filename = f"{sample[2]}.wav"
|
70 |
+
output_filepath = os.path.join(AUDIO_DIR, output_filename)
|
71 |
+
_maybe_convert_wav(source_filename, output_filepath)
|
72 |
+
file_size = -1
|
73 |
+
frames = 0
|
74 |
+
if os.path.exists(output_filepath):
|
75 |
+
file_size = os.path.getsize(output_filepath)
|
76 |
+
if file_size == 0:
|
77 |
+
frames = 0
|
78 |
+
else:
|
79 |
+
frames = int(
|
80 |
+
subprocess.check_output(
|
81 |
+
["soxi", "-s", output_filepath], stderr=subprocess.STDOUT
|
82 |
+
)
|
83 |
+
)
|
84 |
+
label = FILTER_OBJ.filter(sample[1])
|
85 |
+
rows = []
|
86 |
+
counter = get_counter()
|
87 |
+
if file_size == -1:
|
88 |
+
# Excluding samples that failed upon conversion
|
89 |
+
counter["failed"] += 1
|
90 |
+
elif label is None:
|
91 |
+
# Excluding samples that failed on label validation
|
92 |
+
counter["invalid_label"] += 1
|
93 |
+
# + 1 added for filtering surname dataset with too short audio files
|
94 |
+
elif int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(label)) + 1:
|
95 |
+
# Excluding samples that are too short to fit the transcript
|
96 |
+
counter["too_short"] += 1
|
97 |
+
elif frames / SAMPLE_RATE > MAX_SECS:
|
98 |
+
# Excluding very long samples to keep a reasonable batch-size
|
99 |
+
counter["too_long"] += 1
|
100 |
+
else:
|
101 |
+
# This one is good - keep it for the target CSV
|
102 |
+
rows.append((os.path.split(output_filename)
|
103 |
+
[-1], file_size, label, sample[2]))
|
104 |
+
counter["imported_time"] += frames
|
105 |
+
counter["all"] += 1
|
106 |
+
counter["total_time"] += frames
|
107 |
+
|
108 |
+
return (counter, rows)
|
109 |
+
|
110 |
+
|
111 |
+
def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_character=None, rows=None):
|
112 |
+
# iterate over all data lists and write converted version near them
|
113 |
+
speaker_iterator = 1
|
114 |
+
|
115 |
+
samples = []
|
116 |
+
for subdir, dirs, files in os.walk(dataset_dir):
|
117 |
+
for file in files:
|
118 |
+
# Get audiofile path and transcript for each sentence in tsv
|
119 |
+
if file == "txt.final.data":
|
120 |
+
file_path = os.path.join(subdir, file)
|
121 |
+
file = open(file_path, mode="r")
|
122 |
+
data = []
|
123 |
+
for row in file.readlines():
|
124 |
+
file_name, transcript = row.replace(
|
125 |
+
" \n", "").split(" ", 1)
|
126 |
+
|
127 |
+
if file_name.endswith(".wav"):
|
128 |
+
pass
|
129 |
+
elif file_name.endswith(".mp3"):
|
130 |
+
pass
|
131 |
+
elif file_name.find(".") == -1:
|
132 |
+
file_name += ".wav"
|
133 |
+
file_name = os.path.join(os.path.join(
|
134 |
+
os.path.dirname(subdir), "wav"), file_name)
|
135 |
+
data.append(
|
136 |
+
(file_name, transcript, speaker_iterator))
|
137 |
+
speaker_iterator += 1
|
138 |
+
|
139 |
+
file.close()
|
140 |
+
|
141 |
+
samples += data
|
142 |
+
|
143 |
+
if rows is None:
|
144 |
+
rows = []
|
145 |
+
counter = get_counter()
|
146 |
+
num_samples = len(samples)
|
147 |
+
print("Importing dataset files...")
|
148 |
+
pool = Pool(initializer=init_worker, initargs=(PARAMS,))
|
149 |
+
bar = progressbar.ProgressBar(
|
150 |
+
max_value=num_samples, widgets=SIMPLE_BAR)
|
151 |
+
for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1):
|
152 |
+
counter += processed[0]
|
153 |
+
rows += processed[1]
|
154 |
+
bar.update(i)
|
155 |
+
bar.update(num_samples)
|
156 |
+
pool.close()
|
157 |
+
pool.join()
|
158 |
+
|
159 |
+
imported_samples = get_imported_samples(counter)
|
160 |
+
assert counter["all"] == num_samples
|
161 |
+
assert len(rows) == imported_samples
|
162 |
+
print_import_report(counter, SAMPLE_RATE, MAX_SECS)
|
163 |
+
|
164 |
+
output_csv = os.path.join(os.path.abspath(audio_dir), "train.csv")
|
165 |
+
print("Saving new DeepSpeech-formatted CSV file to: ", output_csv)
|
166 |
+
with open(output_csv, "w", encoding="utf-8", newline="") as output_csv_file:
|
167 |
+
print("Writing CSV file for DeepSpeech.py as: ", output_csv)
|
168 |
+
writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
|
169 |
+
writer.writeheader()
|
170 |
+
bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
|
171 |
+
for filename, file_size, transcript, speaker in bar(rows):
|
172 |
+
if space_after_every_character:
|
173 |
+
writer.writerow(
|
174 |
+
{
|
175 |
+
"wav_filename": filename,
|
176 |
+
"wav_filesize": file_size,
|
177 |
+
"transcript": " ".join(transcript),
|
178 |
+
}
|
179 |
+
)
|
180 |
+
else:
|
181 |
+
writer.writerow(
|
182 |
+
{
|
183 |
+
"wav_filename": filename,
|
184 |
+
"wav_filesize": file_size,
|
185 |
+
"transcript": transcript,
|
186 |
+
}
|
187 |
+
)
|
188 |
+
return rows
|
189 |
+
|
190 |
+
|
191 |
+
def _preprocess_data(tsv_dir, audio_dir, space_after_every_character=False):
|
192 |
+
set_samples = _maybe_convert_set(
|
193 |
+
tsv_dir, audio_dir, space_after_every_character)
|
194 |
+
|
195 |
+
|
196 |
+
def _maybe_convert_wav(mp3_filename, wav_filename):
|
197 |
+
if not os.path.exists(wav_filename):
|
198 |
+
transformer = sox.Transformer()
|
199 |
+
transformer.convert(samplerate=SAMPLE_RATE, n_channels=CHANNELS)
|
200 |
+
try:
|
201 |
+
transformer.build(mp3_filename, wav_filename)
|
202 |
+
except sox.core.SoxError:
|
203 |
+
pass
|
204 |
+
|
205 |
+
|
206 |
+
def parse_args():
|
207 |
+
parser = get_importers_parser(
|
208 |
+
description="Import CommonVoice v2.0 corpora")
|
209 |
+
parser.add_argument("tsv_dir", help="Directory containing tsv files")
|
210 |
+
parser.add_argument(
|
211 |
+
"--audio_dir",
|
212 |
+
help='Directory containing the audio clips - defaults to "<tsv_dir>/clips"',
|
213 |
+
)
|
214 |
+
parser.add_argument(
|
215 |
+
"--filter_alphabet",
|
216 |
+
help="Exclude samples with characters not in provided alphabet",
|
217 |
+
)
|
218 |
+
parser.add_argument(
|
219 |
+
"--normalize",
|
220 |
+
action="store_true",
|
221 |
+
help="Converts diacritic characters to their base ones",
|
222 |
+
)
|
223 |
+
parser.add_argument(
|
224 |
+
"--space_after_every_character",
|
225 |
+
action="store_true",
|
226 |
+
help="To help transcript join by white space",
|
227 |
+
)
|
228 |
+
return parser.parse_args()
|
229 |
+
|
230 |
+
|
231 |
+
def main():
|
232 |
+
audio_dir = PARAMS.audio_dir if PARAMS.audio_dir else os.path.join(
|
233 |
+
PARAMS.tsv_dir, "clips")
|
234 |
+
_preprocess_data(PARAMS.tsv_dir, audio_dir,
|
235 |
+
PARAMS.space_after_every_character)
|
236 |
+
|
237 |
+
|
238 |
+
if __name__ == "__main__":
|
239 |
+
PARAMS = parse_args()
|
240 |
+
main()
|