marinone94
commited on
Commit
β’
393fd68
1
Parent(s):
9be1ce7
correct filtering column
Browse files- run.sh +1 -1
- run_speech_recognition_ctc.py +3 -3
run.sh
CHANGED
@@ -17,7 +17,7 @@ python run_speech_recognition_ctc.py \
|
|
17 |
--evaluation_strategy="epoch" \
|
18 |
--save_strategy="epoch" \
|
19 |
--text_column_name="sentence" \
|
20 |
-
--chars_to_ignore , ? . ! \- \; \: \" β % β β οΏ½ β β β¦ β /
|
21 |
--logging_steps="100" \
|
22 |
--layerdrop="0.0" \
|
23 |
--activation_dropout="0.1" \
|
|
|
17 |
--evaluation_strategy="epoch" \
|
18 |
--save_strategy="epoch" \
|
19 |
--text_column_name="sentence" \
|
20 |
+
--chars_to_ignore , ? . ! \- \; \: \" β % β β οΏ½ β β β¦ β / \
|
21 |
--logging_steps="100" \
|
22 |
--layerdrop="0.0" \
|
23 |
--activation_dropout="0.1" \
|
run_speech_recognition_ctc.py
CHANGED
@@ -521,9 +521,9 @@ def main():
|
|
521 |
|
522 |
def remove_special_characters(batch):
|
523 |
if chars_to_ignore_regex is not None:
|
524 |
-
batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).replace("\\\\Punkt", "").replace("\\\\Komma", "").lower()
|
525 |
else:
|
526 |
-
batch["target_text"] = batch[text_column_name].replace("\\\\Punkt", "").replace("\\\\Komma", "").lower()
|
527 |
return batch
|
528 |
|
529 |
num_workers = data_args.preprocessing_num_workers
|
@@ -537,7 +537,7 @@ def main():
|
|
537 |
raw_datasets = raw_datasets.filter(
|
538 |
is_text_valid,
|
539 |
num_proc=num_workers,
|
540 |
-
input_columns=["
|
541 |
desc="remove single words, single chars and 'W O R D S'",
|
542 |
)
|
543 |
|
|
|
521 |
|
522 |
def remove_special_characters(batch):
|
523 |
if chars_to_ignore_regex is not None:
|
524 |
+
batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).replace("\\\\Punkt", "").replace("\\\\Komma", "").lower() + " "
|
525 |
else:
|
526 |
+
batch["target_text"] = batch[text_column_name].replace("\\\\Punkt", "").replace("\\\\Komma", "").lower() + " "
|
527 |
return batch
|
528 |
|
529 |
num_workers = data_args.preprocessing_num_workers
|
|
|
537 |
raw_datasets = raw_datasets.filter(
|
538 |
is_text_valid,
|
539 |
num_proc=num_workers,
|
540 |
+
input_columns=["target_text"],
|
541 |
desc="remove single words, single chars and 'W O R D S'",
|
542 |
)
|
543 |
|