Commit
·
044dff6
1
Parent(s):
fcf680c
log df of train and test data
Browse files- run.sh +1 -1
- run_speech_recognition_ctc.py +6 -0
run.sh
CHANGED
@@ -5,7 +5,7 @@ python run_speech_recognition_ctc.py \
|
|
5 |
--train_split_name="train+validation,train" \
|
6 |
--eval_split_name="test,None" \
|
7 |
--output_dir="./" \
|
8 |
-
--
|
9 |
--num_train_epochs="3" \
|
10 |
--per_device_train_batch_size="32" \
|
11 |
--per_device_eval_batch_size="32" \
|
|
|
5 |
--train_split_name="train+validation,train" \
|
6 |
--eval_split_name="test,None" \
|
7 |
--output_dir="./" \
|
8 |
+
--preprocessing_only \
|
9 |
--num_train_epochs="3" \
|
10 |
--per_device_train_batch_size="32" \
|
11 |
--per_device_eval_batch_size="32" \
|
run_speech_recognition_ctc.py
CHANGED
@@ -750,6 +750,12 @@ def main():
|
|
750 |
# If dataset_seed is set, shuffle train
|
751 |
if data_args.dataset_seed is not None:
|
752 |
vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(seed=data_args.dataset_seed)
|
|
|
|
|
|
|
|
|
|
|
|
|
753 |
|
754 |
# for large datasets it is advised to run the preprocessing on a
|
755 |
# single machine first with ``args.preprocessing_only`` since there will mostly likely
|
|
|
750 |
# If dataset_seed is set, shuffle train
|
751 |
if data_args.dataset_seed is not None:
|
752 |
vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(seed=data_args.dataset_seed)
|
753 |
+
|
754 |
+
# Log sample of datasets
|
755 |
+
pd_train = vectorized_datasets["train"].select(range(10)).to_pandas()
|
756 |
+
pd_eval = vectorized_datasets["eval"].select(range(10)).to_pandas()
|
757 |
+
wandb.log({"train_sample": pd_train})
|
758 |
+
wandb.log("eval_sample": pd_eval)
|
759 |
|
760 |
# for large datasets it is advised to run the preprocessing on a
|
761 |
# single machine first with ``args.preprocessing_only`` since there will mostly likely
|