arbml
/

whisper-large-ar-5k

Automatic Speech Recognition

Generated from Trainer

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

whisper-large-ar-5k / split_xml_mgb2.py

Zaid's picture

Training in progress, step 1000

bb1e479 almost 2 years ago

1.78 kB

	from pathlib import Path
	import soundfile as sf
	import xml.etree.ElementTree as ET

	split = "train" # or "dev"

	# set the following path to where you
	# extracted the mgb2 archive
	archive_path = Path("data/train")

	wav_dir = archive_path / "wav"
	segments_file = archive_path / "xml" / "utf8"
	# output directories
	output_wav_dir = archive_path / "dataset" / split /"wav"
	output_txt_dir = archive_path / "dataset" / split /"txt"

	# create directories for output datasets
	output_wav_dir.mkdir(parents=True, exist_ok=True)
	output_txt_dir.mkdir(parents=True, exist_ok=True)

	# for all xml segments files under utf8 directory from archive
	for s_file in segments_file.glob("*.xml"):
	tree = ET.parse(str(s_file))
	root = tree.getroot()
	head = root[0]
	segments = root[1][0]

	# get the name of the wav file form the recording tag
	for child in head:
	if child.tag == "recording":
	print(child.attrib)
	file_name = child.attrib.get("filename")

	# get the start and end times from the segment under segments tag
	# and join the text from each segment to construct the transcript
	for segment in segments:
	start_time = int(float(segment.attrib.get("starttime")) *16_000)
	end_time = int(float(segment.attrib.get("endtime")) * 16_000)

	text = " ".join([x.text for x in segment])


	# now store the meta data and the correctly sampled wav file in the correct
	# output directories
	wav_path = wav_dir / f"{file_name}.wav"
	sound, _ = sf.read(wav_path, start=start_time, stop=end_time)
	sf.write(output_wav_dir / f"{file_name}_seg{start_time}_{end_time}.wav", sound, 16_000)
	open(output_txt_dir / f"{file_name}_seg{start_time}_{end_time}.txt", "w").write(text)