Spaces:

RamAnanth1
/

Youtube-to-HF-Dataset

Youtube-to-HF-Dataset / downloader /whisper_post_processor.py

Update downloader/whisper_post_processor.py

d297f8f almost 2 years ago

1.66 kB

	from interpreter import WhisperInterpreter
	from utils import VIDEO_INFO, json_dump
	from yt_dlp.postprocessor import PostProcessor
	from datasets import Dataset
	import re

	class WhisperPP(PostProcessor):
	def __init__(self,data,name, **whisper_options):
	super().__init__()
	self._options = whisper_options
	interpreter = WhisperInterpreter(self._options.pop("model_size","base"))
	self.data = data
	self._process = getattr(interpreter, self._options.pop("mode","transcribe"))
	self._write = self._options.pop("write")
	self.videos_to_process = self._options.pop("number_videos",0)
	self.repoId = name

	def run(self, info):
	self.to_screen(f"Processing Video {info['id']}")
	result = {key: info[key] for key in VIDEO_INFO}
	result.update(self._process(info["filepath"], **self._options))
	self.to_screen(f"Processed Video {info['id']} and appended results.")
	self._update_data(result)
	if self._write:
	json_dump(result, f"{info['filepath'].split('.')[0]}.json")
	return [], info

	def _update_data(self, record):
	dataType = type(self.data)
	if dataType == list:
	self.data.append(record)
	else:
	self.data = self.data.add_item(record)
	if self.data.num_rows >= self.videos_to_process and self.videos_to_process != 0:
	self.data.push_to_hub(self.repoId)

	def get_data(self):
	return self.data

	def _get_name(self):
	if self.data.info.download_checksums is not None:
	regex = r"(?<=datasets\/)(.*?)(?=\/resolve)"
	repoId = re.compile(regex)
	url = list(self.data.info.download_checksums.keys())[0]
	return repoId.findall(url)[0]
	return ""