tomiwa1a
/

video-search

Automatic Speech Recognition

endpoints-template

Inference Endpoints

Model card Files Files and versions Community

video-search / handler.py

tomiwa1a's picture

removed tokens array from transcripts

c8f6664 over 1 year ago

No virus

1.95 kB

	from typing import Dict
	from transformers.pipelines.audio_utils import ffmpeg_read
	import whisper
	import torch
	import pytube
	import time


	class EndpointHandler():
	def __init__(self, path=""):
	# load the model
	MODEL_NAME = "tiny.en"

	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f'whisper will use: {device}')

	t0 = time.time()
	self.model = whisper.load_model(MODEL_NAME).to(device)
	t1 = time.time()

	total = t1-t0
	print(f'Finished loading model in {total} seconds')


	def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
	"""
	Args:
	data (:obj:):
	includes the URL to video for transcription
	Return:
	A :obj:`dict`:. transcribed dict
	"""
	# process input
	print('data', data)
	video_url = data.pop("inputs", data)
	decode_options = {
	# Set language to None to support multilingual,
	# but it will take longer to process while it detects the language.
	# Realized this by running in verbose mode and seeing how much time
	# was spent on the decoding language step
	"language":"en",
	"verbose": True
	}
	yt = pytube.YouTube(video_url)
	stream = yt.streams.filter(only_audio=True)[0]
	path_to_audio = f"{yt.video_id}.mp3"
	stream.download(filename=path_to_audio)
	t0 = time.time()
	transcript = self.model.transcribe(path_to_audio, **decode_options)
	t1 = time.time()
	total = t1-t0
	print(f'Finished transcription in {total} seconds')
	for segment in transcript['segments']:
	# Remove the tokens array, it was making response too verbose
	segment.pop('tokens', None)


	# postprocess the prediction
	return {"transcript": transcript}