from typing import Dict from transformers.pipelines.audio_utils import ffmpeg_read import whisper import torch import pytube import time class EndpointHandler(): def __init__(self, path=""): # load the model MODEL_NAME = "tiny.en" device = "cuda" if torch.cuda.is_available() else "cpu" print(f'whisper will use: {device}') t0 = time.time() self.model = whisper.load_model(MODEL_NAME).to(device) t1 = time.time() total = t1-t0 print(f'Finished loading model in {total} seconds') def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: """ Args: data (:obj:): includes the URL to video for transcription Return: A :obj:`dict`:. transcribed dict """ # process input print('data', data) video_url = data.pop("inputs", data) decode_options = { # Set language to None to support multilingual, # but it will take longer to process while it detects the language. # Realized this by running in verbose mode and seeing how much time # was spent on the decoding language step "language":"en", "verbose": True } yt = pytube.YouTube(video_url) stream = yt.streams.filter(only_audio=True)[0] path_to_audio = f"{yt.video_id}.mp3" stream.download(filename=path_to_audio) t0 = time.time() transcript = self.model.transcribe(path_to_audio, **decode_options) t1 = time.time() total = t1-t0 print(f'Finished transcription in {total} seconds') for segment in transcript['segments']: # Remove the tokens array, it was making response too verbose segment.pop('tokens', None) # postprocess the prediction return {"transcript": transcript}