File size: 1,945 Bytes

0a4aac9
 
 
 
e0a80e0
f376027
0a4aac9
 
 
 
 
e0a80e0
 
 
 
 
f376027
6df6849
f376027
 
 
 
0a4aac9
 
 
 
 
 
91f436b
0a4aac9
91f436b
0a4aac9
 
e0a80e0
 
 
 
 
 
 
 
f376027
e0a80e0
47e7453
e0a80e0
 
 
f376027
e0a80e0
f376027
 
 
c8f6664
 
 
f376027
0a4aac9
 
e0a80e0

from typing import  Dict
from transformers.pipelines.audio_utils import ffmpeg_read
import whisper
import torch
import pytube
import time


class EndpointHandler():
    def __init__(self, path=""):
        # load the model
        MODEL_NAME = "tiny.en"
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f'whisper will use: {device}')
        
        t0 = time.time()
        self.model = whisper.load_model(MODEL_NAME).to(device)
        t1 = time.time()
        
        total = t1-t0
        print(f'Finished loading model in {total} seconds')


    def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
        """
        Args:
            data (:obj:):
                includes the URL to video for transcription
        Return:
            A :obj:`dict`:. transcribed dict
        """
        # process input
        print('data', data)
        video_url = data.pop("inputs", data)
        decode_options = {
            # Set language to None to support multilingual, 
            # but it will take longer to process while it detects the language.
            # Realized this by running in verbose mode and seeing how much time
            # was spent on the decoding language step
            "language":"en",
            "verbose": True
        }
        yt = pytube.YouTube(video_url)
        stream = yt.streams.filter(only_audio=True)[0]
        path_to_audio = f"{yt.video_id}.mp3"
        stream.download(filename=path_to_audio)
        t0 = time.time()
        transcript = self.model.transcribe(path_to_audio, **decode_options)
        t1 = time.time()
        total = t1-t0
        print(f'Finished transcription in {total} seconds')
        for segment in transcript['segments']:
          # Remove the tokens array, it was making response too verbose
          segment.pop('tokens', None)
        

        # postprocess the prediction
        return {"transcript": transcript}