File size: 1,945 Bytes
0a4aac9
 
 
 
e0a80e0
f376027
0a4aac9
 
 
 
 
e0a80e0
 
 
 
 
f376027
6df6849
f376027
 
 
 
0a4aac9
 
 
 
 
 
91f436b
0a4aac9
91f436b
0a4aac9
 
e0a80e0
 
 
 
 
 
 
 
f376027
e0a80e0
47e7453
e0a80e0
 
 
f376027
e0a80e0
f376027
 
 
c8f6664
 
 
f376027
0a4aac9
 
e0a80e0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from typing import  Dict
from transformers.pipelines.audio_utils import ffmpeg_read
import whisper
import torch
import pytube
import time


class EndpointHandler():
    def __init__(self, path=""):
        # load the model
        MODEL_NAME = "tiny.en"
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f'whisper will use: {device}')
        
        t0 = time.time()
        self.model = whisper.load_model(MODEL_NAME).to(device)
        t1 = time.time()
        
        total = t1-t0
        print(f'Finished loading model in {total} seconds')


    def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
        """
        Args:
            data (:obj:):
                includes the URL to video for transcription
        Return:
            A :obj:`dict`:. transcribed dict
        """
        # process input
        print('data', data)
        video_url = data.pop("inputs", data)
        decode_options = {
            # Set language to None to support multilingual, 
            # but it will take longer to process while it detects the language.
            # Realized this by running in verbose mode and seeing how much time
            # was spent on the decoding language step
            "language":"en",
            "verbose": True
        }
        yt = pytube.YouTube(video_url)
        stream = yt.streams.filter(only_audio=True)[0]
        path_to_audio = f"{yt.video_id}.mp3"
        stream.download(filename=path_to_audio)
        t0 = time.time()
        transcript = self.model.transcribe(path_to_audio, **decode_options)
        t1 = time.time()
        total = t1-t0
        print(f'Finished transcription in {total} seconds')
        for segment in transcript['segments']:
          # Remove the tokens array, it was making response too verbose
          segment.pop('tokens', None)
        

        # postprocess the prediction
        return {"transcript": transcript}