File size: 1,945 Bytes
0a4aac9 e0a80e0 f376027 0a4aac9 e0a80e0 f376027 6df6849 f376027 0a4aac9 91f436b 0a4aac9 91f436b 0a4aac9 e0a80e0 f376027 e0a80e0 47e7453 e0a80e0 f376027 e0a80e0 f376027 c8f6664 f376027 0a4aac9 e0a80e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
from typing import Dict
from transformers.pipelines.audio_utils import ffmpeg_read
import whisper
import torch
import pytube
import time
class EndpointHandler():
def __init__(self, path=""):
# load the model
MODEL_NAME = "tiny.en"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'whisper will use: {device}')
t0 = time.time()
self.model = whisper.load_model(MODEL_NAME).to(device)
t1 = time.time()
total = t1-t0
print(f'Finished loading model in {total} seconds')
def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
"""
Args:
data (:obj:):
includes the URL to video for transcription
Return:
A :obj:`dict`:. transcribed dict
"""
# process input
print('data', data)
video_url = data.pop("inputs", data)
decode_options = {
# Set language to None to support multilingual,
# but it will take longer to process while it detects the language.
# Realized this by running in verbose mode and seeing how much time
# was spent on the decoding language step
"language":"en",
"verbose": True
}
yt = pytube.YouTube(video_url)
stream = yt.streams.filter(only_audio=True)[0]
path_to_audio = f"{yt.video_id}.mp3"
stream.download(filename=path_to_audio)
t0 = time.time()
transcript = self.model.transcribe(path_to_audio, **decode_options)
t1 = time.time()
total = t1-t0
print(f'Finished transcription in {total} seconds')
for segment in transcript['segments']:
# Remove the tokens array, it was making response too verbose
segment.pop('tokens', None)
# postprocess the prediction
return {"transcript": transcript}
|