|
from typing import Dict |
|
from transformers.pipelines.audio_utils import ffmpeg_read |
|
import whisper |
|
import torch |
|
import pytube |
|
import time |
|
|
|
|
|
class EndpointHandler(): |
|
def __init__(self, path=""): |
|
|
|
MODEL_NAME = "tiny.en" |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
print(f'whisper will use: {device}') |
|
|
|
t0 = time.time() |
|
whisper_model = whisper.load_model(MODEL_NAME).to(device) |
|
t1 = time.time() |
|
|
|
total = t1-t0 |
|
print(f'Finished loading model in {total} seconds') |
|
|
|
|
|
def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: |
|
""" |
|
Args: |
|
data (:obj:): |
|
includes the URL to video for transcription |
|
Return: |
|
A :obj:`dict`:. transcribed dict |
|
""" |
|
|
|
print('data', data) |
|
video_url = data.pop("inputs", data) |
|
decode_options = { |
|
|
|
|
|
|
|
|
|
"language":"en", |
|
"verbose": True |
|
} |
|
yt = pytube.YouTube(video_url) |
|
stream = yt.streams.filter(only_audio=True)[0] |
|
path_to_audio = f"{yt.video_id}.mp3" |
|
stream.download(filename=path_to_audio) |
|
t0 = time.time() |
|
transcript = self.model.transcribe(path_to_audio, **decode_options) |
|
t1 = time.time() |
|
|
|
total = t1-t0 |
|
print(f'Finished transcription in {total} seconds') |
|
|
|
|
|
|
|
return {"transcript": transcript} |
|
|