whisper-medium

Sleeping

File size: 2,664 Bytes

ccfa27a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37fa3c8
ccfa27a
 
 
 
 
16476dc
 
ccfa27a
 
 
16476dc
ccfa27a
 
 
1728aa5
536d4b9
ccfa27a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44f5653
ccfa27a
d74df01
ccfa27a
 
 
 
 
 
 
44f5653
ccfa27a
 
 
16476dc
ccfa27a
d74df01
ccfa27a
 
 
 
 
 
44f5653
ccfa27a

import torch

import gradio as gr
import pytube as pt
from transformers import pipeline
from huggingface_hub import model_info

MODEL_NAME = "openai/whisper-medium"

device = "cuda" if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

langs = model_info(MODEL_NAME).cardData["language"]

article = f"<details><summary>模型支持 {len(langs)} 语言! (单击展开)</summary>> {langs}</details>"

def transcribe(microphone, file_upload):
    warn_output = ""
    if (microphone is not None) and (file_upload is not None):
        warn_output = (
            "WARNING:上传一个音频文件或者使用麦克风录制. "
            "使用麦克风录制音频文件丢弃上传的音频文件.\n"
        )

    elif (microphone is None) and (file_upload is None):
        return "ERROR: 你必须使用麦克风录制或上传音频文件"

    file = microphone if microphone is not None else file_upload

    text = pipe(file,language='zh')["text"]
    # TODO 翻译目标 Chinese
    return warn_output + text


def _return_yt_html_embed(yt_url):
    video_id = yt_url.split("?v=")[-1]
    HTML_str = (
        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
        " </center>"
    )
    return HTML_str


def yt_transcribe(yt_url):
    yt = pt.YouTube(yt_url)
    html_embed_str = _return_yt_html_embed(yt_url)
    stream = yt.streams.filter(only_audio=True)[0]
    stream.download(filename="audio.mp3")

    text = pipe("audio.mp3")["text"]

    return html_embed_str, text


demo = gr.Blocks()

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath", optional=True),
        gr.inputs.Audio(source="upload", type="filepath", optional=True),
    ],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="口译示例: 音频转录",
    description=(
        "转录麦克风录制或上传的音频文件!"
    ),
    article=article,
    allow_flagging="never",
)

yt_transcribe = gr.Interface(
    fn=yt_transcribe,
    inputs=[gr.inputs.Textbox(lines=1, placeholder="请粘贴视频地址", label="视频地址URL")],
    outputs=["html", "text"],
    layout="horizontal",
    theme="huggingface",
    title="口译示例: 视频转录",
    description=(
        "转录上传的视频文件!"
    ),
    article=article,
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([mf_transcribe, yt_transcribe], ["音频转录", "视频转录"])

demo.launch(enable_queue=True)