nb-whisper-demo / app.py
AngelinaZanardi's picture
Update app.py
3357009 verified
raw
history blame
6.76 kB
import time
import os
import re
import base64
import torch
import torchaudio
import gradio as gr
import spaces
from transformers import AutoFeatureExtractor, AutoTokenizer, WhisperForConditionalGeneration, WhisperProcessor, pipeline
from huggingface_hub import model_info
try:
import flash_attn
FLASH_ATTENTION = True
except ImportError:
FLASH_ATTENTION = False
import yt_dlp # Added import for yt-dlp
MODEL_NAME = "NbAiLab/nb-whisper-large"
lang = "no"
with open("Logonew.png", "rb") as img_file:
base64_image = base64.b64encode(img_file.read()).decode('utf-8')
#logo_path = os.path.join(os.path.dirname(__file__), "Logo.png")
max_audio_length= 30 * 60
share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
auth_token = os.environ.get("AUTH_TOKEN") or True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Bruker enhet: {device}")
@spaces.GPU(duration=60 * 2)
def pipe(file, return_timestamps=False):
asr = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=28,
device=device,
token=auth_token,
torch_dtype=torch.float16,
model_kwargs={"attn_implementation": "flash_attention_2", "num_beams": 5} if FLASH_ATTENTION else {"attn_implementation": "sdpa", "num_beams": 5},
)
asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(
language=lang,
task="transcribe",
no_timestamps=not return_timestamps,
)
return asr(file, return_timestamps=return_timestamps, batch_size=24)
def format_output(text):
# Add a line break after ".", "!", ":", or "?" unless part of sequences like "..."
#text = re.sub(r'(?<!\.)[.!:?](?!\.)', lambda m: m.group() + '<br>', text)
# Ensure line break after sequences like "..." or other punctuation patterns
text = re.sub(r'(\.{3,}|[.!:?])', lambda m: m.group() + '<br>', text)
return text
def transcribe(file, return_timestamps=False):
waveform, sample_rate = torchaudio.load(file)
audio_duration = waveform.size(1) / sample_rate
if audio_duration > max_audio_length:
# Trim the waveform to the first 30 minutes
waveform = waveform[:, :int(max_audio_length * sample_rate)]
truncated_file = "truncated_audio.wav"
torchaudio.save(truncated_file, waveform, sample_rate)
file_to_transcribe = truncated_file
truncated = True
else:
file_to_transcribe = file
truncated = False
if not return_timestamps:
text = pipe(file_to_transcribe)["text"]
formatted_text = format_output(text)
else:
chunks = pipe(file_to_transcribe, return_timestamps=True)["chunks"]
text = []
for chunk in chunks:
start_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][0])) if chunk["timestamp"][0] is not None else "??:??:??"
end_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][1])) if chunk["timestamp"][1] is not None else "??:??:??"
line = f"[{start_time} -> {end_time}] {chunk['text']}"
text.append(line)
formatted_text = "\n".join(text)
if truncated:
link="https://github.com/NbAiLab/nostram/blob/main/leverandorer.md"
disclaimer = (
"\n\nDette er en demo. Det er ikke tillatt å bruke denne teksten i profesjonell sammenheng. "
"Vi anbefaler at hvis du trenger å transkribere lengre opptak, så kjører du enten modellen lokalt "
"eller sjekker denne siden for å se hvem som leverer løsninger basert på NB-Whisper: "
"<a href='{link}' target='_blank'>denne siden</a>."
)
formatted_text += f"<br><br><i>{disclaimer}</i>"
formatted_text += "<br><br><i>Transkribert med NB-Whisper demo</i>"
return formatted_text
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
def yt_transcribe(yt_url, return_timestamps=False):
html_embed_str = _return_yt_html_embed(yt_url)
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': 'audio.%(ext)s',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'quiet': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([yt_url])
text = transcribe("audio.mp3", return_timestamps=return_timestamps)
return html_embed_str, text
# Lag Gradio-appen uten faner
demo = gr.Blocks()
with demo:
with gr.Column():
gr.HTML(f"<img src='data:image/png;base64,{base64_image}' style='width:200px;'>")
with gr.Column(scale=8):
# Use Markdown for title and description
gr.Markdown(
"""
<h1 style="font-size: 3em;">NB-Whisper Demo</h1>
"""
)
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
gr.components.Checkbox(label="Inkluder tidsstempler"),
],
outputs=gr.HTML(label="text"),
#outputs="text",
description=(
"Transkriber lange lydopptak fra mikrofon eller lydfiler med et enkelt klikk! Demoen bruker den fintunede"
f" modellen [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) og 🤗 Transformers til å transkribere lydfiler opp til 30 minutter."
),
allow_flagging="never",
#show_submit_button=False,
)
# Uncomment to add the YouTube transcription interface if needed
# yt_transcribe_interface = gr.Interface(
# fn=yt_transcribe,
# inputs=[
# gr.components.Textbox(lines=1, placeholder="Lim inn URL til en YouTube-video her", label="YouTube URL"),
# gr.components.Checkbox(label="Inkluder tidsstempler"),
# ],
# examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
# outputs=["html", "text"],
# title="Whisper Demo: Transkriber YouTube",
# description=(
# "Transkriber lange YouTube-videoer med et enkelt klikk! Demoen bruker den fintunede modellen:"
# f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) og 🤗 Transformers til å transkribere lydfiler av"
# " vilkårlig lengde."
# ),
# allow_flagging="never",
# )
# Start demoen uten faner
demo.launch(share=share, show_api=False,allowed_paths=["Logo_2.png"]).queue()