import gradio as gr from whisperplus.utils.download_utils import download_and_convert_to_mp3 import logging import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') class SpeechToTextPipeline: """Class for converting audio to text using a pre-trained speech recognition model.""" def __init__(self, model_id: str = "openai/whisper-large-v3"): self.model = None self.device = None if self.model is None: self.load_model(model_id) else: logging.info("Model already loaded.") def load_model(self, model_id: str = "openai/whisper-large-v3"): """ Loads the pre-trained speech recognition model and moves it to the specified device. Args: model_id (str): Identifier of the pre-trained model to be loaded. """ logging.info("Loading model...") model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True) model.to(self.device) logging.info("Model loaded successfully.") self.model = model def __call__(self, audio_path: str, model_id: str = "openai/whisper-large-v3", language: str = "turkish"): """ Converts audio to text using the pre-trained speech recognition model. Args: audio_path (str): Path to the audio file to be transcribed. model_id (str): Identifier of the pre-trained model to be used for transcription. Returns: str: Transcribed text from the audio. """ processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=self.model, torch_dtype=torch.float16, chunk_length_s=30, max_new_tokens=128, batch_size=24, return_timestamps=True, device="cuda", tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, model_kwargs={"use_flash_attention_2": True}, generate_kwargs={"language": language}, ) logging.info("Transcribing audio...") result = pipe(audio_path)["text"] return result def youtube_url_to_text(url, model_id, language_choice): """ Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using a specified model, and returns the transcript along with the video path. Args: url (str): The URL of the video to download and convert. model_id (str): The ID of the speech-to-text model to use. language_choice (str): The language choice for the speech-to-text conversion. Returns: transcript (str): The transcript of the speech-to-text conversion. video_path (str): The path of the downloaded video. """ video_path = download_and_convert_to_mp3(url) pipeline = SpeechToTextPipeline(model_id) transcript = pipeline(audio_path=video_path, model_id=model_id, language=language_choice) return transcript, video_path def youtube_url_to_text_app(): with gr.Blocks(): with gr.Row(): with gr.Column(): youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL") language_choice = gr.Dropdown( choices=[ "English", "Turkish", "Spanish", "French", "Chinese", "Japanese", "Korean", ], value="Turkish", label="Language", ) whisper_model_id = gr.Dropdown( choices=[ "openai/whisper-large-v3", "openai/whisper-large", "openai/whisper-medium", "openai/whisper-base", "openai/whisper-small", "openai/whisper-tiny", ], value="openai/whisper-large-v3", label="Whisper Model", ) whisperplus_in_predict = gr.Button(value="Generator") with gr.Column(): output_text = gr.Textbox(label="Output Text") output_audio = gr.Audio(label="Output Audio") whisperplus_in_predict.click( fn=youtube_url_to_text, inputs=[ youtube_url_path, whisper_model_id, language_choice, ], outputs=[output_text, output_audio], ) gradio_app = gr.Blocks() with gradio_app: gr.HTML( """