Spaces:
Sleeping
Sleeping
import gradio as gr | |
from whisperplus.pipelines.whisper import SpeechToTextPipeline | |
from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline | |
from whisperplus.utils.download_utils import download_and_convert_to_mp3 | |
from whisperplus.utils.text_utils import format_speech_to_dialogue | |
import subprocess | |
def install_package(package): | |
subprocess.check_call(['pip', 'install', package, '--no-build-isolation']) | |
# Then install flash-attn | |
install_package('flash-attn') | |
def youtube_url_to_text(url, model_id, language_choice): | |
""" | |
Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using | |
a specified model, and returns the transcript along with the video path. | |
Args: | |
url (str): The URL of the video to download and convert. | |
model_id (str): The ID of the speech-to-text model to use. | |
language_choice (str): The language choice for the speech-to-text conversion. | |
Returns: | |
transcript (str): The transcript of the speech-to-text conversion. | |
video_path (str): The path of the downloaded video. | |
""" | |
video_path = download_and_convert_to_mp3(url) | |
output = SpeechToTextPipeline(model_id) | |
print(video_path) | |
transcript = output(audio_path=video_path, language=language_choice) | |
return transcript, video_path | |
def speaker_diarization(url, model_id, num_speakers, min_speaker, max_speaker): | |
""" | |
Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using | |
a specified model, and returns the transcript along with the video path. | |
Args: | |
url (str): The URL of the video to download and convert. | |
model_id (str): The ID of the speech-to-text model to use. | |
language_choice (str): The language choice for the speech-to-text conversion. | |
Returns: | |
transcript (str): The transcript of the speech-to-text conversion. | |
video_path (str): The path of the downloaded video. | |
""" | |
pipeline = ASRDiarizationPipeline.from_pretrained( | |
asr_model=model_id, | |
diarizer_model="pyannote/speaker-diarization", | |
chunk_length_s=30, | |
device="cuda", | |
) | |
audio_path = download_and_convert_to_mp3(url) | |
output_text = pipeline( | |
audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker) | |
dialogue = format_speech_to_dialogue(output_text) | |
return dialogue, audio_path | |
def youtube_url_to_text_app(): | |
with gr.Blocks(): | |
with gr.Row(): | |
with gr.Column(): | |
youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL") | |
language_choice = gr.Dropdown( | |
choices=[ | |
"English", | |
"Turkish", | |
"Spanish", | |
"French", | |
"Chinese", | |
"Japanese", | |
"Korean", | |
], | |
value="Turkish", | |
label="Language", | |
) | |
whisper_model_id = gr.Dropdown( | |
choices=[ | |
"openai/whisper-large-v3", | |
"openai/whisper-large", | |
"openai/whisper-medium", | |
"openai/whisper-base", | |
"openai/whisper-small", | |
"openai/whisper-tiny", | |
], | |
value="openai/whisper-large-v3", | |
label="Whisper Model", | |
) | |
whisperplus_in_predict = gr.Button(value="Generator") | |
with gr.Column(): | |
output_text = gr.Textbox(label="Output Text") | |
output_audio = gr.Audio(label="Output Audio") | |
whisperplus_in_predict.click( | |
fn=youtube_url_to_text, | |
inputs=[ | |
youtube_url_path, | |
whisper_model_id, | |
language_choice, | |
], | |
outputs=[output_text, output_audio], | |
) | |
gr.Examples( | |
examples=[ | |
[ | |
"https://www.youtube.com/watch?v=di3rHkEZuUw", | |
"distil-whisper/distil-large-v3", | |
"English", | |
], | |
], | |
fn=youtube_url_to_text, | |
inputs=[ | |
youtube_url_path, | |
whisper_model_id, | |
language_choice, | |
], | |
outputs=[output_text, output_audio], | |
cache_examples=True, | |
) | |
def speaker_diarization_app(): | |
with gr.Blocks(): | |
with gr.Row(): | |
with gr.Column(): | |
youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL") | |
whisper_model_id = gr.Dropdown( | |
choices=[ | |
"openai/whisper-large-v3", | |
"distil-whisper/distil-large-v3", | |
"distil-whisper/distil-large-v2", | |
], | |
value="distil-whisper/distil-large-v3", | |
label="Whisper Model", | |
) | |
num_speakers = gr.Number(value=2, label="Number of Speakers") | |
min_speaker = gr.Number(value=1, label="Minimum Number of Speakers") | |
max_speaker = gr.Number(value=2, label="Maximum Number of Speakers") | |
whisperplus_in_predict = gr.Button(value="Generator") | |
with gr.Column(): | |
output_text = gr.Textbox(label="Output Text") | |
output_audio = gr.Audio(label="Output Audio") | |
whisperplus_in_predict.click( | |
fn=speaker_diarization, | |
inputs=[ | |
youtube_url_path, | |
whisper_model_id, | |
num_speakers, | |
min_speaker, | |
max_speaker, | |
], | |
outputs=[output_text, output_audio], | |
) | |
gr.Examples( | |
examples=[ | |
[ | |
"https://www.youtube.com/shorts/o8PgLUgte2k", | |
"distil-whisper/distil-large-v3", | |
2, | |
1, | |
2, | |
], | |
], | |
fn=speaker_diarization, | |
inputs=[ | |
youtube_url_path, | |
whisper_model_id, | |
num_speakers, | |
min_speaker, | |
max_speaker, | |
], | |
outputs=[output_text, output_audio], | |
cache_examples=False, | |
) | |
gradio_app = gr.Blocks() | |
with gradio_app: | |
gr.HTML( | |
""" | |
<h1 style='text-align: center'> | |
WhisperPlus: Advancing Speech-to-Text Processing 🚀 | |
</h1> | |
""") | |
gr.HTML( | |
""" | |
<h3 style='text-align: center'> | |
Follow me for more! | |
<a href='https://twitter.com/kadirnar_ai' target='_blank'>Twitter</a> | <a href='https://github.com/kadirnar' target='_blank'>Github</a> | <a href='https://www.linkedin.com/in/kadir-nar/' target='_blank'>Linkedin</a> | <a href='https://www.huggingface.co/kadirnar/' target='_blank'>HuggingFace</a> | |
</h3> | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Tab(label="Youtube URL to Text"): | |
youtube_url_to_text_app() | |
with gr.Tab(label="Speaker Diarization"): | |
speaker_diarization_app() | |
gradio_app.launch(debug=True) |