kadirnar commited on
Commit
19d9763
·
verified ·
1 Parent(s): ec5308a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -159
app.py CHANGED
@@ -1,17 +1,19 @@
1
- import gradio as gr
2
 
3
- from whisperplus.pipelines.whisper import SpeechToTextPipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
5
- from whisperplus.utils.download_utils import download_and_convert_to_mp3
6
- from whisperplus.utils.text_utils import format_speech_to_dialogue
7
-
8
- import subprocess
9
-
10
- def install_package(package):
11
- subprocess.check_call(['pip', 'install', package, '--no-build-isolation'])
12
-
13
- # Then install flash-attn
14
- install_package('flash-attn')
15
 
16
 
17
  def youtube_url_to_text(url, model_id, language_choice):
@@ -26,17 +28,71 @@ def youtube_url_to_text(url, model_id, language_choice):
26
 
27
  Returns:
28
  transcript (str): The transcript of the speech-to-text conversion.
29
- video_path (str): The path of the downloaded video.
30
  """
31
- video_path = download_and_convert_to_mp3(url)
32
- output = SpeechToTextPipeline(model_id)
33
- print(video_path)
34
- transcript = output(audio_path=video_path, language=language_choice)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- return transcript, video_path
37
 
38
 
39
- def speaker_diarization(url, model_id, num_speakers, min_speaker, max_speaker):
40
  """
41
  Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
42
  a specified model, and returns the transcript along with the video path.
@@ -54,160 +110,104 @@ def speaker_diarization(url, model_id, num_speakers, min_speaker, max_speaker):
54
  pipeline = ASRDiarizationPipeline.from_pretrained(
55
  asr_model=model_id,
56
  diarizer_model="pyannote/speaker-diarization",
 
57
  chunk_length_s=30,
58
- device="cuda",
59
  )
60
 
61
- audio_path = download_and_convert_to_mp3(url)
62
  output_text = pipeline(
63
  audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker)
64
  dialogue = format_speech_to_dialogue(output_text)
65
  return dialogue, audio_path
66
 
67
 
68
- def youtube_url_to_text_app():
69
- with gr.Blocks():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  with gr.Row():
71
  with gr.Column():
72
- youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")
73
-
74
- language_choice = gr.Dropdown(
75
- choices=[
76
- "English",
77
- "Turkish",
78
- "Spanish",
79
- "French",
80
- "Chinese",
81
- "Japanese",
82
- "Korean",
83
- ],
84
- value="Turkish",
85
- label="Language",
86
- )
87
- whisper_model_id = gr.Dropdown(
88
- choices=[
89
- "openai/whisper-large-v3",
90
- "openai/whisper-large",
91
- "openai/whisper-medium",
92
- "openai/whisper-base",
93
- "openai/whisper-small",
94
- "openai/whisper-tiny",
95
- ],
96
- value="openai/whisper-large-v3",
97
- label="Whisper Model",
98
- )
99
- whisperplus_in_predict = gr.Button(value="Generator")
100
 
 
 
101
  with gr.Column():
102
- output_text = gr.Textbox(label="Output Text")
103
- output_audio = gr.Audio(label="Output Audio")
104
-
105
- whisperplus_in_predict.click(
106
- fn=youtube_url_to_text,
107
- inputs=[
108
- youtube_url_path,
109
- whisper_model_id,
110
- language_choice,
111
- ],
112
- outputs=[output_text, output_audio],
113
- )
114
- gr.Examples(
115
- examples=[
116
- [
117
- "https://www.youtube.com/watch?v=di3rHkEZuUw",
118
- "distil-whisper/distil-large-v3",
119
- "English",
120
- ],
121
- ],
122
- fn=youtube_url_to_text,
123
- inputs=[
124
- youtube_url_path,
125
- whisper_model_id,
126
- language_choice,
127
- ],
128
- outputs=[output_text, output_audio],
129
- cache_examples=True,
130
- )
131
-
132
-
133
- def speaker_diarization_app():
134
- with gr.Blocks():
135
  with gr.Row():
136
  with gr.Column():
137
- youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")
138
-
139
- whisper_model_id = gr.Dropdown(
140
- choices=[
141
- "openai/whisper-large-v3",
142
- "distil-whisper/distil-large-v3",
143
- "distil-whisper/distil-large-v2",
144
- ],
145
- value="distil-whisper/distil-large-v3",
146
- label="Whisper Model",
147
- )
148
- num_speakers = gr.Number(value=2, label="Number of Speakers")
149
- min_speaker = gr.Number(value=1, label="Minimum Number of Speakers")
150
- max_speaker = gr.Number(value=2, label="Maximum Number of Speakers")
151
- whisperplus_in_predict = gr.Button(value="Generator")
152
 
 
 
 
 
 
 
 
153
  with gr.Column():
154
- output_text = gr.Textbox(label="Output Text")
155
- output_audio = gr.Audio(label="Output Audio")
156
-
157
- whisperplus_in_predict.click(
158
- fn=speaker_diarization,
159
- inputs=[
160
- youtube_url_path,
161
- whisper_model_id,
162
- num_speakers,
163
- min_speaker,
164
- max_speaker,
165
- ],
166
- outputs=[output_text, output_audio],
167
- )
168
- gr.Examples(
169
- examples=[
170
- [
171
- "https://www.youtube.com/shorts/o8PgLUgte2k",
172
- "distil-whisper/distil-large-v3",
173
- 2,
174
- 1,
175
- 2,
176
- ],
177
- ],
178
- fn=speaker_diarization,
179
- inputs=[
180
- youtube_url_path,
181
- whisper_model_id,
182
- num_speakers,
183
- min_speaker,
184
- max_speaker,
185
- ],
186
- outputs=[output_text, output_audio],
187
- cache_examples=False,
188
- )
189
-
190
-
191
- gradio_app = gr.Blocks()
192
- with gradio_app:
193
- gr.HTML(
194
- """
195
- <h1 style='text-align: center'>
196
- WhisperPlus: Advancing Speech-to-Text Processing 🚀
197
- </h1>
198
- """)
199
- gr.HTML(
200
- """
201
- <h3 style='text-align: center'>
202
- Follow me for more!
203
- <a href='https://twitter.com/kadirnar_ai' target='_blank'>Twitter</a> | <a href='https://github.com/kadirnar' target='_blank'>Github</a> | <a href='https://www.linkedin.com/in/kadir-nar/' target='_blank'>Linkedin</a> | <a href='https://www.huggingface.co/kadirnar/' target='_blank'>HuggingFace</a>
204
- </h3>
205
- """)
206
- with gr.Row():
207
- with gr.Column():
208
- with gr.Tab(label="Youtube URL to Text"):
209
- youtube_url_to_text_app()
210
- with gr.Tab(label="Speaker Diarization"):
211
- speaker_diarization_app()
212
-
213
- gradio_app.launch(debug=True)
 
 
1
 
2
+ import gradio as gr
3
+ import torch
4
+ from transformers import BitsAndBytesConfig, HqqConfig
5
+
6
+ from whisperplus import (
7
+ SpeechToTextPipeline,
8
+ download_youtube_to_mp3,
9
+ download_youtube_to_mp4,
10
+ format_speech_to_dialogue,
11
+ )
12
+ from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline
13
+ from whisperplus.pipelines.summarization import TextSummarizationPipeline
14
+ from whisperplus.pipelines.text2speech import TextToSpeechPipeline
15
+ from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline
16
  from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
 
 
 
 
 
 
 
 
 
 
17
 
18
 
19
  def youtube_url_to_text(url, model_id, language_choice):
 
28
 
29
  Returns:
30
  transcript (str): The transcript of the speech-to-text conversion.
 
31
  """
32
+ audio_path = download_youtube_to_mp3(url, output_dir="downloads", filename="test")
33
+
34
+ hqq_config = HqqConfig(
35
+ nbits=4,
36
+ group_size=64,
37
+ quant_zero=False,
38
+ quant_scale=False,
39
+ axis=0,
40
+ offload_meta=False,
41
+ ) # axis=0 is used by default
42
+
43
+ pipeline = SpeechToTextPipeline(
44
+ model_id=model_id,
45
+ quant_config=hqq_config,
46
+ flash_attention_2=True,
47
+ )
48
+
49
+ transcript = pipeline(
50
+ audio_path=audio_path,
51
+ chunk_length_s=30,
52
+ stride_length_s=5,
53
+ max_new_tokens=128,
54
+ batch_size=100,
55
+ language=language_choice,
56
+ return_timestamps=False,
57
+ )
58
+ return transcript
59
+
60
+
61
+ def summarization(text, model_id="facebook/bart-large-cnn"):
62
+ """
63
+ Main function that performs summarization using a specified model and returns the summary.
64
+
65
+ Args:
66
+ text (str): The text to summarize.
67
+ model_id (str): The ID of the summarization model to use.
68
+
69
+ Returns:
70
+ summary (str): The summary of the text.
71
+ """
72
+ summarizer = TextSummarizationPipeline(model_id=model_id)
73
+ summary = summarizer.summarize(text)
74
+
75
+ return summary[0]["summary_text"]
76
+
77
+
78
+ def long_text_summarization(text, model_id="facebook/bart-large-cnn"):
79
+ """
80
+ Main function that performs summarization using a specified model and returns the summary.
81
+
82
+ Args:
83
+ text (str): The text to summarize.
84
+ model_id (str): The ID of the summarization model to use.
85
+
86
+ Returns:
87
+ summary (str): The summary of the text.
88
+ """
89
+ summarizer = LongTextSummarizationPipeline(model_id=model_id)
90
+ summary_text = summarizer.summarize(text)
91
 
92
+ return summary_text
93
 
94
 
95
+ def speaker_diarization(url, model_id, device, num_speakers, min_speaker, max_speaker):
96
  """
97
  Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
98
  a specified model, and returns the transcript along with the video path.
 
110
  pipeline = ASRDiarizationPipeline.from_pretrained(
111
  asr_model=model_id,
112
  diarizer_model="pyannote/speaker-diarization",
113
+ use_auth_token=False,
114
  chunk_length_s=30,
115
+ device=device,
116
  )
117
 
118
+ audio_path = download_youtube_to_mp3(url)
119
  output_text = pipeline(
120
  audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker)
121
  dialogue = format_speech_to_dialogue(output_text)
122
  return dialogue, audio_path
123
 
124
 
125
+ def text2spech_bark(text, model_id="suno/bark", voice_preset="v2/en_speaker_6"):
126
+ tts = TextToSpeechPipeline(model_id=model_id)
127
+ audio = tts(text=text, voice_preset=voice_preset)
128
+ return audio
129
+
130
+
131
+ def whisper_autocaption(url, language, model_id="openai/whisper-large-v3"):
132
+ video_path = download_youtube_to_mp4(url)
133
+
134
+ caption = WhisperAutoCaptionPipeline(model_id=model_id)
135
+ output = caption(video_path=video_path, output_path="output.mp4", language=language)
136
+ return output
137
+
138
+
139
+ with gr.Blocks() as demo:
140
+ with gr.Tab("YouTube URL to Text"):
141
+ with gr.Row():
142
+ with gr.Column():
143
+ url_input = gr.Textbox(label="Enter YouTube URL")
144
+ model_id_input = gr.Textbox(label="Enter Model ID", value="openai/whisper-medium")
145
+ language_input = gr.Textbox(label="Enter Language", value="en")
146
+ submit_btn1 = gr.Button("Submit")
147
+ with gr.Column():
148
+ output1 = gr.Textbox(label="Transcript")
149
+ submit_btn1.click(
150
+ youtube_url_to_text, inputs=[url_input, model_id_input, language_input], outputs=output1)
151
+
152
+ with gr.Tab("Text Summarization"):
153
+ with gr.Row():
154
+ with gr.Column():
155
+ text_input = gr.Textbox(label="Enter Text", lines=5)
156
+ model_id_input2 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn")
157
+ submit_btn2 = gr.Button("Summarize")
158
+ with gr.Column():
159
+ output2 = gr.Textbox(label="Summary")
160
+ submit_btn2.click(summarization, inputs=[text_input, model_id_input2], outputs=output2)
161
+
162
+ with gr.Tab("Long Text Summarization"):
163
  with gr.Row():
164
  with gr.Column():
165
+ long_text_input = gr.Textbox(label="Enter Long Text", lines=10)
166
+ model_id_input3 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn")
167
+ submit_btn3 = gr.Button("Summarize Long Text")
168
+ with gr.Column():
169
+ output3 = gr.Textbox(label="Long Text Summary")
170
+ submit_btn3.click(long_text_summarization, inputs=[long_text_input, model_id_input3], outputs=output3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
+ with gr.Tab("Speaker Diarization"):
173
+ with gr.Row():
174
  with gr.Column():
175
+ url_input2 = gr.Textbox(label="Enter YouTube URL")
176
+ model_id_input4 = gr.Textbox(label="Enter Model ID")
177
+ num_speakers = gr.Number(label="Number of Speakers", value=2)
178
+ min_speakers = gr.Number(label="Min Speakers", value=1)
179
+ max_speakers = gr.Number(label="Max Speakers", value=4)
180
+ device = gr.Textbox(label="Device", value="cpu")
181
+ submit_btn4 = gr.Button("Diarize")
182
+ with gr.Column():
183
+ output4 = gr.DataFrame(headers=["Speaker", "Text"], datatype=["str", "str"])
184
+ submit_btn4.click(
185
+ speaker_diarization,
186
+ inputs=[url_input2, model_id_input4, device, num_speakers, min_speakers, max_speakers],
187
+ outputs=output4)
188
+
189
+ with gr.Tab("Text to Speech"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  with gr.Row():
191
  with gr.Column():
192
+ text_input2 = gr.Textbox(label="Enter Text", lines=3)
193
+ model_id_input5 = gr.Textbox(label="Enter Model ID", value="suno/bark")
194
+ voice_preset = gr.Textbox(label="Voice Preset", value="v2/en_speaker_6")
195
+ submit_btn5 = gr.Button("Generate Audio")
196
+ with gr.Column():
197
+ output5 = gr.Audio(label="Generated Audio")
198
+ submit_btn5.click(
199
+ text2spech_bark, inputs=[text_input2, model_id_input5, voice_preset], outputs=output5)
 
 
 
 
 
 
 
200
 
201
+ with gr.Tab("Whisper Autocaption"):
202
+ with gr.Row():
203
+ with gr.Column():
204
+ url_input3 = gr.Textbox(label="Enter YouTube URL")
205
+ language = gr.Textbox(label="Language", value="en")
206
+ model_id_input6 = gr.Textbox(label="Enter Model ID", value="openai/whisper-large-v2")
207
+ submit_btn6 = gr.Button("Generate Captions")
208
  with gr.Column():
209
+ output6 = gr.Video(label="Captioned Video")
210
+ submit_btn6.click(
211
+ whisper_autocaption, inputs=[url_input3, language, model_id_input6], outputs=output6)
212
+
213
+ demo.launch()