Spaces:
Running
Running
abdullah-alnahas
commited on
Commit
·
dccede6
1
Parent(s):
356f877
feat(app.py): [WIP] format transcription with gemini
Browse files
app.py
CHANGED
@@ -2,12 +2,14 @@ import random
|
|
2 |
import streamlit as st
|
3 |
import io
|
4 |
import os
|
|
|
5 |
from transformers import pipeline
|
6 |
import torch
|
7 |
import yt_dlp
|
8 |
from silero_vad import load_silero_vad, get_speech_timestamps
|
9 |
import numpy as np
|
10 |
import pydub
|
|
|
11 |
|
12 |
# --- Model Loading and Caching ---
|
13 |
@st.cache_resource
|
@@ -291,8 +293,9 @@ def process_transcription(video_url, vad_sensitivity, batch_size, transcriber, v
|
|
291 |
start_time = format_seconds(chunk['start'])
|
292 |
end_time = format_seconds(chunk['end'])
|
293 |
full_transcription += f"[{start_time} - {end_time}]: {chunk['text'].strip()}\n\n"
|
|
|
294 |
|
295 |
-
return full_transcription, audio_data, audio_format, info
|
296 |
|
297 |
def format_seconds(seconds):
|
298 |
"""Formats seconds into HH:MM:SS string."""
|
@@ -327,18 +330,26 @@ def download_video(video_url, video_format):
|
|
327 |
st.error(f"Error during video download: {e}")
|
328 |
return None, None, None
|
329 |
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
from transformers import pipeline
|
335 |
-
import torch
|
336 |
-
import yt_dlp
|
337 |
-
from silero_vad import load_silero_vad, get_speech_timestamps
|
338 |
-
import numpy as np
|
339 |
-
import pydub
|
340 |
|
341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
|
343 |
def main():
|
344 |
"""Main function to run the Streamlit application."""
|
@@ -360,9 +371,9 @@ def main():
|
|
360 |
# Call setup_ui() to get UI element values
|
361 |
video_url, language, batch_size, transcribe_option, download_audio_option, download_video_option, process_button, vad_sensitivity, audio_format, video_format = setup_ui()
|
362 |
|
363 |
-
transcription_output = st.empty()
|
364 |
if st.session_state.full_transcription:
|
365 |
-
|
366 |
|
367 |
if process_button:
|
368 |
st.session_state.full_transcription = None
|
@@ -377,9 +388,12 @@ def main():
|
|
377 |
return
|
378 |
|
379 |
if transcribe_option:
|
380 |
-
st.session_state.full_transcription, st.session_state.audio_data, st.session_state.audio_format, st.session_state.info = process_transcription(video_url, vad_sensitivity, batch_size, transcriber, vad_model, audio_format, language)
|
381 |
if st.session_state.full_transcription:
|
382 |
-
|
|
|
|
|
|
|
383 |
|
384 |
if download_audio_option:
|
385 |
if st.session_state.audio_data is None or st.session_state.audio_format is None or st.session_state.info is None:
|
|
|
2 |
import streamlit as st
|
3 |
import io
|
4 |
import os
|
5 |
+
|
6 |
from transformers import pipeline
|
7 |
import torch
|
8 |
import yt_dlp
|
9 |
from silero_vad import load_silero_vad, get_speech_timestamps
|
10 |
import numpy as np
|
11 |
import pydub
|
12 |
+
from litellm import completion
|
13 |
|
14 |
# --- Model Loading and Caching ---
|
15 |
@st.cache_resource
|
|
|
293 |
start_time = format_seconds(chunk['start'])
|
294 |
end_time = format_seconds(chunk['end'])
|
295 |
full_transcription += f"[{start_time} - {end_time}]: {chunk['text'].strip()}\n\n"
|
296 |
+
formatted_transcription = format_transcript(full_transcription)
|
297 |
|
298 |
+
return full_transcription, formatted_transcription, audio_data, audio_format, info
|
299 |
|
300 |
def format_seconds(seconds):
|
301 |
"""Formats seconds into HH:MM:SS string."""
|
|
|
330 |
st.error(f"Error during video download: {e}")
|
331 |
return None, None, None
|
332 |
|
333 |
+
def format_transcript(input_transcription):
|
334 |
+
|
335 |
+
|
336 |
+
# os.environ["GEMINI_API_KEY"] = "..."
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
|
338 |
+
sys_prompt = """
|
339 |
+
Video Transcription Formatting
|
340 |
+
|
341 |
+
As an LLM formatting provided video transcriptions (in any language), transform spoken language into clear, readable text. Prioritize readability, consistency, and context, adapting to the specific language conventions. **Do not hallucinate or add any information not present in the original transcript.**
|
342 |
+
|
343 |
+
* **Sentences:** Restructure long, rambling sentences; correct grammatical errors *while preserving the original meaning*; use proper punctuation appropriate for the language.
|
344 |
+
* **Reading:** Italicize/quote read text; clearly separate from explanations.
|
345 |
+
* **Repetitions:** Remove unnecessary repetitions unless for emphasis.
|
346 |
+
""".strip()
|
347 |
+
messages = [{"content": sys_prompt, "role": "system"},
|
348 |
+
{"content": f"Format the following video transcription: {input_transcription}", "role": "user"}]
|
349 |
+
|
350 |
+
response = completion(model="gemini/gemini-2.0-flash-exp", messages=messages)
|
351 |
+
formatted_text = response.choices[0].message.content
|
352 |
+
return formatted_text
|
353 |
|
354 |
def main():
|
355 |
"""Main function to run the Streamlit application."""
|
|
|
371 |
# Call setup_ui() to get UI element values
|
372 |
video_url, language, batch_size, transcribe_option, download_audio_option, download_video_option, process_button, vad_sensitivity, audio_format, video_format = setup_ui()
|
373 |
|
374 |
+
# transcription_output = st.empty()
|
375 |
if st.session_state.full_transcription:
|
376 |
+
st.text_area("Transcription:", value=st.session_state.full_transcription, height=300, key=random.random())
|
377 |
|
378 |
if process_button:
|
379 |
st.session_state.full_transcription = None
|
|
|
388 |
return
|
389 |
|
390 |
if transcribe_option:
|
391 |
+
st.session_state.full_transcription, st.session_state.formatted_transcription, st.session_state.audio_data, st.session_state.audio_format, st.session_state.info = process_transcription(video_url, vad_sensitivity, batch_size, transcriber, vad_model, audio_format, language)
|
392 |
if st.session_state.full_transcription:
|
393 |
+
st.text_area("Transcription:", value=st.session_state.full_transcription, height=300, key=random.random())
|
394 |
+
if st.session_state.formatted_transcription:
|
395 |
+
st.text_area("Formatted Transcription:", value=st.session_state.formatted_transcription, height=300, key=random.random())
|
396 |
+
|
397 |
|
398 |
if download_audio_option:
|
399 |
if st.session_state.audio_data is None or st.session_state.audio_format is None or st.session_state.info is None:
|