abdullah-alnahas commited on
Commit
dccede6
·
1 Parent(s): 356f877

feat(app.py): [WIP] format transcription with gemini

Browse files
Files changed (1) hide show
  1. app.py +30 -16
app.py CHANGED
@@ -2,12 +2,14 @@ import random
2
  import streamlit as st
3
  import io
4
  import os
 
5
  from transformers import pipeline
6
  import torch
7
  import yt_dlp
8
  from silero_vad import load_silero_vad, get_speech_timestamps
9
  import numpy as np
10
  import pydub
 
11
 
12
  # --- Model Loading and Caching ---
13
  @st.cache_resource
@@ -291,8 +293,9 @@ def process_transcription(video_url, vad_sensitivity, batch_size, transcriber, v
291
  start_time = format_seconds(chunk['start'])
292
  end_time = format_seconds(chunk['end'])
293
  full_transcription += f"[{start_time} - {end_time}]: {chunk['text'].strip()}\n\n"
 
294
 
295
- return full_transcription, audio_data, audio_format, info
296
 
297
  def format_seconds(seconds):
298
  """Formats seconds into HH:MM:SS string."""
@@ -327,18 +330,26 @@ def download_video(video_url, video_format):
327
  st.error(f"Error during video download: {e}")
328
  return None, None, None
329
 
330
- import random
331
- import streamlit as st
332
- import io
333
- import os
334
- from transformers import pipeline
335
- import torch
336
- import yt_dlp
337
- from silero_vad import load_silero_vad, get_speech_timestamps
338
- import numpy as np
339
- import pydub
340
 
341
- # ... (rest of your code, including model loading, audio functions, etc.)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
  def main():
344
  """Main function to run the Streamlit application."""
@@ -360,9 +371,9 @@ def main():
360
  # Call setup_ui() to get UI element values
361
  video_url, language, batch_size, transcribe_option, download_audio_option, download_video_option, process_button, vad_sensitivity, audio_format, video_format = setup_ui()
362
 
363
- transcription_output = st.empty()
364
  if st.session_state.full_transcription:
365
- transcription_output.text_area("Transcription:", value=st.session_state.full_transcription, height=300, key=random.random())
366
 
367
  if process_button:
368
  st.session_state.full_transcription = None
@@ -377,9 +388,12 @@ def main():
377
  return
378
 
379
  if transcribe_option:
380
- st.session_state.full_transcription, st.session_state.audio_data, st.session_state.audio_format, st.session_state.info = process_transcription(video_url, vad_sensitivity, batch_size, transcriber, vad_model, audio_format, language)
381
  if st.session_state.full_transcription:
382
- transcription_output.text_area("Transcription:", value=st.session_state.full_transcription, height=300, key=random.random())
 
 
 
383
 
384
  if download_audio_option:
385
  if st.session_state.audio_data is None or st.session_state.audio_format is None or st.session_state.info is None:
 
2
  import streamlit as st
3
  import io
4
  import os
5
+
6
  from transformers import pipeline
7
  import torch
8
  import yt_dlp
9
  from silero_vad import load_silero_vad, get_speech_timestamps
10
  import numpy as np
11
  import pydub
12
+ from litellm import completion
13
 
14
  # --- Model Loading and Caching ---
15
  @st.cache_resource
 
293
  start_time = format_seconds(chunk['start'])
294
  end_time = format_seconds(chunk['end'])
295
  full_transcription += f"[{start_time} - {end_time}]: {chunk['text'].strip()}\n\n"
296
+ formatted_transcription = format_transcript(full_transcription)
297
 
298
+ return full_transcription, formatted_transcription, audio_data, audio_format, info
299
 
300
  def format_seconds(seconds):
301
  """Formats seconds into HH:MM:SS string."""
 
330
  st.error(f"Error during video download: {e}")
331
  return None, None, None
332
 
333
+ def format_transcript(input_transcription):
334
+
335
+
336
+ # os.environ["GEMINI_API_KEY"] = "..."
 
 
 
 
 
 
337
 
338
+ sys_prompt = """
339
+ Video Transcription Formatting
340
+
341
+ As an LLM formatting provided video transcriptions (in any language), transform spoken language into clear, readable text. Prioritize readability, consistency, and context, adapting to the specific language conventions. **Do not hallucinate or add any information not present in the original transcript.**
342
+
343
+ * **Sentences:** Restructure long, rambling sentences; correct grammatical errors *while preserving the original meaning*; use proper punctuation appropriate for the language.
344
+ * **Reading:** Italicize/quote read text; clearly separate from explanations.
345
+ * **Repetitions:** Remove unnecessary repetitions unless for emphasis.
346
+ """.strip()
347
+ messages = [{"content": sys_prompt, "role": "system"},
348
+ {"content": f"Format the following video transcription: {input_transcription}", "role": "user"}]
349
+
350
+ response = completion(model="gemini/gemini-2.0-flash-exp", messages=messages)
351
+ formatted_text = response.choices[0].message.content
352
+ return formatted_text
353
 
354
  def main():
355
  """Main function to run the Streamlit application."""
 
371
  # Call setup_ui() to get UI element values
372
  video_url, language, batch_size, transcribe_option, download_audio_option, download_video_option, process_button, vad_sensitivity, audio_format, video_format = setup_ui()
373
 
374
+ # transcription_output = st.empty()
375
  if st.session_state.full_transcription:
376
+ st.text_area("Transcription:", value=st.session_state.full_transcription, height=300, key=random.random())
377
 
378
  if process_button:
379
  st.session_state.full_transcription = None
 
388
  return
389
 
390
  if transcribe_option:
391
+ st.session_state.full_transcription, st.session_state.formatted_transcription, st.session_state.audio_data, st.session_state.audio_format, st.session_state.info = process_transcription(video_url, vad_sensitivity, batch_size, transcriber, vad_model, audio_format, language)
392
  if st.session_state.full_transcription:
393
+ st.text_area("Transcription:", value=st.session_state.full_transcription, height=300, key=random.random())
394
+ if st.session_state.formatted_transcription:
395
+ st.text_area("Formatted Transcription:", value=st.session_state.formatted_transcription, height=300, key=random.random())
396
+
397
 
398
  if download_audio_option:
399
  if st.session_state.audio_data is None or st.session_state.audio_format is None or st.session_state.info is None: