kobakhit commited on
Commit
e7045c6
·
1 Parent(s): 0a5f4ba

replaced pytube with yt-dlp

Browse files
Files changed (1) hide show
  1. app.py +69 -9
app.py CHANGED
@@ -2,8 +2,9 @@ import streamlit as st
2
  import streamlit_ext as ste
3
  import openai
4
  from pydub import AudioSegment
5
- from pytube import YouTube
6
- import pytube
 
7
  import io
8
  from pyannote.audio import Pipeline
9
  from pyannote.audio.pipelines.utils.hook import ProgressHook
@@ -16,6 +17,9 @@ import urllib.parse as urlparse
16
  from urllib.parse import urlencode
17
  import os
18
 
 
 
 
19
  import matplotlib
20
  matplotlib.use('Agg')
21
  from matplotlib import pyplot as plt
@@ -45,6 +49,22 @@ def add_query_parameter(link, params):
45
 
46
  return urlparse.urlunparse(url_parts)
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def youtube_video_id(value):
49
  """
50
  Examples:
@@ -68,7 +88,11 @@ def youtube_video_id(value):
68
  return None
69
 
70
  @st.cache_data
71
- def process_youtube_link(youtube_link):
 
 
 
 
72
  try:
73
  yt = YouTube(youtube_link)
74
  audio_stream = yt.streams.filter(only_audio=True).first()
@@ -88,6 +112,41 @@ def process_youtube_link(youtube_link):
88
  st.audio(create_audio_stream(audio), format="audio/mp4", start_time=0)
89
  return audio, audio_name
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  @st.cache_data
92
  def load_rttm_file(rttm_path):
93
  return load_rttm(rttm_path)['stream']
@@ -104,7 +163,7 @@ if "prompt_request_counter" not in st.session_state:
104
  st.session_state["prompt_request_counter"] = 0
105
 
106
  initial_prompt = [{"role": "system", "content": "You are helping to analyze and summarize a transcript of a conversation."},
107
- {"role": 'user', "content": 'Please summarize briefly below transcript. Also, inlcude a list of tags with a hash for SEO. \n{}'}]
108
  if "messages" not in st.session_state:
109
  st.session_state.messages = initial_prompt
110
 
@@ -324,7 +383,7 @@ if "audio" in locals():
324
  my_bar.progress((i+1)/len(sp_chunks_loaded), text=progress_text)
325
 
326
  transcript_json = sp_chunks_loaded
327
- transcript_path = f'{audio_name.split(".mp4")[0]}-transcript.json'
328
 
329
  else:
330
  sp_chunks_updated = []
@@ -346,13 +405,14 @@ if "audio" in locals():
346
  sp_chunks_updated.append({'speaker':s['speaker'],
347
  'start':s['start'], 'end':s['end'],
348
  'duration': s['duration'],'transcript': transcript})
349
-
350
- progress_text = f"Processing {i+1}/{len(sp_chunks[:TRANSCRIPTION_REQUEST_LIMIT])}..."
351
- my_bar.progress((i+1)/len(sp_chunks[:TRANSCRIPTION_REQUEST_LIMIT]), text=progress_text)
352
  st.markdown(transcript_summary)
353
 
 
 
 
 
354
  transcript_json = [dict((k, d[k]) for k in ['speaker','start','end','duration','transcript'] if k in d) for d in sp_chunks_updated]
355
- transcript_path = f'{audio_name.split(".mp4")[0]}-transcript.json'
356
  st.session_state.transcript_file = transcript_path
357
 
358
  # save the trancript file
 
2
  import streamlit_ext as ste
3
  import openai
4
  from pydub import AudioSegment
5
+ # from pytube import YouTube
6
+ # import pytube
7
+ import yt_dlp
8
  import io
9
  from pyannote.audio import Pipeline
10
  from pyannote.audio.pipelines.utils.hook import ProgressHook
 
17
  from urllib.parse import urlencode
18
  import os
19
 
20
+ import unicodedata
21
+ import re
22
+
23
  import matplotlib
24
  matplotlib.use('Agg')
25
  from matplotlib import pyplot as plt
 
49
 
50
  return urlparse.urlunparse(url_parts)
51
 
52
+ def slugify(value, allow_unicode=False):
53
+ """
54
+ Taken from https://github.com/django/django/blob/master/django/utils/text.py
55
+ Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
56
+ dashes to single dashes. Remove characters that aren't alphanumerics,
57
+ underscores, or hyphens. Convert to lowercase. Also strip leading and
58
+ trailing whitespace, dashes, and underscores.
59
+ """
60
+ value = str(value)
61
+ if allow_unicode:
62
+ value = unicodedata.normalize('NFKC', value)
63
+ else:
64
+ value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
65
+ value = re.sub(r'[^\w\s-]', '', value.lower())
66
+ return re.sub(r'[-\s]+', '-', value).strip('-_')
67
+
68
  def youtube_video_id(value):
69
  """
70
  Examples:
 
88
  return None
89
 
90
  @st.cache_data
91
+ def process_youtube_link2(youtube_link):
92
+ '''
93
+ uses pytube https://github.com/pytube/pytube
94
+ issue with https://github.com/pytube/pytube/issues/84
95
+ '''
96
  try:
97
  yt = YouTube(youtube_link)
98
  audio_stream = yt.streams.filter(only_audio=True).first()
 
112
  st.audio(create_audio_stream(audio), format="audio/mp4", start_time=0)
113
  return audio, audio_name
114
 
115
+
116
+ @st.cache_data
117
+ def process_youtube_link(youtube_link):
118
+ 'uses yt-dlp https://github.com/yt-dlp/yt-dlp'
119
+
120
+ try:
121
+ os.remove('sample.m4a')
122
+ except OSError:
123
+ pass
124
+
125
+ ydl_opts = {
126
+ 'format': 'm4a/bestaudio/best',
127
+ # ℹ️ See help(yt_dlp.postprocessor) for a list of available Postprocessors and their arguments
128
+ 'outtmpl': './sample.%(ext)s'
129
+ # 'postprocessors': [{ # Extract audio using ffmpeg
130
+ # 'key': 'FFmpegExtractAudio',
131
+ # 'preferredcodec': 'm4a',
132
+ # }]
133
+ }
134
+
135
+ try:
136
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
137
+ info = ydl.extract_info(youtube_link, download=True)
138
+ audio_name = slugify( info['title'] )
139
+ st.write(f"Downloaded {info['title']}")
140
+ except Exception as e:
141
+ st.warning(e)
142
+ st.stop()
143
+
144
+
145
+ time.sleep(2)
146
+ audio = load_audio(f'sample.m4a')
147
+ st.audio(create_audio_stream(audio), format="audio/m4a", start_time=0)
148
+ return audio, audio_name
149
+
150
  @st.cache_data
151
  def load_rttm_file(rttm_path):
152
  return load_rttm(rttm_path)['stream']
 
163
  st.session_state["prompt_request_counter"] = 0
164
 
165
  initial_prompt = [{"role": "system", "content": "You are helping to analyze and summarize a transcript of a conversation."},
166
+ {"role": 'user', "content": 'Please summarize briefly below transcript and inlcude a list of tags with a hash for SEO. \n{}'}]
167
  if "messages" not in st.session_state:
168
  st.session_state.messages = initial_prompt
169
 
 
383
  my_bar.progress((i+1)/len(sp_chunks_loaded), text=progress_text)
384
 
385
  transcript_json = sp_chunks_loaded
386
+ transcript_path = f'{audio_name.split(".")[0]}-transcript.json'
387
 
388
  else:
389
  sp_chunks_updated = []
 
405
  sp_chunks_updated.append({'speaker':s['speaker'],
406
  'start':s['start'], 'end':s['end'],
407
  'duration': s['duration'],'transcript': transcript})
 
 
 
408
  st.markdown(transcript_summary)
409
 
410
+ progress_text = f"Processing {i+1}/{len(sp_chunks[:TRANSCRIPTION_REQUEST_LIMIT])}..."
411
+ my_bar.progress((i+1)/len(sp_chunks[:TRANSCRIPTION_REQUEST_LIMIT]), text=progress_text)
412
+
413
+
414
  transcript_json = [dict((k, d[k]) for k in ['speaker','start','end','duration','transcript'] if k in d) for d in sp_chunks_updated]
415
+ transcript_path = f'{audio_name.split(".")[0]}-transcript.json'
416
  st.session_state.transcript_file = transcript_path
417
 
418
  # save the trancript file