Spaces:
Running
Running
ai chat, state logic
Browse files
app.py
CHANGED
@@ -25,6 +25,13 @@ st.set_page_config(
|
|
25 |
page_icon = 'π'
|
26 |
)
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def create_audio_stream(audio):
|
29 |
return io.BytesIO(audio.export(format="wav").read())
|
30 |
|
@@ -59,31 +66,62 @@ def youtube_video_id(value):
|
|
59 |
# fail?
|
60 |
return None
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
|
|
63 |
def load_rttm_file(rttm_path):
|
64 |
return load_rttm(rttm_path)['stream']
|
65 |
|
66 |
-
|
67 |
def load_audio(uploaded_audio):
|
68 |
return AudioSegment.from_file(uploaded_audio)
|
69 |
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
-
|
|
|
|
|
76 |
reddit_thread = 'https://www.reddit.com/r/dataisbeautiful/comments/17413bq/oc_speech_diarization_app_that_transcribes_audio'
|
77 |
with st.expander('About', expanded=True):
|
78 |
st.markdown(f'''
|
79 |
Given an audio file this app will
|
80 |
- [x] 1. Identify and diarize the speakers using `pyannote` [HuggingFace Speaker Diarization api](https://huggingface.co/pyannote/speaker-diarization-3.0)
|
81 |
- [x] 2. Transcribe the audio and attribute to speakers using [OpenAi Whisper API](https://platform.openai.com/docs/guides/speech-to-text/quickstart)
|
82 |
-
- [
|
83 |
|
84 |
This version will only process up to first 6 minutes of an audio file due to limited resources of Streamlit.io apps.
|
85 |
A local version with access to a GPU can process 1 hour of audio in 1 to 5 minutes.
|
86 |
-
If you would like to use this app at scale reach out directly by creating an issue on github
|
87 |
|
88 |
Rule of thumb, for this Streamlit.io hosted app it takes half the duration of the audio to complete processing, ex. g. 6 minute youtube video will take 3 minutes to diarize.
|
89 |
|
@@ -95,10 +133,13 @@ option = st.radio("Select source:", ["Upload an audio file", "Use YouTube link",
|
|
95 |
|
96 |
# Upload audio file
|
97 |
if option == "Upload an audio file":
|
98 |
-
|
|
|
|
|
|
|
99 |
with st.expander('Optional Parameters'):
|
100 |
-
rttm
|
101 |
-
transcript_file = st.file_uploader("Upload transcipt json", type=["json"])
|
102 |
youtube_link = st.text_input('Youtube link of the audio sample')
|
103 |
|
104 |
if uploaded_audio is not None:
|
@@ -110,32 +151,23 @@ if option == "Upload an audio file":
|
|
110 |
# audio = audio.set_frame_rate(sample_rate)
|
111 |
|
112 |
# use youtube link
|
113 |
-
elif option == "Use YouTube link":
|
114 |
-
|
115 |
-
youtube_link_raw = st.text_input("Enter the YouTube video URL:")
|
116 |
-
youtube_link = f'https://youtu.be/{youtube_video_id(youtube_link_raw)}'
|
117 |
|
118 |
-
with st.
|
119 |
-
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
if youtube_link_raw:
|
122 |
-
|
123 |
-
try:
|
124 |
-
yt = YouTube(youtube_link)
|
125 |
-
audio_stream = yt.streams.filter(only_audio=True).first()
|
126 |
-
audio_name = audio_stream.default_filename
|
127 |
-
st.write(f"Downloaded {audio_name}")
|
128 |
-
except pytube.exceptions.AgeRestrictedError:
|
129 |
-
st.stop('Age restricted videos cannot be processed.')
|
130 |
-
|
131 |
-
try:
|
132 |
-
os.remove('sample.mp4')
|
133 |
-
except OSError:
|
134 |
-
pass
|
135 |
-
audio_file = audio_stream.download(filename='sample.mp4')
|
136 |
-
time.sleep(2)
|
137 |
-
audio = load_audio('sample.mp4')
|
138 |
-
st.audio(create_audio_stream(audio), format="audio/mp4", start_time=0)
|
139 |
# sample_rate = st.number_input("Enter the sample rate of the audio", min_value=8000, max_value=48000)
|
140 |
# audio = audio.set_frame_rate(sample_rate)
|
141 |
# except Exception as e:
|
@@ -143,7 +175,7 @@ elif option == "Use YouTube link":
|
|
143 |
elif option == 'See Example':
|
144 |
youtube_link = 'https://www.youtube.com/watch?v=TamrOZX9bu8'
|
145 |
audio_name = 'Stephen A. Smith has JOKES with Shannon Sharpe'
|
146 |
-
st.write(f'Loaded audio file from {youtube_link} -
|
147 |
if os.path.isfile('example/steve a smith jokes.mp4'):
|
148 |
audio = load_audio('example/steve a smith jokes.mp4')
|
149 |
else:
|
@@ -154,14 +186,13 @@ elif option == 'See Example':
|
|
154 |
audio = load_audio('sample.mp4')
|
155 |
|
156 |
if os.path.isfile("example/steve a smith jokes.rttm"):
|
157 |
-
rttm = "example/steve a smith jokes.rttm"
|
158 |
if os.path.isfile('example/steve a smith jokes.json'):
|
159 |
-
transcript_file = 'example/steve a smith jokes.json'
|
160 |
|
161 |
st.audio(create_audio_stream(audio), format="audio/mp4", start_time=0)
|
162 |
|
163 |
|
164 |
-
|
165 |
# Diarize
|
166 |
if "audio" in locals():
|
167 |
st.write('Performing Diarization...')
|
@@ -182,9 +213,9 @@ if "audio" in locals():
|
|
182 |
pipeline.to(torch.device('cuda'))
|
183 |
|
184 |
# run the pipeline on an audio file
|
185 |
-
if 'rttm' in
|
186 |
-
st.write(f'Loading {rttm}')
|
187 |
-
diarization = load_rttm_file(rttm)
|
188 |
else:
|
189 |
# with ProgressHook() as hook:
|
190 |
audio_ = create_audio_stream(audio)
|
@@ -193,6 +224,7 @@ if "audio" in locals():
|
|
193 |
# dump the diarization output to disk using RTTM format
|
194 |
with open(f'{audio_name.split(".")[0]}.rttm', "w") as f:
|
195 |
diarization.write_rttm(f)
|
|
|
196 |
|
197 |
# Display the diarization results
|
198 |
st.write("Diarization Results:")
|
@@ -211,7 +243,7 @@ if "audio" in locals():
|
|
211 |
temp = {'speaker': speaker,
|
212 |
'start': turn.start, 'end': turn.end, 'duration': turn.end-turn.start,
|
213 |
'audio': audio[turn.start*1000:turn.end*1000]}
|
214 |
-
if 'transcript_file' in
|
215 |
temp['audio_stream'] = create_audio_stream(audio[turn.start*1000:turn.end*1000])
|
216 |
sp_chunks.append(temp)
|
217 |
|
@@ -224,7 +256,7 @@ if "audio" in locals():
|
|
224 |
st.pyplot(figure)
|
225 |
|
226 |
st.write('Speakers and Audio Samples')
|
227 |
-
with st.expander('Samples', expanded=
|
228 |
for speaker in set(s['speaker'] for s in sp_chunks):
|
229 |
temp = max(filter(lambda d: d['speaker'] == speaker, sp_chunks), key=lambda x: x['duration'])
|
230 |
speak_time = sum(c['duration'] for c in filter(lambda d: d['speaker'] == speaker, sp_chunks))
|
@@ -240,32 +272,36 @@ if "audio" in locals():
|
|
240 |
|
241 |
st.divider()
|
242 |
# # Perform transcription with Whisper ASR
|
|
|
|
|
|
|
|
|
243 |
st.write('Transcribing using Whisper API (150 requests limit)...')
|
244 |
-
|
245 |
|
246 |
-
|
247 |
-
progress_text = f"Processing 1/{len(sp_chunks[:limit])}..."
|
248 |
my_bar = st.progress(0, text=progress_text)
|
|
|
249 |
with st.expander('Transcript', expanded=True):
|
250 |
-
if 'transcript_file' in
|
251 |
-
with open(transcript_file,'r') as f:
|
252 |
sp_chunks_loaded = json.load(f)
|
253 |
for i,s in enumerate(sp_chunks_loaded):
|
254 |
if s['transcript'] != None:
|
255 |
-
transcript_summary = f"{s['speaker']} start={float(s['start']):.1f}s end={float(s['end']):.1f}s: {s['transcript']}"
|
256 |
-
if youtube_link != None:
|
257 |
transcript_summary += f" {add_query_parameter(youtube_link, {'t':str(int(s['start']))})}"
|
258 |
|
259 |
-
st.
|
260 |
progress_text = f"Processing {i+1}/{len(sp_chunks_loaded)}..."
|
261 |
my_bar.progress((i+1)/len(sp_chunks_loaded), text=progress_text)
|
262 |
|
263 |
transcript_json = sp_chunks_loaded
|
264 |
-
transcript_path = f'
|
265 |
|
266 |
else:
|
267 |
sp_chunks_updated = []
|
268 |
-
for i,s in enumerate(sp_chunks[:
|
269 |
if s['duration'] > 0.1:
|
270 |
audio_path = s['audio'].export('temp.wav',format='wav')
|
271 |
try:
|
@@ -276,7 +312,7 @@ if "audio" in locals():
|
|
276 |
|
277 |
if transcript !='' and transcript != None:
|
278 |
s['transcript'] = transcript
|
279 |
-
transcript_summary = f"{s['speaker']} start={s['start']:.1f}s end={s['end']:.1f}s : {s['transcript']}"
|
280 |
if youtube_link != None:
|
281 |
transcript_summary += f" {add_query_parameter(youtube_link, {'t':str(int(s['start']))})}"
|
282 |
|
@@ -284,27 +320,101 @@ if "audio" in locals():
|
|
284 |
'start':s['start'], 'end':s['end'],
|
285 |
'duration': s['duration'],'transcript': transcript})
|
286 |
|
287 |
-
progress_text = f"Processing {i+1}/{len(sp_chunks[:
|
288 |
-
my_bar.progress((i+1)/len(sp_chunks[:
|
289 |
-
st.
|
290 |
|
291 |
transcript_json = [dict((k, d[k]) for k in ['speaker','start','end','duration','transcript'] if k in d) for d in sp_chunks_updated]
|
292 |
-
transcript_path = f'{audio_name.split(".")[0]}-transcript.json'
|
|
|
293 |
|
|
|
294 |
with open(transcript_path,'w') as f:
|
295 |
json.dump(transcript_json, f)
|
|
|
|
|
|
|
296 |
|
297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
st.info(f'Completed transcribing')
|
299 |
|
300 |
@st.cache_data
|
301 |
def convert_df(string):
|
302 |
# IMPORTANT: Cache the conversion to prevent computation on every rerun
|
303 |
return string.encode('utf-8')
|
304 |
-
|
305 |
transcript_json_download = convert_df(json.dumps(transcript_json))
|
306 |
-
|
307 |
c1_b,c2_b = st.columns((1,2))
|
|
|
|
|
308 |
with c1_b:
|
309 |
ste.download_button(
|
310 |
"Download transcript as json",
|
@@ -312,10 +422,12 @@ if "audio" in locals():
|
|
312 |
transcript_path,
|
313 |
)
|
314 |
|
|
|
315 |
header = ','.join(transcript_json[0].keys()) + '\n'
|
316 |
for s in transcript_json:
|
317 |
header += ','.join([str(e) if ',' not in str(e) else '"' + str(e) + '"' for e in s.values()]) + '\n'
|
318 |
|
|
|
319 |
transcript_csv_download = convert_df(header)
|
320 |
with c2_b:
|
321 |
ste.download_button(
|
|
|
25 |
page_icon = 'π'
|
26 |
)
|
27 |
|
28 |
+
# Set your OpenAI, Hugging Face API keys
|
29 |
+
openai.api_key = st.secrets['openai']
|
30 |
+
hf_api_key = st.secrets['hf']
|
31 |
+
|
32 |
+
TRANSCRIPTION_REQUEST_LIMIT = 150
|
33 |
+
PROMPT_REQUEST_LIMIT = 2
|
34 |
+
|
35 |
def create_audio_stream(audio):
|
36 |
return io.BytesIO(audio.export(format="wav").read())
|
37 |
|
|
|
66 |
# fail?
|
67 |
return None
|
68 |
|
69 |
+
@st.cache_data
|
70 |
+
def process_youtube_link(youtube_link):
|
71 |
+
st.write(f"Fetching audio from YouTube: {youtube_link}")
|
72 |
+
try:
|
73 |
+
yt = YouTube(youtube_link)
|
74 |
+
audio_stream = yt.streams.filter(only_audio=True).first()
|
75 |
+
audio_name = audio_stream.default_filename
|
76 |
+
st.write(f"Downloaded {audio_name}")
|
77 |
+
except pytube.exceptions.AgeRestrictedError:
|
78 |
+
st.stop('Age restricted videos cannot be processed.')
|
79 |
+
|
80 |
+
try:
|
81 |
+
os.remove('sample.mp4')
|
82 |
+
except OSError:
|
83 |
+
pass
|
84 |
+
audio_file = audio_stream.download(filename='sample.mp4')
|
85 |
+
time.sleep(2)
|
86 |
+
audio = load_audio('sample.mp4')
|
87 |
+
st.audio(create_audio_stream(audio), format="audio/mp4", start_time=0)
|
88 |
+
return audio, audio_name
|
89 |
|
90 |
+
@st.cache_data
|
91 |
def load_rttm_file(rttm_path):
|
92 |
return load_rttm(rttm_path)['stream']
|
93 |
|
94 |
+
@st.cache_resource
|
95 |
def load_audio(uploaded_audio):
|
96 |
return AudioSegment.from_file(uploaded_audio)
|
97 |
|
98 |
|
99 |
+
if "openai_model" not in st.session_state:
|
100 |
+
st.session_state["openai_model"] = "gpt-3.5-turbo"
|
101 |
+
|
102 |
+
if "prompt_request_counter" not in st.session_state:
|
103 |
+
st.session_state["prompt_request_counter"] = 0
|
104 |
+
|
105 |
+
initial_prompt = [{"role": "system", "content": "You are helping to analyze and summarize a transcript of a conversation."},
|
106 |
+
{"role": 'user', "content": 'Please summarize briefly the following transcript\n{}'}]
|
107 |
+
if "messages" not in st.session_state:
|
108 |
+
st.session_state.messages = initial_prompt
|
109 |
+
|
110 |
|
111 |
+
|
112 |
+
|
113 |
+
st.title("Speech to Chat")
|
114 |
reddit_thread = 'https://www.reddit.com/r/dataisbeautiful/comments/17413bq/oc_speech_diarization_app_that_transcribes_audio'
|
115 |
with st.expander('About', expanded=True):
|
116 |
st.markdown(f'''
|
117 |
Given an audio file this app will
|
118 |
- [x] 1. Identify and diarize the speakers using `pyannote` [HuggingFace Speaker Diarization api](https://huggingface.co/pyannote/speaker-diarization-3.0)
|
119 |
- [x] 2. Transcribe the audio and attribute to speakers using [OpenAi Whisper API](https://platform.openai.com/docs/guides/speech-to-text/quickstart)
|
120 |
+
- [x] 3. Set up an LLM chat with the transcript loaded into its knowledge database, so that a user can "talk" to the transcript of the audio file
|
121 |
|
122 |
This version will only process up to first 6 minutes of an audio file due to limited resources of Streamlit.io apps.
|
123 |
A local version with access to a GPU can process 1 hour of audio in 1 to 5 minutes.
|
124 |
+
If you would like to use this app at scale reach out directly by creating an issue on [githubπ€](https://github.com/KobaKhit/speech-to-text-app/issues)!
|
125 |
|
126 |
Rule of thumb, for this Streamlit.io hosted app it takes half the duration of the audio to complete processing, ex. g. 6 minute youtube video will take 3 minutes to diarize.
|
127 |
|
|
|
133 |
|
134 |
# Upload audio file
|
135 |
if option == "Upload an audio file":
|
136 |
+
with st.form('uploaded-file', clear_on_submit=True):
|
137 |
+
uploaded_audio = st.file_uploader("Upload an audio file (MP3 or WAV)", type=["mp3", "wav","mp4"])
|
138 |
+
st.form_submit_button()
|
139 |
+
if st.form_submit_button(): st.session_state.messages = initial_prompt
|
140 |
with st.expander('Optional Parameters'):
|
141 |
+
# st.session_state.rttm = st.file_uploader("Upload .rttm if you already have one", type=["rttm"])
|
142 |
+
# st.session_state.transcript_file = st.file_uploader("Upload transcipt json", type=["json"])
|
143 |
youtube_link = st.text_input('Youtube link of the audio sample')
|
144 |
|
145 |
if uploaded_audio is not None:
|
|
|
151 |
# audio = audio.set_frame_rate(sample_rate)
|
152 |
|
153 |
# use youtube link
|
154 |
+
elif option == "Use YouTube link":
|
|
|
|
|
|
|
155 |
|
156 |
+
with st.form('youtube-link', clear_on_submit=True):
|
157 |
+
youtube_link_raw = st.text_input("Enter the YouTube video URL:")
|
158 |
+
youtube_link = f'https://youtu.be/{youtube_video_id(youtube_link_raw)}'
|
159 |
+
|
160 |
+
if st.form_submit_button(): # reset variables on new link submit
|
161 |
+
st.session_state.messages = initial_prompt
|
162 |
+
st.session_state.rttm = None
|
163 |
+
st.session_state.transcript_file = None
|
164 |
+
st.session_state.prompt_request_counter = 0
|
165 |
+
|
166 |
+
# with st.expander('Optional Parameters'):
|
167 |
+
# st.session_state.rttm = st.file_uploader("Upload .rttm if you already have one", type=["rttm"])
|
168 |
+
# st.session_state.transcript_file = st.file_uploader("Upload transcipt json", type=["json"])
|
169 |
if youtube_link_raw:
|
170 |
+
audio, audio_name = process_youtube_link(youtube_link)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
# sample_rate = st.number_input("Enter the sample rate of the audio", min_value=8000, max_value=48000)
|
172 |
# audio = audio.set_frame_rate(sample_rate)
|
173 |
# except Exception as e:
|
|
|
175 |
elif option == 'See Example':
|
176 |
youtube_link = 'https://www.youtube.com/watch?v=TamrOZX9bu8'
|
177 |
audio_name = 'Stephen A. Smith has JOKES with Shannon Sharpe'
|
178 |
+
st.write(f'Loaded audio file from {youtube_link} - {audio_name} ππ')
|
179 |
if os.path.isfile('example/steve a smith jokes.mp4'):
|
180 |
audio = load_audio('example/steve a smith jokes.mp4')
|
181 |
else:
|
|
|
186 |
audio = load_audio('sample.mp4')
|
187 |
|
188 |
if os.path.isfile("example/steve a smith jokes.rttm"):
|
189 |
+
st.session_state.rttm = "example/steve a smith jokes.rttm"
|
190 |
if os.path.isfile('example/steve a smith jokes.json'):
|
191 |
+
st.session_state.transcript_file = 'example/steve a smith jokes.json'
|
192 |
|
193 |
st.audio(create_audio_stream(audio), format="audio/mp4", start_time=0)
|
194 |
|
195 |
|
|
|
196 |
# Diarize
|
197 |
if "audio" in locals():
|
198 |
st.write('Performing Diarization...')
|
|
|
213 |
pipeline.to(torch.device('cuda'))
|
214 |
|
215 |
# run the pipeline on an audio file
|
216 |
+
if 'rttm' in st.session_state and st.session_state.rttm != None:
|
217 |
+
st.write(f'Loading {st.session_state.rttm}')
|
218 |
+
diarization = load_rttm_file(st.session_state.rttm )
|
219 |
else:
|
220 |
# with ProgressHook() as hook:
|
221 |
audio_ = create_audio_stream(audio)
|
|
|
224 |
# dump the diarization output to disk using RTTM format
|
225 |
with open(f'{audio_name.split(".")[0]}.rttm', "w") as f:
|
226 |
diarization.write_rttm(f)
|
227 |
+
st.session_state.rttm = f'{audio_name.split(".")[0]}.rttm'
|
228 |
|
229 |
# Display the diarization results
|
230 |
st.write("Diarization Results:")
|
|
|
243 |
temp = {'speaker': speaker,
|
244 |
'start': turn.start, 'end': turn.end, 'duration': turn.end-turn.start,
|
245 |
'audio': audio[turn.start*1000:turn.end*1000]}
|
246 |
+
if 'transcript_file' in st.session_state and st.session_state.transcript_file == None:
|
247 |
temp['audio_stream'] = create_audio_stream(audio[turn.start*1000:turn.end*1000])
|
248 |
sp_chunks.append(temp)
|
249 |
|
|
|
256 |
st.pyplot(figure)
|
257 |
|
258 |
st.write('Speakers and Audio Samples')
|
259 |
+
with st.expander('Samples', expanded=False):
|
260 |
for speaker in set(s['speaker'] for s in sp_chunks):
|
261 |
temp = max(filter(lambda d: d['speaker'] == speaker, sp_chunks), key=lambda x: x['duration'])
|
262 |
speak_time = sum(c['duration'] for c in filter(lambda d: d['speaker'] == speaker, sp_chunks))
|
|
|
272 |
|
273 |
st.divider()
|
274 |
# # Perform transcription with Whisper ASR
|
275 |
+
|
276 |
+
|
277 |
+
# Transcript containers
|
278 |
+
container_transcript_chat = st.container()
|
279 |
st.write('Transcribing using Whisper API (150 requests limit)...')
|
280 |
+
container_transcript_completed = st.container()
|
281 |
|
282 |
+
progress_text = f"Processing 1/{len(sp_chunks[:TRANSCRIPTION_REQUEST_LIMIT])}..."
|
|
|
283 |
my_bar = st.progress(0, text=progress_text)
|
284 |
+
# rework the loop. Simplify if Else
|
285 |
with st.expander('Transcript', expanded=True):
|
286 |
+
if 'transcript_file' in st.session_state and st.session_state.transcript_file != None:
|
287 |
+
with open(st.session_state.transcript_file,'r') as f:
|
288 |
sp_chunks_loaded = json.load(f)
|
289 |
for i,s in enumerate(sp_chunks_loaded):
|
290 |
if s['transcript'] != None:
|
291 |
+
transcript_summary = f"**{s['speaker']}** start={float(s['start']):.1f}s end={float(s['end']):.1f}s: {s['transcript']}"
|
292 |
+
if youtube_link != None and youtube_link != '':
|
293 |
transcript_summary += f" {add_query_parameter(youtube_link, {'t':str(int(s['start']))})}"
|
294 |
|
295 |
+
st.markdown(transcript_summary)
|
296 |
progress_text = f"Processing {i+1}/{len(sp_chunks_loaded)}..."
|
297 |
my_bar.progress((i+1)/len(sp_chunks_loaded), text=progress_text)
|
298 |
|
299 |
transcript_json = sp_chunks_loaded
|
300 |
+
transcript_path = f'{audio_name.split(".mp4")[0]}-transcript.json'
|
301 |
|
302 |
else:
|
303 |
sp_chunks_updated = []
|
304 |
+
for i,s in enumerate(sp_chunks[:TRANSCRIPTION_REQUEST_LIMIT]):
|
305 |
if s['duration'] > 0.1:
|
306 |
audio_path = s['audio'].export('temp.wav',format='wav')
|
307 |
try:
|
|
|
312 |
|
313 |
if transcript !='' and transcript != None:
|
314 |
s['transcript'] = transcript
|
315 |
+
transcript_summary = f"**{s['speaker']}** start={s['start']:.1f}s end={s['end']:.1f}s : {s['transcript']}"
|
316 |
if youtube_link != None:
|
317 |
transcript_summary += f" {add_query_parameter(youtube_link, {'t':str(int(s['start']))})}"
|
318 |
|
|
|
320 |
'start':s['start'], 'end':s['end'],
|
321 |
'duration': s['duration'],'transcript': transcript})
|
322 |
|
323 |
+
progress_text = f"Processing {i+1}/{len(sp_chunks[:TRANSCRIPTION_REQUEST_LIMIT])}..."
|
324 |
+
my_bar.progress((i+1)/len(sp_chunks[:TRANSCRIPTION_REQUEST_LIMIT]), text=progress_text)
|
325 |
+
st.markdown(transcript_summary)
|
326 |
|
327 |
transcript_json = [dict((k, d[k]) for k in ['speaker','start','end','duration','transcript'] if k in d) for d in sp_chunks_updated]
|
328 |
+
transcript_path = f'{audio_name.split(".mp4")[0]}-transcript.json'
|
329 |
+
st.session_state.transcript_file = transcript_path
|
330 |
|
331 |
+
# save the trancript file
|
332 |
with open(transcript_path,'w') as f:
|
333 |
json.dump(transcript_json, f)
|
334 |
+
|
335 |
+
# generate transcript string
|
336 |
+
transcript_string = '\n'.join([f"{s['speaker']} start={s['start']:.1f}s end={s['end']:.1f}s : {s['transcript']}" for s in transcript_json])
|
337 |
|
338 |
+
@st.cache_data
|
339 |
+
def get_initial_response(transcript):
|
340 |
+
st.session_state.messages[1]['content'] = st.session_state.messages[1]['content'].format(transcript)
|
341 |
+
initial_response = openai.ChatCompletion.create(
|
342 |
+
model=st.session_state["openai_model"],
|
343 |
+
messages=st.session_state.messages
|
344 |
+
)
|
345 |
+
return initial_response['choices'][0]['message']['content']
|
346 |
+
|
347 |
+
# Chat container
|
348 |
+
with container_transcript_chat:
|
349 |
+
# get a summary of transcript from ChatGpt
|
350 |
+
init = get_initial_response(transcript_string)
|
351 |
+
# pass transcript to initial prompt
|
352 |
+
st.session_state.messages[1]['content'] = st.session_state.messages[1]['content'].format(transcript_string)
|
353 |
+
|
354 |
+
# LLM Chat
|
355 |
+
with st.expander('Summary of the Transcribed Audio File Generated by ChatGPT', expanded = True):
|
356 |
+
# display the AI generated summary.
|
357 |
+
with st.chat_message("assistant", avatar='https://upload.wikimedia.org/wikipedia/commons/0/04/ChatGPT_logo.svg'):
|
358 |
+
st.write(init)
|
359 |
+
|
360 |
+
# chat field
|
361 |
+
with st.form("Chat",clear_on_submit=True):
|
362 |
+
prompt = st.text_input("Chat with the Transcript (2 prompts limit)")
|
363 |
+
st.form_submit_button()
|
364 |
+
|
365 |
+
# message list
|
366 |
+
# for message in st.session_state.messages[2:]:
|
367 |
+
# with st.chat_message(message["role"]):
|
368 |
+
# st.markdown(message["content"])
|
369 |
+
|
370 |
+
# make request if prompt was entered
|
371 |
+
if prompt:
|
372 |
+
st.session_state.prompt_request_counter += 1
|
373 |
+
if st.session_state.prompt_request_counter > PROMPT_REQUEST_LIMIT:
|
374 |
+
st.warning('Exceeded prompt limit.');
|
375 |
+
st.stop()
|
376 |
+
# append user prompt to messages
|
377 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
378 |
+
|
379 |
+
# dislay user prompt
|
380 |
+
with st.chat_message("user"):
|
381 |
+
st.markdown(prompt)
|
382 |
+
|
383 |
+
# stream LLM Assisstant response
|
384 |
+
with st.chat_message("assistant"):
|
385 |
+
message_placeholder = st.empty()
|
386 |
+
full_response = ""
|
387 |
+
|
388 |
+
# stream response
|
389 |
+
for response in openai.ChatCompletion.create(
|
390 |
+
model=st.session_state["openai_model"],
|
391 |
+
messages=[
|
392 |
+
{"role": m["role"], "content": m["content"]}
|
393 |
+
for m in st.session_state.messages
|
394 |
+
],
|
395 |
+
stream=True,
|
396 |
+
):
|
397 |
+
full_response += response.choices[0].delta.get("content", "")
|
398 |
+
message_placeholder.markdown(full_response + "β")
|
399 |
+
message_placeholder.markdown(full_response)
|
400 |
+
|
401 |
+
# append ai response to messages
|
402 |
+
st.session_state.messages.append({"role": "assistant", "content": full_response})
|
403 |
+
|
404 |
+
# Trancription Completed Section
|
405 |
+
with container_transcript_completed:
|
406 |
st.info(f'Completed transcribing')
|
407 |
|
408 |
@st.cache_data
|
409 |
def convert_df(string):
|
410 |
# IMPORTANT: Cache the conversion to prevent computation on every rerun
|
411 |
return string.encode('utf-8')
|
412 |
+
# encode transcript string
|
413 |
transcript_json_download = convert_df(json.dumps(transcript_json))
|
414 |
+
# transcript download buttons
|
415 |
c1_b,c2_b = st.columns((1,2))
|
416 |
+
|
417 |
+
# json button
|
418 |
with c1_b:
|
419 |
ste.download_button(
|
420 |
"Download transcript as json",
|
|
|
422 |
transcript_path,
|
423 |
)
|
424 |
|
425 |
+
# create csv string
|
426 |
header = ','.join(transcript_json[0].keys()) + '\n'
|
427 |
for s in transcript_json:
|
428 |
header += ','.join([str(e) if ',' not in str(e) else '"' + str(e) + '"' for e in s.values()]) + '\n'
|
429 |
|
430 |
+
# csv button
|
431 |
transcript_csv_download = convert_df(header)
|
432 |
with c2_b:
|
433 |
ste.download_button(
|