Spaces:
Sleeping
Sleeping
def get_video_duration(video_id):
Browse files
app.py
CHANGED
@@ -415,7 +415,14 @@ def generate_transcription_by_whisper(video_id):
|
|
415 |
|
416 |
def get_video_duration(video_id):
|
417 |
yt = YouTube(f'https://www.youtube.com/watch?v={video_id}')
|
418 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
419 |
|
420 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
421 |
print("====process_transcript_and_screenshots_on_gcs====")
|
@@ -428,7 +435,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
428 |
# 检查逐字稿是否存在
|
429 |
is_new_transcript = False
|
430 |
is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
|
431 |
-
|
432 |
if not is_transcript_exists:
|
433 |
print("逐字稿文件不存在于GCS中,重新建立")
|
434 |
# 从YouTube获取逐字稿并上传
|
@@ -444,8 +451,9 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
444 |
else:
|
445 |
print("沒有找到字幕")
|
446 |
transcript = generate_transcription_by_whisper(video_id)
|
447 |
-
|
448 |
-
|
|
|
449 |
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
450 |
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
451 |
|
@@ -455,7 +463,8 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
455 |
print("逐字稿已存在于GCS中")
|
456 |
transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
|
457 |
transcript = json.loads(transcript_text)
|
458 |
-
|
|
|
459 |
|
460 |
# print("===確認其他衍生文件===")
|
461 |
# source = "gcs"
|
|
|
415 |
|
416 |
def get_video_duration(video_id):
|
417 |
yt = YouTube(f'https://www.youtube.com/watch?v={video_id}')
|
418 |
+
try:
|
419 |
+
video_duration = yt.length
|
420 |
+
except:
|
421 |
+
video_duration = None
|
422 |
+
|
423 |
+
print(f"video_duration: {video_duration}")
|
424 |
+
|
425 |
+
return video_duration
|
426 |
|
427 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
428 |
print("====process_transcript_and_screenshots_on_gcs====")
|
|
|
435 |
# 检查逐字稿是否存在
|
436 |
is_new_transcript = False
|
437 |
is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
|
438 |
+
video_duration = get_video_duration(video_id)
|
439 |
if not is_transcript_exists:
|
440 |
print("逐字稿文件不存在于GCS中,重新建立")
|
441 |
# 从YouTube获取逐字稿并上传
|
|
|
451 |
else:
|
452 |
print("沒有找到字幕")
|
453 |
transcript = generate_transcription_by_whisper(video_id)
|
454 |
+
if video_duration:
|
455 |
+
transcript = [entry for entry in transcript if entry['start'] <= video_duration]
|
456 |
+
|
457 |
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
458 |
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
459 |
|
|
|
463 |
print("逐字稿已存在于GCS中")
|
464 |
transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
|
465 |
transcript = json.loads(transcript_text)
|
466 |
+
if video_duration:
|
467 |
+
transcript = [entry for entry in transcript if entry['start'] <= video_duration]
|
468 |
|
469 |
# print("===確認其他衍生文件===")
|
470 |
# source = "gcs"
|