youngtsai commited on
Commit
2c1b15d
·
1 Parent(s): 6b24245

def get_video_duration(video_id):

Browse files
Files changed (1) hide show
  1. app.py +14 -5
app.py CHANGED
@@ -415,7 +415,14 @@ def generate_transcription_by_whisper(video_id):
415
 
416
  def get_video_duration(video_id):
417
  yt = YouTube(f'https://www.youtube.com/watch?v={video_id}')
418
- return yt.length
 
 
 
 
 
 
 
419
 
420
  def process_transcript_and_screenshots_on_gcs(video_id):
421
  print("====process_transcript_and_screenshots_on_gcs====")
@@ -428,7 +435,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
428
  # 检查逐字稿是否存在
429
  is_new_transcript = False
430
  is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
431
- video_length = get_video_duration(video_id)
432
  if not is_transcript_exists:
433
  print("逐字稿文件不存在于GCS中,重新建立")
434
  # 从YouTube获取逐字稿并上传
@@ -444,8 +451,9 @@ def process_transcript_and_screenshots_on_gcs(video_id):
444
  else:
445
  print("沒有找到字幕")
446
  transcript = generate_transcription_by_whisper(video_id)
447
-
448
- transcript = [entry for entry in transcript if entry['start'] <= video_length]
 
449
  transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
450
  GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
451
 
@@ -455,7 +463,8 @@ def process_transcript_and_screenshots_on_gcs(video_id):
455
  print("逐字稿已存在于GCS中")
456
  transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
457
  transcript = json.loads(transcript_text)
458
- transcript = [entry for entry in transcript if entry['start'] <= video_length]
 
459
 
460
  # print("===確認其他衍生文件===")
461
  # source = "gcs"
 
415
 
416
  def get_video_duration(video_id):
417
  yt = YouTube(f'https://www.youtube.com/watch?v={video_id}')
418
+ try:
419
+ video_duration = yt.length
420
+ except:
421
+ video_duration = None
422
+
423
+ print(f"video_duration: {video_duration}")
424
+
425
+ return video_duration
426
 
427
  def process_transcript_and_screenshots_on_gcs(video_id):
428
  print("====process_transcript_and_screenshots_on_gcs====")
 
435
  # 检查逐字稿是否存在
436
  is_new_transcript = False
437
  is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
438
+ video_duration = get_video_duration(video_id)
439
  if not is_transcript_exists:
440
  print("逐字稿文件不存在于GCS中,重新建立")
441
  # 从YouTube获取逐字稿并上传
 
451
  else:
452
  print("沒有找到字幕")
453
  transcript = generate_transcription_by_whisper(video_id)
454
+ if video_duration:
455
+ transcript = [entry for entry in transcript if entry['start'] <= video_duration]
456
+
457
  transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
458
  GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
459
 
 
463
  print("逐字稿已存在于GCS中")
464
  transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
465
  transcript = json.loads(transcript_text)
466
+ if video_duration:
467
+ transcript = [entry for entry in transcript if entry['start'] <= video_duration]
468
 
469
  # print("===確認其他衍生文件===")
470
  # source = "gcs"