Spaces:

mozilla-foundation
/

youtube_video_similarity

Build error

aapot

Add demo application

f3772cc over 2 years ago

4.12 kB

	import itertools
	import random
	import requests
	import pandas as pd
	from pytube import YouTube
	from youtube_transcript_api import YouTubeTranscriptApi
	from youtube_transcript_api.formatters import TextFormatter


	def is_youtube_video_available(url):
	video = YouTube(url)
	try:
	video.title
	return True
	except:
	return False


	def get_example_videos(rr_examples_url, num_rr_examples):
	example_videos = [['https://www.youtube.com/watch?v=WfVF-Ec4naQ', 'https://www.youtube.com/watch?v=4hrNt28t7Cw'],
	['https://www.youtube.com/watch?v=GbpjLP-UvIU',
	'https://www.youtube.com/watch?v=BlQ2mP2EE4A'],
	['https://www.youtube.com/watch?v=fdzY1f2P91k',
	'https://www.youtube.com/watch?v=BlQ2mP2EE4A'],
	['https://www.youtube.com/watch?v=fdzY1f2P91k', 'https://www.youtube.com/watch?v=9gIVGJQ3xWE']]
	example_videos = [ex for ex in example_videos if is_youtube_video_available(
	ex[0]) and is_youtube_video_available(ex[1])]

	try:
	example_videos_rr = requests.get(rr_examples_url).json()
	except:
	example_videos_rr = []
	example_videos_rr = [[f'https://www.youtube.com/watch?v={ex["rejected_video_id"]}',
	f'https://www.youtube.com/watch?v={ex["recommendation_id"]}'] for ex in example_videos_rr]
	# remove duplicate video pairs, there seems to be one duplicate
	example_videos_rr.sort()
	example_videos_rr = list(example_videos_rr for example_videos_rr,
	_ in itertools.groupby(example_videos_rr))
	example_videos_rr = [ex for ex in example_videos_rr if is_youtube_video_available(
	ex[0]) and is_youtube_video_available(ex[1])]
	if len(example_videos_rr) > num_rr_examples:
	example_videos_rr = random.sample(example_videos_rr, num_rr_examples)

	return example_videos, example_videos_rr


	def get_youtube_embedded_html(embed_url, video_position):
	return f'''
	<p>Video {video_position}</p>
	<iframe width="100%" height="360px" src="{embed_url}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; fullscreen" allowfullscreen></iframe>
	'''


	def update_youtube_embedded_html(video_url, video_position):
	try:
	embed_url = YouTube(video_url).embed_url
	except:
	return f'''
	<p>There was error in fetching details for video with the URL: {video_url}</p>
	'''
	return get_youtube_embedded_html(embed_url, video_position)


	def get_youtube_video_data(url):
	video = YouTube(url)
	channel_id = video.channel_id
	video_title = video.title
	video_description = video.description

	try:
	transcript_list = YouTubeTranscriptApi.list_transcripts(video.video_id)
	except:
	return channel_id, video_title, video_description, None

	available_non_common_langs = [tr.language_code for tr in list(
	transcript_list) if tr.language_code not in ['en', 'en-US', 'es', 'de']]
	video_transcript = YouTubeTranscriptApi.get_transcript(
	video.video_id, languages=['en', 'en-US', 'es', 'de'] + available_non_common_langs)
	video_transcript = TextFormatter().format_transcript(
	video_transcript).replace('\n', ' ')
	return channel_id, video_title, video_description, video_transcript


	def get_input_data_df(video1_url, video2_url):
	channel_id, video_title, video_description, video_transcript = get_youtube_video_data(
	video1_url)
	channel_id2, video_title2, video_description2, video_transcript2 = get_youtube_video_data(
	video2_url)
	channel_sim = 1 if channel_id == channel_id2 else 0
	df = pd.DataFrame([[video_title, video_description, video_transcript] + [video_title2, video_description2, video_transcript2] + [channel_sim]], columns=[
	'regret_title', 'regret_description', 'regret_transcript', 'recommendation_title', 'recommendation_description', 'recommendation_transcript', 'channel_sim'])
	return df