import itertools import random import requests import pandas as pd from pytube import YouTube from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api.formatters import TextFormatter def is_youtube_video_available(url): video = YouTube(url) try: video.title return True except: return False def get_example_videos(rr_examples_url, num_rr_examples): example_videos = [['https://www.youtube.com/watch?v=WfVF-Ec4naQ', 'https://www.youtube.com/watch?v=4hrNt28t7Cw'], ['https://www.youtube.com/watch?v=GbpjLP-UvIU', 'https://www.youtube.com/watch?v=BlQ2mP2EE4A'], ['https://www.youtube.com/watch?v=fdzY1f2P91k', 'https://www.youtube.com/watch?v=BlQ2mP2EE4A'], ['https://www.youtube.com/watch?v=fdzY1f2P91k', 'https://www.youtube.com/watch?v=9gIVGJQ3xWE']] example_videos = [ex for ex in example_videos if is_youtube_video_available( ex[0]) and is_youtube_video_available(ex[1])] try: example_videos_rr = requests.get(rr_examples_url).json() except: example_videos_rr = [] example_videos_rr = [[f'https://www.youtube.com/watch?v={ex["rejected_video_id"]}', f'https://www.youtube.com/watch?v={ex["recommendation_id"]}'] for ex in example_videos_rr] # remove duplicate video pairs, there seems to be one duplicate example_videos_rr.sort() example_videos_rr = list(example_videos_rr for example_videos_rr, _ in itertools.groupby(example_videos_rr)) example_videos_rr = [ex for ex in example_videos_rr if is_youtube_video_available( ex[0]) and is_youtube_video_available(ex[1])] if len(example_videos_rr) > num_rr_examples: example_videos_rr = random.sample(example_videos_rr, num_rr_examples) return example_videos, example_videos_rr def get_youtube_embedded_html(embed_url, video_position): return f'''

Video {video_position}

''' def update_youtube_embedded_html(video_url, video_position): try: embed_url = YouTube(video_url).embed_url except: return f'''

There was error in fetching details for video with the URL: {video_url}

''' return get_youtube_embedded_html(embed_url, video_position) def get_youtube_video_data(url): video = YouTube(url) channel_id = video.channel_id video_title = video.title video_description = video.description try: transcript_list = YouTubeTranscriptApi.list_transcripts(video.video_id) except: return channel_id, video_title, video_description, None available_non_common_langs = [tr.language_code for tr in list( transcript_list) if tr.language_code not in ['en', 'en-US', 'es', 'de']] video_transcript = YouTubeTranscriptApi.get_transcript( video.video_id, languages=['en', 'en-US', 'es', 'de'] + available_non_common_langs) video_transcript = TextFormatter().format_transcript( video_transcript).replace('\n', ' ') return channel_id, video_title, video_description, video_transcript def get_input_data_df(video1_url, video2_url): channel_id, video_title, video_description, video_transcript = get_youtube_video_data( video1_url) channel_id2, video_title2, video_description2, video_transcript2 = get_youtube_video_data( video2_url) channel_sim = 1 if channel_id == channel_id2 else 0 df = pd.DataFrame([[video_title, video_description, video_transcript] + [video_title2, video_description2, video_transcript2] + [channel_sim]], columns=[ 'regret_title', 'regret_description', 'regret_transcript', 'recommendation_title', 'recommendation_description', 'recommendation_transcript', 'channel_sim']) return df