Spaces:
Build error
Build error
import itertools | |
import random | |
import requests | |
import pandas as pd | |
from pytube import YouTube | |
from youtube_transcript_api import YouTubeTranscriptApi | |
from youtube_transcript_api.formatters import TextFormatter | |
def is_youtube_video_available(url): | |
video = YouTube(url) | |
try: | |
video.title | |
return True | |
except: | |
return False | |
def get_example_videos(rr_examples_url, num_rr_examples): | |
example_videos = [['https://www.youtube.com/watch?v=WfVF-Ec4naQ', 'https://www.youtube.com/watch?v=4hrNt28t7Cw'], | |
['https://www.youtube.com/watch?v=GbpjLP-UvIU', | |
'https://www.youtube.com/watch?v=BlQ2mP2EE4A'], | |
['https://www.youtube.com/watch?v=fdzY1f2P91k', | |
'https://www.youtube.com/watch?v=BlQ2mP2EE4A'], | |
['https://www.youtube.com/watch?v=fdzY1f2P91k', 'https://www.youtube.com/watch?v=9gIVGJQ3xWE']] | |
example_videos = [ex for ex in example_videos if is_youtube_video_available( | |
ex[0]) and is_youtube_video_available(ex[1])] | |
try: | |
example_videos_rr = requests.get(rr_examples_url).json() | |
except: | |
example_videos_rr = [] | |
example_videos_rr = [[f'https://www.youtube.com/watch?v={ex["rejected_video_id"]}', | |
f'https://www.youtube.com/watch?v={ex["recommendation_id"]}'] for ex in example_videos_rr] | |
# remove duplicate video pairs, there seems to be one duplicate | |
example_videos_rr.sort() | |
example_videos_rr = list(example_videos_rr for example_videos_rr, | |
_ in itertools.groupby(example_videos_rr)) | |
example_videos_rr = [ex for ex in example_videos_rr if is_youtube_video_available( | |
ex[0]) and is_youtube_video_available(ex[1])] | |
if len(example_videos_rr) > num_rr_examples: | |
example_videos_rr = random.sample(example_videos_rr, num_rr_examples) | |
return example_videos, example_videos_rr | |
def get_youtube_embedded_html(embed_url, video_position): | |
return f''' | |
<p>Video {video_position}</p> | |
<iframe width="100%" height="360px" src="{embed_url}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; fullscreen" allowfullscreen></iframe> | |
''' | |
def update_youtube_embedded_html(video_url, video_position): | |
try: | |
embed_url = YouTube(video_url).embed_url | |
except: | |
return f''' | |
<p>There was error in fetching details for video with the URL: {video_url}</p> | |
''' | |
return get_youtube_embedded_html(embed_url, video_position) | |
def get_youtube_video_data(url): | |
video = YouTube(url) | |
channel_id = video.channel_id | |
video_title = video.title | |
video_description = video.description | |
try: | |
transcript_list = YouTubeTranscriptApi.list_transcripts(video.video_id) | |
except: | |
return channel_id, video_title, video_description, None | |
available_non_common_langs = [tr.language_code for tr in list( | |
transcript_list) if tr.language_code not in ['en', 'en-US', 'es', 'de']] | |
video_transcript = YouTubeTranscriptApi.get_transcript( | |
video.video_id, languages=['en', 'en-US', 'es', 'de'] + available_non_common_langs) | |
video_transcript = TextFormatter().format_transcript( | |
video_transcript).replace('\n', ' ') | |
return channel_id, video_title, video_description, video_transcript | |
def get_input_data_df(video1_url, video2_url): | |
channel_id, video_title, video_description, video_transcript = get_youtube_video_data( | |
video1_url) | |
channel_id2, video_title2, video_description2, video_transcript2 = get_youtube_video_data( | |
video2_url) | |
channel_sim = 1 if channel_id == channel_id2 else 0 | |
df = pd.DataFrame([[video_title, video_description, video_transcript] + [video_title2, video_description2, video_transcript2] + [channel_sim]], columns=[ | |
'regret_title', 'regret_description', 'regret_transcript', 'recommendation_title', 'recommendation_description', 'recommendation_transcript', 'channel_sim']) | |
return df | |