File size: 6,675 Bytes

a45bd3f

import requests
try: # pragma: no cover
    import http.cookiejar as cookiejar
    CookieLoadError = (FileNotFoundError, cookiejar.LoadError)
except ImportError: # pragma: no cover
    import cookielib as cookiejar
    CookieLoadError = IOError

from ._transcripts import TranscriptListFetcher

from ._errors import (
    CookiePathInvalid,
    CookiesInvalid
)


class YouTubeTranscriptApi(object):
    @classmethod
    def list_transcripts(cls, video_id, proxies=None, cookies=None):
        """
        Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
        which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
        over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide
        metadata and can either be fetched by calling `transcript.fetch()` or translated by calling
        `transcript.translate('en')`. Example::

            # retrieve the available transcripts
            transcript_list = YouTubeTranscriptApi.get('video_id')

            # iterate over all available transcripts
            for transcript in transcript_list:
                # the Transcript object provides metadata properties
                print(
                    transcript.video_id,
                    transcript.language,
                    transcript.language_code,
                    # whether it has been manually created or generated by YouTube
                    transcript.is_generated,
                    # a list of languages the transcript can be translated to
                    transcript.translation_languages,
                )

                # fetch the actual transcript data
                print(transcript.fetch())

                # translating the transcript will return another transcript object
                print(transcript.translate('en').fetch())

            # you can also directly filter for the language you are looking for, using the transcript list
            transcript = transcript_list.find_transcript(['de', 'en'])

            # or just filter for manually created transcripts
            transcript = transcript_list.find_manually_created_transcript(['de', 'en'])

            # or automatically generated ones
            transcript = transcript_list.find_generated_transcript(['de', 'en'])

        :param video_id: the youtube video id
        :type video_id: str
        :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
        :param cookies: a string of the path to a text file containing youtube authorization cookies
        :type cookies: str
        :return: the list of available transcripts
        :rtype TranscriptList:
        """
        with requests.Session() as http_client:
            if cookies:
                http_client.cookies = cls._load_cookies(cookies, video_id)
            http_client.proxies = proxies if proxies else {}
            return TranscriptListFetcher(http_client).fetch(video_id)

    @classmethod
    def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
        """
        Retrieves the transcripts for a list of videos.

        :param video_ids: a list of youtube video ids
        :type video_ids: list[str]
        :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
        it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
        do so.
        :type languages: list[str]
        :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
        one of the video transcripts
        :type continue_after_error: bool
        :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
        :param cookies: a string of the path to a text file containing youtube authorization cookies
        :type cookies: str
        :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
        video ids, which could not be retrieved
        :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
        """
        data = {}
        unretrievable_videos = []

        for video_id in video_ids:
            try:
                data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies)
            except Exception as exception:
                if not continue_after_error:
                    raise exception

                unretrievable_videos.append(video_id)

        return data, unretrievable_videos

    @classmethod
    def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None):
        """
        Retrieves the transcript for a single video. This is just a shortcut for calling::

            YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch()

        :param video_id: the youtube video id
        :type video_id: str
        :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
        it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
        do so.
        :type languages: list[str]
        :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
        :param cookies: a string of the path to a text file containing youtube authorization cookies
        :type cookies: str
        :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
        :rtype [{'text': str, 'start': float, 'end': float}]:
        """
        return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch()
    
    @classmethod
    def _load_cookies(cls, cookies, video_id):
        try:
            cookie_jar = cookiejar.MozillaCookieJar()
            cookie_jar.load(cookies)
            if not cookie_jar:
                raise CookiesInvalid(video_id)
            return cookie_jar
        except CookieLoadError:
            raise CookiePathInvalid(video_id)