Ferdowsi
/

pytube

Model card Files Files and versions Community

Taylor Fox Dahlin commited on May 9, 2021

Commit

f293f76

unverified ·

1 Parent(s): d7f6508

Initial implementation of Channel object (#932)

* Implements a Channel object for downloading videos from a YouTube channel.

* Minor changes to the playlist class to make it more compatible to be subclassed.

* `.videos` and `.video_urls` now behave just like iterable lists, but defer web requests.

* Implements DeferredGeneratorList which converts generators to lazy list-like objects.

Files changed (8) hide show

pytube/__init__.py +1 -0
pytube/contrib/channel.py +137 -0
pytube/contrib/playlist.py +34 -15
pytube/extract.py +31 -0
pytube/helpers.py +95 -0
tests/conftest.py +20 -3
tests/contrib/test_channel.py +54 -0
tests/mocks/channel-videos.html.gz +0 -0

pytube/__init__.py CHANGED Viewed

@@ -15,3 +15,4 @@ from pytube.captions import Caption
 from pytube.query import CaptionQuery, StreamQuery
 from pytube.__main__ import YouTube
 from pytube.contrib.playlist import Playlist

 from pytube.query import CaptionQuery, StreamQuery
 from pytube.__main__ import YouTube
 from pytube.contrib.playlist import Playlist
+from pytube.contrib.channel import Channel

pytube/contrib/channel.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# -*- coding: utf-8 -*-
+"""Module for interacting with a user's youtube channel."""
+import json
+import logging
+from typing import Dict, List, Optional, Tuple
+from pytube import extract, Playlist, request
+from pytube.helpers import uniqueify
+logger = logging.getLogger(__name__)
+class Channel(Playlist):
+    def __init__(self, url: str, proxies: Optional[Dict[str, str]] = None):
+        super().__init__(url, proxies)
+        self.channel_name = extract.channel_name(url)
+        self.channel_url = (
+            f"https://www.youtube.com/c/{self.channel_name}"
+        )
+        self.videos_url = self.channel_url + '/videos'
+        self.playlists_url = self.channel_url + '/playlists'
+        self.community_url = self.channel_url + '/community'
+        self.featured_channels_url = self.channel_url + '/channels'
+        self.about_url = self.channel_url + '/about'
+        # Possible future additions
+        self._playlists_html = None
+        self._community_html = None
+        self._featured_channels_html = None
+        self._about_html = None
+    @property
+    def html(self):
+        if self._html:
+            return self._html
+        self._html = request.get(self.videos_url)
+        return self._html
+    @property
+    def playlists_html(self):
+        if self._playlists_html:
+            return self._playlists_html
+        else:
+            self._playlists_html = request.get(self.playlists_url)
+            return self._playlists_html
+    @property
+    def community_html(self):
+        if self._community_html:
+            return self._community_html
+        else:
+            self._community_html = request.get(self.community_url)
+            return self._community_html
+    @property
+    def featured_channels_html(self):
+        if self._featured_channels_html:
+            return self._featured_channels_html
+        else:
+            self._featured_channels_html = request.get(self.featured_channels_url)
+            return self._featured_channels_html
+    @property
+    def about_html(self):
+        if self._about_html:
+            return self._about_html
+        else:
+            self._about_html = request.get(self.about_url)
+            return self._about_html
+    @staticmethod
+    def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]:
+        """Extracts videos from a raw json page
+        :param str raw_json: Input json extracted from the page or the last
+            server response
+        :rtype: Tuple[List[str], Optional[str]]
+        :returns: Tuple containing a list of up to 100 video watch ids and
+            a continuation token, if more videos are available
+        """
+        initial_data = json.loads(raw_json)
+        # this is the json tree structure, if the json was extracted from
+        # html
+        try:
+            videos = initial_data["contents"][
+                "twoColumnBrowseResultsRenderer"][
+                "tabs"][1]["tabRenderer"]["content"][
+                "sectionListRenderer"]["contents"][0][
+                "itemSectionRenderer"]["contents"][0][
+                "gridRenderer"]["items"]
+        except (KeyError, IndexError, TypeError):
+            try:
+                # this is the json tree structure, if the json was directly sent
+                # by the server in a continuation response
+                important_content = initial_data[1]['response']['onResponseReceivedActions'][
+                    0
+                ]['appendContinuationItemsAction']['continuationItems']
+                videos = important_content
+            except (KeyError, IndexError, TypeError):
+                try:
+                    # this is the json tree structure, if the json was directly sent
+                    # by the server in a continuation response
+                    # no longer a list and no longer has the "response" key
+                    important_content = initial_data['onResponseReceivedActions'][0][
+                        'appendContinuationItemsAction']['continuationItems']
+                    videos = important_content
+                except (KeyError, IndexError, TypeError) as p:
+                    logger.info(p)
+                    return [], None
+        try:
+            continuation = videos[-1]['continuationItemRenderer'][
+                'continuationEndpoint'
+            ]['continuationCommand']['token']
+            videos = videos[:-1]
+        except (KeyError, IndexError):
+            # if there is an error, no continuation is available
+            continuation = None
+        # remove duplicates
+        return (
+            uniqueify(
+                list(
+                    # only extract the video ids from the video data
+                    map(
+                        lambda x: (
+                            f"/watch?v="
+                            f"{x['gridVideoRenderer']['videoId']}"
+                        ),
+                        videos
+                    )
+                ),
+            ),
+            continuation,
+        )

pytube/contrib/playlist.py CHANGED Viewed

@@ -7,7 +7,7 @@ from datetime import date, datetime
 from typing import Dict, Iterable, List, Optional, Tuple, Union
 from pytube import extract, request, YouTube
-from pytube.helpers import cache, install_proxy, regex_search, uniqueify
 logger = logging.getLogger(__name__)
@@ -19,15 +19,24 @@ class Playlist(Sequence):
         if proxies:
             install_proxy(proxies)
         # These need to be initialized as None for the properties.
         self._html = None
         self._ytcfg = None
-        self.playlist_id = extract.playlist_id(url)
-        self.playlist_url = (
-            f"https://www.youtube.com/playlist?list={self.playlist_id}"
-        )
     @property
     def html(self):
@@ -175,7 +184,7 @@ class Playlist(Sequence):
                     'appendContinuationItemsAction']['continuationItems']
                 videos = important_content
             except (KeyError, IndexError, TypeError) as p:
-                print(p)
                 return [], None
         try:
@@ -218,27 +227,37 @@ class Playlist(Sequence):
         for page in self._paginate(until_watch_id=video_id):
             yield from (self._video_url(watch_path) for watch_path in page)
     @property  # type: ignore
     @cache
-    def video_urls(self) -> List[str]:
         """Complete links of all the videos in playlist
         :rtype: List[str]
         :returns: List of video URLs
         """
-        return [
-            self._video_url(video)
-            for page in list(self._paginate())
-            for video in page
-        ]
     @property
     def videos(self) -> Iterable[YouTube]:
         """Yields YouTube objects of videos in this playlist
-        :Yields: YouTube
         """
-        yield from (YouTube(url) for url in self.video_urls)
     def __getitem__(self, i: Union[slice, int]) -> Union[str, List[str]]:
         return self.video_urls[i]
@@ -247,7 +266,7 @@ class Playlist(Sequence):
         return len(self.video_urls)
     def __repr__(self) -> str:
-        return f"{self.video_urls}"
     @property
     @cache

 from typing import Dict, Iterable, List, Optional, Tuple, Union
 from pytube import extract, request, YouTube
+from pytube.helpers import cache, DeferredGeneratorList, install_proxy, regex_search, uniqueify
 logger = logging.getLogger(__name__)
         if proxies:
             install_proxy(proxies)
+        self._input_url = url
         # These need to be initialized as None for the properties.
         self._html = None
         self._ytcfg = None
+        self._playlist_id = None
+    @property
+    def playlist_id(self):
+        if self._playlist_id:
+            return self._playlist_id
+        self._playlist_id = extract.playlist_id(self._input_url)
+        return self._playlist_id
+    @property
+    def playlist_url(self):
+        return f"https://www.youtube.com/playlist?list={self.playlist_id}"
     @property
     def html(self):
                     'appendContinuationItemsAction']['continuationItems']
                 videos = important_content
             except (KeyError, IndexError, TypeError) as p:
+                logger.info(p)
                 return [], None
         try:
         for page in self._paginate(until_watch_id=video_id):
             yield from (self._video_url(watch_path) for watch_path in page)
+    def url_generator(self):
+        """Generator that yields video URLs.
+        :Yields: Video URLs
+        """
+        for page in self._paginate():
+            for video in page:
+                yield self._video_url(video)
     @property  # type: ignore
     @cache
+    def video_urls(self) -> DeferredGeneratorList:
         """Complete links of all the videos in playlist
         :rtype: List[str]
         :returns: List of video URLs
         """
+        return DeferredGeneratorList(self.url_generator())
+    def videos_generator(self):
+        for url in self.video_urls:
+            yield YouTube(url)
     @property
     def videos(self) -> Iterable[YouTube]:
         """Yields YouTube objects of videos in this playlist
+        :rtype: List[YouTube]
+        :returns: List of YouTube
         """
+        return DeferredGeneratorList(self.videos_generator())
     def __getitem__(self, i: Union[slice, int]) -> Union[str, List[str]]:
         return self.video_urls[i]
         return len(self.video_urls)
     def __repr__(self) -> str:
+        return f"{repr(self.video_urls)}"
     @property
     @cache

pytube/extract.py CHANGED Viewed

@@ -178,6 +178,37 @@ def playlist_id(url: str) -> str:
     return parse_qs(parsed.query)['list'][0]
 def video_info_url(video_id: str, watch_url: str) -> str:
     """Construct the video_info url.

     return parse_qs(parsed.query)['list'][0]
+def channel_name(url: str) -> str:
+    """Extract the ``channel_name`` or ``channel_id`` from a YouTube url.
+    This function supports the following patterns:
+    - :samp:`https://youtube.com/c/{channel_name}/*`
+    - :samp:`https://youtube.com/channel/{channel_id}/*
+    :param str url:
+        A YouTube url containing a channel name.
+    :rtype: str
+    :returns:
+        YouTube channel name.
+    """
+    patterns = [
+        r"(?:\/c\/([\d\w_\-]+)(\/.*)?)",
+        r"(?:\/channel\/([\w\d_\-]+)(\/.*)?)"
+    ]
+    for pattern in patterns:
+        regex = re.compile(pattern)
+        function_match = regex.search(url)
+        if function_match:
+            logger.debug("finished regex search, matched: %s", pattern)
+            channel_id = function_match.group(1)
+            return channel_id
+    raise RegexMatchError(
+        caller="channel_name", pattern="patterns"
+    )
 def video_info_url(video_id: str, watch_url: str) -> str:
     """Construct the video_info url.

pytube/helpers.py CHANGED Viewed

@@ -14,6 +14,101 @@ from pytube.exceptions import RegexMatchError
 logger = logging.getLogger(__name__)
 def regex_search(pattern: str, string: str, group: int) -> str:
     """Shortcut method to search a string for a given pattern.

 logger = logging.getLogger(__name__)
+class DeferredGeneratorList:
+    """A wrapper class for deferring list generation.
+    Pytube has some continuation generators that create web calls, which means
+    that any time a full list is requested, all of those web calls must be
+    made at once, which could lead to slowdowns. This will allow individual
+    elements to be queried, so that slowdowns only happen as necessary. For
+    example, you can iterate over elements in the list without accessing them
+    all simultaneously. This should allow for speed improvements for playlist
+    and channel interactions.
+    """
+    def __init__(self, generator):
+        """Construct a :class:`DeferredGeneratorList <DeferredGeneratorList>`.
+        :param generator generator:
+            The deferrable generator to create a wrapper for.
+        :param func func:
+            (Optional) A function to call on the generator items to produce the list.
+        """
+        self.gen = generator
+        self._elements = []
+    def __eq__(self, other):
+        """We want to mimic list behavior for comparison."""
+        return list(self) == other
+    def __getitem__(self, key) -> Any:
+        """Only generate items as they're asked for."""
+        # We only allow querying with indexes.
+        if not isinstance(key, (int, slice)):
+            raise TypeError('Key must be either a slice or int.')
+        # Convert int keys to slice
+        key_slice = key
+        if isinstance(key, int):
+            key_slice = slice(key, key + 1, 1)
+        # Generate all elements up to the final item
+        while len(self._elements) < key_slice.stop:
+            try:
+                next_item = next(self.gen)
+            except StopIteration:
+                # If we can't find enough elements for the slice, raise an IndexError
+                raise IndexError
+            else:
+                self._elements.append(next_item)
+        return self._elements[key]
+    def __iter__(self):
+        """Custom iterator for dynamically generated list."""
+        iter_index = 0
+        while True:
+            try:
+                curr_item = self[iter_index]
+            except IndexError:
+                return
+            else:
+                yield curr_item
+                iter_index += 1
+    def __next__(self) -> Any:
+        """Fetch next element in iterator."""
+        try:
+            curr_element = self[self.iter_index]
+        except IndexError:
+            raise StopIteration
+        self.iter_index += 1
+        return curr_element  # noqa:R504
+    def __len__(self) -> int:
+        """Return length of list of all items."""
+        self.generate_all()
+        return len(self._elements)
+    def __repr__(self) -> str:
+        """String representation of all items."""
+        self.generate_all()
+        return str(self._elements)
+    def __reversed__(self):
+        self.generate_all()
+        return self._elements[::-1]
+    def generate_all(self):
+        """Generate all items."""
+        while True:
+            try:
+                next_item = next(self.gen)
+            except StopIteration:
+                break
+            else:
+                self._elements.append(next_item)
 def regex_search(pattern: str, string: str, group: int) -> str:
     """Shortcut method to search a string for a given pattern.

tests/conftest.py CHANGED Viewed

@@ -91,7 +91,8 @@ def region_blocked():
 @pytest.fixture
 def playlist_html():
     """Youtube playlist HTML loaded on 2020-01-25 from
-    https://www.youtube.com/playlist?list=PLzMcBGfZo4-mP7qA9cagf68V06sko5otr"""
     file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)),
         "mocks",
@@ -104,7 +105,8 @@ def playlist_html():
 @pytest.fixture
 def playlist_long_html():
     """Youtube playlist HTML loaded on 2020-01-25 from
-    https://www.youtube.com/playlist?list=PLzMcBGfZo4-mP7qA9cagf68V06sko5otr"""
     file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)),
         "mocks",
@@ -117,7 +119,8 @@ def playlist_long_html():
 @pytest.fixture
 def playlist_submenu_html():
     """Youtube playlist HTML loaded on 2020-01-24 from
-    https://www.youtube.com/playlist?list=PLZHQObOWTQDMsr9K-rj53DwVRMYO3t5Yr"""
     file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)),
         "mocks",
@@ -138,3 +141,17 @@ def stream_dict():
     with gzip.open(file_path, "rb") as f:
         content = json.loads(f.read().decode("utf-8"))
         return content['watch_html']

 @pytest.fixture
 def playlist_html():
     """Youtube playlist HTML loaded on 2020-01-25 from
+    https://www.youtube.com/playlist?list=PLzMcBGfZo4-mP7qA9cagf68V06sko5otr
+    """
     file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)),
         "mocks",
 @pytest.fixture
 def playlist_long_html():
     """Youtube playlist HTML loaded on 2020-01-25 from
+    https://www.youtube.com/playlist?list=PLzMcBGfZo4-mP7qA9cagf68V06sko5otr
+    """
     file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)),
         "mocks",
 @pytest.fixture
 def playlist_submenu_html():
     """Youtube playlist HTML loaded on 2020-01-24 from
+    https://www.youtube.com/playlist?list=PLZHQObOWTQDMsr9K-rj53DwVRMYO3t5Yr
+    """
     file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)),
         "mocks",
     with gzip.open(file_path, "rb") as f:
         content = json.loads(f.read().decode("utf-8"))
         return content['watch_html']
+@pytest.fixture
+def channel_videos_html():
+    """Youtube channel HTML loaded on 2021-05-05 from
+    https://www.youtube.com/c/ProgrammingKnowledge/videos
+    """
+    file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)),
+        "mocks",
+        "channel-videos.html.gz",
+    )
+    with gzip.open(file_path, 'rb') as f:
+        return f.read().decode('utf-8')

tests/contrib/test_channel.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from unittest import mock
+from pytube import Channel
+@mock.patch('pytube.request.get')
+def test_init_with_url(request_get, channel_videos_html):
+    request_get.return_value = channel_videos_html
+    c = Channel('https://www.youtube.com/c/ProgrammingKnowledge/videos')
+    assert c.channel_url == 'https://www.youtube.com/c/ProgrammingKnowledge'
+    assert c.videos_url == f'{c.channel_url}/videos'
+    assert c.playlists_url == f'{c.channel_url}/playlists'
+    assert c.community_url == f'{c.channel_url}/community'
+    assert c.featured_channels_url == f'{c.channel_url}/channels'
+    assert c.about_url == f'{c.channel_url}/about'
+@mock.patch('pytube.request.get')
+def test_channel_name(request_get, channel_videos_html):
+    request_get.return_value = channel_videos_html
+    c = Channel('https://www.youtube.com/c/ProgrammingKnowledge/videos')
+    assert c.channel_name == 'ProgrammingKnowledge'
+@mock.patch('pytube.request.get')
+def test_channel_video_list(request_get, channel_videos_html):
+    request_get.return_value = channel_videos_html
+    c = Channel('https://www.youtube.com/c/ProgrammingKnowledge/videos')
+    first_ten = [
+        'https://www.youtube.com/watch?v=t_xLpJo_35k',
+        'https://www.youtube.com/watch?v=ccbh5YhxouQ',
+        'https://www.youtube.com/watch?v=wDnFjDjxW_0',
+        'https://www.youtube.com/watch?v=F3W_p_4XftA',
+        'https://www.youtube.com/watch?v=_fxm0xGGEi4',
+        'https://www.youtube.com/watch?v=cRbKZzcuIsg',
+        'https://www.youtube.com/watch?v=sdDu3dfIuow',
+        'https://www.youtube.com/watch?v=10KIbp-gJCE',
+        'https://www.youtube.com/watch?v=wZIT-cRtd6s',
+        'https://www.youtube.com/watch?v=KucCvEbTj0w',
+    ]
+    assert c.video_urls[:10] == first_ten
+@mock.patch('pytube.request.get')
+def test_videos_html(request_get, channel_videos_html):
+    request_get.return_value = channel_videos_html
+    c = Channel('https://www.youtube.com/c/ProgrammingKnowledge')
+    assert c.html == channel_videos_html
+# Because the Channel object subclasses the Playlist object, most of the tests
+# are already taken care of by the Playlist test suite.

tests/mocks/channel-videos.html.gz ADDED Viewed

Binary file (48.6 kB). View file