Taylor Fox Dahlin commited on
Commit
f293f76
·
unverified ·
1 Parent(s): d7f6508

Initial implementation of Channel object (#932)

Browse files

* Implements a Channel object for downloading videos from a YouTube channel.

* Minor changes to the playlist class to make it more compatible to be subclassed.

* `.videos` and `.video_urls` now behave just like iterable lists, but defer web requests.

* Implements DeferredGeneratorList which converts generators to lazy list-like objects.

pytube/__init__.py CHANGED
@@ -15,3 +15,4 @@ from pytube.captions import Caption
15
  from pytube.query import CaptionQuery, StreamQuery
16
  from pytube.__main__ import YouTube
17
  from pytube.contrib.playlist import Playlist
 
 
15
  from pytube.query import CaptionQuery, StreamQuery
16
  from pytube.__main__ import YouTube
17
  from pytube.contrib.playlist import Playlist
18
+ from pytube.contrib.channel import Channel
pytube/contrib/channel.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Module for interacting with a user's youtube channel."""
3
+ import json
4
+ import logging
5
+ from typing import Dict, List, Optional, Tuple
6
+
7
+ from pytube import extract, Playlist, request
8
+ from pytube.helpers import uniqueify
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class Channel(Playlist):
14
+ def __init__(self, url: str, proxies: Optional[Dict[str, str]] = None):
15
+ super().__init__(url, proxies)
16
+
17
+ self.channel_name = extract.channel_name(url)
18
+
19
+ self.channel_url = (
20
+ f"https://www.youtube.com/c/{self.channel_name}"
21
+ )
22
+ self.videos_url = self.channel_url + '/videos'
23
+ self.playlists_url = self.channel_url + '/playlists'
24
+ self.community_url = self.channel_url + '/community'
25
+ self.featured_channels_url = self.channel_url + '/channels'
26
+ self.about_url = self.channel_url + '/about'
27
+
28
+ # Possible future additions
29
+ self._playlists_html = None
30
+ self._community_html = None
31
+ self._featured_channels_html = None
32
+ self._about_html = None
33
+
34
+ @property
35
+ def html(self):
36
+ if self._html:
37
+ return self._html
38
+ self._html = request.get(self.videos_url)
39
+ return self._html
40
+
41
+ @property
42
+ def playlists_html(self):
43
+ if self._playlists_html:
44
+ return self._playlists_html
45
+ else:
46
+ self._playlists_html = request.get(self.playlists_url)
47
+ return self._playlists_html
48
+
49
+ @property
50
+ def community_html(self):
51
+ if self._community_html:
52
+ return self._community_html
53
+ else:
54
+ self._community_html = request.get(self.community_url)
55
+ return self._community_html
56
+
57
+ @property
58
+ def featured_channels_html(self):
59
+ if self._featured_channels_html:
60
+ return self._featured_channels_html
61
+ else:
62
+ self._featured_channels_html = request.get(self.featured_channels_url)
63
+ return self._featured_channels_html
64
+
65
+ @property
66
+ def about_html(self):
67
+ if self._about_html:
68
+ return self._about_html
69
+ else:
70
+ self._about_html = request.get(self.about_url)
71
+ return self._about_html
72
+
73
+ @staticmethod
74
+ def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]:
75
+ """Extracts videos from a raw json page
76
+
77
+ :param str raw_json: Input json extracted from the page or the last
78
+ server response
79
+ :rtype: Tuple[List[str], Optional[str]]
80
+ :returns: Tuple containing a list of up to 100 video watch ids and
81
+ a continuation token, if more videos are available
82
+ """
83
+ initial_data = json.loads(raw_json)
84
+ # this is the json tree structure, if the json was extracted from
85
+ # html
86
+ try:
87
+ videos = initial_data["contents"][
88
+ "twoColumnBrowseResultsRenderer"][
89
+ "tabs"][1]["tabRenderer"]["content"][
90
+ "sectionListRenderer"]["contents"][0][
91
+ "itemSectionRenderer"]["contents"][0][
92
+ "gridRenderer"]["items"]
93
+ except (KeyError, IndexError, TypeError):
94
+ try:
95
+ # this is the json tree structure, if the json was directly sent
96
+ # by the server in a continuation response
97
+ important_content = initial_data[1]['response']['onResponseReceivedActions'][
98
+ 0
99
+ ]['appendContinuationItemsAction']['continuationItems']
100
+ videos = important_content
101
+ except (KeyError, IndexError, TypeError):
102
+ try:
103
+ # this is the json tree structure, if the json was directly sent
104
+ # by the server in a continuation response
105
+ # no longer a list and no longer has the "response" key
106
+ important_content = initial_data['onResponseReceivedActions'][0][
107
+ 'appendContinuationItemsAction']['continuationItems']
108
+ videos = important_content
109
+ except (KeyError, IndexError, TypeError) as p:
110
+ logger.info(p)
111
+ return [], None
112
+
113
+ try:
114
+ continuation = videos[-1]['continuationItemRenderer'][
115
+ 'continuationEndpoint'
116
+ ]['continuationCommand']['token']
117
+ videos = videos[:-1]
118
+ except (KeyError, IndexError):
119
+ # if there is an error, no continuation is available
120
+ continuation = None
121
+
122
+ # remove duplicates
123
+ return (
124
+ uniqueify(
125
+ list(
126
+ # only extract the video ids from the video data
127
+ map(
128
+ lambda x: (
129
+ f"/watch?v="
130
+ f"{x['gridVideoRenderer']['videoId']}"
131
+ ),
132
+ videos
133
+ )
134
+ ),
135
+ ),
136
+ continuation,
137
+ )
pytube/contrib/playlist.py CHANGED
@@ -7,7 +7,7 @@ from datetime import date, datetime
7
  from typing import Dict, Iterable, List, Optional, Tuple, Union
8
 
9
  from pytube import extract, request, YouTube
10
- from pytube.helpers import cache, install_proxy, regex_search, uniqueify
11
 
12
  logger = logging.getLogger(__name__)
13
 
@@ -19,15 +19,24 @@ class Playlist(Sequence):
19
  if proxies:
20
  install_proxy(proxies)
21
 
 
 
22
  # These need to be initialized as None for the properties.
23
  self._html = None
24
  self._ytcfg = None
25
 
26
- self.playlist_id = extract.playlist_id(url)
27
 
28
- self.playlist_url = (
29
- f"https://www.youtube.com/playlist?list={self.playlist_id}"
30
- )
 
 
 
 
 
 
 
31
 
32
  @property
33
  def html(self):
@@ -175,7 +184,7 @@ class Playlist(Sequence):
175
  'appendContinuationItemsAction']['continuationItems']
176
  videos = important_content
177
  except (KeyError, IndexError, TypeError) as p:
178
- print(p)
179
  return [], None
180
 
181
  try:
@@ -218,27 +227,37 @@ class Playlist(Sequence):
218
  for page in self._paginate(until_watch_id=video_id):
219
  yield from (self._video_url(watch_path) for watch_path in page)
220
 
 
 
 
 
 
 
 
 
 
221
  @property # type: ignore
222
  @cache
223
- def video_urls(self) -> List[str]:
224
  """Complete links of all the videos in playlist
225
 
226
  :rtype: List[str]
227
  :returns: List of video URLs
228
  """
229
- return [
230
- self._video_url(video)
231
- for page in list(self._paginate())
232
- for video in page
233
- ]
234
 
235
  @property
236
  def videos(self) -> Iterable[YouTube]:
237
  """Yields YouTube objects of videos in this playlist
238
 
239
- :Yields: YouTube
 
240
  """
241
- yield from (YouTube(url) for url in self.video_urls)
242
 
243
  def __getitem__(self, i: Union[slice, int]) -> Union[str, List[str]]:
244
  return self.video_urls[i]
@@ -247,7 +266,7 @@ class Playlist(Sequence):
247
  return len(self.video_urls)
248
 
249
  def __repr__(self) -> str:
250
- return f"{self.video_urls}"
251
 
252
  @property
253
  @cache
 
7
  from typing import Dict, Iterable, List, Optional, Tuple, Union
8
 
9
  from pytube import extract, request, YouTube
10
+ from pytube.helpers import cache, DeferredGeneratorList, install_proxy, regex_search, uniqueify
11
 
12
  logger = logging.getLogger(__name__)
13
 
 
19
  if proxies:
20
  install_proxy(proxies)
21
 
22
+ self._input_url = url
23
+
24
  # These need to be initialized as None for the properties.
25
  self._html = None
26
  self._ytcfg = None
27
 
28
+ self._playlist_id = None
29
 
30
+ @property
31
+ def playlist_id(self):
32
+ if self._playlist_id:
33
+ return self._playlist_id
34
+ self._playlist_id = extract.playlist_id(self._input_url)
35
+ return self._playlist_id
36
+
37
+ @property
38
+ def playlist_url(self):
39
+ return f"https://www.youtube.com/playlist?list={self.playlist_id}"
40
 
41
  @property
42
  def html(self):
 
184
  'appendContinuationItemsAction']['continuationItems']
185
  videos = important_content
186
  except (KeyError, IndexError, TypeError) as p:
187
+ logger.info(p)
188
  return [], None
189
 
190
  try:
 
227
  for page in self._paginate(until_watch_id=video_id):
228
  yield from (self._video_url(watch_path) for watch_path in page)
229
 
230
+ def url_generator(self):
231
+ """Generator that yields video URLs.
232
+
233
+ :Yields: Video URLs
234
+ """
235
+ for page in self._paginate():
236
+ for video in page:
237
+ yield self._video_url(video)
238
+
239
  @property # type: ignore
240
  @cache
241
+ def video_urls(self) -> DeferredGeneratorList:
242
  """Complete links of all the videos in playlist
243
 
244
  :rtype: List[str]
245
  :returns: List of video URLs
246
  """
247
+ return DeferredGeneratorList(self.url_generator())
248
+
249
+ def videos_generator(self):
250
+ for url in self.video_urls:
251
+ yield YouTube(url)
252
 
253
  @property
254
  def videos(self) -> Iterable[YouTube]:
255
  """Yields YouTube objects of videos in this playlist
256
 
257
+ :rtype: List[YouTube]
258
+ :returns: List of YouTube
259
  """
260
+ return DeferredGeneratorList(self.videos_generator())
261
 
262
  def __getitem__(self, i: Union[slice, int]) -> Union[str, List[str]]:
263
  return self.video_urls[i]
 
266
  return len(self.video_urls)
267
 
268
  def __repr__(self) -> str:
269
+ return f"{repr(self.video_urls)}"
270
 
271
  @property
272
  @cache
pytube/extract.py CHANGED
@@ -178,6 +178,37 @@ def playlist_id(url: str) -> str:
178
  return parse_qs(parsed.query)['list'][0]
179
 
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  def video_info_url(video_id: str, watch_url: str) -> str:
182
  """Construct the video_info url.
183
 
 
178
  return parse_qs(parsed.query)['list'][0]
179
 
180
 
181
+ def channel_name(url: str) -> str:
182
+ """Extract the ``channel_name`` or ``channel_id`` from a YouTube url.
183
+
184
+ This function supports the following patterns:
185
+
186
+ - :samp:`https://youtube.com/c/{channel_name}/*`
187
+ - :samp:`https://youtube.com/channel/{channel_id}/*
188
+
189
+ :param str url:
190
+ A YouTube url containing a channel name.
191
+ :rtype: str
192
+ :returns:
193
+ YouTube channel name.
194
+ """
195
+ patterns = [
196
+ r"(?:\/c\/([\d\w_\-]+)(\/.*)?)",
197
+ r"(?:\/channel\/([\w\d_\-]+)(\/.*)?)"
198
+ ]
199
+ for pattern in patterns:
200
+ regex = re.compile(pattern)
201
+ function_match = regex.search(url)
202
+ if function_match:
203
+ logger.debug("finished regex search, matched: %s", pattern)
204
+ channel_id = function_match.group(1)
205
+ return channel_id
206
+
207
+ raise RegexMatchError(
208
+ caller="channel_name", pattern="patterns"
209
+ )
210
+
211
+
212
  def video_info_url(video_id: str, watch_url: str) -> str:
213
  """Construct the video_info url.
214
 
pytube/helpers.py CHANGED
@@ -14,6 +14,101 @@ from pytube.exceptions import RegexMatchError
14
  logger = logging.getLogger(__name__)
15
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def regex_search(pattern: str, string: str, group: int) -> str:
18
  """Shortcut method to search a string for a given pattern.
19
 
 
14
  logger = logging.getLogger(__name__)
15
 
16
 
17
+ class DeferredGeneratorList:
18
+ """A wrapper class for deferring list generation.
19
+
20
+ Pytube has some continuation generators that create web calls, which means
21
+ that any time a full list is requested, all of those web calls must be
22
+ made at once, which could lead to slowdowns. This will allow individual
23
+ elements to be queried, so that slowdowns only happen as necessary. For
24
+ example, you can iterate over elements in the list without accessing them
25
+ all simultaneously. This should allow for speed improvements for playlist
26
+ and channel interactions.
27
+ """
28
+ def __init__(self, generator):
29
+ """Construct a :class:`DeferredGeneratorList <DeferredGeneratorList>`.
30
+
31
+ :param generator generator:
32
+ The deferrable generator to create a wrapper for.
33
+ :param func func:
34
+ (Optional) A function to call on the generator items to produce the list.
35
+ """
36
+ self.gen = generator
37
+ self._elements = []
38
+
39
+ def __eq__(self, other):
40
+ """We want to mimic list behavior for comparison."""
41
+ return list(self) == other
42
+
43
+ def __getitem__(self, key) -> Any:
44
+ """Only generate items as they're asked for."""
45
+ # We only allow querying with indexes.
46
+ if not isinstance(key, (int, slice)):
47
+ raise TypeError('Key must be either a slice or int.')
48
+
49
+ # Convert int keys to slice
50
+ key_slice = key
51
+ if isinstance(key, int):
52
+ key_slice = slice(key, key + 1, 1)
53
+
54
+ # Generate all elements up to the final item
55
+ while len(self._elements) < key_slice.stop:
56
+ try:
57
+ next_item = next(self.gen)
58
+ except StopIteration:
59
+ # If we can't find enough elements for the slice, raise an IndexError
60
+ raise IndexError
61
+ else:
62
+ self._elements.append(next_item)
63
+
64
+ return self._elements[key]
65
+
66
+ def __iter__(self):
67
+ """Custom iterator for dynamically generated list."""
68
+ iter_index = 0
69
+ while True:
70
+ try:
71
+ curr_item = self[iter_index]
72
+ except IndexError:
73
+ return
74
+ else:
75
+ yield curr_item
76
+ iter_index += 1
77
+
78
+ def __next__(self) -> Any:
79
+ """Fetch next element in iterator."""
80
+ try:
81
+ curr_element = self[self.iter_index]
82
+ except IndexError:
83
+ raise StopIteration
84
+ self.iter_index += 1
85
+ return curr_element # noqa:R504
86
+
87
+ def __len__(self) -> int:
88
+ """Return length of list of all items."""
89
+ self.generate_all()
90
+ return len(self._elements)
91
+
92
+ def __repr__(self) -> str:
93
+ """String representation of all items."""
94
+ self.generate_all()
95
+ return str(self._elements)
96
+
97
+ def __reversed__(self):
98
+ self.generate_all()
99
+ return self._elements[::-1]
100
+
101
+ def generate_all(self):
102
+ """Generate all items."""
103
+ while True:
104
+ try:
105
+ next_item = next(self.gen)
106
+ except StopIteration:
107
+ break
108
+ else:
109
+ self._elements.append(next_item)
110
+
111
+
112
  def regex_search(pattern: str, string: str, group: int) -> str:
113
  """Shortcut method to search a string for a given pattern.
114
 
tests/conftest.py CHANGED
@@ -91,7 +91,8 @@ def region_blocked():
91
  @pytest.fixture
92
  def playlist_html():
93
  """Youtube playlist HTML loaded on 2020-01-25 from
94
- https://www.youtube.com/playlist?list=PLzMcBGfZo4-mP7qA9cagf68V06sko5otr"""
 
95
  file_path = os.path.join(
96
  os.path.dirname(os.path.realpath(__file__)),
97
  "mocks",
@@ -104,7 +105,8 @@ def playlist_html():
104
  @pytest.fixture
105
  def playlist_long_html():
106
  """Youtube playlist HTML loaded on 2020-01-25 from
107
- https://www.youtube.com/playlist?list=PLzMcBGfZo4-mP7qA9cagf68V06sko5otr"""
 
108
  file_path = os.path.join(
109
  os.path.dirname(os.path.realpath(__file__)),
110
  "mocks",
@@ -117,7 +119,8 @@ def playlist_long_html():
117
  @pytest.fixture
118
  def playlist_submenu_html():
119
  """Youtube playlist HTML loaded on 2020-01-24 from
120
- https://www.youtube.com/playlist?list=PLZHQObOWTQDMsr9K-rj53DwVRMYO3t5Yr"""
 
121
  file_path = os.path.join(
122
  os.path.dirname(os.path.realpath(__file__)),
123
  "mocks",
@@ -138,3 +141,17 @@ def stream_dict():
138
  with gzip.open(file_path, "rb") as f:
139
  content = json.loads(f.read().decode("utf-8"))
140
  return content['watch_html']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  @pytest.fixture
92
  def playlist_html():
93
  """Youtube playlist HTML loaded on 2020-01-25 from
94
+ https://www.youtube.com/playlist?list=PLzMcBGfZo4-mP7qA9cagf68V06sko5otr
95
+ """
96
  file_path = os.path.join(
97
  os.path.dirname(os.path.realpath(__file__)),
98
  "mocks",
 
105
  @pytest.fixture
106
  def playlist_long_html():
107
  """Youtube playlist HTML loaded on 2020-01-25 from
108
+ https://www.youtube.com/playlist?list=PLzMcBGfZo4-mP7qA9cagf68V06sko5otr
109
+ """
110
  file_path = os.path.join(
111
  os.path.dirname(os.path.realpath(__file__)),
112
  "mocks",
 
119
  @pytest.fixture
120
  def playlist_submenu_html():
121
  """Youtube playlist HTML loaded on 2020-01-24 from
122
+ https://www.youtube.com/playlist?list=PLZHQObOWTQDMsr9K-rj53DwVRMYO3t5Yr
123
+ """
124
  file_path = os.path.join(
125
  os.path.dirname(os.path.realpath(__file__)),
126
  "mocks",
 
141
  with gzip.open(file_path, "rb") as f:
142
  content = json.loads(f.read().decode("utf-8"))
143
  return content['watch_html']
144
+
145
+
146
+ @pytest.fixture
147
+ def channel_videos_html():
148
+ """Youtube channel HTML loaded on 2021-05-05 from
149
+ https://www.youtube.com/c/ProgrammingKnowledge/videos
150
+ """
151
+ file_path = os.path.join(
152
+ os.path.dirname(os.path.realpath(__file__)),
153
+ "mocks",
154
+ "channel-videos.html.gz",
155
+ )
156
+ with gzip.open(file_path, 'rb') as f:
157
+ return f.read().decode('utf-8')
tests/contrib/test_channel.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from unittest import mock
2
+
3
+ from pytube import Channel
4
+
5
+
6
+ @mock.patch('pytube.request.get')
7
+ def test_init_with_url(request_get, channel_videos_html):
8
+ request_get.return_value = channel_videos_html
9
+ c = Channel('https://www.youtube.com/c/ProgrammingKnowledge/videos')
10
+ assert c.channel_url == 'https://www.youtube.com/c/ProgrammingKnowledge'
11
+ assert c.videos_url == f'{c.channel_url}/videos'
12
+ assert c.playlists_url == f'{c.channel_url}/playlists'
13
+ assert c.community_url == f'{c.channel_url}/community'
14
+ assert c.featured_channels_url == f'{c.channel_url}/channels'
15
+ assert c.about_url == f'{c.channel_url}/about'
16
+
17
+
18
+ @mock.patch('pytube.request.get')
19
+ def test_channel_name(request_get, channel_videos_html):
20
+ request_get.return_value = channel_videos_html
21
+
22
+ c = Channel('https://www.youtube.com/c/ProgrammingKnowledge/videos')
23
+ assert c.channel_name == 'ProgrammingKnowledge'
24
+
25
+
26
+ @mock.patch('pytube.request.get')
27
+ def test_channel_video_list(request_get, channel_videos_html):
28
+ request_get.return_value = channel_videos_html
29
+
30
+ c = Channel('https://www.youtube.com/c/ProgrammingKnowledge/videos')
31
+ first_ten = [
32
+ 'https://www.youtube.com/watch?v=t_xLpJo_35k',
33
+ 'https://www.youtube.com/watch?v=ccbh5YhxouQ',
34
+ 'https://www.youtube.com/watch?v=wDnFjDjxW_0',
35
+ 'https://www.youtube.com/watch?v=F3W_p_4XftA',
36
+ 'https://www.youtube.com/watch?v=_fxm0xGGEi4',
37
+ 'https://www.youtube.com/watch?v=cRbKZzcuIsg',
38
+ 'https://www.youtube.com/watch?v=sdDu3dfIuow',
39
+ 'https://www.youtube.com/watch?v=10KIbp-gJCE',
40
+ 'https://www.youtube.com/watch?v=wZIT-cRtd6s',
41
+ 'https://www.youtube.com/watch?v=KucCvEbTj0w',
42
+ ]
43
+ assert c.video_urls[:10] == first_ten
44
+
45
+
46
+ @mock.patch('pytube.request.get')
47
+ def test_videos_html(request_get, channel_videos_html):
48
+ request_get.return_value = channel_videos_html
49
+
50
+ c = Channel('https://www.youtube.com/c/ProgrammingKnowledge')
51
+ assert c.html == channel_videos_html
52
+
53
+ # Because the Channel object subclasses the Playlist object, most of the tests
54
+ # are already taken care of by the Playlist test suite.
tests/mocks/channel-videos.html.gz ADDED
Binary file (48.6 kB). View file