Taylor Fox Dahlin
commited on
[Feature] Video metadata (#809)
Browse files* Added accept-language to request headers to coerce certain strings sent by YouTube into english (e.g. 'This video is private.'
* Implemented metadata class.
- .github/workflows/ci.yml +1 -1
- pytube/__main__.py +25 -15
- pytube/extract.py +57 -0
- pytube/metadata.py +48 -0
- pytube/request.py +1 -1
- tests/test_extract.py +10 -0
- tests/test_metadata.py +18 -0
.github/workflows/ci.yml
CHANGED
@@ -13,7 +13,7 @@ jobs:
|
|
13 |
|
14 |
strategy:
|
15 |
matrix:
|
16 |
-
python: [3.
|
17 |
|
18 |
steps:
|
19 |
- name: Checkout repo
|
|
|
13 |
|
14 |
strategy:
|
15 |
matrix:
|
16 |
+
python: [3.6, 3.7, 3.8, 3.9]
|
17 |
|
18 |
steps:
|
19 |
- name: Checkout repo
|
pytube/__main__.py
CHANGED
@@ -27,6 +27,7 @@ from pytube.extract import apply_descrambler
|
|
27 |
from pytube.extract import apply_signature
|
28 |
from pytube.extract import get_ytplayer_config
|
29 |
from pytube.helpers import install_proxy
|
|
|
30 |
from pytube.monostate import Monostate
|
31 |
from pytube.monostate import OnComplete
|
32 |
from pytube.monostate import OnProgress
|
@@ -60,23 +61,17 @@ class YouTube:
|
|
60 |
|
61 |
"""
|
62 |
self.js: Optional[str] = None # js fetched by js_url
|
63 |
-
self.js_url: Optional[
|
64 |
-
str
|
65 |
-
] = None # the url to the js, parsed from watch html
|
66 |
|
67 |
# note: vid_info may eventually be removed. It sounds like it once had
|
68 |
# additional formats, but that doesn't appear to still be the case.
|
69 |
|
70 |
# the url to vid info, parsed from watch html
|
71 |
self.vid_info_url: Optional[str] = None
|
72 |
-
self.vid_info_raw: Optional[
|
73 |
-
str
|
74 |
-
] = None # content fetched by vid_info_url
|
75 |
self.vid_info: Optional[Dict] = None # parsed content of vid_info_raw
|
76 |
|
77 |
-
self.watch_html: Optional[
|
78 |
-
str
|
79 |
-
] = None # the html of /watch?v=<video_id>
|
80 |
self.embed_html: Optional[str] = None
|
81 |
self.player_config_args: Dict = {} # inline js in the html containing
|
82 |
self.player_response: Dict = {}
|
@@ -85,6 +80,10 @@ class YouTube:
|
|
85 |
|
86 |
self.fmt_streams: List[Stream] = []
|
87 |
|
|
|
|
|
|
|
|
|
88 |
# video_id part of /watch?v=<video_id>
|
89 |
self.video_id = extract.video_id(url)
|
90 |
|
@@ -187,6 +186,9 @@ class YouTube:
|
|
187 |
video_id=self.video_id, watch_url=self.watch_url
|
188 |
)
|
189 |
|
|
|
|
|
|
|
190 |
self.vid_info_raw = request.get(self.vid_info_url)
|
191 |
if not self.age_restricted:
|
192 |
self.js_url = extract.js_url(self.watch_html)
|
@@ -287,9 +289,7 @@ class YouTube:
|
|
287 |
:rtype: str
|
288 |
|
289 |
"""
|
290 |
-
return self.player_response.get("videoDetails", {}).get(
|
291 |
-
"shortDescription"
|
292 |
-
)
|
293 |
|
294 |
@property
|
295 |
def rating(self) -> float:
|
@@ -298,9 +298,7 @@ class YouTube:
|
|
298 |
:rtype: float
|
299 |
|
300 |
"""
|
301 |
-
return self.player_response.get("videoDetails", {}).get(
|
302 |
-
"averageRating"
|
303 |
-
)
|
304 |
|
305 |
@property
|
306 |
def length(self) -> int:
|
@@ -338,6 +336,18 @@ class YouTube:
|
|
338 |
"author", "unknown"
|
339 |
)
|
340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
def register_on_progress_callback(self, func: OnProgress):
|
342 |
"""Register a download progress callback function post initialization.
|
343 |
|
|
|
27 |
from pytube.extract import apply_signature
|
28 |
from pytube.extract import get_ytplayer_config
|
29 |
from pytube.helpers import install_proxy
|
30 |
+
from pytube.metadata import YouTubeMetadata
|
31 |
from pytube.monostate import Monostate
|
32 |
from pytube.monostate import OnComplete
|
33 |
from pytube.monostate import OnProgress
|
|
|
61 |
|
62 |
"""
|
63 |
self.js: Optional[str] = None # js fetched by js_url
|
64 |
+
self.js_url: Optional[str] = None # the url to the js, parsed from watch html
|
|
|
|
|
65 |
|
66 |
# note: vid_info may eventually be removed. It sounds like it once had
|
67 |
# additional formats, but that doesn't appear to still be the case.
|
68 |
|
69 |
# the url to vid info, parsed from watch html
|
70 |
self.vid_info_url: Optional[str] = None
|
71 |
+
self.vid_info_raw: Optional[str] = None # content fetched by vid_info_url
|
|
|
|
|
72 |
self.vid_info: Optional[Dict] = None # parsed content of vid_info_raw
|
73 |
|
74 |
+
self.watch_html: Optional[str] = None # the html of /watch?v=<video_id>
|
|
|
|
|
75 |
self.embed_html: Optional[str] = None
|
76 |
self.player_config_args: Dict = {} # inline js in the html containing
|
77 |
self.player_response: Dict = {}
|
|
|
80 |
|
81 |
self.fmt_streams: List[Stream] = []
|
82 |
|
83 |
+
self.initial_data_raw = None
|
84 |
+
self.initial_data = {}
|
85 |
+
self._metadata: Optional[YouTubeMetadata] = None
|
86 |
+
|
87 |
# video_id part of /watch?v=<video_id>
|
88 |
self.video_id = extract.video_id(url)
|
89 |
|
|
|
186 |
video_id=self.video_id, watch_url=self.watch_url
|
187 |
)
|
188 |
|
189 |
+
self.initial_data_raw = extract.initial_data(self.watch_html)
|
190 |
+
self.initial_data = json.loads(self.initial_data_raw)
|
191 |
+
|
192 |
self.vid_info_raw = request.get(self.vid_info_url)
|
193 |
if not self.age_restricted:
|
194 |
self.js_url = extract.js_url(self.watch_html)
|
|
|
289 |
:rtype: str
|
290 |
|
291 |
"""
|
292 |
+
return self.player_response.get("videoDetails", {}).get("shortDescription")
|
|
|
|
|
293 |
|
294 |
@property
|
295 |
def rating(self) -> float:
|
|
|
298 |
:rtype: float
|
299 |
|
300 |
"""
|
301 |
+
return self.player_response.get("videoDetails", {}).get("averageRating")
|
|
|
|
|
302 |
|
303 |
@property
|
304 |
def length(self) -> int:
|
|
|
336 |
"author", "unknown"
|
337 |
)
|
338 |
|
339 |
+
@property
|
340 |
+
def metadata(self) -> Optional[YouTubeMetadata]:
|
341 |
+
"""Get the metadata for the video.
|
342 |
+
|
343 |
+
:rtype: YouTubeMetadata
|
344 |
+
"""
|
345 |
+
if self._metadata:
|
346 |
+
return self._metadata
|
347 |
+
else:
|
348 |
+
self._metadata = extract.metadata(self.initial_data)
|
349 |
+
return self._metadata
|
350 |
+
|
351 |
def register_on_progress_callback(self, func: OnProgress):
|
352 |
"""Register a download progress callback function post initialization.
|
353 |
|
pytube/extract.py
CHANGED
@@ -8,6 +8,7 @@ from datetime import datetime
|
|
8 |
from typing import Any
|
9 |
from typing import Dict
|
10 |
from typing import List
|
|
|
11 |
from typing import Tuple
|
12 |
from urllib.parse import parse_qs
|
13 |
from urllib.parse import parse_qsl
|
@@ -19,6 +20,7 @@ from pytube.cipher import Cipher
|
|
19 |
from pytube.exceptions import LiveStreamError
|
20 |
from pytube.exceptions import RegexMatchError
|
21 |
from pytube.helpers import regex_search
|
|
|
22 |
|
23 |
logger = logging.getLogger(__name__)
|
24 |
|
@@ -396,3 +398,58 @@ def apply_descrambler(stream_data: Dict, key: str) -> None:
|
|
396 |
]
|
397 |
|
398 |
logger.debug("applying descrambler")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
from typing import Any
|
9 |
from typing import Dict
|
10 |
from typing import List
|
11 |
+
from typing import Optional
|
12 |
from typing import Tuple
|
13 |
from urllib.parse import parse_qs
|
14 |
from urllib.parse import parse_qsl
|
|
|
20 |
from pytube.exceptions import LiveStreamError
|
21 |
from pytube.exceptions import RegexMatchError
|
22 |
from pytube.helpers import regex_search
|
23 |
+
from pytube.metadata import YouTubeMetadata
|
24 |
|
25 |
logger = logging.getLogger(__name__)
|
26 |
|
|
|
398 |
]
|
399 |
|
400 |
logger.debug("applying descrambler")
|
401 |
+
|
402 |
+
|
403 |
+
def initial_data(watch_html: str) -> str:
|
404 |
+
"""Extract the ytInitialData json from the watch_html page.
|
405 |
+
|
406 |
+
This mostly contains metadata necessary for rendering the page on-load,
|
407 |
+
such as video information, copyright notices, etc.
|
408 |
+
|
409 |
+
@param watch_html: Html of the watch page
|
410 |
+
@return:
|
411 |
+
"""
|
412 |
+
initial_data_pattern = r"window\[['\"]ytInitialData['\"]]\s*=\s*([^\n]+)"
|
413 |
+
try:
|
414 |
+
match = regex_search(initial_data_pattern, watch_html, 1)
|
415 |
+
except RegexMatchError:
|
416 |
+
return "{}"
|
417 |
+
else:
|
418 |
+
return match[:-1]
|
419 |
+
|
420 |
+
|
421 |
+
def metadata(initial_data) -> Optional[YouTubeMetadata]:
|
422 |
+
"""Get the informational metadata for the video.
|
423 |
+
|
424 |
+
e.g.:
|
425 |
+
[
|
426 |
+
{
|
427 |
+
'Song': '강남스타일(Gangnam Style)',
|
428 |
+
'Artist': 'PSY',
|
429 |
+
'Album': 'PSY SIX RULES Pt.1',
|
430 |
+
'Licensed to YouTube by': 'YG Entertainment Inc. [...]'
|
431 |
+
}
|
432 |
+
]
|
433 |
+
|
434 |
+
:rtype: YouTubeMetadata
|
435 |
+
"""
|
436 |
+
try:
|
437 |
+
metadata_rows: List = initial_data["contents"]["twoColumnWatchNextResults"][
|
438 |
+
"results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"][
|
439 |
+
"metadataRowContainer"]["metadataRowContainerRenderer"]["rows"]
|
440 |
+
except (KeyError, IndexError):
|
441 |
+
# If there's an exception accessing this data, it probably doesn't exist.
|
442 |
+
return YouTubeMetadata([])
|
443 |
+
|
444 |
+
# Rows appear to only have "metadataRowRenderer" or "metadataRowHeaderRenderer"
|
445 |
+
# and we only care about the former, so we filter the others
|
446 |
+
metadata_rows = filter(
|
447 |
+
lambda x: "metadataRowRenderer" in x.keys(),
|
448 |
+
metadata_rows
|
449 |
+
)
|
450 |
+
|
451 |
+
# We then access the metadataRowRenderer key in each element
|
452 |
+
# and build a metadata object from this new list
|
453 |
+
metadata_rows = [x["metadataRowRenderer"] for x in metadata_rows]
|
454 |
+
|
455 |
+
return YouTubeMetadata(metadata_rows)
|
pytube/metadata.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""This module contains the YouTubeMetadata class."""
|
3 |
+
import json
|
4 |
+
from typing import Dict
|
5 |
+
from typing import List
|
6 |
+
from typing import Optional
|
7 |
+
|
8 |
+
|
9 |
+
class YouTubeMetadata:
|
10 |
+
def __init__(self, metadata: List):
|
11 |
+
self._raw_metadata: List = metadata
|
12 |
+
self._metadata = [{}]
|
13 |
+
|
14 |
+
for el in metadata:
|
15 |
+
# We only add metadata to the dict if it has a simpleText title.
|
16 |
+
if 'title' in el and 'simpleText' in el['title']:
|
17 |
+
metadata_title = el['title']['simpleText']
|
18 |
+
else:
|
19 |
+
continue
|
20 |
+
|
21 |
+
contents = el['contents'][0]
|
22 |
+
if 'simpleText' in contents:
|
23 |
+
self._metadata[-1][metadata_title] = contents['simpleText']
|
24 |
+
elif 'runs' in contents:
|
25 |
+
self._metadata[-1][metadata_title] = contents['runs'][0]['text']
|
26 |
+
|
27 |
+
# Upon reaching a dividing line, create a new grouping
|
28 |
+
if el.get('hasDividerLine', False):
|
29 |
+
self._metadata.append({})
|
30 |
+
|
31 |
+
# If we happen to create an empty dict at the end, drop it
|
32 |
+
if self._metadata[-1] == {}:
|
33 |
+
self._metadata = self._metadata[:-1]
|
34 |
+
|
35 |
+
def __iter__(self):
|
36 |
+
for el in self._metadata:
|
37 |
+
yield el
|
38 |
+
|
39 |
+
def __str__(self):
|
40 |
+
return json.dumps(self._metadata)
|
41 |
+
|
42 |
+
@property
|
43 |
+
def raw_metadata(self) -> Optional[Dict]:
|
44 |
+
return self._raw_metadata
|
45 |
+
|
46 |
+
@property
|
47 |
+
def metadata(self):
|
48 |
+
return self._metadata
|
pytube/request.py
CHANGED
@@ -16,7 +16,7 @@ default_range_size = 9437184 # 9MB
|
|
16 |
|
17 |
|
18 |
def _execute_request(url, method=None, headers=None):
|
19 |
-
base_headers = {"User-Agent": "Mozilla/5.0"}
|
20 |
if headers:
|
21 |
base_headers.update(headers)
|
22 |
if url.lower().startswith("http"):
|
|
|
16 |
|
17 |
|
18 |
def _execute_request(url, method=None, headers=None):
|
19 |
+
base_headers = {"User-Agent": "Mozilla/5.0", "accept-language": "en-US,en"}
|
20 |
if headers:
|
21 |
base_headers.update(headers)
|
22 |
if url.lower().startswith("http"):
|
tests/test_extract.py
CHANGED
@@ -102,3 +102,13 @@ def test_signature_cipher_does_not_error(stream_dict):
|
|
102 |
config_args = extract.get_ytplayer_config(stream_dict)['args']
|
103 |
extract.apply_descrambler(config_args, "url_encoded_fmt_stream_map")
|
104 |
assert "s" in config_args["url_encoded_fmt_stream_map"][0].keys()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
config_args = extract.get_ytplayer_config(stream_dict)['args']
|
103 |
extract.apply_descrambler(config_args, "url_encoded_fmt_stream_map")
|
104 |
assert "s" in config_args["url_encoded_fmt_stream_map"][0].keys()
|
105 |
+
|
106 |
+
|
107 |
+
def test_initial_data_missing():
|
108 |
+
initial_data = extract.initial_data('')
|
109 |
+
assert initial_data == "{}"
|
110 |
+
|
111 |
+
|
112 |
+
def test_initial_data(stream_dict):
|
113 |
+
initial_data = extract.initial_data(stream_dict)
|
114 |
+
assert 'contents' in initial_data
|
tests/test_metadata.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Unit tests for the :module:`metadata <metadata>` module."""
|
3 |
+
import json
|
4 |
+
from pytube import extract
|
5 |
+
|
6 |
+
|
7 |
+
def test_extract_metadata_empty():
|
8 |
+
ytmd = extract.metadata({})
|
9 |
+
assert ytmd._raw_metadata == []
|
10 |
+
|
11 |
+
|
12 |
+
def test_metadata_from_initial_data(stream_dict):
|
13 |
+
initial_data = extract.initial_data(stream_dict)
|
14 |
+
ytmd = extract.metadata(json.loads(initial_data))
|
15 |
+
assert len(ytmd.raw_metadata) > 0
|
16 |
+
assert 'contents' in ytmd.raw_metadata[0]
|
17 |
+
assert len(ytmd.metadata) > 0
|
18 |
+
assert 'Song' in ytmd.metadata[0]
|