Taylor Fox Dahlin
commited on
Fix #797 (#804)
Browse files* Changed base_js get_ytplayer_config to work with embed html; updated js_url to reflect this.
* Updated helper to better reflect a real example of returned HTML.
- pytube/cipher.py +1 -1
- pytube/extract.py +23 -6
- tests/test_cipher.py +2 -2
- tests/test_helpers.py +2 -1
pytube/cipher.py
CHANGED
@@ -247,7 +247,7 @@ def splice(arr: List, b: int):
|
|
247 |
>>> splice([1, 2, 3, 4], 2)
|
248 |
[1, 2]
|
249 |
"""
|
250 |
-
return arr[:
|
251 |
|
252 |
|
253 |
def swap(arr: List, b: int):
|
|
|
247 |
>>> splice([1, 2, 3, 4], 2)
|
248 |
[1, 2]
|
249 |
"""
|
250 |
+
return arr[b:]
|
251 |
|
252 |
|
253 |
def swap(arr: List, b: int):
|
pytube/extract.py
CHANGED
@@ -174,7 +174,10 @@ def js_url(html: str) -> str:
|
|
174 |
:param str html:
|
175 |
The html contents of the watch page.
|
176 |
"""
|
177 |
-
|
|
|
|
|
|
|
178 |
return "https://youtube.com" + base_js
|
179 |
|
180 |
|
@@ -215,8 +218,7 @@ def get_ytplayer_js(html: str) -> Any:
|
|
215 |
Path to YouTube's base.js file.
|
216 |
"""
|
217 |
js_url_patterns = [
|
218 |
-
r"\
|
219 |
-
r"\"js\":\"([^\"]*base\.js)\""
|
220 |
]
|
221 |
for pattern in js_url_patterns:
|
222 |
regex = re.compile(pattern)
|
@@ -244,11 +246,10 @@ def get_ytplayer_config(html: str) -> Any:
|
|
244 |
:returns:
|
245 |
Substring of the html containing the encoded manifest data.
|
246 |
"""
|
|
|
247 |
config_patterns = [
|
248 |
r";ytplayer\.config\s*=\s*({.*?});",
|
249 |
-
r"yt\.setConfig\(.*'PLAYER_CONFIG':\s*({.+?})"
|
250 |
]
|
251 |
-
logger.debug("finding initial function name")
|
252 |
for pattern in config_patterns:
|
253 |
regex = re.compile(pattern)
|
254 |
function_match = regex.search(html)
|
@@ -257,8 +258,24 @@ def get_ytplayer_config(html: str) -> Any:
|
|
257 |
yt_player_config = function_match.group(1)
|
258 |
return json.loads(yt_player_config)
|
259 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
raise RegexMatchError(
|
261 |
-
caller="get_ytplayer_config", pattern="config_patterns"
|
262 |
)
|
263 |
|
264 |
|
|
|
174 |
:param str html:
|
175 |
The html contents of the watch page.
|
176 |
"""
|
177 |
+
try:
|
178 |
+
base_js = get_ytplayer_config(html)['assets']['js']
|
179 |
+
except KeyError:
|
180 |
+
base_js = get_ytplayer_js(html)
|
181 |
return "https://youtube.com" + base_js
|
182 |
|
183 |
|
|
|
218 |
Path to YouTube's base.js file.
|
219 |
"""
|
220 |
js_url_patterns = [
|
221 |
+
r"(/s/player/[\w\d]+/[\w\d_/.]+/base\.js)"
|
|
|
222 |
]
|
223 |
for pattern in js_url_patterns:
|
224 |
regex = re.compile(pattern)
|
|
|
246 |
:returns:
|
247 |
Substring of the html containing the encoded manifest data.
|
248 |
"""
|
249 |
+
logger.debug("finding initial function name")
|
250 |
config_patterns = [
|
251 |
r";ytplayer\.config\s*=\s*({.*?});",
|
|
|
252 |
]
|
|
|
253 |
for pattern in config_patterns:
|
254 |
regex = re.compile(pattern)
|
255 |
function_match = regex.search(html)
|
|
|
258 |
yt_player_config = function_match.group(1)
|
259 |
return json.loads(yt_player_config)
|
260 |
|
261 |
+
# setConfig() needs to be handled a little differently.
|
262 |
+
# We want to parse the entire argument to setConfig()
|
263 |
+
# and use then load that as json to find PLAYER_CONFIG
|
264 |
+
# inside of it.
|
265 |
+
setconfig_patterns = [
|
266 |
+
r"yt\.setConfig\((.*'PLAYER_CONFIG':\s*{.+?})\);",
|
267 |
+
r"yt\.setConfig\((.*\"PLAYER_CONFIG\":\s*{.+?})\);"
|
268 |
+
]
|
269 |
+
for pattern in setconfig_patterns:
|
270 |
+
regex = re.compile(pattern)
|
271 |
+
function_match = regex.search(html)
|
272 |
+
if function_match:
|
273 |
+
logger.debug("finished regex search, matched: %s", pattern)
|
274 |
+
yt_config = function_match.group(1)
|
275 |
+
return json.loads(yt_config)['PLAYER_CONFIG']
|
276 |
+
|
277 |
raise RegexMatchError(
|
278 |
+
caller="get_ytplayer_config", pattern="config_patterns, setconfig_patterns"
|
279 |
)
|
280 |
|
281 |
|
tests/test_cipher.py
CHANGED
@@ -26,5 +26,5 @@ def test_reverse():
|
|
26 |
|
27 |
|
28 |
def test_splice():
|
29 |
-
assert cipher.splice([1, 2, 3, 4], 2) == [
|
30 |
-
assert cipher.splice([1, 2, 3, 4], 1) == [
|
|
|
26 |
|
27 |
|
28 |
def test_splice():
|
29 |
+
assert cipher.splice([1, 2, 3, 4], 2) == [3, 4]
|
30 |
+
assert cipher.splice([1, 2, 3, 4], 1) == [2, 3, 4]
|
tests/test_helpers.py
CHANGED
@@ -121,7 +121,8 @@ def test_create_mock_html_json(mock_url_open, mock_open):
|
|
121 |
# 2. vid_info_raw
|
122 |
# 3. js
|
123 |
mock_url_open_object.read.side_effect = [
|
124 |
-
b'"
|
|
|
125 |
b'vid_info_raw',
|
126 |
b'js_result',
|
127 |
]
|
|
|
121 |
# 2. vid_info_raw
|
122 |
# 3. js
|
123 |
mock_url_open_object.read.side_effect = [
|
124 |
+
(b'yt.setConfig({"PLAYER_CONFIG":{"args":[]}});'
|
125 |
+
b'"jsUrl":"/s/player/13371337/player_ias.vflset/en_US/base.js"'),
|
126 |
b'vid_info_raw',
|
127 |
b'js_result',
|
128 |
]
|