finished caption support + docs
Browse files- docs/api.rst +7 -0
- pytube/__init__.py +1 -0
- pytube/__main__.py +8 -1
- pytube/captions.py +26 -0
- pytube/compat.py +7 -0
- pytube/helpers.py +51 -0
- pytube/query.py +36 -3
docs/api.rst
CHANGED
@@ -27,6 +27,13 @@ StreamQuery Object
|
|
27 |
:members:
|
28 |
:inherited-members:
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
Extract
|
31 |
-------
|
32 |
|
|
|
27 |
:members:
|
28 |
:inherited-members:
|
29 |
|
30 |
+
Caption Object
|
31 |
+
-------------
|
32 |
+
|
33 |
+
.. autoclass:: pytube.Caption
|
34 |
+
:members:
|
35 |
+
:inherited-members:
|
36 |
+
|
37 |
Extract
|
38 |
-------
|
39 |
|
pytube/__init__.py
CHANGED
@@ -15,6 +15,7 @@ __license__ = 'MIT License'
|
|
15 |
__copyright__ = 'Copyright 2017 Nick Ficano'
|
16 |
|
17 |
from pytube.logging import create_logger
|
|
|
18 |
from pytube.query import StreamQuery
|
19 |
from pytube.streams import Stream
|
20 |
from pytube.captions import Caption
|
|
|
15 |
__copyright__ = 'Copyright 2017 Nick Ficano'
|
16 |
|
17 |
from pytube.logging import create_logger
|
18 |
+
from pytube.query import CaptionQuery
|
19 |
from pytube.query import StreamQuery
|
20 |
from pytube.streams import Stream
|
21 |
from pytube.captions import Caption
|
pytube/__main__.py
CHANGED
@@ -13,6 +13,7 @@ import json
|
|
13 |
import logging
|
14 |
|
15 |
from pytube import Caption
|
|
|
16 |
from pytube import extract
|
17 |
from pytube import mixins
|
18 |
from pytube import request
|
@@ -163,6 +164,11 @@ class YouTube(object):
|
|
163 |
self.fmt_streams.append(video)
|
164 |
|
165 |
def initialize_caption_objects(self):
|
|
|
|
|
|
|
|
|
|
|
166 |
if 'captions' not in self.player_config['args']['player_response']:
|
167 |
return
|
168 |
caption_tracks = (
|
@@ -177,7 +183,8 @@ class YouTube(object):
|
|
177 |
|
178 |
@property
|
179 |
def captions(self):
|
180 |
-
|
|
|
181 |
|
182 |
@property
|
183 |
def streams(self):
|
|
|
13 |
import logging
|
14 |
|
15 |
from pytube import Caption
|
16 |
+
from pytube import CaptionQuery
|
17 |
from pytube import extract
|
18 |
from pytube import mixins
|
19 |
from pytube import request
|
|
|
164 |
self.fmt_streams.append(video)
|
165 |
|
166 |
def initialize_caption_objects(self):
|
167 |
+
"""Populate instances of :class:`Caption <Caption>`.
|
168 |
+
|
169 |
+
Take the unscrambled player response data, and use it to initialize
|
170 |
+
instances of :class:`Caption <Caption>`.
|
171 |
+
"""
|
172 |
if 'captions' not in self.player_config['args']['player_response']:
|
173 |
return
|
174 |
caption_tracks = (
|
|
|
183 |
|
184 |
@property
|
185 |
def captions(self):
|
186 |
+
"""Interface to query caption tracks."""
|
187 |
+
return CaptionQuery([c for c in self.caption_tracks])
|
188 |
|
189 |
@property
|
190 |
def streams(self):
|
pytube/captions.py
CHANGED
@@ -1,9 +1,35 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
|
|
|
2 |
class Caption:
|
|
|
|
|
3 |
def __init__(self, caption_track):
|
|
|
|
|
|
|
|
|
|
|
4 |
self.url = caption_track.get('baseUrl')
|
5 |
self.name = caption_track['name']['simpleText']
|
6 |
self.code = caption_track['languageCode']
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
def __repr__(self):
|
|
|
9 |
return'<Caption lang="{s.name}" code="{s.code}">'.format(s=self)
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
+
"""This module contrains a container for caption tracks."""
|
3 |
+
from pytube import request
|
4 |
+
from pytube.helpers import xml_caption_to_srt
|
5 |
+
|
6 |
+
|
7 |
class Caption:
|
8 |
+
"""Container for caption tracks."""
|
9 |
+
|
10 |
def __init__(self, caption_track):
|
11 |
+
"""Construct a :class:`Caption <Caption>`.
|
12 |
+
|
13 |
+
:param dict caption_track:
|
14 |
+
Caption track data extracted from ``watch_html``.
|
15 |
+
"""
|
16 |
self.url = caption_track.get('baseUrl')
|
17 |
self.name = caption_track['name']['simpleText']
|
18 |
self.code = caption_track['languageCode']
|
19 |
|
20 |
+
@property
|
21 |
+
def xml_captions(self):
|
22 |
+
"""Download the xml caption tracks."""
|
23 |
+
return request.get(self.url)
|
24 |
+
|
25 |
+
def generate_srt_captions(self):
|
26 |
+
"""Generate "SubRip Subtitle" captions.
|
27 |
+
|
28 |
+
Takes the xml captions from :meth:`~pytube.Caption.xml_captions` and
|
29 |
+
recompiles them into the "SubRip Subtitle" format.
|
30 |
+
"""
|
31 |
+
return xml_caption_to_srt(self.xml_captions)
|
32 |
+
|
33 |
def __repr__(self):
|
34 |
+
"""Printable object representation."""
|
35 |
return'<Caption lang="{s.name}" code="{s.code}">'.format(s=self)
|
pytube/compat.py
CHANGED
@@ -13,6 +13,12 @@ if python_version == 2:
|
|
13 |
from urllib2 import unquote
|
14 |
from urllib2 import urlopen
|
15 |
from urlparse import parse_qsl
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
def unicode(s):
|
18 |
"""Encode a string to utf-8."""
|
@@ -25,6 +31,7 @@ elif python_version == 3:
|
|
25 |
from urllib.parse import unquote
|
26 |
from urllib.parse import urlencode
|
27 |
from urllib.request import urlopen
|
|
|
28 |
|
29 |
def unicode(s):
|
30 |
"""No-op."""
|
|
|
13 |
from urllib2 import unquote
|
14 |
from urllib2 import urlopen
|
15 |
from urlparse import parse_qsl
|
16 |
+
from HTMLParser import HTMLParser
|
17 |
+
|
18 |
+
def unescape(s):
|
19 |
+
"""Strip HTML entries from a string."""
|
20 |
+
html_parser = HTMLParser()
|
21 |
+
return html_parser.unescape(s)
|
22 |
|
23 |
def unicode(s):
|
24 |
"""Encode a string to utf-8."""
|
|
|
31 |
from urllib.parse import unquote
|
32 |
from urllib.parse import urlencode
|
33 |
from urllib.request import urlopen
|
34 |
+
from html import unescape
|
35 |
|
36 |
def unicode(s):
|
37 |
"""No-op."""
|
pytube/helpers.py
CHANGED
@@ -3,9 +3,13 @@
|
|
3 |
from __future__ import absolute_import
|
4 |
|
5 |
import logging
|
|
|
6 |
import pprint
|
7 |
import re
|
|
|
|
|
8 |
|
|
|
9 |
from pytube.compat import unicode
|
10 |
from pytube.exceptions import RegexMatchError
|
11 |
|
@@ -88,3 +92,50 @@ def safe_filename(s, max_length=255):
|
|
88 |
regex = re.compile(pattern, re.UNICODE)
|
89 |
filename = regex.sub('', s)
|
90 |
return unicode(filename[:max_length].rsplit(' ', 0)[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from __future__ import absolute_import
|
4 |
|
5 |
import logging
|
6 |
+
import math
|
7 |
import pprint
|
8 |
import re
|
9 |
+
import time
|
10 |
+
import xml.etree.ElementTree as ElementTree
|
11 |
|
12 |
+
from pytube.compat import unescape
|
13 |
from pytube.compat import unicode
|
14 |
from pytube.exceptions import RegexMatchError
|
15 |
|
|
|
92 |
regex = re.compile(pattern, re.UNICODE)
|
93 |
filename = regex.sub('', s)
|
94 |
return unicode(filename[:max_length].rsplit(' ', 0)[0])
|
95 |
+
|
96 |
+
|
97 |
+
def float_to_srt_time_format(d):
|
98 |
+
"""Convert decimal durations into proper srt format.
|
99 |
+
|
100 |
+
:rtype: str
|
101 |
+
:returns:
|
102 |
+
SubRip Subtitle (str) formatted time duration.
|
103 |
+
|
104 |
+
>>> float_to_srt_time_format(3.89)
|
105 |
+
'00:00:03,890'
|
106 |
+
"""
|
107 |
+
frac, whole = math.modf(d)
|
108 |
+
time_fmt = time.strftime('0%H:0%M:%S,', time.gmtime(whole))
|
109 |
+
ms = '{:.3f}'.format(frac).replace('0.', '')
|
110 |
+
return time_fmt + ms
|
111 |
+
|
112 |
+
|
113 |
+
def xml_caption_to_srt(xml_captions):
|
114 |
+
"""Convert xml caption tracks to "SubRip Subtitle (srt)".
|
115 |
+
|
116 |
+
:param str xml_captions:
|
117 |
+
XML formatted caption tracks.
|
118 |
+
"""
|
119 |
+
segments = []
|
120 |
+
root = ElementTree.fromstring(xml_captions)
|
121 |
+
for i, child in enumerate(root.getchildren()):
|
122 |
+
text = child.text or ''
|
123 |
+
caption = unescape(
|
124 |
+
text
|
125 |
+
.replace('\n', ' ')
|
126 |
+
.replace(' ', ' '),
|
127 |
+
)
|
128 |
+
duration = float(child.attrib['dur'])
|
129 |
+
start = float(child.attrib['start'])
|
130 |
+
end = start + duration
|
131 |
+
sequence_number = i + 1 # convert from 0-indexed to 1.
|
132 |
+
line = (
|
133 |
+
'{seq}\n{start} --> {end}\n{text}\n'.format(
|
134 |
+
seq=sequence_number,
|
135 |
+
start=float_to_srt_time_format(start),
|
136 |
+
end=float_to_srt_time_format(end),
|
137 |
+
text=caption,
|
138 |
+
)
|
139 |
+
)
|
140 |
+
segments.append(line)
|
141 |
+
return '\n'.join(segments).strip()
|
pytube/query.py
CHANGED
@@ -1,13 +1,16 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
-
"""This module provides a query interface for media streams."""
|
3 |
|
4 |
|
5 |
class StreamQuery:
|
6 |
"""Interface for querying the available media streams."""
|
7 |
|
8 |
def __init__(self, fmt_streams):
|
9 |
-
"""Construct a :class:`StreamQuery <StreamQuery>`.
|
10 |
-
|
|
|
|
|
|
|
11 |
self.fmt_streams = fmt_streams
|
12 |
self.itag_index = {int(s.itag): s for s in fmt_streams}
|
13 |
|
@@ -224,3 +227,33 @@ class StreamQuery:
|
|
224 |
def all(self):
|
225 |
"""Get all the results represented by this query as a list."""
|
226 |
return self.fmt_streams
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
+
"""This module provides a query interface for media streams and captions."""
|
3 |
|
4 |
|
5 |
class StreamQuery:
|
6 |
"""Interface for querying the available media streams."""
|
7 |
|
8 |
def __init__(self, fmt_streams):
|
9 |
+
"""Construct a :class:`StreamQuery <StreamQuery>`.
|
10 |
+
|
11 |
+
param list fmt_streams:
|
12 |
+
list of :class:`Stream <Stream>` instances.
|
13 |
+
"""
|
14 |
self.fmt_streams = fmt_streams
|
15 |
self.itag_index = {int(s.itag): s for s in fmt_streams}
|
16 |
|
|
|
227 |
def all(self):
|
228 |
"""Get all the results represented by this query as a list."""
|
229 |
return self.fmt_streams
|
230 |
+
|
231 |
+
|
232 |
+
class CaptionQuery:
|
233 |
+
"""Interface for querying the available captions."""
|
234 |
+
|
235 |
+
def __init__(self, captions):
|
236 |
+
"""Construct a :class:`Caption <Caption>`.
|
237 |
+
|
238 |
+
param list captions:
|
239 |
+
list of :class:`Caption <Caption>` instances.
|
240 |
+
|
241 |
+
"""
|
242 |
+
self.captions = captions
|
243 |
+
self.lang_code_index = {c.code: c for c in captions}
|
244 |
+
|
245 |
+
def get_by_language_code(self, lang_code):
|
246 |
+
"""Get the :class:`Caption <Caption>` for a given ``lang_code``.
|
247 |
+
|
248 |
+
:param str lang_code:
|
249 |
+
The code that identifies the caption language.
|
250 |
+
:rtype: :class:`Caption <Caption>` or ``None``
|
251 |
+
:returns:
|
252 |
+
The :class:`Caption <Caption>` matching the given ``lang_code`` or
|
253 |
+
``None`` if it does not exist.
|
254 |
+
"""
|
255 |
+
return self.lang_code_index.get(lang_code)
|
256 |
+
|
257 |
+
def all(self):
|
258 |
+
"""Get all the results represented by this query as a list."""
|
259 |
+
return self.captions
|