nficano commited on
Commit
8d4975a
·
1 Parent(s): 38a749a

finished caption support + docs

Browse files
docs/api.rst CHANGED
@@ -27,6 +27,13 @@ StreamQuery Object
27
  :members:
28
  :inherited-members:
29
 
 
 
 
 
 
 
 
30
  Extract
31
  -------
32
 
 
27
  :members:
28
  :inherited-members:
29
 
30
+ Caption Object
31
+ -------------
32
+
33
+ .. autoclass:: pytube.Caption
34
+ :members:
35
+ :inherited-members:
36
+
37
  Extract
38
  -------
39
 
pytube/__init__.py CHANGED
@@ -15,6 +15,7 @@ __license__ = 'MIT License'
15
  __copyright__ = 'Copyright 2017 Nick Ficano'
16
 
17
  from pytube.logging import create_logger
 
18
  from pytube.query import StreamQuery
19
  from pytube.streams import Stream
20
  from pytube.captions import Caption
 
15
  __copyright__ = 'Copyright 2017 Nick Ficano'
16
 
17
  from pytube.logging import create_logger
18
+ from pytube.query import CaptionQuery
19
  from pytube.query import StreamQuery
20
  from pytube.streams import Stream
21
  from pytube.captions import Caption
pytube/__main__.py CHANGED
@@ -13,6 +13,7 @@ import json
13
  import logging
14
 
15
  from pytube import Caption
 
16
  from pytube import extract
17
  from pytube import mixins
18
  from pytube import request
@@ -163,6 +164,11 @@ class YouTube(object):
163
  self.fmt_streams.append(video)
164
 
165
  def initialize_caption_objects(self):
 
 
 
 
 
166
  if 'captions' not in self.player_config['args']['player_response']:
167
  return
168
  caption_tracks = (
@@ -177,7 +183,8 @@ class YouTube(object):
177
 
178
  @property
179
  def captions(self):
180
- return self.caption_tracks
 
181
 
182
  @property
183
  def streams(self):
 
13
  import logging
14
 
15
  from pytube import Caption
16
+ from pytube import CaptionQuery
17
  from pytube import extract
18
  from pytube import mixins
19
  from pytube import request
 
164
  self.fmt_streams.append(video)
165
 
166
  def initialize_caption_objects(self):
167
+ """Populate instances of :class:`Caption <Caption>`.
168
+
169
+ Take the unscrambled player response data, and use it to initialize
170
+ instances of :class:`Caption <Caption>`.
171
+ """
172
  if 'captions' not in self.player_config['args']['player_response']:
173
  return
174
  caption_tracks = (
 
183
 
184
  @property
185
  def captions(self):
186
+ """Interface to query caption tracks."""
187
+ return CaptionQuery([c for c in self.caption_tracks])
188
 
189
  @property
190
  def streams(self):
pytube/captions.py CHANGED
@@ -1,9 +1,35 @@
1
  # -*- coding: utf-8 -*-
 
 
 
 
 
2
  class Caption:
 
 
3
  def __init__(self, caption_track):
 
 
 
 
 
4
  self.url = caption_track.get('baseUrl')
5
  self.name = caption_track['name']['simpleText']
6
  self.code = caption_track['languageCode']
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def __repr__(self):
 
9
  return'<Caption lang="{s.name}" code="{s.code}">'.format(s=self)
 
1
  # -*- coding: utf-8 -*-
2
+ """This module contrains a container for caption tracks."""
3
+ from pytube import request
4
+ from pytube.helpers import xml_caption_to_srt
5
+
6
+
7
  class Caption:
8
+ """Container for caption tracks."""
9
+
10
  def __init__(self, caption_track):
11
+ """Construct a :class:`Caption <Caption>`.
12
+
13
+ :param dict caption_track:
14
+ Caption track data extracted from ``watch_html``.
15
+ """
16
  self.url = caption_track.get('baseUrl')
17
  self.name = caption_track['name']['simpleText']
18
  self.code = caption_track['languageCode']
19
 
20
+ @property
21
+ def xml_captions(self):
22
+ """Download the xml caption tracks."""
23
+ return request.get(self.url)
24
+
25
+ def generate_srt_captions(self):
26
+ """Generate "SubRip Subtitle" captions.
27
+
28
+ Takes the xml captions from :meth:`~pytube.Caption.xml_captions` and
29
+ recompiles them into the "SubRip Subtitle" format.
30
+ """
31
+ return xml_caption_to_srt(self.xml_captions)
32
+
33
  def __repr__(self):
34
+ """Printable object representation."""
35
  return'<Caption lang="{s.name}" code="{s.code}">'.format(s=self)
pytube/compat.py CHANGED
@@ -13,6 +13,12 @@ if python_version == 2:
13
  from urllib2 import unquote
14
  from urllib2 import urlopen
15
  from urlparse import parse_qsl
 
 
 
 
 
 
16
 
17
  def unicode(s):
18
  """Encode a string to utf-8."""
@@ -25,6 +31,7 @@ elif python_version == 3:
25
  from urllib.parse import unquote
26
  from urllib.parse import urlencode
27
  from urllib.request import urlopen
 
28
 
29
  def unicode(s):
30
  """No-op."""
 
13
  from urllib2 import unquote
14
  from urllib2 import urlopen
15
  from urlparse import parse_qsl
16
+ from HTMLParser import HTMLParser
17
+
18
+ def unescape(s):
19
+ """Strip HTML entries from a string."""
20
+ html_parser = HTMLParser()
21
+ return html_parser.unescape(s)
22
 
23
  def unicode(s):
24
  """Encode a string to utf-8."""
 
31
  from urllib.parse import unquote
32
  from urllib.parse import urlencode
33
  from urllib.request import urlopen
34
+ from html import unescape
35
 
36
  def unicode(s):
37
  """No-op."""
pytube/helpers.py CHANGED
@@ -3,9 +3,13 @@
3
  from __future__ import absolute_import
4
 
5
  import logging
 
6
  import pprint
7
  import re
 
 
8
 
 
9
  from pytube.compat import unicode
10
  from pytube.exceptions import RegexMatchError
11
 
@@ -88,3 +92,50 @@ def safe_filename(s, max_length=255):
88
  regex = re.compile(pattern, re.UNICODE)
89
  filename = regex.sub('', s)
90
  return unicode(filename[:max_length].rsplit(' ', 0)[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from __future__ import absolute_import
4
 
5
  import logging
6
+ import math
7
  import pprint
8
  import re
9
+ import time
10
+ import xml.etree.ElementTree as ElementTree
11
 
12
+ from pytube.compat import unescape
13
  from pytube.compat import unicode
14
  from pytube.exceptions import RegexMatchError
15
 
 
92
  regex = re.compile(pattern, re.UNICODE)
93
  filename = regex.sub('', s)
94
  return unicode(filename[:max_length].rsplit(' ', 0)[0])
95
+
96
+
97
+ def float_to_srt_time_format(d):
98
+ """Convert decimal durations into proper srt format.
99
+
100
+ :rtype: str
101
+ :returns:
102
+ SubRip Subtitle (str) formatted time duration.
103
+
104
+ >>> float_to_srt_time_format(3.89)
105
+ '00:00:03,890'
106
+ """
107
+ frac, whole = math.modf(d)
108
+ time_fmt = time.strftime('0%H:0%M:%S,', time.gmtime(whole))
109
+ ms = '{:.3f}'.format(frac).replace('0.', '')
110
+ return time_fmt + ms
111
+
112
+
113
+ def xml_caption_to_srt(xml_captions):
114
+ """Convert xml caption tracks to "SubRip Subtitle (srt)".
115
+
116
+ :param str xml_captions:
117
+ XML formatted caption tracks.
118
+ """
119
+ segments = []
120
+ root = ElementTree.fromstring(xml_captions)
121
+ for i, child in enumerate(root.getchildren()):
122
+ text = child.text or ''
123
+ caption = unescape(
124
+ text
125
+ .replace('\n', ' ')
126
+ .replace(' ', ' '),
127
+ )
128
+ duration = float(child.attrib['dur'])
129
+ start = float(child.attrib['start'])
130
+ end = start + duration
131
+ sequence_number = i + 1 # convert from 0-indexed to 1.
132
+ line = (
133
+ '{seq}\n{start} --> {end}\n{text}\n'.format(
134
+ seq=sequence_number,
135
+ start=float_to_srt_time_format(start),
136
+ end=float_to_srt_time_format(end),
137
+ text=caption,
138
+ )
139
+ )
140
+ segments.append(line)
141
+ return '\n'.join(segments).strip()
pytube/query.py CHANGED
@@ -1,13 +1,16 @@
1
  # -*- coding: utf-8 -*-
2
- """This module provides a query interface for media streams."""
3
 
4
 
5
  class StreamQuery:
6
  """Interface for querying the available media streams."""
7
 
8
  def __init__(self, fmt_streams):
9
- """Construct a :class:`StreamQuery <StreamQuery>`."""
10
- # list of :class:`Stream <Stream>` instances.
 
 
 
11
  self.fmt_streams = fmt_streams
12
  self.itag_index = {int(s.itag): s for s in fmt_streams}
13
 
@@ -224,3 +227,33 @@ class StreamQuery:
224
  def all(self):
225
  """Get all the results represented by this query as a list."""
226
  return self.fmt_streams
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # -*- coding: utf-8 -*-
2
+ """This module provides a query interface for media streams and captions."""
3
 
4
 
5
  class StreamQuery:
6
  """Interface for querying the available media streams."""
7
 
8
  def __init__(self, fmt_streams):
9
+ """Construct a :class:`StreamQuery <StreamQuery>`.
10
+
11
+ param list fmt_streams:
12
+ list of :class:`Stream <Stream>` instances.
13
+ """
14
  self.fmt_streams = fmt_streams
15
  self.itag_index = {int(s.itag): s for s in fmt_streams}
16
 
 
227
  def all(self):
228
  """Get all the results represented by this query as a list."""
229
  return self.fmt_streams
230
+
231
+
232
+ class CaptionQuery:
233
+ """Interface for querying the available captions."""
234
+
235
+ def __init__(self, captions):
236
+ """Construct a :class:`Caption <Caption>`.
237
+
238
+ param list captions:
239
+ list of :class:`Caption <Caption>` instances.
240
+
241
+ """
242
+ self.captions = captions
243
+ self.lang_code_index = {c.code: c for c in captions}
244
+
245
+ def get_by_language_code(self, lang_code):
246
+ """Get the :class:`Caption <Caption>` for a given ``lang_code``.
247
+
248
+ :param str lang_code:
249
+ The code that identifies the caption language.
250
+ :rtype: :class:`Caption <Caption>` or ``None``
251
+ :returns:
252
+ The :class:`Caption <Caption>` matching the given ``lang_code`` or
253
+ ``None`` if it does not exist.
254
+ """
255
+ return self.lang_code_index.get(lang_code)
256
+
257
+ def all(self):
258
+ """Get all the results represented by this query as a list."""
259
+ return self.captions