File size: 5,983 Bytes
a45bd3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import json

import pprint


class Formatter(object):
    """Formatter should be used as an abstract base class.

    Formatter classes should inherit from this class and implement
    their own .format() method which should return a string. A
    transcript is represented by a List of Dictionary items.
    """

    def format_transcript(self, transcript, **kwargs):
        raise NotImplementedError('A subclass of Formatter must implement ' \
            'their own .format_transcript() method.')

    def format_transcripts(self, transcripts, **kwargs):
        raise NotImplementedError('A subclass of Formatter must implement ' \
                                  'their own .format_transcripts() method.')


class PrettyPrintFormatter(Formatter):
    def format_transcript(self, transcript, **kwargs):
        """Pretty prints a transcript.

        :param transcript:
        :return: A pretty printed string representation of the transcript.'
        :rtype str
        """
        return pprint.pformat(transcript, **kwargs)

    def format_transcripts(self, transcripts, **kwargs):
        """Pretty prints a list of transcripts.

        :param transcripts:
        :return: A pretty printed string representation of the transcripts.'
        :rtype str
        """
        return self.format_transcript(transcripts, **kwargs)


class JSONFormatter(Formatter):
    def format_transcript(self, transcript, **kwargs):
        """Converts a transcript into a JSON string.

        :param transcript:
        :return: A JSON string representation of the transcript.'
        :rtype str
        """
        return json.dumps(transcript, **kwargs)

    def format_transcripts(self, transcripts, **kwargs):
        """Converts a list of transcripts into a JSON string.

        :param transcripts:
        :return: A JSON string representation of the transcript.'
        :rtype str
        """
        return self.format_transcript(transcripts, **kwargs)


class TextFormatter(Formatter):
    def format_transcript(self, transcript, **kwargs):
        """Converts a transcript into plain text with no timestamps.

        :param transcript:
        :return: all transcript text lines separated by newline breaks.'
        :rtype str
        """
        return '\n'.join(line['text'] for line in transcript)

    def format_transcripts(self, transcripts, **kwargs):
        """Converts a list of transcripts into plain text with no timestamps.

        :param transcripts:
        :return: all transcript text lines separated by newline breaks.'
        :rtype str
        """
        return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts])


class WebVTTFormatter(Formatter):
    def _seconds_to_timestamp(self, time):
        """Helper that converts `time` into a transcript cue timestamp.

        :reference: https://www.w3.org/TR/webvtt1/#webvtt-timestamp

        :param time: a float representing time in seconds.
        :type time: float
        :return: a string formatted as a cue timestamp, 'HH:MM:SS.MS'
        :rtype str
        :example:
        >>> self._seconds_to_timestamp(6.93)
        '00:00:06.930'
        """
        time = float(time)
        hours, remainder = divmod(time, 3600)
        mins, secs = divmod(remainder, 60)
        ms = int(round((time - int(time))*1000, 2))
        return "{:02.0f}:{:02.0f}:{:02.0f}.{:03d}".format(hours, mins, secs, ms)

    def format_transcript(self, transcript, **kwargs):
        """A basic implementation of WEBVTT formatting.

        :param transcript:
        :reference: https://www.w3.org/TR/webvtt1/#introduction-caption
        """
        lines = []
        for i, line in enumerate(transcript):
            if i < len(transcript) - 1:
                # Looks ahead, use next start time since duration value
                # would create an overlap between start times.
                time_text = "{} --> {}".format(
                    self._seconds_to_timestamp(line['start']),
                    self._seconds_to_timestamp(transcript[i + 1]['start'])
                )
            else:
                # Reached the end, cannot look ahead, use duration now.
                duration = line['start'] + line['duration']
                time_text = "{} --> {}".format(
                    self._seconds_to_timestamp(line['start']),
                    self._seconds_to_timestamp(duration)
                )
            lines.append("{}\n{}".format(time_text, line['text']))

        return "WEBVTT\n\n" + "\n\n".join(lines) + "\n"

    def format_transcripts(self, transcripts, **kwargs):
        """A basic implementation of WEBVTT formatting for a list of transcripts.

        :param transcripts:
        :reference: https://www.w3.org/TR/webvtt1/#introduction-caption
        """
        return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts])


class FormatterLoader(object):
    TYPES = {
        'json': JSONFormatter,
        'pretty': PrettyPrintFormatter,
        'text': TextFormatter,
        'webvtt': WebVTTFormatter,
    }

    class UnknownFormatterType(Exception):
        def __init__(self, formatter_type):
            super(FormatterLoader.UnknownFormatterType, self).__init__(
                'The format \'{formatter_type}\' is not supported. '
                'Choose one of the following formats: {supported_formatter_types}'.format(
                    formatter_type=formatter_type,
                    supported_formatter_types=', '.join(FormatterLoader.TYPES.keys()),
                )
            )

    def load(self, formatter_type='pretty'):
        """
        Loads the Formatter for the given formatter type.

        :param formatter_type:
        :return: Formatter object
        """
        if formatter_type not in FormatterLoader.TYPES.keys():
            raise FormatterLoader.UnknownFormatterType(formatter_type)
        return FormatterLoader.TYPES[formatter_type]()