artificialguybr commited on
Commit
9e548ce
1 Parent(s): 9d72f44

Upload 45 files

Browse files
Files changed (46) hide show
  1. .gitattributes +1 -0
  2. whisper/.flake8 +4 -0
  3. whisper/.gitattributes +3 -0
  4. whisper/.github/workflows/python-publish.yml +37 -0
  5. whisper/.github/workflows/test.yml +56 -0
  6. whisper/.gitignore +11 -0
  7. whisper/.pre-commit-config.yaml +28 -0
  8. whisper/CHANGELOG.md +69 -0
  9. whisper/LICENSE +21 -0
  10. whisper/MANIFEST.in +5 -0
  11. whisper/README.md +147 -0
  12. whisper/approach.png +0 -0
  13. whisper/data/README.md +118 -0
  14. whisper/data/meanwhile.json +322 -0
  15. whisper/language-breakdown.svg +0 -0
  16. whisper/model-card.md +69 -0
  17. whisper/notebooks/LibriSpeech.ipynb +958 -0
  18. whisper/notebooks/Multilingual_ASR.ipynb +0 -0
  19. whisper/pyproject.toml +8 -0
  20. whisper/requirements.txt +6 -0
  21. whisper/setup.py +43 -0
  22. whisper/tests/conftest.py +14 -0
  23. whisper/tests/jfk.flac +3 -0
  24. whisper/tests/test_audio.py +19 -0
  25. whisper/tests/test_normalizer.py +96 -0
  26. whisper/tests/test_timing.py +96 -0
  27. whisper/tests/test_tokenizer.py +24 -0
  28. whisper/tests/test_transcribe.py +42 -0
  29. whisper/whisper/__init__.py +154 -0
  30. whisper/whisper/__main__.py +3 -0
  31. whisper/whisper/assets/gpt2.tiktoken +0 -0
  32. whisper/whisper/assets/mel_filters.npz +3 -0
  33. whisper/whisper/assets/multilingual.tiktoken +0 -0
  34. whisper/whisper/audio.py +157 -0
  35. whisper/whisper/decoding.py +821 -0
  36. whisper/whisper/model.py +309 -0
  37. whisper/whisper/normalizers/__init__.py +2 -0
  38. whisper/whisper/normalizers/basic.py +76 -0
  39. whisper/whisper/normalizers/english.json +1741 -0
  40. whisper/whisper/normalizers/english.py +550 -0
  41. whisper/whisper/timing.py +385 -0
  42. whisper/whisper/tokenizer.py +386 -0
  43. whisper/whisper/transcribe.py +461 -0
  44. whisper/whisper/triton_ops.py +109 -0
  45. whisper/whisper/utils.py +258 -0
  46. whisper/whisper/version.py +1 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ whisper/tests/jfk.flac filter=lfs diff=lfs merge=lfs -text
whisper/.flake8 ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [flake8]
2
+ per-file-ignores =
3
+ */__init__.py: F401
4
+
whisper/.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Override jupyter in Github language stats for more accurate estimate of repo code languages
2
+ # reference: https://github.com/github/linguist/blob/master/docs/overrides.md#generated-code
3
+ *.ipynb linguist-generated
whisper/.github/workflows/python-publish.yml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ jobs:
8
+ deploy:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v3
12
+ - uses: actions-ecosystem/action-regex-match@v2
13
+ id: regex-match
14
+ with:
15
+ text: ${{ github.event.head_commit.message }}
16
+ regex: '^Release ([^ ]+)'
17
+ - name: Set up Python
18
+ uses: actions/setup-python@v4
19
+ with:
20
+ python-version: '3.8'
21
+ - name: Install dependencies
22
+ run: |
23
+ python -m pip install --upgrade pip
24
+ pip install setuptools wheel twine
25
+ - name: Release
26
+ if: ${{ steps.regex-match.outputs.match != '' }}
27
+ uses: softprops/action-gh-release@v1
28
+ with:
29
+ tag_name: v${{ steps.regex-match.outputs.group1 }}
30
+ - name: Build and publish
31
+ if: ${{ steps.regex-match.outputs.match != '' }}
32
+ env:
33
+ TWINE_USERNAME: __token__
34
+ TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
35
+ run: |
36
+ python setup.py sdist
37
+ twine upload dist/*
whisper/.github/workflows/test.yml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: test
2
+ on:
3
+ push:
4
+ branches:
5
+ - main
6
+ pull_request:
7
+ branches:
8
+ - main
9
+
10
+ jobs:
11
+ pre-commit:
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v3
15
+ - name: Fetch base branch
16
+ run: git fetch origin ${{ github.base_ref }}
17
+ - uses: actions/setup-python@v4
18
+ with:
19
+ python-version: "3.8"
20
+ architecture: x64
21
+ - name: Get pip cache dir
22
+ id: pip-cache
23
+ run: |
24
+ echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT
25
+ - name: pip/pre-commit cache
26
+ uses: actions/cache@v3
27
+ with:
28
+ path: |
29
+ ${{ steps.pip-cache.outputs.dir }}
30
+ ~/.cache/pre-commit
31
+ key: ${{ runner.os }}-pip-pre-commit-${{ hashFiles('**/.pre-commit-config.yaml') }}
32
+ restore-keys: |
33
+ ${{ runner.os }}-pip-pre-commit
34
+ - name: pre-commit
35
+ run: |
36
+ pip install -U pre-commit
37
+ pre-commit install --install-hooks
38
+ pre-commit run --all-files
39
+ whisper-test:
40
+ needs: pre-commit
41
+ runs-on: ubuntu-latest
42
+ strategy:
43
+ matrix:
44
+ python-version: ['3.8', '3.9', '3.10', '3.11']
45
+ pytorch-version: [1.13.1, 2.0.0]
46
+ exclude:
47
+ - python-version: '3.11'
48
+ pytorch-version: 1.13.1
49
+ steps:
50
+ - uses: conda-incubator/setup-miniconda@v2
51
+ - run: conda install -n test ffmpeg python=${{ matrix.python-version }}
52
+ - run: pip3 install torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu
53
+ - uses: actions/checkout@v3
54
+ - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH
55
+ - run: pip install .["dev"]
56
+ - run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]' -m 'not requires_cuda'
whisper/.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.egg-info
5
+ .pytest_cache
6
+ .ipynb_checkpoints
7
+
8
+ thumbs.db
9
+ .DS_Store
10
+ .idea
11
+
whisper/.pre-commit-config.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v4.0.1
4
+ hooks:
5
+ - id: check-json
6
+ - id: end-of-file-fixer
7
+ types: [file, python]
8
+ - id: trailing-whitespace
9
+ types: [file, python]
10
+ - id: mixed-line-ending
11
+ - id: check-added-large-files
12
+ args: [--maxkb=4096]
13
+ - repo: https://github.com/psf/black
14
+ rev: 23.7.0
15
+ hooks:
16
+ - id: black
17
+ - repo: https://github.com/pycqa/isort
18
+ rev: 5.12.0
19
+ hooks:
20
+ - id: isort
21
+ name: isort (python)
22
+ args: ["--profile", "black", "-l", "88", "--trailing-comma", "--multi-line", "3"]
23
+ - repo: https://github.com/pycqa/flake8.git
24
+ rev: 6.0.0
25
+ hooks:
26
+ - id: flake8
27
+ types: [python]
28
+ args: ["--max-line-length", "88", "--ignore", "E203,E501,W503,W504"]
whisper/CHANGELOG.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CHANGELOG
2
+
3
+ ## [v20230918](https://github.com/openai/whisper/releases/tag/v20230918)
4
+
5
+ * Add .pre-commit-config.yaml ([#1528](https://github.com/openai/whisper/pull/1528))
6
+ * fix doc of TextDecoder ([#1526](https://github.com/openai/whisper/pull/1526))
7
+ * Update model-card.md ([#1643](https://github.com/openai/whisper/pull/1643))
8
+ * word timing tweaks ([#1559](https://github.com/openai/whisper/pull/1559))
9
+ * Avoid rearranging all caches ([#1483](https://github.com/openai/whisper/pull/1483))
10
+ * Improve timestamp heuristics. ([#1461](https://github.com/openai/whisper/pull/1461))
11
+ * fix condition_on_previous_text ([#1224](https://github.com/openai/whisper/pull/1224))
12
+ * Fix numba depreceation notice ([#1233](https://github.com/openai/whisper/pull/1233))
13
+ * Updated README.md to provide more insight on BLEU and specific appendices ([#1236](https://github.com/openai/whisper/pull/1236))
14
+ * Avoid computing higher temperatures on no_speech segments ([#1279](https://github.com/openai/whisper/pull/1279))
15
+ * Dropped unused execute bit from mel_filters.npz. ([#1254](https://github.com/openai/whisper/pull/1254))
16
+ * Drop ffmpeg-python dependency and call ffmpeg directly. ([#1242](https://github.com/openai/whisper/pull/1242))
17
+ * Python 3.11 ([#1171](https://github.com/openai/whisper/pull/1171))
18
+ * Update decoding.py ([#1219](https://github.com/openai/whisper/pull/1219))
19
+ * Update decoding.py ([#1155](https://github.com/openai/whisper/pull/1155))
20
+ * Update README.md to reference tiktoken ([#1105](https://github.com/openai/whisper/pull/1105))
21
+ * Implement max line width and max line count, and make word highlighting optional ([#1184](https://github.com/openai/whisper/pull/1184))
22
+ * Squash long words at window and sentence boundaries. ([#1114](https://github.com/openai/whisper/pull/1114))
23
+ * python-publish.yml: bump actions version to fix node warning ([#1211](https://github.com/openai/whisper/pull/1211))
24
+ * Update tokenizer.py ([#1163](https://github.com/openai/whisper/pull/1163))
25
+
26
+ ## [v20230314](https://github.com/openai/whisper/releases/tag/v20230314)
27
+
28
+ * abort find_alignment on empty input ([#1090](https://github.com/openai/whisper/pull/1090))
29
+ * Fix truncated words list when the replacement character is decoded ([#1089](https://github.com/openai/whisper/pull/1089))
30
+ * fix github language stats getting dominated by jupyter notebook ([#1076](https://github.com/openai/whisper/pull/1076))
31
+ * Fix alignment between the segments and the list of words ([#1087](https://github.com/openai/whisper/pull/1087))
32
+ * Use tiktoken ([#1044](https://github.com/openai/whisper/pull/1044))
33
+
34
+ ## [v20230308](https://github.com/openai/whisper/releases/tag/v20230308)
35
+
36
+ * kwargs in decode() for convenience ([#1061](https://github.com/openai/whisper/pull/1061))
37
+ * fix all_tokens handling that caused more repetitions and discrepancy in JSON ([#1060](https://github.com/openai/whisper/pull/1060))
38
+ * fix typo in CHANGELOG.md
39
+
40
+ ## [v20230307](https://github.com/openai/whisper/releases/tag/v20230307)
41
+
42
+ * Fix the repetition/hallucination issue identified in #1046 ([#1052](https://github.com/openai/whisper/pull/1052))
43
+ * Use triton==2.0.0 ([#1053](https://github.com/openai/whisper/pull/1053))
44
+ * Install triton in x86_64 linux only ([#1051](https://github.com/openai/whisper/pull/1051))
45
+ * update setup.py to specify python >= 3.8 requirement
46
+
47
+ ## [v20230306](https://github.com/openai/whisper/releases/tag/v20230306)
48
+
49
+ * remove auxiliary audio extension ([#1021](https://github.com/openai/whisper/pull/1021))
50
+ * apply formatting with `black`, `isort`, and `flake8` ([#1038](https://github.com/openai/whisper/pull/1038))
51
+ * word-level timestamps in `transcribe()` ([#869](https://github.com/openai/whisper/pull/869))
52
+ * Decoding improvements ([#1033](https://github.com/openai/whisper/pull/1033))
53
+ * Update README.md ([#894](https://github.com/openai/whisper/pull/894))
54
+ * Fix infinite loop caused by incorrect timestamp tokens prediction ([#914](https://github.com/openai/whisper/pull/914))
55
+ * drop python 3.7 support ([#889](https://github.com/openai/whisper/pull/889))
56
+
57
+ ## [v20230124](https://github.com/openai/whisper/releases/tag/v20230124)
58
+
59
+ * handle printing even if sys.stdout.buffer is not available ([#887](https://github.com/openai/whisper/pull/887))
60
+ * Add TSV formatted output in transcript, using integer start/end time in milliseconds ([#228](https://github.com/openai/whisper/pull/228))
61
+ * Added `--output_format` option ([#333](https://github.com/openai/whisper/pull/333))
62
+ * Handle `XDG_CACHE_HOME` properly for `download_root` ([#864](https://github.com/openai/whisper/pull/864))
63
+ * use stdout for printing transcription progress ([#867](https://github.com/openai/whisper/pull/867))
64
+ * Fix bug where mm is mistakenly replaced with hmm in e.g. 20mm ([#659](https://github.com/openai/whisper/pull/659))
65
+ * print '?' if a letter can't be encoded using the system default encoding ([#859](https://github.com/openai/whisper/pull/859))
66
+
67
+ ## [v20230117](https://github.com/openai/whisper/releases/tag/v20230117)
68
+
69
+ The first versioned release available on [PyPI](https://pypi.org/project/openai-whisper/)
whisper/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 OpenAI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
whisper/MANIFEST.in ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ include requirements.txt
2
+ include README.md
3
+ include LICENSE
4
+ include whisper/assets/*
5
+ include whisper/normalizers/english.json
whisper/README.md ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Whisper
2
+
3
+ [[Blog]](https://openai.com/blog/whisper)
4
+ [[Paper]](https://arxiv.org/abs/2212.04356)
5
+ [[Model card]](https://github.com/openai/whisper/blob/main/model-card.md)
6
+ [[Colab example]](https://colab.research.google.com/github/openai/whisper/blob/master/notebooks/LibriSpeech.ipynb)
7
+
8
+ Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multitasking model that can perform multilingual speech recognition, speech translation, and language identification.
9
+
10
+
11
+ ## Approach
12
+
13
+ ![Approach](https://raw.githubusercontent.com/openai/whisper/main/approach.png)
14
+
15
+ A Transformer sequence-to-sequence model is trained on various speech processing tasks, including multilingual speech recognition, speech translation, spoken language identification, and voice activity detection. These tasks are jointly represented as a sequence of tokens to be predicted by the decoder, allowing a single model to replace many stages of a traditional speech-processing pipeline. The multitask training format uses a set of special tokens that serve as task specifiers or classification targets.
16
+
17
+
18
+ ## Setup
19
+
20
+ We used Python 3.9.9 and [PyTorch](https://pytorch.org/) 1.10.1 to train and test our models, but the codebase is expected to be compatible with Python 3.8-3.11 and recent PyTorch versions. The codebase also depends on a few Python packages, most notably [OpenAI's tiktoken](https://github.com/openai/tiktoken) for their fast tokenizer implementation. You can download and install (or update to) the latest release of Whisper with the following command:
21
+
22
+ pip install -U openai-whisper
23
+
24
+ Alternatively, the following command will pull and install the latest commit from this repository, along with its Python dependencies:
25
+
26
+ pip install git+https://github.com/openai/whisper.git
27
+
28
+ To update the package to the latest version of this repository, please run:
29
+
30
+ pip install --upgrade --no-deps --force-reinstall git+https://github.com/openai/whisper.git
31
+
32
+ It also requires the command-line tool [`ffmpeg`](https://ffmpeg.org/) to be installed on your system, which is available from most package managers:
33
+
34
+ ```bash
35
+ # on Ubuntu or Debian
36
+ sudo apt update && sudo apt install ffmpeg
37
+
38
+ # on Arch Linux
39
+ sudo pacman -S ffmpeg
40
+
41
+ # on MacOS using Homebrew (https://brew.sh/)
42
+ brew install ffmpeg
43
+
44
+ # on Windows using Chocolatey (https://chocolatey.org/)
45
+ choco install ffmpeg
46
+
47
+ # on Windows using Scoop (https://scoop.sh/)
48
+ scoop install ffmpeg
49
+ ```
50
+
51
+ You may need [`rust`](http://rust-lang.org) installed as well, in case [tiktoken](https://github.com/openai/tiktoken) does not provide a pre-built wheel for your platform. If you see installation errors during the `pip install` command above, please follow the [Getting started page](https://www.rust-lang.org/learn/get-started) to install Rust development environment. Additionally, you may need to configure the `PATH` environment variable, e.g. `export PATH="$HOME/.cargo/bin:$PATH"`. If the installation fails with `No module named 'setuptools_rust'`, you need to install `setuptools_rust`, e.g. by running:
52
+
53
+ ```bash
54
+ pip install setuptools-rust
55
+ ```
56
+
57
+
58
+ ## Available models and languages
59
+
60
+ There are five model sizes, four with English-only versions, offering speed and accuracy tradeoffs. Below are the names of the available models and their approximate memory requirements and relative speed.
61
+
62
+
63
+ | Size | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
64
+ |:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
65
+ | tiny | 39 M | `tiny.en` | `tiny` | ~1 GB | ~32x |
66
+ | base | 74 M | `base.en` | `base` | ~1 GB | ~16x |
67
+ | small | 244 M | `small.en` | `small` | ~2 GB | ~6x |
68
+ | medium | 769 M | `medium.en` | `medium` | ~5 GB | ~2x |
69
+ | large | 1550 M | N/A | `large` | ~10 GB | 1x |
70
+
71
+ The `.en` models for English-only applications tend to perform better, especially for the `tiny.en` and `base.en` models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models.
72
+
73
+ Whisper's performance varies widely depending on the language. The figure below shows a WER (Word Error Rate) breakdown by languages of the Fleurs dataset using the `large-v2` model (The smaller the numbers, the better the performance). Additional WER scores corresponding to the other models and datasets can be found in Appendix D.1, D.2, and D.4. Meanwhile, more BLEU (Bilingual Evaluation Understudy) scores can be found in Appendix D.3. Both are found in [the paper](https://arxiv.org/abs/2212.04356).
74
+
75
+ ![WER breakdown by language](https://raw.githubusercontent.com/openai/whisper/main/language-breakdown.svg)
76
+
77
+
78
+
79
+ ## Command-line usage
80
+
81
+ The following command will transcribe speech in audio files, using the `medium` model:
82
+
83
+ whisper audio.flac audio.mp3 audio.wav --model medium
84
+
85
+ The default setting (which selects the `small` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option:
86
+
87
+ whisper japanese.wav --language Japanese
88
+
89
+ Adding `--task translate` will translate the speech into English:
90
+
91
+ whisper japanese.wav --language Japanese --task translate
92
+
93
+ Run the following to view all available options:
94
+
95
+ whisper --help
96
+
97
+ See [tokenizer.py](https://github.com/openai/whisper/blob/main/whisper/tokenizer.py) for the list of all available languages.
98
+
99
+
100
+ ## Python usage
101
+
102
+ Transcription can also be performed within Python:
103
+
104
+ ```python
105
+ import whisper
106
+
107
+ model = whisper.load_model("base")
108
+ result = model.transcribe("audio.mp3")
109
+ print(result["text"])
110
+ ```
111
+
112
+ Internally, the `transcribe()` method reads the entire file and processes the audio with a sliding 30-second window, performing autoregressive sequence-to-sequence predictions on each window.
113
+
114
+ Below is an example usage of `whisper.detect_language()` and `whisper.decode()` which provide lower-level access to the model.
115
+
116
+ ```python
117
+ import whisper
118
+
119
+ model = whisper.load_model("base")
120
+
121
+ # load audio and pad/trim it to fit 30 seconds
122
+ audio = whisper.load_audio("audio.mp3")
123
+ audio = whisper.pad_or_trim(audio)
124
+
125
+ # make log-Mel spectrogram and move to the same device as the model
126
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
127
+
128
+ # detect the spoken language
129
+ _, probs = model.detect_language(mel)
130
+ print(f"Detected language: {max(probs, key=probs.get)}")
131
+
132
+ # decode the audio
133
+ options = whisper.DecodingOptions()
134
+ result = whisper.decode(model, mel, options)
135
+
136
+ # print the recognized text
137
+ print(result.text)
138
+ ```
139
+
140
+ ## More examples
141
+
142
+ Please use the [🙌 Show and tell](https://github.com/openai/whisper/discussions/categories/show-and-tell) category in Discussions for sharing more example usages of Whisper and third-party extensions such as web demos, integrations with other tools, ports for different platforms, etc.
143
+
144
+
145
+ ## License
146
+
147
+ Whisper's code and model weights are released under the MIT License. See [LICENSE](https://github.com/openai/whisper/blob/main/LICENSE) for further details.
whisper/approach.png ADDED
whisper/data/README.md ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This directory supplements the paper with more details on how we prepared the data for evaluation, to help replicate our experiments.
2
+
3
+ ## Short-form English-only datasets
4
+
5
+ ### LibriSpeech
6
+
7
+ We used the test-clean and test-other splits from the [LibriSpeech ASR corpus](https://www.openslr.org/12).
8
+
9
+ ### TED-LIUM 3
10
+
11
+ We used the test split of [TED-LIUM Release 3](https://www.openslr.org/51/), using the segmented manual transcripts included in the release.
12
+
13
+ ### Common Voice 5.1
14
+
15
+ We downloaded the English subset of Common Voice Corpus 5.1 from [the official website](https://commonvoice.mozilla.org/en/datasets)
16
+
17
+ ### Artie
18
+
19
+ We used the [Artie bias corpus](https://github.com/artie-inc/artie-bias-corpus). This is a subset of the Common Voice dataset.
20
+
21
+ ### CallHome & Switchboard
22
+
23
+ We used the two corpora from [LDC2002S09](https://catalog.ldc.upenn.edu/LDC2002S09) and [LDC2002T43](https://catalog.ldc.upenn.edu/LDC2002T43) and followed the [eval2000_data_prep.sh](https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/eval2000_data_prep.sh) script for preprocessing. The `wav.scp` files can be converted to WAV files with the following bash commands:
24
+
25
+ ```bash
26
+ mkdir -p wav
27
+ while read name cmd; do
28
+ echo $name
29
+ echo ${cmd/\|/} wav/$name.wav | bash
30
+ done < wav.scp
31
+ ```
32
+
33
+
34
+ ### WSJ
35
+
36
+ We used [LDC93S6B](https://catalog.ldc.upenn.edu/LDC93S6B) and [LDC94S13B](https://catalog.ldc.upenn.edu/LDC94S13B) and followed the [s5 recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/wsj/s5) to preprocess the dataset.
37
+
38
+ ### CORAAL
39
+
40
+ We used the 231 interviews from [CORAAL (v. 2021.07)](https://oraal.uoregon.edu/coraal) and used the segmentations from [the FairSpeech project](https://github.com/stanford-policylab/asr-disparities/blob/master/input/CORAAL_transcripts.csv).
41
+
42
+ ### CHiME-6
43
+
44
+ We downloaded the [CHiME-5 dataset](https://spandh.dcs.shef.ac.uk//chime_challenge/CHiME5/download.html) and followed the stage 0 of the [s5_track1 recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/chime6/s5_track1) to create the CHiME-6 dataset which fixes synchronization. We then used the binaural recordings (`*_P??.wav`) and the corresponding transcripts.
45
+
46
+ ### AMI-IHM, AMI-SDM1
47
+
48
+ We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 ad 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b).
49
+
50
+
51
+ ## Long-form English-only datasets
52
+
53
+ ### TED-LIUM 3
54
+
55
+ To create a long-form transcription dataset from the [TED-LIUM3](https://www.openslr.org/51/) dataset, we sliced the audio between the beginning of the first labeled segment and the end of the last labeled segment of each talk, and we used the concatenated text as the label. Below are the timestamps used for slicing each of the 11 TED talks in the test split.
56
+
57
+ | Filename | Begin time (s) | End time (s) |
58
+ |---------------------|----------------|--------------|
59
+ | DanBarber_2010 | 16.09 | 1116.24 |
60
+ | JaneMcGonigal_2010 | 15.476 | 1187.61 |
61
+ | BillGates_2010 | 15.861 | 1656.94 |
62
+ | TomWujec_2010U | 16.26 | 402.17 |
63
+ | GaryFlake_2010 | 16.06 | 367.14 |
64
+ | EricMead_2009P | 18.434 | 536.44 |
65
+ | MichaelSpecter_2010 | 16.11 | 979.312 |
66
+ | DanielKahneman_2010 | 15.8 | 1199.44 |
67
+ | AimeeMullins_2009P | 17.82 | 1296.59 |
68
+ | JamesCameron_2010 | 16.75 | 1010.65 |
69
+ | RobertGupta_2010U | 16.8 | 387.03 |
70
+
71
+ ### Meanwhile
72
+
73
+ This dataset consists of 64 segments from The Late Show with Stephen Colbert. The YouTube video ID, start and end timestamps, and the labels can be found in [meanwhile.json](meanwhile.json). The labels are collected from the closed-caption data for each video and corrected with manual inspection.
74
+
75
+ ### Rev16
76
+
77
+ We use a subset of 16 files from the 30 podcast episodes in [Rev.AI's Podcast Transcription Benchmark](https://www.rev.ai/blog/podcast-transcription-benchmark-part-1/), after finding that there are multiple cases where a significant portion of the audio and the labels did not match, mostly on the parts introducing the sponsors. We selected 16 episodes that do not have this error, whose "file number" are:
78
+
79
+ 3 4 9 10 11 14 17 18 20 21 23 24 26 27 29 32
80
+
81
+ ### Kincaid46
82
+
83
+ This dataset consists of 46 audio files and the corresponding transcripts compiled in the blog article [Which automatic transcription service is the most accurate - 2018](https://medium.com/descript/which-automatic-transcription-service-is-the-most-accurate-2018-2e859b23ed19) by Jason Kincaid. We used the 46 audio files and reference transcripts from the Airtable widget in the article.
84
+
85
+ For the human transcription benchmark in the paper, we use a subset of 25 examples from this data, whose "Ref ID" are:
86
+
87
+ 2 4 5 8 9 10 12 13 14 16 19 21 23 25 26 28 29 30 33 35 36 37 42 43 45
88
+
89
+ ### Earnings-21, Earnings-22
90
+
91
+ For these datasets, we used the files available in [the speech-datasets repository](https://github.com/revdotcom/speech-datasets), as of their `202206` version.
92
+
93
+ ### CORAAL
94
+
95
+ We used the 231 interviews from [CORAAL (v. 2021.07)](https://oraal.uoregon.edu/coraal) and used the full-length interview files and transcripts.
96
+
97
+
98
+ ## Multilingual datasets
99
+
100
+ ### Multilingual LibriSpeech
101
+
102
+ We used the test splits from each language in [the Multilingual LibriSpeech (MLS) corpus](https://www.openslr.org/94/).
103
+
104
+ ### Fleurs
105
+
106
+ We collected audio files and transcripts using the implementation available as [HuggingFace datasets](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py). To use as a translation dataset, we matched the numerical utterance IDs to find the corresponding transcript in English.
107
+
108
+ ### VoxPopuli
109
+
110
+ We used the `get_asr_data.py` script from [the official repository](https://github.com/facebookresearch/voxpopuli) to collect the ASR data in 14 languages.
111
+
112
+ ### Common Voice 9
113
+
114
+ We downloaded the Common Voice Corpus 9 from [the official website](https://commonvoice.mozilla.org/en/datasets)
115
+
116
+ ### CoVOST 2
117
+
118
+ We collected the `X into English` data collected using [the official repository](https://github.com/facebookresearch/covost).
whisper/data/meanwhile.json ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1YOmY-Vjy-o": {
3
+ "begin": "1:04.0",
4
+ "end": "2:11.0",
5
+ "text": "FOLKS, IF YOU WATCH THE SHOW,\nYOU KNOW I SPEND A LOT OF TIME\nRIGHT OVER THERE, PATIENTLY AND\nASTUTELY SCRUTINIZING THE\nBOXWOOD AND MAHOGANY CHESS SET\nOF THE DAY'S BIGGEST STORIES,\nDEVELOPING THE CENTRAL\nHEADLINE-PAWNS, DEFTLY\nMANEUVERING AN OH-SO-TOPICAL\nKNIGHT TO F-6, FEIGNING A\nCLASSIC SICILIAN-NAJDORF\nVARIATION ON THE NEWS, ALL THE\nWHILE, SEEING EIGHT MOVES DEEP\nAND PATIENTLY MARSHALING THE\nLATEST PRESS RELEASES INTO A\nFISCHER SOZIN LIPNITZKY ATTACK\nTHAT CULMINATES IN THE ELEGANT,\nLETHAL, SLOW-PLAYED EN PASSANT\nCHECKMATE THAT IS MY NIGHTLY\nMONOLOGUE.\nBUT SOMETIMES, SOMETIMES,\nFOLKS-- I,\nSOMETIMES,\nI STARTLE AWAKE UPSIDE DOWN ON\nTHE MONKEY BARS OF A CONDEMNED\nPLAYGROUND ON A SUPERFUND SITE,\nGET ALL HEPPED UP ON GOOFBALLS,\nRUMMAGE THROUGH A DISCARDED TAG\nBAG OF DEFECTIVE TOYS, YANK\nOUT A FISTFUL OF DISEMBODIED\nDOLL LIMBS, TOSS THEM ON A\nSTAINED KID'S PLACEMAT FROM A\nDEFUNCT DENNY'S, SET UP A TABLE\nINSIDE A RUSTY CARGO CONTAINER\nDOWN BY THE WHARF, AND CHALLENGE\nTOOTHLESS DRIFTERS TO THE\nGODLESS BUGHOUSE BLITZ\nOF TOURNAMENT OF NEWS THAT IS MY\nSEGMENT:\nMEANWHILE!\n"
6
+ },
7
+ "3P_XnxdlXu0": {
8
+ "begin": "2:08.3",
9
+ "end": "3:02.3",
10
+ "text": "FOLKS, I SPEND A LOT OF TIME\nRIGHT OVER THERE, NIGHT AFTER NIGHT ACTUALLY, CAREFULLY\nSELECTING FOR YOU THE DAY'S NEWSIEST,\nMOST AERODYNAMIC HEADLINES,\nSTRESS TESTING THE MOST TOPICAL\nANTILOCK BRAKES AND POWER\nSTEERING, PAINSTAKINGLY\nSTITCHING LEATHER SEATING SO\nSOFT, IT WOULD MAKE J.D. POWER\nAND HER ASSOCIATES BLUSH, TO\nCREATE THE LUXURY SEDAN THAT IS\nMY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nFOLKS, I LURCH TO CONSCIOUSNESS\nIN THE BACK OF AN ABANDONED\nSCHOOL BUS AND SLAP MYSELF AWAKE\nWITH A CRUSTY FLOOR MAT BEFORE\nUSING A MOUSE-BITTEN TIMING BELT\nTO STRAP SOME OLD PLYWOOD TO A\nCOUPLE OF DISCARDED OIL DRUMS.\nTHEN, BY THE LIGHT OF A HEATHEN\nMOON, RENDER A GAS TANK OUT OF\nAN EMPTY BIG GULP, FILL IT WITH\nWHITE CLAW AND DENATURED\nALCOHOL, THEN LIGHT A MATCH AND\nLET HER RIP, IN THE DEMENTED\nONE-MAN SOAP BOX DERBY OF NEWS\nTHAT IS MY SEGMENT: MEANWHILE!"
11
+ },
12
+ "3elIlQzJEQ0": {
13
+ "begin": "1:08.5",
14
+ "end": "1:58.5",
15
+ "text": "LADIES AND GENTLEMEN, YOU KNOW, I SPEND A\nLOT OF TIME RIGHT OVER THERE,\nRAISING THE FINEST HOLSTEIN NEWS\nCATTLE, FIRMLY, YET TENDERLY,\nMILKING THE LATEST HEADLINES\nFROM THEIR JOKE-SWOLLEN TEATS,\nCHURNING THE DAILY STORIES INTO\nTHE DECADENT, PROVENCAL-STYLE\nTRIPLE CREME BRIE THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS,\nI STAGGER HOME HUNGRY AFTER\nBEING RELEASED BY THE POLICE,\nAND ROOT AROUND IN THE NEIGHBOR'S\nTRASH CAN FOR AN OLD MILK\nCARTON, SCRAPE OUT THE BLOOMING\nDAIRY RESIDUE ONTO THE REMAINS\nOF A WET CHEESE RIND I WON\nFROM A RAT IN A PRE-DAWN STREET\nFIGHT, PUT IT IN A DISCARDED\nPAINT CAN, AND LEAVE IT TO\nFERMENT NEXT TO A TRASH FIRE,\nTHEN HUNKER DOWN AND HALLUCINATE\nWHILE EATING THE LISTERIA-LADEN\nDEMON CUSTARD OF NEWS THAT IS\nMY SEGMENT: MEANWHILE!"
16
+ },
17
+ "43P4q1KGKEU": {
18
+ "begin": "0:29.3",
19
+ "end": "1:58.3",
20
+ "text": "FOLKS, IF YOU WATCH THE SHOW, YOU KNOW I SPEND MOST\nOF MY TIME, RIGHT OVER THERE.\nCAREFULLY SORTING THROUGH THE\nDAY'S BIGGEST STORIES, AND\nSELECTING ONLY THE MOST\nSUPPLE AND UNBLEMISHED OSTRICH\nAND CROCODILE NEWS LEATHER,\nWHICH I THEN ENTRUST TO ARTISAN\nGRADUATES OF THE \"ECOLE\nGREGOIRE-FERRANDI,\" WHO\nCAREFULLY DYE THEM IN A PALETTE\nOF BRIGHT, ZESTY SHADES, AND\nADORN THEM WITH THE FINEST, MOST\nTOPICAL INLAY WORK USING HAND\nTOOLS AND DOUBLE MAGNIFYING\nGLASSES, THEN ASSEMBLE THEM\nACCORDING TO NOW CLASSIC AND\nELEGANT GEOMETRY USING OUR\nSIGNATURE SADDLE STITCHING, AND\nLINE IT WITH BEESWAX-COATED\nLINEN, AND FINALLY ATTACH A\nMALLET-HAMMERED STRAP, PEARLED\nHARDWARE, AND A CLOCHETTE TO\nCREATE FOR YOU THE ONE-OF-A-KIND\nHAUTE COUTURE HERMES BIRKIN BAG\nTHAT IS MY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS,\nSOMETIMES, SOMETIMES I WAKE UP IN THE\nLAST CAR OF AN ABANDONED ROLLER\nCOASTER AT CONEY ISLAND, WHERE\nI'M HIDING FROM THE TRIADS, I\nHUFF SOME ENGINE LUBRICANTS OUT\nOF A SAFEWAY BAG AND STAGGER\nDOWN THE SHORE TO TEAR THE SAIL\nOFF A BEACHED SCHOONER, THEN I\nRIP THE CO-AXIAL CABLE OUT OF\nTHE R.V. OF AN ELDERLY COUPLE\nFROM UTAH, HANK AND MABEL,\nLOVELY FOLKS, AND USE IT TO\nSTITCH THE SAIL INTO A LOOSE,\nPOUCH-LIKE RUCKSACK, THEN I\nSTOW AWAY IN THE BACK OF A\nGARBAGE TRUCK TO THE JUNK YARD\nWHERE I PICK THROUGH THE DEBRIS\nFOR ONLY THE BROKEN TOYS THAT\nMAKE ME THE SADDEST UNTIL I HAVE\nLOADED, FOR YOU, THE HOBO\nFUGITIVE'S BUG-OUT BINDLE OF\nNEWS THAT IS MY SEGMENT:\nMEANWHILE!"
21
+ },
22
+ "4ktyaJkLMfo": {
23
+ "begin": "0:42.5",
24
+ "end": "1:26.5",
25
+ "text": "YOU KNOW, FOLKS, I SPEND A LOT\nOF TIME CRAFTING FOR YOU A\nBESPOKE PLAYLIST OF THE DAY'S\nBIGGEST STORIES, RIGHT OVER THERE, METICULOUSLY\nSELECTING THE MOST TOPICAL\nCHAKRA-AFFIRMING SCENTED\nCANDLES, AND USING FENG SHUI TO\nPERFECTLY ALIGN THE JOKE ENERGY\nIN THE EXCLUSIVE BOUTIQUE YOGA\nRETREAT THAT IS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nI GO TO THE DUMPSTER BEHIND THE\nWAFFLE HOUSE AT 3:00 IN THE\nMORNING, TAKE OFF MY SHIRT,\nCOVER MYSELF IN USED FRY OIL,\nWRAP MY HANDS IN SOME OLD DUCT\nTAPE I STOLE FROM A BROKEN CAR\nWINDOW, THEN POUND A SIX-PACK OF\nBLUEBERRY HARD SELTZER AND A\nSACK OF PILLS I STOLE FROM A\nPARKED AMBULANCE, THEN\nARM-WRESTLE A RACCOON IN THE\nBACK ALLEY VISION QUEST OF NEWS\nTHAT IS MY SEGMENT:\n\"MEANWHILE!\""
26
+ },
27
+ "5Dsh9AgqRG0": {
28
+ "begin": "1:06.0",
29
+ "end": "2:34.0",
30
+ "text": "YOU KNOW, FOLKS, I SPEND MOST OF\nMY TIME RIGHT OVER THERE, MINING\nTHE DAY'S BIGGEST, MOST\nIMPORTANT STORIES, COLLECTING\nTHE FINEST, MOST TOPICAL IRON\nORE, HAND HAMMERING IT INTO JOKE\nPANELS.\nTHEN I CRAFT SHEETS OF BRONZE\nEMBLAZONED WITH PATTERNS THAT\nTELL AN EPIC TALE OF CONQUEST\nAND GLORY.\nTHEN, USING THE GERMANIC\nTRADITIONAL PRESSBLECH\nPROCESS, I PLACE THIN SHEETS OF\nFOIL AGAINST THE SCENES, AND BY\nHAMMERING OR OTHERWISE,\nAPPLYING PRESSURE FROM THE BACK,\nI PROJECT THESE SCENES INTO A\nPAIR OF CHEEK GUARDS AND A\nFACEPLATE.\nAND, FINALLY, USING FLUTED\nSTRIPS OF WHITE ALLOYED\nMOULDING, I DIVIDE THE DESIGNS\nINTO FRAMED PANELS AND HOLD IT\nALL TOGETHER USING BRONZE RIVETS\nTO CREATE THE BEAUTIFUL AND\nINTIMIDATING ANGLO-SAXON\nBATTLE HELM THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS,\nSOMETIMES, JUST SOMETIMES, I COME TO MY SENSES FULLY NAKED\nON THE DECK OF A PIRATE-BESIEGED\nMELEE CONTAINER SHIP THAT PICKED\nME UP FLOATING ON THE DETACHED\nDOOR OF A PORT-A-POTTY IN THE\nINDIAN OCEAN.\nTHEN, AFTER A SUNSTROKE-INDUCED\nREALIZATION THAT THE CREW OF\nTHIS SHIP PLANS TO SELL ME IN\nEXCHANGE FOR A BAG OF ORANGES TO\nFIGHT OFF SCURVY, I LEAD A\nMUTINY USING ONLY A P.V.C. PIPE\nAND A POOL CHAIN.\nTHEN, ACCEPTING MY NEW ROLE AS\nCAPTAIN, AND DECLARING MYSELF\nKING OF THE WINE-DARK SEAS, I\nGRAB A DIRTY MOP BUCKET COVERED\nIN BARNACLES AND ADORN IT WITH\nTHE TEETH OF THE VANQUISHED, TO\nCREATE THE SOPPING WET PIRATE\nCROWN OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE!\" "
31
+ },
32
+ "748OyesQy84": {
33
+ "begin": "0:40.0",
34
+ "end": "1:41.0",
35
+ "text": "FOLKS, IF YOU WATCH THE SHOW, YOU KNOW, I SPEND MOST OF\nMY TIME, RIGHT OVER THERE,\nCAREFULLY BLENDING FOR YOU THE\nDAY'S NEWSIEST, MOST TOPICAL\nFLOUR, EGGS, MILK, AND BUTTER,\nAND STRAINING IT INTO A FINE\nBATTER TO MAKE DELICATE, YET\nINFORMATIVE COMEDY PANCAKES.\nTHEN I GLAZE THEM IN THE JUICE\nAND ZEST OF THE MOST RELEVANT\nMIDNIGHT VALENCIA ORANGES, AND\nDOUSE IT ALL IN A FINE DELAMAIN\nDE VOYAGE COGNAC, BEFORE\nFLAMBEING AND BASTING THEM TABLE\nSIDE, TO SERVE FOR YOU THE JAMES\nBEARD AWARD-WORTHY CREPES\nSUZETTE THAT IS MY NIGHTLY,\nMONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES FOLKS, I WAKE UP IN THE\nBAGGAGE HOLD OF A GREYHOUND BUS\nAS ITS BEING HOISTED BY THE\nSCRAPYARD CLAW TOWARD THE BURN\nPIT, ESCAPE TO A NEARBY\nABANDONED PRICE CHOPPER, WHERE I\nSCROUNGE FOR OLD BREAD SCRAPS,\nBUSTED OPEN BAGS OF STAR FRUIT\nCANDIES, AND EXPIRED EGGS,\nCHUCK IT ALL IN A DIRTY HUBCAP\nAND SLAP IT OVER A TIRE FIRE\nBEFORE USING THE LEGS OF A\nSTAINED PAIR OF SWEATPANTS AS\nOVEN MITTS TO EXTRACT AND SERVE\nTHE DEMENTED TRANSIENT'S POUND\nCAKE OF NEWS THAT IS MY SEGMENT:\n\"MEANWHILE.\""
36
+ },
37
+ "8prs9Pq5Xhk": {
38
+ "begin": "1:18.5",
39
+ "end": "2:17.5",
40
+ "text": "FOLKS, IF YOU WATCH THE SHOW,\nAND I HOPE YOU DO, I SPEND A\nLOT OF TIME RIGHT OVER THERE,\nTIRELESSLY STUDYING THE LINEAGE\nOF THE DAY'S MOST IMPORTANT\nTHOROUGHBRED STORIES AND\nHOLSTEINER HEADLINES, WORKING\nWITH THE BEST TRAINERS MONEY CAN\nBUY TO REAR THEIR COMEDY\nOFFSPRING WITH A HAND THAT IS\nSTERN, YET GENTLE, INTO THE\nTRIPLE-CROWN-WINNING EQUINE\nSPECIMEN THAT IS MY NIGHTLY\nMONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS\nI BREAK INTO AN UNINCORPORATED\nVETERINARY GENETICS LAB AND GRAB\nWHATEVER TEST TUBES I CAN FIND.\nAND THEN, UNDER A GROW LIGHT I GOT\nFROM A DISCARDED CHIA PET, I MIX\nTHE PILFERED D.N.A. OF A HORSE\nAND WHATEVER WAS IN A TUBE\nLABELED \"KEITH-COLON-EXTRA,\"\nSLURRYING THE CONCOCTION WITH\nCAFFEINE PILLS AND A MICROWAVED\nRED BULL, I SCREAM-SING A PRAYER\nTO JANUS, INITIATOR OF HUMAN\nLIFE AND GOD OF TRANSFORMATION\nAS A HALF-HORSE, HALF-MAN FREAK,\nSEIZES TO LIFE BEFORE ME IN THE\nHIDEOUS COLLECTION OF LOOSE\nANIMAL PARTS AND CORRUPTED MAN\nTISSUE THAT IS MY SEGMENT:\nMEANWHILE!"
41
+ },
42
+ "9gX4kdFajqE": {
43
+ "begin": "0:44.0",
44
+ "end": "1:08.0",
45
+ "text": "FOLKS, IF YOU WATCH THE SHOW,\nYOU KNOW SOMETIMES I'M OVER\nTHERE DOING THE MONOLOGUE.\nAND THEN THERE'S A COMMERCIAL\nBREAK, AND THEN I'M SITTING\nHERE.\nAND I DO A REALLY LONG\nDESCRIPTION OF A DIFFERENT\nSEGMENT ON THE SHOW, A SEGMENT\nWE CALL... \"MEANWHILE!\""
46
+ },
47
+ "9ssGpE9oem8": {
48
+ "begin": "0:00.0",
49
+ "end": "0:58.0",
50
+ "text": "WELCOME BACK, EVERYBODY.\nYOU KNOW, FOLKS, I SPEND MOST OF\nMY TIME RIGHT OVER THERE,\nCOMBING OVER THE DAY'S NEWS,\nSELECTING ONLY THE HIGHEST\nQUALITY AND MOST TOPICAL BONDED\nCALFSKIN-LEATHER STORIES,\nCAREFULLY TANNING THEM AND CUTTING\nTHEM WITH MILLIMETER PRECISION,\nTHEN WEAVING IT TOGETHER IN\nA DOUBLE-FACED INTRECCIATO\nPATTERN TO CREATE FOR YOU THE\nEXQUISITE BOTTEGA VENETA CLUTCH\nTHAT IS MY MONOLOGUE.\nBUT SOMETIMES, WHILE AT A RAVE\nIN A CONDEMNED CEMENT FACTORY, I\nGET INJECTED WITH A MYSTERY\nCOCKTAIL OF HALLUCINOGENS AND\nPAINT SOLVENTS, THEN, OBEYING\nTHE VOICES WHO WILL STEAL MY\nTEETH IF I DON'T, I STUMBLE INTO\nA SHIPYARD WHERE I RIP THE\nCANVAS TARP FROM A GRAVEL TRUCK,\nAND TIE IT OFF WITH THE ROPE FROM A\nROTTING FISHING NET, THEN WANDER\nA FOOD COURT, FILLING IT WITH\nWHAT I THINK ARE GOLD COINS BUT\nARE, IN FACT, OTHER PEOPLE'S CAR\nKEYS, TO DRAG AROUND THE\nROOTLESS TRANSIENT'S CLUSTER\nSACK OF NEWS THAT IS MY SEGMENT:\n\"MEANWHILE!\""
51
+ },
52
+ "ARw4K9BRCAE": {
53
+ "begin": "0:26.0",
54
+ "end": "1:16.0",
55
+ "text": "YOU KNOW, FOLKS, I SPEND\nMOST OF MY TIME STANDING RIGHT OVER\nTHERE,\nGOING OVER THE DAY'S NEWS\nAND SELECTING THE FINEST,\nMOST\nTOPICAL CARBON FIBER\nSTORIES, SHAPING THEM IN DRY\nDOCK INTO A\nSLEEK AND SEXY HULL, KITTING\nIT OUT WITH THE MOST TOPICAL\nFIBERGLASS AND TEAK FITTINGS\nBRASS RAILINGS, HOT TUBS,\nAND\nNEWS HELIPADS, TO CREATE THE\nCUSTOM DESIGNED, GLEAMING\nMEGA-YACHT THAT IS MY NIGHTLY\nMONOLOGUE. BUT SOMETIMES, JUST SOMETIMES FOLKS, I\nWASH ASHORE AT\nAN ABANDONED BEACH RESORT\nAFTER A NIGHT OF BATH SALTS\nAND\nSCOPOLAMINE, LASH SOME\nROTTING PICNIC TABLES\nTOGETHER, THEN\nDREDGE THE NEWS POND TO\nHAUL UP WHATEVER DISCARDED\nTRICYCLES AND BROKEN\nFRISBEES I CAN FIND, STEAL\nAN EYE PATCH\nFROM A HOBO, STAPLE A DEAD\nPIGEON TO MY SHOULDER, AND\nSAIL\nINTO INTERNATIONAL WATERS\nON THE PIRATE GARBAGE SCOW\nOF\nNEWS THAT IS MY SEGMENT:\nMEANWHILE!"
56
+ },
57
+ "B1DRmrOlKtY": {
58
+ "begin": "1:34.0",
59
+ "end": "2:17.0",
60
+ "text": "FOLKS, I SPEND A LOT OF TIME\nSTANDING RIGHT OVER THERE, OKAY,\nHANDPICKING ONLY THE RIPEST\nMOST TOPICAL DAILY HEADLINES,\nSEEKING OUT THE SWEETEST, MOST\nREFINED OF JEST-JAMS AND\nJOKE-JELLIES, CURATING A PLATE\nOF LOCAL COMEDY-FED MEATS AND\nSATIRICAL CHEESES TO LAY OUT THE\nARTISANAL CHARCUTERIE BOARD\nA-NEWS BOUCHE THAT IS MY NIGHTLY\nMONOLOGUE.\nBUT SOMETIMES, I WAKE UP IN THE\nGREASE TRAP OF AN\nUNLICENSED SLAUGHTERHOUSE,\nSPLASH MY FACE WITH SOME BEEF TALLOW,\nRENDER A RUDIMENTARY PRESERVE\nFROM BONE MARROW AND MELTED\nGUMMY WORMS, AND THROW IT\nTOGETHER WITH SOME SWEETBREADS\nAND TRIPE TO PRESENT THE\nPLOWMAN'S PLATTER OF UNCLAIMED\nOFFAL THAT IS MY SEGMENT:\nMEANWHILE!"
61
+ },
62
+ "BT950jqCCUY": {
63
+ "begin": "1:00.0",
64
+ "end": "2:09.0",
65
+ "text": "YOU KNOW, FOLKS,\nI SPEND SO MUCH OF MY TIME RIGHT\nOVER THERE, SIFTING THROUGH THE\nDAY'S BIGGEST STORIES, HAND\nSELECTING ONLY THE FINEST, MOST\nPERFECTLY AGED BURMESE NEWS\nTEAK.\nTHEN CAREFULLY CARVING AND\nSHAPING IT INTO A REFINED\nBUDDHA, DEPICTED IN THE \"CALLING\nTHE EARTH TO WITNESS\" GESTURE,\nWITH CHARACTERISTIC CIRCULAR\nPATTERNS ON THE ROBE, SHINS, AND\nKNEES, OF COURSE, WHICH I\nTHEN CAREFULLY GILD WITH THE\nMOST TOPICAL GOLD LEAF.\nTHEN I HARVEST THE SAP OF A\nUSITATA TREE TO APPLY THE DELICATE\nTAI O LACQUER, AND FINALLY HAND\nDECORATE IT WITH THE WHITE GLASS\nINLAYS TO CREATE FOR YOU THE\nGLORIOUS AMARA-PURA PERIOD\nSCULPTURE THAT IS MY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES FOLKS,\nI WASH UP ON A GALVESTON\nBEACH, NAKED ON A RAFT OF GAS\nCANS AFTER ESCAPING FROM A FIGHT CLUB\nIN INTERNATIONAL WATERS.\nTHEN, STILL DERANGED ON A\nCOCKTAIL OF MESCALINE AND COUGH\nSYRUP, I STEAL A CINDER BLOCK\nFROM UNDER A STRIPPED '87 FORD\nTAURUS, AND CHIP AWAY AT IT\nWITH A BROKEN UMBRELLA HANDLE I\nSCAVENGED FROM A GOODWILL\nDUMPSTER UNTIL IT VAGUELY\nRESEMBLES THE HUNGERING WOLF\nTHAT SCRATCHES AT THE DOOR\nOF MY DREAMS AND PRESENT TO YOU\nTHE TORMENTED DREAD-EFFIGY OF\nNEWS THAT IS MY SEGMENT:\nMEANWHILE!"
66
+ },
67
+ "C0e8XM30tQI": {
68
+ "begin": "0:57.0",
69
+ "end": "2:40.0",
70
+ "text": "YOU KNOW, IF YOU WATCH THE SHOW, YOU'RE AWARE THAT I SPEND MOST OF\nMY TIME RIGHT OVER THERE,\nWANDERING THE NEWS FOREST FOR\nYOU, FELLING ONLY THE BIGGEST\nAND HEARTIEST WHITE STORY OAKS,\nCUTTING AND SHAPING THEM INTO\nTHE NEWSIEST, MOST TOPICAL\nCLEATS, CLAMPS AND PLANKS,\nKEEPING THEM AT A CONSTANT\nANGLE, GRADUALLY CREATING A\nSHELL-SHAPED, SHALLOW BOWL HULL\nUSING THE FIRE-BENDING TECHNIQUE\nINSTEAD OF STEAM-BENDING\nOBVIOUSLY.\nTHEN I LAY OUT ALL THE KEEL\nBLOCKS TO CAREFULLY SET UP THE\nSTEM, STERN AND, GARBOARD,\nATTACH THE BILGE FUTTOCKS TO THE\nTIMBER AND LOVINGLY CRAFT A FLAT\nTRANSOM STERN OUT OF\nNATURALLY-CURVED QUARTER\nCIRCLES.\nTHEN SECURE ALL THE PLANKS WITH\nTRUNNELS HANDMADE FROM THE\nFINEST LOCUST WOOD AND, FINALLY,\nADORN IT WITH A PROUD BOWSPRIT,\nFOREPEAK, AND CUSTOM GILDED\nFIGUREHEAD TO PRESENT TO YOU THE\nDUTCH GOLDEN AGE \"SPIEGEL-JACHT\"\nTHAT IS MY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES,\nFOLKS-- YOU GOT TO HYDRATE\nAFTER THAT.\n\"SPIEGEL-JACHT\"\nBUT SOMETIMES, I\nAWAKEN FROM A MEAT-SWEAT-\nINDUCED FEVER STRAPPED TO A\nBASKET ON THE WONDER WHEEL AT\nCONEY ISLAND, STUMBLE ACROSS THE\nGARBAGE-FLECKED BEACH TO THE\nSOUND OF A TERRIFYING RAGGED\nBELLOW I REALIZE IS COMING FROM\nMY OWN LUNGS, WHICH THEN SUMMONS AN\nARMY OF SEAGULLS WHOM I INSTRUCT\nTO GATHER THE HALF-EMPTIED CANS\nOF BUSCH LIGHT LEFT BY A MOB OF\nBELGIAN TOURISTS, ALL OF WHICH\nI GATHER INTO A SACK I FASHIONED\nOUT OF PANTS I STOLE FROM A\nSLEEPING COP. THEN I SWIPE A\nGIANT INFLATABLE BABY YODA FROM\nA CARNY GAME, STRAP IT TO THE\nMAST I MADE BY RIPPING A B-68\nBUS STOP SIGN OUT OF THE GROUND\nON THE CORNER OF STILLWELL, AND\nLAUNCH THE VESSEL AS CAPTAIN OF\nTHE UNREGULATED PIRATE BOOZE\nCRUISE OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE\"!"
71
+ },
72
+ "CKsASCGr_4A": {
73
+ "begin": "1:48.3",
74
+ "end": "2:37.3",
75
+ "text": "FOLKS, YOU KNOW, I SPEND A LOT\nOF TIME RIGHT OVER THERE, CARVING THE\nFINEST, MOST-TOPICAL JAPANESE\nNEWS CYPRESS INTO AN EXPRESSIVE\nHANNYA DEMON MASK, DONNING MY\nBLACK AND GOLD SHOZOKU ROBES\nMADE OF THE SMOOTHEST STORY\nSILK, AND MASTERING THE ELABORATE\nCHOREOGRAPHY OF STILLNESS AND\nMOVEMENT, TO CREATE THE JAPANESE\nNOH THEATER PRODUCTION THAT IS\nMY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nFOLKS, I ENTER A FUGUE STATE IN\nTHE MIDDLE OF THE NIGHT, SIPHON\nA BUCKETFUL OF GASOLINE OUT OF MY\nNEIGHBOR'S MAZDA, RUN BAREFOOT\nFROM MY HOUSE TO A HOVEL UNDER\nTHE TURNPIKE WHERE I ASK A HOBO\nFOR A LIGHT AND SET A GARBAGE\nCAN ABLAZE, THEN PLACE MY\nFROSTBITTEN HANDS IN FRONT OF\nTHE DUMPSTER TO PROJECT THE\nSHADOW PUPPET WINO OPERA OF NEWS\nTHAT IS MY SEGMENT, \"MEANWHILE.\""
76
+ },
77
+ "DSc26qAJp_g": {
78
+ "begin": "0:28.0",
79
+ "end": "1:46.0",
80
+ "text": "FOLKS, I SPEND MOST OF MY TIME\nRIGHT OVER THERE, ASSEMBLING THE\nDAY'S BIGGEST, MOST-IMPORTANT\nSTORIES, THEN HAND-SHAPING THEM\nINTO SLEEK, ELEGANT BODYWORK,\nWHICH I LINE WITH ONLY THE\nFINEST, MOST TOPICAL POLISHED\nMACASSAR EBONY AND OPEN-PORE\nPALDAO VENEER, ADDING LIGHT\nMOCCASIN AND DARK SPICE LEATHER\nSEATS, AND MALABAR TEAK WOOD TO\nSET OFF A SCULPTED MINIMALIST\nSWITCHGEAR, ACCOMPANIED BY A\nSTERLING SILVER HUMIDOR AND\nCHAMPAGNE CELLARETTE, THEN\nHAND-SET 1,600 FIBER OPTIC\nLIGHTS ALIGNED WITH PINPOINT\nPERFORATIONS IN THE ROOF-LINING,\nTO CREATE FOR YOU THE BESPOKE\nCOACH BUILT ROLLS-ROYCE SWEPTAIL\nTHAT IS MY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS,\nSOMETIMES, I JUST, I JUST SHRIEK AWAKE IN THE\nSCRAPYARD OF A DERELICT MACHINE\nSHOP, SCAVENGE A\n3.6-HORSEPOWER BRIGGS AND STRATTON\nLAWN MOWER ENGINE, BULLY A\nDOG INTO GIVING ME ITS FRISBEE\nTO USE AS A STEERING WHEEL, THEN\nI BRIEFLY CONSIDER-- BUT DECIDE\nAGAINST-- SWIPING THE BRAKE PADS\nOFF AN UNATTENDED HEARSE,\nBECAUSE WHERE I'M GOING, WE DON'T\nNEED BRAKES.\nI HOOK IT ALL UP TO A RUSTY\nDOLLAR TREE SHOPPING CART,\nSHOTGUN A WHITE CLAW, NO LAW ON THE CLAW, SPRAY\nPAINT MY TEETH, AND BLAST OFF IN\nTHE \"FURY ROAD\" THUG-BUGGY OF\nNEWS THAT IS MY SEGMENT:\n\"MEANWHILE\"!"
81
+ },
82
+ "DhuCyncmFgM": {
83
+ "begin": "0:43.5",
84
+ "end": "1:30.5",
85
+ "text": "YOU KNOW, FOLKS, I\nSPEND A LOT OF TIME STANDING RIGHT\nOVER THERE, COMBING THROUGH\nTHE DAY'S\nSTORIES TO FIND AND ERECT\nFOR YOU THE FINEST GRECIAN\nNEWS\nCOLUMNS, ADORNING THEM WITH\nTHE MOST UP-TO-THE-MINUTE\nBAS-RELIEF.\nAND THEN I IMPART MY MOST\nTOPICAL\nTEACHINGS TO BE ABSORBED BY\nEAGER, SPONGE-LIKE\nMINDS IN THE\nAUGUST ATHENIAN ACADEMY THAT\nIS MY MONOLOGUE.\nBUT SOMETIMES, JUST\nSOMETIMES, FOLKS, I COME TO IN A\nDRIED-OUT BABY POOL\nON AN ABANDONED RANCH I WON\nIN A RUSSIAN ROULETTE GAME\nOFF THE COAST OF MOZAMBIQUE\nI GATHER TUMBLEWEEDS\nAND I LASH THEM TOGETHER WITH\nSOME TWINE I FOUND IN A DUMPSTER\nBY A BURNED-OUT REST STOP,\nTHEN I\nSHOTGUN A HOT MONSTER ENERGY\nDRINK AND CHEW ON MACA ROOT\nAS I\nHALLUCINATE THROUGH THE\nNIGHT IN THE VAGRANT'S HOT\n-BOX YURT OF\nNEWS THAT IS MY SEGMENT:\n\"MEANWILE!\""
86
+ },
87
+ "EnGHyZS4f-8": {
88
+ "begin": "1:33.0",
89
+ "end": "2:12.0",
90
+ "text": "YOU KNOW, FOLKS, I'VE SPENT\nDECADES CULTIVATING THE MOST\nRELEVANT RED OAK, FELLING THEM\nWITH TOPICAL HUSQVARNAS,\nMULTIGRADING THEM INTO THE MOST\nBUZZWORTHY THREE-QUARTER INCH\nFLOORSTRIPS, AND FINISHING THEM\nWITH UP-TO-THE-MINUTE HIGH-GLOSS\nPOLYURETHANE, TO LAY FOR YOU THE\nFLAWLESS PARQUET N.B.A.-QUALITY\nBASKETBALL COURT THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, I SCARE THE GEESE\nOUT OF THE BOG BEHIND MY UNCLE'S\nSHED, FILL IT WITH SAND I STOLE\nFROM AN ABANDONED PLAYGROUND,\nTHEN BLANKET IT WITH WET TRASH\nAND DISCARDED AMAZON BOXES, TO\nCREATE FOR YOU THE\nMUSKRAT-RIDDLED BOUNCELESS\nBACKYARD WALLBALL PIT OF NEWS\nTHAT IS MY SEGMENT:\n\"MEANWHILE!\""
91
+ },
92
+ "G8ajua4Mb5I": {
93
+ "begin": "1:39.0",
94
+ "end": "2:24.0",
95
+ "text": "YOU KNOW, FOLKS,\nIF YOU WATCH THIS SHOW AND I\nHOPE YOU DO,\nI SPEND A LOT OF TIME RIGHT OVER\nTHERE, CAREFULLY ERECTING THE\nNEWSIEST, MOST TOPICAL\nCORINTHIAN COLUMNS, FOLLOWING\nTHE GOLDEN RATIO, POLISHING THE\nFINEST CARRARA MARBLE, AND\nENGRAVING ONTO IT MYTHIC TALES\nWITH FLOURISHES OF HUMOR AND\nPATHOS, TO CREATE THE\nGRECO-ROMAN ACROPOLIS THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES, FOLKS,\nI GRAB A COUPLE OF OFF-BRAND\nLEGOS THAT HAVE BEEN JAMMED\nBETWEEN MY TOES SINCE I STEPPED\nON THEM IN 2003, FISH THE STICKS\nOUT OF SOME MELTED POPSICLES IN\nA BROKEN FREEZER, COLLECT THE\nPRE-SOAKED SAND FROM A\nPLAYGROUND I BROKE INTO IN THE\nDEAD OF NIGHT, AND THROW IT ALL\nTOGETHER TO CONSTRUCT FOR YOU\nTHE RAMSHACKLE PARTHENON OF NEWS\nTHAT IS MY SEGMENT:\n\"MEANWHILE!\""
96
+ },
97
+ "I4s-44cPYVE": {
98
+ "begin": "1:10.0",
99
+ "end": "2:59.0",
100
+ "text": "FOLKS, YOU KNOW, IF YOU WATCH THE SHOW, YOU KNOW, I SPEND\nMOST OF MY TIME\nRIGHT OVER THERE, SURVEYING THE\nNEWS MARKET FOR THE\nBIGGEST STORIES, THEN CAREFULLY\nSELECTING THE FINEST, MOST\nTOPICAL BUFFALO NEWS HIDE WHICH\nI THEN SOAK USING NATURAL SPRING\nAND LIMEWATER-- ONLY DURING\nCOLDER MONTHS-- AND SCRAPE IT\nUNTIL IT IS EVENLY TRANSLUCENT\nAND SUPPLE.\nAND THEN, USING THE TRADITIONAL\nPUSH-KNIFE METHOD, I DELICATELY\nMAKE MORE THAN 3,000 CUTS TO\nCREATE THE EMOTIVE AND POWERFUL\nFIGURINE WHICH I DECORATE WITH\nGRADATIONS AND CONTRAST,\nEMPLOYING THE SHAANXI REGION\nFIVE-COLOR SYSTEM.\nTHEN I CAREFULLY CONNECT THE\nFIGURINE'S JOINTS WITH COTTON\nTHREADS SO THEY CAN BE OPERATED\nFREELY, AND FIRE UP A PAPER\nLANTERN BEHIND A FINE HUANGZHOU\nSILK SCREEN, AND, BACKED BY A\nSUONA HORN, AND YUEQIN, AND\nBANHU FIDDLE, I OPERATE NO LESS\nTHAN FIVE OF THESE FIGURINES AT\nONCE, BECOMING THE LIVING\nEMBODIMENT OF THE 1,000-HAND\nGWAN-YIN, TO MOUNT FOR YOU THE\nEPIC AND MOVING TONG DYNASTY\nPI-YING SHADOW PLAY THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, FOLKS,\nSOMETIMES,\nIT'S CRAFTSMANSHIP.\nIT GOES LIKE THAT, RIGHT\nTHERE.\nSOMETIMES, FOLKS,\nI AM PECKED\nAWAKE BY A DIRTY SEAGULL ON THE\nINTRACOASTAL WATERWAY, WHILE\nSTILL LYING ON THE BACK OF A\nMANATEE WHO RESCUED ME FROM SOME\nARMS DEALERS I DOUBLE-CROSSED\nOFF THE COAST OF CAPE FEAR, AND WHO THEN\nDUMPS ME ON AN ABANDONED WHARF\nWHERE I SLIP THEIR DIRTY SOCK OFF\nA SEVERED FOOT I FISHED OUT OF A\nSTORM DRAIN, AND SLAP GOOGLY\nEYES ON IT MADE FROM TWO MENTOS\nI PRIED OUT OF THE MOUTH OF A\nMANGY COYOTE.\nTHEN I FASHION A KAZOO OUT OF A\nPOCKET COMB I STOLE FROM A\nFISHERMAN AND THE WAX PAPER FROM\nHIS MEATBALL SUB, TO HONK OUT A\nDIRGE WHILE YAMMERING A TONE\nPOEM ABOUT THE DEMONS INFESTING\nMY BEARD IN THE UNBALANCED MANIC\nSOCK PUPPET SHOW OF NEWS THAT IS\nMY SEGMENT:\n\"MEANWHILE!\""
101
+ },
102
+ "JAfAApqOeFU": {
103
+ "begin": "1:49.5",
104
+ "end": "2:42.5",
105
+ "text": "FOLKS.\nI SPEND A LOT OF MY TIME GATHERING FOR YOU\nTHE FINEST, MOST TOPICAL STORIES\nABOUT NATIONAL STUDIES,\nSCIENTIFIC BREAKTHROUGHS, AND\nDRUNK MOOSE BREAKING INTO ICE\nCREAM SHOPS, ONLY TO HAVE A\nPANDEMIC HIT, DURING WHICH I\nTAKE THEM INTO A SAD, EMPTY\nLITTLE ROOM WHERE MY ONLY\nFRIENDS ARE ROBOT CAMERAS AND A\nPURELL BOTTLE, AND I LET MYSELF\nGO WHILE SLOWLY DESCENDING INTO\nMADNESS AS I SHOVE MY SWEET\nINNOCENT LITTLE JOKES INTO A\nSEGMENT THAT I AM FORCED TO\nRENAME \"QUARANTINE-WHILE.\"\nBUT SOMETIMES, I CRAWL OUT OF\nTHE BROOM CLOSET AFTER 15\nMONTHS, POUR MYSELF BACK INTO A\nSUIT, ASSEMBLE THE TOP TEAM IN\nTHE BUSINESS, THE SWINGINGEST\nBAND IN LATE NIGHT, AND THE\nBEST DAMN AUDIENCE IN THE WORLD.\nSO I CAN RETURN TO YOUR LOVING\nARMS IN THE KICK-ASS,\nPROPERLY-PRESENTED\nCELEBRATION OF MARGINAL NEWS\nTHAT IS MY SEGMENT:\n\"MEANWHILE!\""
106
+ },
107
+ "JfT59wBSQME": {
108
+ "begin": "0:52.0",
109
+ "end": "1:37.0",
110
+ "text": "YOU KNOW, FOLKS, I SPEND A\nLOT OF TIME HARVESTING THE DAY'S\nMOST TOPICAL MATCHA POWDER,\nCAREFULLY POLISHING THE NEWSIEST\nCHAWAN TEA BOWL WITH A HEMP\nCLOTH, AND ADDING THE PUREST\nBOILED WATER COLLECTED FROM THE\nRIVER OF STORIES, TO STAGE THE\nELEGANT JAPANESE TEA CEREMONY\nTHAT IS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES, I CONVINCE A\nTRUCK DRIVER TO GIVE ME A RIDE\nIN EXCHANGE FOR AN UNREGISTERED\nHAND GUN AND A HALF-EATEN CAN OF\nBEANS, HITCHHIKE TO THE SONORA\nDESERT WITH NOTHING BUT AN OLD\nPOT THAT I FILL WITH THE\nNEWSPAPER I'VE BEEN USING FOR A\nBLANKET, AND THE SALVAGED\nTOBACCO FROM A SIDEWALK CIGARETTE\nBUTT, TO BREW FOR YOU, THE\nNIGHTMARE HALLUCINATION-INDUCING\nFERMENTED AYAHUASCA SLURRY OF\nNEWS THAT IS MY SEGMENT:\nMEANWHILE!"
111
+ },
112
+ "KT8pCZ5Xw9I": {
113
+ "begin": "0:37.6",
114
+ "end": "1:26.6",
115
+ "text": "FOLKS, YOU KNOW,\nIF YOU WATCH THE SHOW,\nTHAT I SPEND A LOT OF TIME RIGHT OVER\nTHERE, GATHERING THE FRESHEST,\nNEWSIEST HEADLINE FLOWERS,\nSCOURING THE FIELDS AND FORESTS\nFOR THE MOST TOPICAL AND\nFRAGRANT SYLVAN NEWS BOUGHS, THE\nJOKIEST FESTIVE GOURDS, AND THEN\nCAREFULLY ASSEMBLING AND\nARRANGING THEM ALL INTO THE\nGRAND YET TASTEFUL STATE\nDINNER-WORTHY CENTERPIECE THAT\nIS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES, NOW, SOMETIMES,\nI RUB SOME LEAD PAINT CHIPS ONTO\nMY GUMS, STAGGER INTO THE WOODS\nWITH NOTHING BUT A STAPLE GUN\nAND SOME EMPTY CANS OF SPRAY\nPAINT, AND THEN, BY THE LIGHT OF THE\nTIRE-FIRE, USING SMASHED LARVAE\nAND MY OWN SALIVA AS GLUE, I\nCOBBLE TOGETHER A CRUDE PILE OF\nPUNKY WOOD AND ANIMAL SKULLS TO\nPRESENT TO YOU THE UNHINGED\nLONER'S CORNUCOPIA OF NEWS THAT\nIS MY SEGMENT:\n\"MEANWHILE!\""
116
+ },
117
+ "L-kR7UCzhTU": {
118
+ "begin": "1:36.0",
119
+ "end": "2:19.0",
120
+ "text": "YOU KNOW, I SPEND A LOT OF TIME\nRIGHT OVER THERE HAND-RAISING\nAND SELECTING THE NEWEST,\nMOST-TOPICAL SEVILLE ORANGES,\nCAREFULLY SIMMERING THEM WITH\nTURBINADO SUGAR AND PRISTINE\nFILTERED WATER TO CREATE FOR YOU\nA DOUBLE-GOLD-MEDAL-WINNING\nBITTERSWEET ENGLISH MARMALADE TO\nSPREAD ON THE PERFECTLY TOASTED\nARTISANAL BRIOCHE THAT IS\nMY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES, FOLKS,\nI GRAB AN EXPIRED TUB OF\nWHIPPING CREAM, TOSS IT IN A\nBLENDER WITH THE HALF OF A\nGOGURT I FOUND IN A SCHOOLYARD,\nAND LET THAT FERMENT BEHIND THE\nFURNACE WHILE I FISH A DRIED\nPITA OUT OF THE GUTTER\nUNDERNEATH THE CORN-- CORNER KEBAB\nSTAND, THEN SLATHER IT WITH THE\nUNPRESSURIZED NIGHTMARE\nDAIRY OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE!\""
121
+ },
122
+ "Lf-LkJhKVhk": {
123
+ "begin": "1:14.0",
124
+ "end": "1:53.0",
125
+ "text": "YOU KNOW, I SPEND A LOT OF TIME\nOVER THERE CAREFULLY ASSEMBLING\nTHE MOST TOPICAL VIRTUOSO WIND\nSECTION, TUNING THE VIOLAS,\nCELLOS, AND CONTRABASS TO THE\nCOUNTRY'S ZEITGEIST, AND WAVING\nMY CONDUCTOR'S BATON TO THE\nTEMPLE OF HUMOR TO PRESENT FOR\nYOU THE SPECTACULAR BRAHMS\nCONCERTO IN SATIRE MAJOR THAT IS\nMY MONOLOGUE.\nBUT SOMETIMES, I WAKE UP IN A\nFUGUE STATE BEHIND THE ABANDONED\nVIDEO STORE, STEAL A GARBAGE CAN\nLID AND A BROKEN UMBRELLA\nHANDLE, AND THEN GRAB AN EMPTY CAN\nOF P.B.R. I'VE BEEN USING AS AN\nASHTRAY TO FASHION A RUSTED\nKAZOO, ALL TO CREATE THE\n2-IN-THE-MORNING ONE-MAN-STOMP\nSHOW OF NEWS THAT IS MY SEGMENT:\n\"MEANWHILE!\""
126
+ },
127
+ "P72uFdrkaVA": {
128
+ "begin": "0:49.3",
129
+ "end": "1:48.3",
130
+ "text": "FOLKS, IF YOU WATCH THE SHOW, YOU KNOW I SPEND A LOT OF\nTIME RIGHT OVER THERE, TENDERLY\nCLIPPING THE NEWSIEST, MOST\nFRAGRANT TEA-LEAF BUDS OF THE\nDAY, GINGERLY LAYING THEM TO DRY\nBY THE LIGHT OF THE\nROSY-FINGERED DAWN,\nPAINSTAKINGLY STEEPING THEM TO\nPERFECTION IN THE MOST TOPICAL\nOF FRESH WATER GATHERED FROM THE\nNATION'S NEWS RESERVOIR, BEFORE\nCEREMONIOUSLY SERVING TO YOU THE\nANTIOXIDANT-RICH ELIXIR OF\nBESPOKE TEA SHAN THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES FOLKS, AFTER\nA BENDER ON ABSINTHE AND\nMOUNTAIN DEW CODE RED, I SWEAT\nMYSELF AWAKE HOVERED OVER A VAT\nOF BLISTERING SEWER RUNOFF.\nI ADD TO THE GURGLING POT OF\nNIGHTMARES WHATEVER THE VOICES\nDEMAND: SCRAPS OF WET TRASHCAN\nLETTUCE AND BAND-AIDS FROM THE\nCOMMUNITY POOL.\nTHEN, USING AN OLD GYM SOCK I\nFOUND AT THE HOBOKEN Y.M.C.A., I\nSTRAIN IT ALL INTO A DISUSED\nGASOLINE CONTAINER TO OFFER YOU\nTHE SCALDING-HOT DEMENTED DEMON\nTONIC OF NEWS THAT IS MY\nSEGMENT: \"MEANWHILE!\""
131
+ },
132
+ "PT5_00Bld_8": {
133
+ "begin": "0:57.0",
134
+ "end": "2:02.0",
135
+ "text": "FOLKS, YOU KNOW, I SPEND A LOT OF MY TIME,\nRIGHT OVER THERE, CRUISING THE\nVAST TSUKIJI FISH MARKET OF THE\nDAY'S BIGGEST STORIES, CAREFULLY\nSURVEYING THE FRESHEST, MOST\nTOPICAL CATCH OF THE DAY,\nCHOOSING ONLY THE HIGHEST GRADE\nAHI NEWS TUNA, AWABI, AND UNAGI,\nTHEN DELICATELY PICKING THROUGH\nTHE RICE GRAINS OF THE\nHEADLINES, GENTLY WRAPPING THE\nINGREDIENTS IN THE FRESHEST\nHAND-PICKED NORI, AND CAREFULLY\nLAYING IT ALL OUT TO PRESENT TO\nYOU THE YAMANAKA LACQUERED\nTHREE-TIER JUBAKO BENTO BOX THAT\nIS MY NIGHTLY MONOLOGUE.\nBUT -- YOU KNOW WHAT I'M SAYING.\nYOU KNOW WHAT I'M SAYING.\nYOU KNOW WHAT'S COMING.\nBUT SOMETIMES, JUST SOMETIMES FOLKS, I AM SLAPPED\nAWAKE BY A POSSUM ON A BED OF\nWET TIRES NEAR A LONG-ABANDONED\nWHARF, SELL MY LAST REMAINING\nADULT TEETH TO A VAGRANT\nFISHMONGER FOR AN EXPIRED SALMON\nCARCASS, AND GRIND THAT INTO A\nSOFT PASTE, THEN ROLL IT IN ENOUGH\nSTALE RICE KRISPIES TO MAKE\nSNAP, CRACKLE, AND POP TAKE A\nHARD LOOK IN THE MIRROR, AND SERVE\nYOU THE NIGHT-SCREAM INDUCING\nHOBO HAND-ROLL OF NEWS THAT IS\nMY SEGMENT:\n\"MEANWHILE!\""
136
+ },
137
+ "QPDZbNEhsuw": {
138
+ "begin": "1:31.0",
139
+ "end": "2:15.0",
140
+ "text": "YOU KNOW, FOLKS, I SPEND MOST\nOF MY TIME RIGHT OVER THERE,\nDIGGING DOWN INTO THE NEWS\nPIT AND MINING FOR YOU THE\nDAY'S\nCLEAREST STORY DIAMONDS,\nCLEAVING THEM INTO THE MOST\nTOPICAL CUTS, FACETING THEM,\nPOLISHING THEM TO A HIGH\nFINISH,\nTHEN SETTING THEM ALL IN A\nDELICATE 24 KARAT GOLD CHAIN\nTO\nCREATE THE BESPOKE CARTIER\nNECKLACE THAT IS MY NIGHTLY MONOLOGUE,\nBUT SOMETIMES, SOMETIMES FOLKS,\nI JUST HUFF A LITTLE TOO MUCH\nEPOXY\nAND STUMBLE DOWN TO AN\nABANDONED PIER, WHERE I FIND\nA PIECE OF\nDISUSED FISHING LINE AND\nSTRING IT WITH OLD BOTTLE\nCAPS, RUSTY\nPADLOCKS, AND BABY TEETH,\nTHEN RIP THE SEAT BELT OUT\nOF A\nBURNED-OUT POLICE CAR TO\nMAKE A CLASP, AND PARADE\nNAKED THROUGH\nA CHI-CHI'S WEARING THE\nPSYCHO CHOKER OF NEWS THAT\nIS MY\nSEGMENT:\n\"MEANWHILE.\""
141
+ },
142
+ "QjQbQlN9Ev8": {
143
+ "begin": "3:53.0",
144
+ "end": "4:35.0",
145
+ "text": "YOU KNOW, FOLKS, I SPEND A LOT OF\nMY TIME RIGHT OVER THERE,\nHANGING FOR YOU THE DAY'S\nHOTTEST, MOST TOPICAL NEWS\nDECORATIONS, BOOKING THE SEXIEST\nBAND, CURATING THE MOST RELEVANT\nDRINKS MENU, DISTRIBUTING\nPLAYFUL, YET TASTEFUL PARTY\nFAVORS, AND GLITTER HATS THAT\nSAY \"2022,\" AND THEN SETTING THE\nMOST AU-COURANT LIGHTING TO\nTHROW FOR YOU THE UPSCALE,\n\"PITCH PERFECT\" NEW YEAR'S EVE\nPARTY THAT IS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nFOLKS, I WAKE UP IN THE RAFTERS\nAFTER LOSING A BET TO A CROW.\nAND THEN I SPIKE THE PUNCH BOWL WITH\nCHLOROFORM AND MILITARY-GRADE\nHELICOPTER LUBRICANTS, MAKE A\nBUNCH OF RESOLUTIONS TO QUIT\nHUFFING WD-40, AND PUNCH A\nPOLICE HORSE DURING THE FUGITIVE\nNEW YEAR'S HO-DOWN OF NEWS THAT\nIS MY SEGMENT:\nMEANWHILE!"
146
+ },
147
+ "R6JV_I36It8": {
148
+ "begin": "0:38.0",
149
+ "end": "1:18.0",
150
+ "text": "YOU KNOW, FOLKS, I SPEND\nA LOT OF TIME\nSHUCKING FOR YOU THE DAY'S MOST\nTOPICAL CLAMS, BONING THE\nFINEST, MOST CURRENT NEWS\nCHICKENS, AND COLLECTING THE\nHIGHEST QUALITY STORY SHRIMP\nAND SAFFRON RICE, THEN GENTLY\nSIMMERING IT ALL IN A CAST-IRON\nCOMEDY PAI-YERA, TO CREATE\nTHE FRAGRANT SEAFOOD PAELLA THAT\nIS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES FOLKS, I\nSHAMBLE DOWN TO THE DOCKS WITH A\nRUSTY CROWBAR, MANEUVER THROUGH\nTHE POLLUTED CANAL USING A\nMCDONALD'S STRAW AS A SNORKEL,\nAND SCRAPE THE BARNACLES OFF A\nPASSING GARBAGE SCOW, TOSS THEM\nIN A POT WITH SOME HALF-USED\nRAMEN FLAVORED PACKETS AND\nMOUNTAIN DEW, TO BREW FOR YOU\nTHE CHUNKY STEW OF NEWS THAT IS\nMY SEGMENT:\n\"MEANWHILE!\""
151
+ },
152
+ "RFVggCw58lo": {
153
+ "begin": "1:00.0",
154
+ "end": "1:43.0",
155
+ "text": "YOU KNOW FOLKS, I SPEND A LOT OF TIME\nSTANDING RIGHT OVER THERE, PAINSTAKINGLY\nPENCILING THE DAY'S MOST TOPICAL\nAND HEROIC STORIES, STAGING THEM\nIN METICULOUSLY PLANNED PANELS,\nHAND-INKING THEM WITH THE\nPITHIEST DIALOGUE, THEN COLOR\nBALANCING THE FINISHED PAGES TO\nCREATE FOR YOU THE GENRE-BENDING\nONCE-IN-A-GENERATION GRAPHIC\nNOVEL THAT IS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES FOLKS, I JUST TEAR A PAGE OUT\nOF A WET NEWSPAPER I PEELED OFF\nA SUBWAY SEAT, GRAB A BROKEN\nCRAYON I FOUND IN MY COUCH,\nCRUDELY DRAW SOME FILTHY\nCARTOONS, SCRIBBLE IN AS\nMANY CURSE WORDS AS I CAN IN THE\nPORNOGRAPHIC DRIFTER ZINE OF\nNEWS THAT IS MY SEGMENT:\nMEANWHILE!"
156
+ },
157
+ "RHDQpOVLKeM": {
158
+ "begin": "0:39.0",
159
+ "end": "2:11.0",
160
+ "text": "YOU KNOW, FOLKS, I\nSPEND MOST OF MY TIME, RIGHT\nOVER THERE, PORING OVER THE\nDAY'S BIGGEST STORIES,\nCOLLECTING THE FINEST,\nMOST-TOPICAL NEWS CALFSKINS AND\nPAINSTAKINGLY WASHING THEM IN A\nCALCIUM HYDROXIDE SOLUTION, THEN\nSOAKING THEM IN LIME FOR DAYS TO\nREMOVE ALL NARRATIVE IMPURITIES\nAND CREATE A PALE VELLUM THAT I\nLATER PLACE ON MY SCRIPTORIUM IN\nA MONASTERY ON THE CLIFFS OF\nDOVER.\nTHERE, USING A PEN CUT FROM THE\nWING FEATHER OF A SWAN OF THE\nRIVER AVON, I DESIGN COPTIC AND\nSYRIAC ILLUSTRATIONS, ADORNED\nWITH WHIMSICAL CELTIC SPIRALS\nAND USE GERMANIC ZOOMORPHIC\nDESIGNS TO CREATE THE MARGINALIA\nSURROUNDING THE PAGES OF ELEGANT\nHALF-UNCIAL INSULAR SCRIPT THAT\nTELL THE HOLIEST OF STORIES,\nWHICH I THEN BIND WITH GOLDEN\nTHREAD UNDER A PROTECTIVE CASE\nOF CARVED OAK TO CREATE FOR YOU\nTHE GLORIOUS LATE\nANGLO-SAXON PERIOD ILLUMINATED\nMANUSCRIPT THAT IS MY MONOLOGUE.\nBUT SOMETIMES, FOLKS, SOMETIMES,\nTHESE PEOPLE KNOW, THEY KNOW, THEY KNOW\nBUT SOMETIMES, FOLKS,\nI COME TO UNDER A RAMP IN THE\nMIDDLE OF A DEMOLITION DERBY,\nHOTWIRE THE TRUCKASAURUS\nAND LEAD THE POLICE ON A CHASE\nBEFORE CRASHING INTO A SWAMP\nGATHERING JUST AS, WHAT I ASSUME\nIS A PRIEST SAYS, \"YOU MAY KISS\nTHE BRIDE,\" RIP A LEECH OFF MY\nASS, AND USE IT TO HASTILY\nDOODLE A SKETCH OF THE SCENE IN\nMY OWN BLOOD ON AN OLD DAVE AND\nBUSTERS RECEIPT, THEN STAGGER\nTOWARD THE HAPPY COUPLE\nCLUTCHING THE NIGHTMARE\nSTALKER'S WEDDING ALBUM OF NEWS\nTHAT IS MY SEGMENT:\nMEANWHILE!"
161
+ },
162
+ "TZSw9iRk03E": {
163
+ "begin": "0:22.5",
164
+ "end": "1:08.5",
165
+ "text": "FOLKS, YOU KNOW I\nSPEND A LOT\nOF RIGHT TIME OVER THERE, CAREFULLY\nSTUDYING THE LATEST, NEWSIEST\nCLINICAL STUDIES, PRACTICING AND\nTRAINING UNDER THE BEST, MOST\nTOPICAL DOCTORS, CAREFULLY\nSTERILIZING ALL MY EQUIPMENT,\nAND ASSEMBLING THE WORLD'S GREATEST\nSURGICAL TEAM TO PERFORM FOR YOU THE\nDAZZLINGLY COMPLEX AND\nGROUNDBREAKING THORACIC AORTIC\nDISSECTION REPAIR THAT IS MY\nMONOLOGUE.\nBUT SOMETIMES, SOMETIMES,\nI GET KICKED OUT OF MY HEALTH CARE\nPLAN FOR LISTING MY DOG BENNY AS\nA GASTROENTEROLOGIST, SO I\nSTIPPLE SOME INCISION MARKS ON\nMY ABDOMEN WITH A DRIED-OUT\nSHARPIE, SLAM A COUPLE OF RED BULLS\nIN FRONT OF A SHATTERED MIRROR,\nAND FISH A RUSTY BONING KNIFE\nOUT OF A STOLEN CRAB BOAT TO\nPERFORM THE EXPLORATORY HOBO\nAPPENDECTOMY OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE!\""
166
+ },
167
+ "VV3UJmb8kHw": {
168
+ "begin": "2:04.0",
169
+ "end": "2:54.0",
170
+ "text": "YOU KNOW, FOLKS,\nIF YOU WATCH THE SHOW, YOU KNOW\nI SPEND A LOT OF MY TIME RIGHT\nOVER THERE.\nCAREFULLY COMBING THROUGH THE\nBIGGEST STORIES OF THE DAY,\nSOURCING FOR YOU THE NEWSIEST\nMIKADO ORGANZA IN A HIGH SHEEN,\nADDING THE MOST TOPICAL IVORY\nFEATHER FRINGE AND A DIPPED\nBACK, THEN THROWING ON A DEMURE\nBUT KICKY FLORAL EMBROIDERED\nTULLE SHRUG WITH STATEMENT PEARL\nACCENTS TO PRESENT TO YOU THE\nGLORIOUS \"VOGUE\" COVER-READY\nWEDDING GOWN THAT IS MY\nMONOLOGUE.\nBUT SOMETIMES, WHILE ON A\nGLUE-HUFFING BINGE, I CRASH A\nSTOLEN HEARSE INTO AN ABANDONED\nCHILDREN'S HOSPITAL WHERE I USE\nMY TEETH TO TEAR UP SOME OLD\nCURTAINS AND STAINED CARPETING,\nAND STEAL A BUTTON OFF AN OLD\nSURGICAL APRON, AND STITCH IT\nALL TOGETHER WITH A NEEDLE MADE\nFROM A CHICKEN BONE TO THROW\nTOGETHER THE SHRIEKING CAT\nLADY'S SACK DRESS OF NEWS THAT\nIS MY SEGMENT: \"MEANWHILE.\""
171
+ },
172
+ "VYVbTzoggKc": {
173
+ "begin": "0:00.0",
174
+ "end": "0:49.0",
175
+ "text": "FOLKS, YOU KNOW, I\nSPEND A LOT OF\nMY TIME ON THE SHOW-- IF YOU\nWATCH THE SHOW YOU'D FIGURE\nTHIS OUT,\nRIGHT OVER THERE, STANDING RIGHT OVER THERE IN THE MONOLOGUE SPELUNKING\nTHROUGH THE DAY'S STORIES TO\nSELECT AND SOURCE THE\nNEWSIEST MARBLE, CHISELING\nIT INTO A\nPEDESTAL OF HUMOR AS WIDE AS\nTWO GREEK ISLES.\nTHEN I CAST THE MOST TOPICAL\nCURRENT-EVENTS-BRONZE INTO A\nFINELY CRAFTED MOULD TO\nERECT FOR YOU THE TOWERING\nGRECIAN\nCOLOSSUS THAT IS MY NIGHTLY\nMONOLOGUE.\nBUT SOMETIMES, JUST\nSOMETIMES, FOLKS, I JOLT\nAWAKE INSIDE\nWHAT'S LEFT OF A RUSTED\nMAZDA MIATA IN A WHITE CLAW\nAND\nOVEN-CLEANER-INDUCED FUGUE\nSTATE, SHAMBLE THROUGH THE\nJUNKYARD, RANSACKING THE\nDEBRIS FOR OLD FISHING RODS,\nMELTED\nBATTERIES AND THE SHOVEL OF\nA DERELICT BACKHOE, AND THEN\nBOOST AN\nACETYLENE TORCH TO HASTILY\nWELD TOGETHER THE BOOTLEG\nTRUCKASAURUS OF NEWS THAT\nIS MY SEGMENT:\nMEANWHILE!"
176
+ },
177
+ "WWWeV8xVNtI": {
178
+ "begin": "2:00.0",
179
+ "end": "2:35.0",
180
+ "text": "YOU KNOW, FOLKS, I SPENT A LOT\nOF TIME CAREFULLY RESEARCHING\nTHE DAY'S MOST CULTURALLY\nPRECIOUS STORIES,\nCROSS-REFERENCING HISTORICAL\nACCOUNTS WITH TOPOGRAPHICAL\nMAPS, AND ASSEMBLING THE FINEST\nTEAM OF ARCHAEOLOGISTS TO\nUNEARTH THE UNESCO WORLD\nHERITAGE EXCAVATION SITE OF\nHUMOR THAT IS MY MONOLOGUE.\nBUT SOMETIMES, I DISTRACT AN\nORPHAN WITH A PIECE OF\nLINT-COVERED CANDY AND STEAL\nTHEIR BUCKET AND PAIL, THEN\nSNEAK INTO THE POTTER'S FIELD IN\nTHE DEAD OF NIGHT WITH TWO\nDRIFTERS I PICKED UP ON THE\nCOMMUTER TRAIN, AND FORCE THEM\nTO DIG FOR THE ABANDONED\nPAUPER'S GRAVE\nOF NEWS THAT IS MY SEGMENT:\nMEANWHILE!"
181
+ },
182
+ "XzJAtzdrY_w": {
183
+ "begin": "2:57.0",
184
+ "end": "4:32.0",
185
+ "text": "FOLKS, IF YOU WATCH THIS SHOW, YOU KNOW I SPEND MOST\nOF MY TIME, RIGHT OVER THERE,\nCAREFULLY COMBING THE NEWS\nLANDSCAPE AND HARVESTING THE\nFINEST, MOST BEAUTIFUL STORY\nPIGMENTS LIKE MALACHITE,\nAZURITE, AND CINNABAR, WHICH I\nSLOWLY GRIND UNDER A GLASS\nOF MULLER WITH ONLY THE MOST\nTOPICAL LINSEED OIL, WORKING\nTHEM INTO SMOOTH, BUTTERY\nVERMILLIONS, VERDIGRIS, AND NEW\nGAMBOGES, WHICH I THEN APPLY TO\nA GRISAILLE PREPARED ON A CANVAS\nOF FLAX, TOW, AND JUTE, SLOWLY\nWORKING UP THE SHADOW\nSHAPES AND MAJOR MASSES, THEN\nDELICATELY RENDERING THE\nINTERPLAY OF LIGHT AND FORM,\nBEFORE APPLYING THE FINE DAMMAR\nAND MASTIC VARNISH TO UNVEIL FOR\nYOU THE GLORIOUS REMBRANDT\nPORTRAIT OF THE DAY'S EVENTS\nTHAT IS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES--\nSOMETIMES, FOLKS, SOMETIMES\nI'M SHAKEN AWAKE INSIDE THE\nDARKENED TRUNK OF A BULGARIAN\nMOBSTER'S VOLVO 940, I QUIETLY\nRELEASE THE SAFETY CATCH AND\nTUMBLE ONTO THE SIDE OF A DIRT\nROAD, BREAKING BOTH CLAVICLES,\nWHICH I DO NOT FEEL BECAUSE OF\nALL THE ANGEL DUST.\nI STAGGER INTO AN ABANDONED\nTANNERY WHERE I BEFRIEND AN OWL\nWHO TELLS ME TO I HAVE TO LET\nHIM SPEAK THROUGH ME OR HE'LL\nMURDER THE CLOUDS.\nAND IN HIS DIRECTION, I MIX THE\nFUN DIP I FOUND IN MY POCKET\nWITH THE FISTFULS OF HEXAVALENT\nCHROMIUM I SCOOP UP FROM THE\nDISUSED TANNING PITS, THEN HURL\nIT AT THE SIDE OF A NEARBY\nDEFUNCT DAIRY QUEEN IN A FUGUE\nSTATE OF LASHING OUT AT LIGHT\nAND COLOR, TO UNLEASH FOR YOU\nTHE ABSTRACT EXPRESSIONIST\nSPLATTER FRESCO OF NEWS THAT IS\nMY SEGMENT:\nMEANWHILE!"
186
+ },
187
+ "YyV6l8HPmdQ": {
188
+ "begin": "0:48.0",
189
+ "end": "1:32.0",
190
+ "text": "FOLKS, LADIES AND\nGENTLEMEN, YOU KNOW I SPEND A\nLOT\nOF TIME DELICATELY WHITTLING A\nMELANGE OF THE DAY'S MOST\nPRESSING STORY TIMBERS,\nPRECISELY MEASURING THE NECKS,\nRIBS, AND BACKS OF THE NEWS,\nEMPLOYING ONLY THE MOST\nSOPHISTICATED AND TOPICAL\nPURFLING, THEN LAYING 15\nEXQUISITE COATS OF INSIGHT ONTO\nTHE ORNATE YET ROBUST\nSTRADIVARIUS VIOLIN THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS,\nI GATHER UP FRAYED ELECTRICAL\nWIRE FROM A BURNT-OUT BOWLING\nALLEY, TAPE IT TO A\nTERMITE-INFESTED 2-by-4, THEN SHOVE\nONE END TO A DISCARDED CHUM\nBUCKET TO MAKE FOR YOU THE APPALACHIAN\nDRIFTER'S BANJO OF NEWS THAT IS\nMY SEGMENT:\n\"MEANWHILE!\""
191
+ },
192
+ "a8DD__mRtPk": {
193
+ "begin": "1:13.0",
194
+ "end": "1:58.0",
195
+ "text": "FOLKS, YOU KNOW, I SPEND MOST OF\nMY TIME GATHERING FOR YOU THE LATEST,\nMOST CUTTING-EDGE NEWS STORIES,\nCAREFULLY EXAMINING THE DAY'S\nCT SCAN, THEN ASSEMBLING\nAMERICA'S CRACK MEDICAL TEAM,\nAND MAKING PRECISE INCISIONS\nWITH THE AID OF A STRYKER 1588\nLAPAROSCOPE IN THE\nGROUNDBREAKING SURGICAL ARTISTRY\nTHAT IS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nFOLKS, WHEN I NEED A LITTLE EXTRA CASH TO\nPAY OFF MY COCK-FIGHTING DEBTS,\nI SET UP A RUSTY COT UNDER A\nTARP IN WASHINGTON SQUARE PARK,\nWHERE I PLY CURIOUS PASSERSBY\nWITH BATHTUB COUGH SYRUP TO HELP\nDULL THE PAIN WHILE I USE GARDEN\nSHEARS TO CUT OUT ANYTHING THAT\nLOOKS SUPERFLUOUS IN THE AMATEUR\nAPPENDECTOMY TENT OF NEWS\nTHAT IS MY SEGMENT...\nMEANWHILE!"
196
+ },
197
+ "cHhomJMwY1I": {
198
+ "begin": "2:42.0",
199
+ "end": "3:24.0",
200
+ "text": "FOLKS, I SPEND A LOT OF TIME\nSTANDING RIGHT OVER THERE,\nCOMBING THROUGH HOURS UPON HOURS\nOF GAME TAPE ON THE MOST PROMISING\nHEADLINES, METICULOUSLY CRAFTING\nMY BIG BOARD TO RANK STORIES\nBASED ON THEIR RAW TALENT AND\nINTANGIBLES, AND CUT DEALS FOR\nTHE MOST TOPICAL TRADES TO DRAFT\nTHE ONCE-IN-A-GENERATION,\nHEISMAN-WINNING QUARTERBACK THAT\nIS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES I\nWAKE UP IN AN ICE BATH AFTER\nDOING RAILS OF GATORADE POWDER,\nREALIZE IT'S DRAFT DAY, AND I\nHAVE 15 SECONDS LEFT TO MAKE A\nCHOICE AND BLURT OUT THE FIRST\nNAME I SEE TO WASTE THE NUMBER\nONE OVERALL PICK ON THE SCRAWNY,\nUNPOLISHED THIRD-STRING PUNTER\nOF NEWS THAT IS MY SEGMENT,\n\"MEANWHILE!\""
201
+ },
202
+ "hhwTiwUAaf8": {
203
+ "begin": "1:06.5",
204
+ "end": "1:54.5",
205
+ "text": "YOU KNOW, FOLKS, I SPEND MOST\nOF MY TIME SOURCING FOR YOU THE DAY'S\nFINEST HANGZHOU SILK NEWS\nSTORIES, MOUNTING THEM ON THE\nMOST TOPICAL, PREMIUM,\nPOLYHEDRAL BAMBOO JOKE FRAME,\nDECORATING IT WITH ARTISANAL ASH\nINK, INSERTING A HAND-POURED\nBEESWAX CANDLE, FILLING IT WITH\nINTENTION AND THEN SENDING IT ALOFT\nON THE UPDRAFT OF AUDIENCE\nLAUGHTER IN THE SPECTACULAR\nCHINESE LANTERN FESTIVAL THAT IS\nMY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES, FOLKS, I GO VISIT MY\nBUDDY, BARRACUDA AT THE\nABANDONED MALL, DROP A COUPLE OF\nHUNDOS ON SOME WET ROMAN\nCANDLES, BENT SPARKLERS SMUGGLED\nIN FROM THE PHILIPPINES, AND A\nFLARE GUN STOLEN FROM A CRASHED\nCOAST GUARD BOAT, SET IT\nALL OFF IN THE DERANGED,\nUNREGULATED FIREWORKS ACCIDENT\nOF NEWS THAT IS MY SEGMENT:\n\"MEANWHILE!\""
206
+ },
207
+ "iB6diOGE8y4": {
208
+ "begin": "0:52.8",
209
+ "end": "1:50.8",
210
+ "text": "YOU KNOW, IF YOU WATCH THE SHOW, YOU KNOW I SPEND MOST OF MY TIME\nRIGHT OVER THERE, COMBING\nTHROUGH THE DAY'S BIGGEST NEWS,\nAND SELECTING FOR YOU THE\nFINEST, MOST TOPICAL INDIAN\nROSEWOOD, SPRUCE, AND MAHOGANY\nSTORIES.\nI THEN HAND-SHAPE AND COMBINE\nTHEM WITH AN ABALONE\nMULTI-STRIPE BACK INLAY, AND\nFORWARD-SHIFTED SCALLOPED\nBRACES, ANTIQUE WHITE BINDING,\nAND A HIGH-PERFORMANCE NECK WITH\nA HEXAGON FINGERBOARD, AND\nFINALLY LAY IN A\nTORTOISE-PATTERN PEARL PICK\nGUARD, AND A COMPENSATED\nBONE SADDLE, TO CRAFT FOR YOU\nTHE EXQUISITE MARTIN D-45\nDREADNOUGHT ACOUSTIC GUITAR\nTHAT IS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES, FOLKS, I SNAP\nAWAKE IN A RUSTY COFFIN FREEZER\nBEHIND AN ABANDONED DAIRY QUEEN\nOUTSIDE OF GALVESTON.\nTHEN I NAIL A 2-BY-4 TO A CEDAR URN\nI STOLE FROM A FUNERAL PARLOR, STRING\nON SOME BRAKE CABLES I RIPPED\nOUT OF A COP CAR, THEN CUT EYE\nHOLES IN A GOODWILL BAG FOR A MASK,\nHIT A WHIPPET, AND TERRORIZE\nTHE LOCALS ON THE TEXAS CHAINSAW\nBANJO OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE!\""
211
+ },
212
+ "iQFwGF0aW-o": {
213
+ "begin": "2:10.7",
214
+ "end": "3:33.7",
215
+ "text": "I SPEND A LOT OF MY TIME, RIGHT\nOVER THERE, CULTIVATING FOR YOU THE\nDAY'S BIGGEST STORIES,\nPLUCKING THE MOST BEAUTIFUL AND\nTOPICAL NEWS VIOLETS AND\nMARIGOLDS, STRIPPING THE FRENCH\nLAVENDER FROM THE STEM AND\nLOVINGLY PRESSING THEM ALL\nBETWEEN THE PAGES OF A GILDED\nFIRST EDITION OF \"PRIDE AND\nPREJUDICE.\"\nTHEN I FOLD THEM INTO A DOUGH I\nHAND ROLLED FROM PASINI BAKERY\nFLOUR, BORDIER BUTTER, AND\nCHILLED SOMERDALE DEVON DOUBLE\nCREAM, SPRINKLE THEM WITH A\nPINCH OF MUSCOVADO SUGAR, AND\nBAKE THEM IN A \"LA CORNUE GRAND\nPALAIS\" RANGE TO PERFECTLY PREP\nTHE GOURMET FLORAL SHORTBREAD\nCOOKIE THAT IS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, FOLKS, SOMETIMES,\nSOMETIMES, I AM NIBBLED AWAKE BY AN AMOROUS\nRACCOON IN THE ABANDONED WALK-IN\nFREEZER OF A HAUNTED BAKERY IN\nWHICH I HAVE ESTABLISHED\nSQUATTER'S RIGHTS, I SLIP INTO\nTHE TWISTED KITCHENAID PADDLES I\nCALL SHOES, AND KNIFE FIGHT A\nPOSSUM FOR AN EXPIRED BAG OF\nCRUSHED BREAKFAST CEREAL DUST\nAND A BROKEN EGG, WHICH I MIX\nWITH THREE SMUSHED RESTAURANT\nBUTTER PACKETS I STOLE FROM A\nNAPPING RETIREE'S PURSE, POUR\nTHE REST OF A SHATTERED BOTTLE\nOF RUBBING ALCOHOL I FOUND IN\nTHE DUMPSTER OUT BACK INTO A\nRUSTY BARREL TO IGNITE THE HOBO\nFIRE OVER WHICH I BAKE MY\nSLUDGE, THEN DISPLAY IT IN A\nFILTHY CHEF'S HAT TO SERVE YOU\nTHE DERANGED RAT KING BISCUIT OF\nNEWS THAT IS MY SEGMENT:\n\"MEANWHILE\"!"
216
+ },
217
+ "jIL7kvG7d10": {
218
+ "begin": "1:55.3",
219
+ "end": "2:40.3",
220
+ "text": "FOLKS, YOU KNOW, I SPEND A LOT\nOF TIME, RIGHT OVER THERE\nCAREFULLY PLANTING AND TENDING\nTO THE DAY'S BIGGEST, MOST\nIMPORTANT STORIES, TRIMMING\nTHE TOPICAL HEDGES WITH DELICATE\nEXACTITUDE, RESEARCHING AND\nSEEDING THE SOIL OF TODAY'S\nNEWS IN ORDER TO YIELD THE MOST\nBEAUTIFUL, FRAGRANT JOKE\nFLOWERS, AND PRECISELY TIMING\nTHE BLOOM IN THE EXQUISITE\nENGLISH GARDEN THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES FOLKS, I DRIVE A 2003\nPONTIAC SUNFIRE THROUGH A HOME\nDEPOT GARDEN CENTER, DOWN A JUG\nOF MIRACLE GRO, SMEAR MY BODY IN\nMUD AND PEA GRAVEL, BUILD A FORT\nOUT OF PAVERS, PLOP A SUCCULENT\nDISH GARDEN ON MY HEAD, AND\nBARRICADE MYSELF INSIDE A\nPORTABLE TOOL SHED TO CREATE THE\nPARANOID BACKYARD STANDOFF OF\nNEWS THAT IS MY SEGMENT:\nMEANWHILE!"
221
+ },
222
+ "jpq8eXZcvpo": {
223
+ "begin": "0:52.5",
224
+ "end": "1:37.5",
225
+ "text": "FOLKS, YOU KNOW, I SPEND A LOT OF MY TIME\nRIGHT OVER THERE, SORTING\nTHROUGH THE DAY'S TOP STORIES,\nCAREFULLY SELECTING FOR YOU THE\nFRESHEST, MOST TOPICAL\nNEWS-FRUIT, ARTFULLY CARVING IT\nINTO SATIRICAL SHAPES, DIPPING\nIT IN THE FINEST ARTISANAL\nCHOCOLATE, AND GENTLY PLACING THEM\nINTO THE FLAWLESSLY COMPOSED AND\nDELICIOUS EDIBLE ARRANGEMENT\nTHAT IS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nFOLKS, I WAKE UP FACE DOWN IN\nTHE RECYCLING BIN BEHIND A JAMBA\nJUICE, FIGHT A SEAGULL FOR THE\nDISCARDED CANTALOUPE RINDS\nAND PINEAPPLE STEMS, DIP THEM\nINTO A BUCKET OF DIESEL SIPHONED OFF\nFROM A SEMI FULL OF UNWASHED\nBIRD BONES, WHICH I USE TO\nSKEWER TOGETHER MY GARBAGE\nKEBABS, THEN STAB THEM ONTO A\nWATERLOGGED TEDDY BEAR TO CREATE\nTHE CRIMINALLY INSANE NIGHTMARE\nGIFT BASKET OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE!\""
226
+ },
227
+ "jq2LhJ9rMpg": {
228
+ "begin": "2:52.0",
229
+ "end": "3:34.0",
230
+ "text": "YOU KNOW, I SPEND A LOT OF TIME\nRIGHT OVER THERE, PULLING\nTOGETHER THE FINEST, NEWSIEST\nLIMESTONE, CHISELING IN THE MOST\nDELICATE AND TOPICAL OF\nBAS-RELIEF, AND THE MOST ORNATE\nARCHES, MAKING SURE THERE'S NARY\nA BAD SEAT IN THE HOUSE, THEN\nASSEMBLING THE MOST FEARSOME\nNEWS WARRIORS THE ARENA HAS EVER\nSEEN TO CONSTRUCT FOR YOU THE\nROMAN COLOSSEUM THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES FOLKS, I WAKE UP MY\nNEIGHBOR AT 3:00 IN THE MORNING,\nDRAG HIM INTO MY SHED, WHERE\nI'VE SET UP A KIDDIE POOL I\nBOUGHT 20 YEARS AGO AND FILLED\nWITH EXPIRED JELL-O AND LUKEWARM\nBEER, HUFF SOME ACETONE OUT OF A\nPRICE CHOPPER BAG, THEN\nCHALLENGE HIM TO JOIN ME IN THE\nOLD MAN WRESTLING LEAGUE OF\nNEWS THAT IS MY SEGMENT:\n\"MEANWHILE!\""
231
+ },
232
+ "lWyia3aF92o": {
233
+ "begin": "1:44.0",
234
+ "end": "2:30.0",
235
+ "text": "FRIENDS, EVERY NIGHT I STAND\nRIGHT OVER THERE, AND I CAREFULLY WORK ON THE LIGHTING\nAND STAGING OF THE DAY'S MOST\nTOPICAL NEWS STORIES, COMPOSING\nGROUNDBREAKING ORCHESTRAL\nARRANGEMENTS TO SUPPORT THEM,\nAND THEN METICULOUSLY CHOREOGRAPHING\nTHEM AND MYSELF INTO A DELICATE,\nHEARTBREAKING, AND YET UPLIFTING\nPAS DE DEUX, TO PRESENT FOR YOU\nTHE EPOCH-DEFINING BALLET THAT\nIS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES FOLKS, I FISH A STAINED\nVELOUR JUMPSUIT OUT OF A CANAL,\nHOOK A RADIO I RIPPED OUT OF A\nGARBAGE TRUCK TO AN ABANDONED\nCAR BATTERY, AND SLAP THE DIAL\nTHROUGH FUZZ TILL IT LANDS ON A\nRANDOM A.M. OLDIES STATION, AND THEN\nSHAKE MY ASS FOR NICKELS IN THE\nDEMENTED VAGRANT MACARENA OF\nNEWS OF THAT IS MY SEGMENT:\nMEANWHILE!"
236
+ },
237
+ "ldTzn1RpsNY": {
238
+ "begin": "1:00.0",
239
+ "end": "1:48.0",
240
+ "text": "FOLKS, I SPEND A LOT OF TIME\nRIGHT OVER THERE, COMPILING THE\nMOST CURRENT GEOMETRY QUESTIONS,\nSPRINKLING IN A TOPICAL SET OF\nDATA ANALYSES, FOLDING THEM\nTOGETHER ALONG THE NEWSWORTHIEST\nWORD PROBLEMS, THEN PAIRING ALL\nOF THAT WITH THE DAY'S MOST\nPRESSING READING PASSAGES TO\nCOLLATE FOR YOU THE PERFECTLY\nCALIBRATED, BESPOKE S.A.T. TEST\nTHAT IS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nI HUFF A PILE OF SALVIA AND\nSTAGGER INTO A LANDFILL WHERE I\nFORAGE FOR CRUSTY OLD SUDOKUS,\nGRAB A SACKFUL OF USED AND WET\nMADLIBS, AND CRAZY-GLUE THEM INTO\nTHE SPINE OF A DISCARDED\n\"READER'S DIGEST\" I FOUND IN A\nBURNT-OUT WALDENBOOKS, TO\nPRESENT TO YOU THE ILLEGIBLE\nHOBO BUZZFEED QUIZ OF NEWS THAT\nIS MY SEGMENT:\n\"MEANWHILE!\""
241
+ },
242
+ "lgH-itFA_hg": {
243
+ "begin": "1:53.2",
244
+ "end": "2:33.2",
245
+ "text": "FOLKS, I SPENT A LOT OF TIME\nSTANDING RIGHT OVER THERE, OKAY,\nSETTING\nUP MY NEWS EASEL, LAYING OUT THE\nMOST TOPICAL BRUSH STROKES,\nCHOOSING THE MOST RELEVANT\nCOLORS, ALL TO FAITHFULLY\nCAPTURE FOR YOU, THE SOUL OF THE\nSTORIES OF THE DAY IN THE\nOIL-ON-CANVAS MASTERPIECE THAT\nIS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES -- JUST SOMETIMES,\nFOLKS -- I SET A LIQUOR\nSTORE ON FIRE AND COME BACK THE\nNEXT DAY TO SCRAPE SOME CHARCOAL\nOFF THE BURNT TIMBERS, USE THE\nCARDBOARD FROM THE DISCARDED\nREFRIGERATOR BOX I'VE BEEN\nCALLING HOME FOR THE WEEKEND,\nTHEN HARASS TOURISTS TO ETCH THE\nOFFENSIVE BOARDWALK CARICATURE\nOF NEWS THAT IS MY SEGMENT:\nMEANWHILE!"
246
+ },
247
+ "ll5DeZrejsM": {
248
+ "begin": "2:02.7",
249
+ "end": "2:52.7",
250
+ "text": "YOU KNOW, FOLKS,\nIF YOU WATCH THE SHOW, YOU KNOW\nI SPEND A LOT OF TIME RIGHT OVER\nTHERE, COMBING THROUGH THE\nDAY'S BIG STORIES, SELECTING THE\nFINEST NEWS TENORS AND THE\nSILKIEST SOPRANOS.\nTHEN, I BRUSH UP ON THE WORKS OF\nCERVANTES AND FIND THE PERFECT\nSWEET SPOT BETWEEN DRAMA, HUMOR,\nAND OPERA TO COMPOSE FOR YOU THE\nTIMELESS AND SEDUCTIVE SPANISH\nZARZUELA THAT IS MY\nMONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS,\nI WAKE UP IN THE FREEZER OF A\nCOMBINATION TACO BELL PIZZA\nHUT ON THE EXPRESSWAY, AND I CUT A\nPAIR OF LEG HOLES INTO A POTATO\nSACK AND RACE BAREFOOT INTO THE\nCITY TO BREAK INTO AN ABANDONED\nDOLLAR STORE, WHERE I FASHION A\nPAIR OF CASTANETS FROM DEFECTIVE\nCHATTERING TEETH TOYS.\nTHEN I DOWN A JERRY CAN FULL OF\nRED BULL AND COUGH MEDICINE\nBEFORE STAGGERING INTO A PUBLIC\nPARK TO DISTURB TOURISTS WITH\nTHE DRIFTER'S FLAMENCO SHOWCASE\nOF NEWS THAT IS MY SEGMENT:\n\"MEANWHILE!\""
251
+ },
252
+ "lzviJMlii7A": {
253
+ "begin": "1:36.5",
254
+ "end": "3:21.5",
255
+ "text": "LADIES AND GENTLEMEN, YOU KNOW, IF YOU\nWATCH THIS SHOW, YOU KNOW I\nSPEND A LOT OF MY TIME RIGHT OVER\nTHERE, METICULOUSLY SIFTING\nTHROUGH THE DAILY NEWS\nDESERT, HARVESTING THE FINEST,\nMOST TOPICAL MINERAL SANDS--\nABOUT 65% SILICA, 10% FLUX\nOF SODIUM OXIDE, AND A\nSTABILIZER OF CALCIUM OXIDE--\nWHICH I THEN SMELT IN A\nHIGH-TEMPERATURE CALCERA FURNACE\nAT 1,200 TO 1,400 DEGREES\nCELSIUS, FUSING THEM INTO LIQUID\nGLASS, THEN CAREFULLY DROPPING\nMY FURNACE TEMPERATURE SO I CAN\nFOLD IN HAND-SELECTED CULLET AND\nCOBALT TO OBTAIN MY INTENDED\nCOLOR AND CREATE THE MOST\nPRISMATIC NEWS CRYSTALS, WHICH\nI THEN DELICATELY HANG ON A\nHAND-CRAFTED BALUSTER OF\nHEADLINES, ARRANGING THEM TO\nCATCH AND REFRACT, IN A GENTLE\nDANCE OF LIGHTS AND SHADOWS, THE\nMOST TOPICAL REFLECTIONS OF THE\nDAY, ADORNING THE ARRANGEMENT\nWITH ONE FINAL FINIAL OF QUIPS\nTO PRESENT TO YOU THE VENETIAN\nMURANO GLASS CHANDELIER THAT IS\nMY MONOLOGUE.\nBUT SOMETIMES FOLKS, SOMETIMES,\nFOLKS, SOMETIMES, I JOLT AWAKE\nNAKED IN THE BACK BOOTH OF A\nLONG-ABANDONED COUNTY FAIR, I\nPULL ON SOME OVERALLS I STOLE\nOFF A SCARECROW, AND CLAW\nTHROUGH THE GROUNDS SCRAPING THE\nLEAVINGS OFF SOME DISUSED\nSPARKLERS, PICK THROUGH BROKEN\nCOKE BOTTLES AND BIRTHDAY\nCANDLES, BOIL OFF THE REMNANTS\nIN A DISCARDED TUB OF LAUNDRY\nDETERGENT TO EXTRACT THE\nBENZENE.\nTHEN, USING THE SHOELACES I TOOK\nOFF A HOBO SLEEPING UNDER\nTHE FERRIS WHEEL AND DENTAL\nFLOSS CURRENTLY IN USE BY SAID\nHOBO, I BIND THE CONGLOMERATE\nOF SHARDS AND ACCELERANT\nTOGETHER AND HOLD IT NEAR THE\nSPUTTERING SPARK PLUG OF AN\nOLD ICE CREAM TRUCK TO IGNITE\nTHE CHAOTIC CANDELABRA OF\nFLAMING NEWS THAT IS MY SEGMENT:\n\"MEANWHILE\"!"
256
+ },
257
+ "okJDGV6Jjmc": {
258
+ "begin": "2:03.0",
259
+ "end": "2:55.0",
260
+ "text": "YOU KNOW, FOLKS, I SPEND A LOT\nOF TIME RIGHT OVER THERE, PORING\nOVER THE DAY'S NEWSIEST, MOST\nTOPICAL NAUTICAL RECORDS TO\nDETERMINE THE ROUGH POSITIONS OF\nTHE DAY'S TRENDING SHIPWRECKS.\nTHEN I USE THE LATEST SONAR TECH\nTO LOCATE AND FIND THE\nORIENTATION OF THE FINEST\nSALVAGE SITE.\nTHEN MY TEAM OF CERTIFIED AND\nLICENSED COMEDY DIVERS DESCEND\nTO THE OCEAN FLOOR AND USE\nCUTTING EDGE UNDERWATER CAMERAS\nTO STITCH TOGETHER THE DETAILED\n3D-MODELED HISTORIC ATOSHA\nSHIPWRECK SITE OF SATIRICAL\nOBSERVATIONS THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES, FOLKS,\nI WAKE UP NAKED BY THE DOCKS\nCOVERED IN PIRATE TATTOOS, BLAST\nA STRING OF WHIPPETS, THEN\nSTAGGER INTO THE FESTERING\nHUDSON RIVER, WHERE I SLOWLY\nSINK THROUGH THE MURK UNTIL I\nIMPALE MYSELF ON THE RUSTY AXLE\nOF A SUNKEN TAXI IN THE\nTETANUS-LACED CRIME SCENE OF\nNEWS THAT IS MY SEGMENT...\n\"MEANWHILE!\""
261
+ },
262
+ "pbR-kF0PjlA": {
263
+ "begin": "1:09.0",
264
+ "end": "2:43.0",
265
+ "text": "WELL FOLKS, I SPEND A LOT\nOF MY TIME, ON THE SHOW, RIGHT OVER THERE,\nWANDERING THROUGH THE\nFARMERS' MARKET THAT IS TODAY'S\nBIGGEST STORIES, SQUEEZING THE\nFINEST NEWS RADISHES, THE RIPEST\nSTORY PEPPERS, SNIFFING THE MOST\nTOPICAL DATES, WHICH I THEN PAIR\nWITH FRA'MANI SOPPRESSATA, AND\nTHE MOST SUCCULENT HANDRAISED\nPATA NEGRA JAMON IBERICO, BACKED\nUP BY GENEROUS HELPINGS OF\nBEEMSTER GOUDA, AND A WEDGE OF\nBRILLAT-SAVARIN TRIPLE CREAM\nBRIE, THEN I ADD FORTNUM AND MASON\nAPRICOT AND FIG SPREADS WITH\nGRISSINI BREADSTICKS AND LA\nPANZANELLA CROCCANTINI, AND\nFINALLY LIBERAL SPRINKLINGS OF\nSAN SABA ELLIOT PECANS AND\nSICILIAN CASTEL-VETRANO OLIVES\nON A RAW CARRARA MARBLE SLAB TO\nLAY OUT FOR YOU THE SPECTACULAR\nGOURMET CHARCUTERIE BOARD THAT\nIS MY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES FOLKS, SOMETIMES, I AM HOSED AWAKE\nINSIDE AN EMPTY 6,000 GALLON\nDIESEL TANKER OFF OF I-24, WHERE I\nAM HIDING FROM A CULT THAT I\nSTARTED, THEN DASH, NAKED BEHIND\nA RECENTLY DEFUNCT QUIZNOS,\nWHERE I MUST WRESTLE A POSSUM\nFOR THE REMAINS OF A BAJA\nCHICKEN FOOTLONG, STAGGER INTO A\nMIDDLE SCHOOL REC. YARD AFTER\nFIGHTING A SEAGULL FOR THE LAST\nHAM CUBE IN A LUNCHABLES TRAY,\nPUNCH A RACCOON TO STEAL HIS\nPEANUT, THEN DUMP IT ALL INTO A\nHUBCAP I STRIPPED OFF AN\nABANDONED '76 CHEVY VEGA TO\nOFFER FOR YOU THE RAIL YARD BUFFET\nOF NEWS THAT IS MY SEGMENT:\nMEANWHILE!"
266
+ },
267
+ "pyhaU-_1Szk": {
268
+ "begin": "2:48.0",
269
+ "end": "4:31.0",
270
+ "text": "YOU KNOW FOLKS, I SPEND A LOT OF\nTIME RIGHT OVER THERE, ISOLATING\nTHE BIGGEST, NEWSIEST STORIES\nOF THE DAY AND CONTAINING THEM\nIN THE MOST TOPICAL CIRCULAR\nTUNNEL, WITH A CIRCUMFERENCE OF\n26.7 KILOMETERS, AND A DEPTH\nRANGING FROM 50 TO 175 METERS.\nTHEN, I ADD TWO ADJACENT\nPARALLEL BEAM-LINES, WHICH\nTRAVEL IN OPPOSITE DIRECTIONS\nAROUND THE RING, INTERSECTING AT\nFOUR POINTS.\nI ADD 1,232 DIPOLE MAGNETS TO\nKEEP THE BEAMS IN THEIR CIRCULAR\nPATH, WHILE AN ADDITIONAL 392\nQUADRUPOLE MAGNETS ARE USED TO\nKEEP THE BEAMS FOCUSED, THEN\nI ADD STRONGER QUADRUPOLE\nMAGNETS CLOSE TO THE\nINTERSECTION POINTS IN ORDER TO\nMAXIMIZE THE CHANCES OF\nINTERACTION BETWEEN THE TWO BEAMS\nCROSS, ALL SO I CAN SMASH JOKE\nPROTONS AGAINST EACH OTHER AT\nNEAR THE SPEED OF LIGHT TO\nGENERATE THE HIGGS BOSON HEAVY\nCOMEDY PARTICLES THAT MAKE UP MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, FOLKS, SOMETIMES \nSOMETIMES, SOMETIMES I WAKE UP IN AN\nABANDONED JUNKYARD, STRAPPED TO\nTHE CHASSIS OF WHAT USED TO BE A\nSCHOOL BUS.\nI GNAW MYSELF FREE OF MY\nRESTRAINTS AND CLIMB ATOP A HILL\nOF CRUSHED MAZDA MIATAS TO UTTER\nA CALL THAT CAN BE HEARD ONLY BY\nTHOSE IN THE MIDST OF A\nLIFE-CHANGING PEYOTE TRIP.\nWITH MY FREAKS GATHERED AROUND ME,\nHOTWIRE AS MANY BURNT-OUT\n'91 BUICK LESABRES AS WE CAN\nFIND TO ANIMATE A FLEET OF\nFURY-ROAD-WORTHY LEMONS,\nTHEN ROLL THEM TO THE ABANDONED\nSUBWAY STATION BELOW CITY HALL,\nWHERE I LAUNCH THEM HEAD ON AT\nTOP SPEED IN THE UNREGULATED\nHOTWHEELS COLOSSAL CRASH TRACK\nOF NEWS THAT IS MY SEGMENT:\nMEANWHILE!"
271
+ },
272
+ "q8zlh8XKfLc": {
273
+ "begin": "1:00.0",
274
+ "end": "1:54.0",
275
+ "text": "FOLKS, YOU KNOW, I\nSPEND MOST OF MY TIME, RIGHT\nOVER THERE, WITH MY EARS, MY MIND,\nAND MY HEART OPEN TO THE DAY'S\nBIGGEST STORIES, AUDITIONING AND\nSELECTING ONLY THE MOST TOPICAL\nNEWS-OBOES, THE MOST RELEVANT\nAND LILTING VIOLAS, ROUNDING IT\nOUT WITH SOME NOBLE FRENCH\nHORNS, AND INSOUCIANT\nBASSOONS, THEN COMPOSING AND\nARRANGING THE NEWSIEST, MOST\nUPLIFTING YET BITTERSWEET\nRONDOS, ALLEGROS, SCHERZOS, AND\nSONATAS TO PRESENT TO YOU THE\nTIMELESSLY MOVING YET\nINFORMATIVE POST-MODERN OPUS\nNUMBER ONE SYMPHONY THAT IS MY\nMONOLOGUE.\nBUT SOMETIMES, SOMETIMES FOLKS, I WAKE UP AT THE\nWHEEL OF A STOLEN CEMENT TRUCK,\nSNORT ANOTHER RAIL OF KETAMINE\nAND BATH SALTS, THEN I STRIP\nDOWN AND SCAMPER THROUGH A\nCEMETERY TRAPPING RATS UNDER\nRUSTY COFFEE CANS.\nAFTER AN IMPASSIONED SPEECH TO THEM\nABOUT THEIR NEED\nTO HELP ME SAVE AN OLD THEATER, THEY\nACCOMPANY ME ON A RAID TO A\nPRESCHOOL MUSIC ROOM TO STEAL\nITS FLUTES, RECORDERS, AND\nKAZOOS, WHERE I CONDUCT THE\nFUGITIVE VERMIN PHILHARMONIC OF\nNEWS THAT IS MY SEGMENT:\nMEANWHILE!\n"
276
+ },
277
+ "qEY5SUevhgU": {
278
+ "begin": "1:59.0",
279
+ "end": "3:07.0",
280
+ "text": "FOLKS, IF YOU\nWATCH THIS SHOW, YOU KNOW I\nSPEND MUCH OF MY TIME,\nRIGHT OVER THERE, PLANTING AND\nGROWING THE DAY'S BIGGEST NEWS\nIN A PARCELED TERROIR AT\nPRECISELY 80 METERS, ON A\nNORTH-FACING SLOPE, WITH JUST\nTHE RIGHT MICROCLIMATE, THEN\nHAND-PICKING ONLY THE RIPEST,\nMOST TOPICAL BOTRYTIS-PRUNED\nSTORY GRAPES.\nAFTER THREE PRESSINGS, I THEN\nCAREFULLY BARREL-AGE THEIR\nNOBLE-ROTTED NECTAR FOR 30\nMONTHS EXCLUSIVELY IN NEW OAK BARRELS TO\nBRING OUT THE AROMAS OF TROPICAL\nFRUITS, HONEYED PEARS, AND\nROASTED NUTS IN THE CHATEAU\nD'YQUEM SAUTERNES THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS,\nI WAKE UP IN A BULGARIAN PRISON,\nCONVICTED OF WHAT MY NON-ENGLISH\nSPEAKING, COURT-APPOINTED LAWYER\nONLY CALLS \"ANIMAL WRONGS.\"\nI TRADE THE CIGARETTES I WON IN\nA BARE-KNUCKLE MATCH WITH A\nGUARD FOR SOME FIG MARMALADE,\nAPPLE CORES, AND DISCARDED\nKETCHUP PACKETS, TOSS IT ALL IN\nTHE PLASTIC BAG I STOLE OFF A\nCELLMATE DRAGOMIR'S FOOT WHILE\nHE SLEPT, LEAVE IT UNDER A\nFERMENTING PIPE\nOVERNIGHT, TO SERVE UP THE\nSOUR-MASHED GOON PLONK OF NEWS\nTHAT IS MY SEGMENT:\n\"MEANWHILE!\""
281
+ },
282
+ "r7NnpAGIkEY": {
283
+ "begin": "1:46.5",
284
+ "end": "2:32.5",
285
+ "text": "AND, YOU KNOW,\nFOLKS, I SPEND A LOT OF TIME ON THIS SHOW, RIGHT\nOVER THERE, CAREFULLY HARVESTING\nTHE HIGHEST-QUALITY ORGANIC ACAI\nNEWS BERRIES, PUTTING THEM INTO\nMY CURRENT EVENTS BLENDER, THEN\nPULSING ON HIGH UNTIL THEY'VE\nBECOME THE SMOOTH PURPLE PUREE OF\nSTORIES TO BE PILED WITH\nGRANOLA, CHIA SEEDS, AND SLICED\nJOKE BANANA, TO MAKE THE\nHIGH-PRICED, ARTISANAL SMOOTHIE\nBOWL OF NEWS THAT IS MY\nMONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES\nFOLKS, I LIKE TO SCROUNGE\nTOGETHER SOME EXPIRED KALE FROM\nTHE BACK OF THE FRIDGE, MIX IT\nWITH THE FERMENTING ORANGE\nSLICES LEFT IN THE BACK SEAT\nAFTER LAST WEEK'S LITTLE LEAGUE\nGAME, AND AN APPLE CORE I FOUND\nINSIDE A COFFEE CUP, THEN\nPULVERIZE IT ALL IN A LEAKY\nNUTRI-BULLET TO MAKE THE PRISON\nTOILET GREEN JUICE OF NEWS THAT\nIS MY SEGMENT:\n\"MEANWHILE!\""
286
+ },
287
+ "sKCeqiWA-gQ": {
288
+ "begin": "0:43.5",
289
+ "end": "1:36.5",
290
+ "text": "FOLKS, I SPEND A LOT OF TIME\nRIGHT OVER THERE, NIGHT AFTER NIGHT, COMBING\nTHROUGH THE DAY'S NEWS,\nCAREFULLY SELECTING THE MOST\nTOPICAL, FRAGRANT HERBS AND\nJOKE-RICH ALLIUM, DELICATELY\nSTIRRING THEM INTO A SATIRICAL\nSTOCK, BRINGING THE CONCOCTION\nTO A BREAKING NEWS BOIL BEFORE\nPAINSTAKINGLY EDITING AWAY THE\nSCRAPS, LEAVING ONLY THE PUREST,\nNUTRIENT-RICH CONSOMME OF COMEDY\nTHAT IS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, I SWEAT MYSELF\nAWAKE INSIDE A DEFLATED BOUNCY\nCASTLE AT A DEFUNCT AMUSEMENT\nPARK, BREAK INTO A COMBINATION\nGAS STATION PIZZA HUT WHERE I\nTHROW TOGETHER WHATEVER OLD HOT\nDOGS AND REPURPOSED CHEESE\nPRODUCT I CAN GET MY CHAPPED AND\nCRACKING HANDS ON.\nAND THERE, BY THE CRUEL LIGHT OF\nA PIZZA WARMING DRAWER, I DROWN\nTHE MIXTURE IN A CAN OF\nDISCONTINUED SURGE FROM 2002\nBEFORE STRAINING IT THROUGH THE\nGREASE-SOILED BEARD NET TO\nCREATE THE FESTERING MOP BUCKET\nSOUR MASH OF NEWS THAT IS MY\nSEGMENT:\nMEANWHILE!"
291
+ },
292
+ "tSdWz6CvpIc": {
293
+ "begin": "1:50.0",
294
+ "end": "2:39.0",
295
+ "text": "FOLKS, YOU KNOW, IF YOU WATCH\nTHE SHOW, YOU KNOW I SPEND A\nLOT OF MY TIME, RIGHT OVER THERE,\nCAREFULLY WELDING TOGETHER THE\nDAY'S TOP STORIES, FORGED FROM\nTHE FINEST NEWS METALS, WIRING\nIN THE MOST EFFICIENT, HIGH-\nSPEED ENGINE.\nTHEN I COMBINE THE MOST TOPICAL\nTITANIUM ACCENTS WITH ITALIAN\nCURRENT-EVENTS CRAFTSMANSHIP TO\nCREATE THE BESPOKE TRIUMPH\nMOTORCYCLE THAT IS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES FOLKS,\nAFTER STANDING TOO LONG OVER AN\nEPHEDRINE BARREL FIRE, I STUMBLE\nINTO A DEFUNCT BODY SHOP, SLAP A\nHALF-EMPTY CANISTER OF PROPANE\nONTO A STOLEN HUFFY, WRAP IT IN\nNEWSPAPER AND BITS OF CAUTION\nTAPE THAT I SWIPED FROM A\nSTILL-ACTIVE CRIME SCENE, AND\nHOOK IT UP TO AN OLD MILK JUG\nFULL OF NITROUS I STOLE FROM A\nBLACK MARKET ORTHODONTIST, IN\nORDER TO MAKE THE FLAMING DEATH\nROCKET OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE!\""
296
+ },
297
+ "thFDua8MF_w": {
298
+ "begin": "1:21.5",
299
+ "end": "2:18.5",
300
+ "text": "FOLKS, I SPEND A\nLOT OF TIME\nRIGHT OVER THERE, COMBING\nTHROUGH THE DAY'S NEWS AND\nCAREFULLY SELECTING THE MOST\nPRISTINE OPALESCENT GLASS\nSTORIES, ORNATELY FUSING THE\nPIECES USING ONLY THE MOST\nTOPICAL COPPER WIRE AND LEAD\nCASING BEFORE COLORING THEM WITH\nTHE MOST PIGMENT-RICH JOKES\nAVAILABLE TO CONSTRUCT FOR YOU AND YOU ALONE\nTHE ELEGANT STAINED-GLASS\nTIFFANY DOME THAT IS MY NIGHTLY\nMONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES FOLKS, I JOLT AWAKE\nBEHIND THE WHEEL OF A '79 BUICK\nREGAL LOWRIDER WHILE DOING\nDONUTS IN THE PARKING LOT OF A\nBOARDED UP JOANNE FABRICS, WHEN\nI CLIP A BARREL FIRE AND I'M\nTHROWN FROM THE CAR INTO THE\nDUMPSTERS.\nTHERE, I RUMMAGE THROUGH THE\nBITS OF BROKEN FANTA BOTTLES,\nAND GLUE THEM TOGETHER WITH\nSTILL-WARM CHEWING GUM, AND\nSTAIN THEM WITH WHATEVER\nREMNANTS I CAN SCRAPE FROM OLD\nKETCHUP AND FUN-DIP PACKETS.\nTHEN I DOUSE MY PANTS IN\nKEROSENE AND LET HER BLAZE TO\nPROJECT THE DEMENTED NIGHTMARE\nKALEIDOSCOPE OF NEWS THAT IS MY\nSEGMENT:\nMEANWHILE"
301
+ },
302
+ "u9oMwS3I12s": {
303
+ "begin": "0:53.0",
304
+ "end": "1:47.0",
305
+ "text": "FOLKS, YOU KNOW, IF YOU WATCH THE SHOW, YOU KNOW I SPEND A LOT OF\nMY TIME RIGHT OVER THERE,\nWORKING THE OLD MIRTH KILN,\nMELTING DOWN THE DAY'S MOST\nIMPORTANT GOLDEN STORY INGOTS\nTO MAKE AN ORNATE SET OF CUSTOM\nNEWS VAMBRACES.\nTHEN CARVE A MOULDED CUIRASS\nINTO THE MOST TOPICAL\nANIMAL-THEMED CHEST PLATE THAT I\nDECORATE WITH FINE FILIGREE AND\nORNATE PRE-COLUMBIAN PATTERNS TO\nCREATE THE BESPOKE SET OF GOLD\nMUISCA ARMOR THAT IS MY\nMONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nFOLKS, I WAKE UP IN THE BASEMENT\nOF A DERELICT ROW HOUSE DURING A\nFULL MOON, RIFLE THROUGH A\nDISCARDED BOX OF ELBOW PASTA AND\nSTRING SOME NOODLES TOGETHER\nINTO CRUDE SHIN GUARDS WITH\nCOPPER WIRE I STRIPPED OUT OF\nTHE FUNERAL HOME I ROBBED\nEARLIER.\nTHEN, I FASHION A HAT BY\nSTAPLING OLD NEWSPAPER CLIPPINGS\nTO A BIKE HELMET AND WRAP MYSELF\nIN A TARP I SWIPED FROM THE\nRETIREMENT HOME'S HOT TUB TO\nFROLIC BEFORE YOU IN THE\nMADMAN'S HAZMAT SUIT OF NEWS\nTHAT IS MY SEGMENT:\n\"MEANWHILE!\""
306
+ },
307
+ "z2dPp5yM-NA": {
308
+ "begin": "1:34.3",
309
+ "end": "2:34.3",
310
+ "text": "YOU KNOW, FOLKS, IF YOU WATCH\nTHIS SHOW, AND I HOPE YOU DO...\nTHEN YOU KNOW I SPEND MOST OF MY TIME\nRIGHT OVER THERE, CAREFULLY\nUNWRAPPING THE DAY'S NEWS,\nPLACING THE FINEST, MOST TOPICAL\nORNAMENTS UPON THE HAND-CUT\nDOUGLAS FIR OF THE DAY'S TOP\nSTORIES, SPRINKLING\nBIODEGRADABLE TINSEL ON THE\nBOUGHS WITH PRECISION AND\nDECADES OF TRAINING THAT COMES\nACROSS AS EFFORTLESS, CHECKING\nEACH JOKE BULB FOR THE OPTIMAL\nTWINKLE, AND FINALLY TOPPING IT\nOFF WITH A FAMILY HEIRLOOM STAR\nTO CREATE THE MAGICAL CHRISTMAS\nMEMORY THAT IS MY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES,\nI BREAK INTO A HOME DEPOT, HUFF\nA BOTTLE OF GOO GONE, STEAL A\nPALLET OF TWO BY FOURS, A PILE\nOF RUSTY NAILS, A BUCKET OF\nDISCONTINUED FAUCET PARTS, SLAP\nTHEM TOGETHER WITH A RECEIPT PAPER\nAND INDUSTRIAL ADHESIVE, PUT IT\nOUTSIDE THE LIVING ROOM WINDOW\nOF THE RETIREMENT HOME I WAS\nKICKED OUT OF FOR A VERY GOOD\nREASON, AND THROW IN A COUPLE OF\nMANNEQUINS STOLEN FROM A BURNED\nOUT FOREVER 21 TO CREATE THE\nHELLSCAPE CRECHE OF NEWS THAT IS\nMY SEGMENT:\n\"MEANWHILE\"!"
311
+ },
312
+ "zFRXCwdPD-M": {
313
+ "begin": "1:31.6",
314
+ "end": "2:10.6",
315
+ "text": "YOU KNOW, FOLKS, I SPEND A LOT\nOF MY TIME ON THE SHOW RIGHT OVER THERE PRECISELY\nMEASURING THE NEWS' INSEAM,\nSELECTING THE FINEST, MOST\nTOPICAL IMPORTED MERINO STORY\nWOOL, THEN HAND-STITCHING IT\nWITH JOKES TO CREATE FOR YOU THE\nBESPOKE, DOUBLE-BREASTED SAVILE\nROW CURRENT EVENT SUIT THAT IS\nMY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES,\nI LIKE TO GATHER UP SOME USED\nBURLAP FROM BEHIND THE\nMEAT-PACKING PLANT, DRAPE IT\nOVER AN ABANDONED MANNEQUIN AT\nOLD MAN JENKINS' BURNED-DOWN\nDRESS FACTORY, AND SEW IT\nTOGETHER WITH SHOESTRINGS AND A\nSTAPLE GUN, TO CREATE FOR YOU\nTHE HAUNTED POTATO-SACK\nSCARECROW OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE!\""
316
+ },
317
+ "zIS1lp9CS-E": {
318
+ "begin": "1:16.5",
319
+ "end": "2:04.5",
320
+ "text": "FOLKS, I SPEND A\nLOT OF TIME RIGHT OVER THERE,\nSELECTING THE FINEST GRAINS OF\nNEWS, HAND SIFTING THROUGH\nBARRELS OF STEELCUT JOKE OATS,\nSELECTING THE RIPEST SEASONAL\nSTORY BERRIES, AND\nHOME-FERMENTING MACROBIOTIC\nALMOND MILK INTO THE UPSCALE ORGANIC\nYOGURT TO LOVINGLY FOLD TOGETHER\nTHE BUZZWORTHY BREAKFAST PARFAIT\nTHAT IS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES I SWEAT MYSELF\nAWAKE IN THE MIDDLE OF THE\nNIGHT, CREEP OUT TO AN ABANDONED\nSCHOOLYARD, SCRAPE A BUNCH OF\nSPILLED CHEERIOS AND DISCARDED\nGOGURT WRAPPERS INTO A BIG GULP\nTRAVEL MUG I WON IN THE GREAT\nTRUCKER-DRIFTER WARS OF 2019,\nADD SOME FERMENTED CRABAPPLES\nFROM BEHIND THE SWING SET, AND\nCHUG BACK THE FETID PRISON\nPORRIDGE OF NEWS THAT IS MY\nSEGMENT:\nMEANWHILE!"
321
+ }
322
+ }
whisper/language-breakdown.svg ADDED
whisper/model-card.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model Card: Whisper
2
+
3
+ This is the official codebase for running the automatic speech recognition (ASR) models (Whisper models) trained and released by OpenAI.
4
+
5
+ Following [Model Cards for Model Reporting (Mitchell et al.)](https://arxiv.org/abs/1810.03993), we're providing some information about the automatic speech recognition model. More information on how these models were trained and evaluated can be found [in the paper](https://arxiv.org/abs/2212.04356).
6
+
7
+
8
+ ## Model Details
9
+
10
+ The Whisper models are trained for speech recognition and translation tasks, capable of transcribing speech audio into the text in the language it is spoken (ASR) as well as translated into English (speech translation). Researchers at OpenAI developed the models to study the robustness of speech processing systems trained under large-scale weak supervision. There are 9 models of different sizes and capabilities, summarized in the following table.
11
+
12
+ | Size | Parameters | English-only model | Multilingual model |
13
+ |:------:|:----------:|:------------------:|:------------------:|
14
+ | tiny | 39 M | ✓ | ✓ |
15
+ | base | 74 M | ✓ | ✓ |
16
+ | small | 244 M | ✓ | ✓ |
17
+ | medium | 769 M | ✓ | ✓ |
18
+ | large | 1550 M | | ✓ |
19
+
20
+ In December 2022, we [released an improved large model named `large-v2`](https://github.com/openai/whisper/discussions/661).
21
+
22
+
23
+ ### Release date
24
+
25
+ September 2022 (original series) and December 2022 (`large-v2`)
26
+
27
+ ### Model type
28
+
29
+ Sequence-to-sequence ASR (automatic speech recognition) and speech translation model
30
+
31
+ ### Paper & samples
32
+
33
+ [Paper](https://arxiv.org/abs/2212.04356) / [Blog](https://openai.com/blog/whisper)
34
+
35
+
36
+ ## Model Use
37
+
38
+ ### Evaluated Use
39
+
40
+ The primary intended users of these models are AI researchers studying the robustness, generalization, capabilities, biases, and constraints of the current model. However, Whisper is also potentially quite useful as an ASR solution for developers, especially for English speech recognition. We recognize that once models are released, it is impossible to restrict access to only “intended” uses or to draw reasonable guidelines around what is or is not research.
41
+
42
+ The models are primarily trained and evaluated on ASR and speech translation to English tasks. They show strong ASR results in ~10 languages. They may exhibit additional capabilities, particularly if fine-tuned on certain tasks like voice activity detection, speaker classification, or speaker diarization but have not been robustly evaluated in these areas. We strongly recommend that users perform robust evaluations of the models in a particular context and domain before deploying them.
43
+
44
+ In particular, we caution against using Whisper models to transcribe recordings of individuals taken without their consent or purporting to use these models for any kind of subjective classification. We recommend against use in high-risk domains like decision-making contexts, where flaws in accuracy can lead to pronounced flaws in outcomes. The models are intended to transcribe and translate speech, use of the model for classification is not only not evaluated but also not appropriate, particularly to infer human attributes.
45
+
46
+
47
+ ## Training Data
48
+
49
+ The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
50
+
51
+ As discussed in [the accompanying paper](https://arxiv.org/abs/2212.04356), we see that performance on transcription in a given language is directly correlated with the amount of training data we employ in that language.
52
+
53
+
54
+ ## Performance and Limitations
55
+
56
+ Our studies show that, over many existing ASR systems, the models exhibit improved robustness to accents, background noise, and technical language, as well as zero-shot translation from multiple languages into English; and that accuracy on speech recognition and translation is near the state-of-the-art level.
57
+
58
+ However, because the models are trained in a weakly supervised manner using large-scale noisy data, the predictions may include texts that are not actually spoken in the audio input (i.e. hallucination). We hypothesize that this happens because, given their general knowledge of language, the models combine trying to predict the next word in audio with trying to transcribe the audio itself.
59
+
60
+ Our models perform unevenly across languages, and we observe lower accuracy on low-resource and/or low-discoverability languages or languages where we have less training data. The models also exhibit disparate performance on different accents and dialects of particular languages, which may include a higher word error rate across speakers of different genders, races, ages, or other demographic criteria. Our full evaluation results are presented in [the paper accompanying this release](https://arxiv.org/abs/2212.04356).
61
+
62
+ In addition, the sequence-to-sequence architecture of the model makes it prone to generating repetitive texts, which can be mitigated to some degree by beam search and temperature scheduling but not perfectly. Further analysis of these limitations is provided in [the paper](https://arxiv.org/abs/2212.04356). It is likely that this behavior and hallucinations may be worse in lower-resource and/or lower-discoverability languages.
63
+
64
+
65
+ ## Broader Implications
66
+
67
+ We anticipate that Whisper models’ transcription capabilities may be used for improving accessibility tools. While Whisper models cannot be used for real-time transcription out of the box – their speed and size suggest that others may be able to build applications on top of them that allow for near-real-time speech recognition and translation. The real value of beneficial applications built on top of Whisper models suggests that the disparate performance of these models may have real economic implications.
68
+
69
+ There are also potential dual-use concerns that come with releasing Whisper. While we hope the technology will be used primarily for beneficial purposes, making ASR technology more accessible could enable more actors to build capable surveillance technologies or scale up existing surveillance efforts, as the speed and accuracy allow for affordable automatic transcription and translation of large volumes of audio communication. Moreover, these models may have some capabilities to recognize specific individuals out of the box, which in turn presents safety concerns related both to dual use and disparate performance. In practice, we expect that the cost of transcription is not the limiting factor of scaling up surveillance projects.
whisper/notebooks/LibriSpeech.ipynb ADDED
@@ -0,0 +1,958 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "v5hvo8QWN-a9"
7
+ },
8
+ "source": [
9
+ "# Installing Whisper\n",
10
+ "\n",
11
+ "The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results."
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 1,
17
+ "metadata": {
18
+ "id": "ZsJUxc0aRsAf"
19
+ },
20
+ "outputs": [],
21
+ "source": [
22
+ "! pip install git+https://github.com/openai/whisper.git\n",
23
+ "! pip install jiwer"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "markdown",
28
+ "metadata": {
29
+ "id": "1IMEkgyagYto"
30
+ },
31
+ "source": [
32
+ "# Loading the LibriSpeech dataset\n",
33
+ "\n",
34
+ "The following will load the test-clean split of the LibriSpeech corpus using torchaudio."
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 2,
40
+ "metadata": {
41
+ "id": "3CqtR2Fi5-vP"
42
+ },
43
+ "outputs": [],
44
+ "source": [
45
+ "import os\n",
46
+ "import numpy as np\n",
47
+ "\n",
48
+ "try:\n",
49
+ " import tensorflow # required in Colab to avoid protobuf compatibility issues\n",
50
+ "except ImportError:\n",
51
+ " pass\n",
52
+ "\n",
53
+ "import torch\n",
54
+ "import pandas as pd\n",
55
+ "import whisper\n",
56
+ "import torchaudio\n",
57
+ "\n",
58
+ "from tqdm.notebook import tqdm\n",
59
+ "\n",
60
+ "\n",
61
+ "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\""
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": 3,
67
+ "metadata": {
68
+ "id": "GuCCB2KYOJCE"
69
+ },
70
+ "outputs": [],
71
+ "source": [
72
+ "class LibriSpeech(torch.utils.data.Dataset):\n",
73
+ " \"\"\"\n",
74
+ " A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.\n",
75
+ " It will drop the last few seconds of a very small portion of the utterances.\n",
76
+ " \"\"\"\n",
77
+ " def __init__(self, split=\"test-clean\", device=DEVICE):\n",
78
+ " self.dataset = torchaudio.datasets.LIBRISPEECH(\n",
79
+ " root=os.path.expanduser(\"~/.cache\"),\n",
80
+ " url=split,\n",
81
+ " download=True,\n",
82
+ " )\n",
83
+ " self.device = device\n",
84
+ "\n",
85
+ " def __len__(self):\n",
86
+ " return len(self.dataset)\n",
87
+ "\n",
88
+ " def __getitem__(self, item):\n",
89
+ " audio, sample_rate, text, _, _, _ = self.dataset[item]\n",
90
+ " assert sample_rate == 16000\n",
91
+ " audio = whisper.pad_or_trim(audio.flatten()).to(self.device)\n",
92
+ " mel = whisper.log_mel_spectrogram(audio)\n",
93
+ " \n",
94
+ " return (mel, text)"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": 4,
100
+ "metadata": {
101
+ "id": "-YcRU5jqNqo2"
102
+ },
103
+ "outputs": [],
104
+ "source": [
105
+ "dataset = LibriSpeech(\"test-clean\")\n",
106
+ "loader = torch.utils.data.DataLoader(dataset, batch_size=16)"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "markdown",
111
+ "metadata": {
112
+ "id": "0ljocCNuUAde"
113
+ },
114
+ "source": [
115
+ "# Running inference on the dataset using a base Whisper model\n",
116
+ "\n",
117
+ "The following will take a few minutes to transcribe all utterances in the dataset."
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": 5,
123
+ "metadata": {
124
+ "colab": {
125
+ "base_uri": "https://localhost:8080/"
126
+ },
127
+ "id": "_PokfNJtOYNu",
128
+ "outputId": "2c53ec44-bc93-4107-b4fa-214e3f71fe8e"
129
+ },
130
+ "outputs": [
131
+ {
132
+ "name": "stdout",
133
+ "output_type": "stream",
134
+ "text": [
135
+ "Model is English-only and has 71,825,408 parameters.\n"
136
+ ]
137
+ }
138
+ ],
139
+ "source": [
140
+ "model = whisper.load_model(\"base.en\")\n",
141
+ "print(\n",
142
+ " f\"Model is {'multilingual' if model.is_multilingual else 'English-only'} \"\n",
143
+ " f\"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters.\"\n",
144
+ ")"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": 6,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "# predict without timestamps for short-form transcription\n",
154
+ "options = whisper.DecodingOptions(language=\"en\", without_timestamps=True)"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": 7,
160
+ "metadata": {
161
+ "colab": {
162
+ "base_uri": "https://localhost:8080/",
163
+ "height": 49,
164
+ "referenced_widgets": [
165
+ "09a29a91f58d4462942505a3cc415801",
166
+ "83391f98a240490987c397048fc1a0d4",
167
+ "06b9aa5f49fa44ba8c93b647dc7db224",
168
+ "da9c231ee67047fb89073c95326b72a5",
169
+ "48da931ebe7f4fd299f8c98c7d2460ff",
170
+ "7a901f447c1d477bb49f954e0feacedd",
171
+ "39f5a6ae8ba74c8598f9c6d5b8ad2d65",
172
+ "a0d10a42c753453283e5219c22239337",
173
+ "09f4cb79ff86465aaf48b0de24869af9",
174
+ "1b9cecf5b3584fba8258a81d4279a25b",
175
+ "039b53f2702c4179af7e0548018d0588"
176
+ ]
177
+ },
178
+ "id": "7OWTn_KvNk59",
179
+ "outputId": "a813a792-3c91-4144-f11f-054fd6778023"
180
+ },
181
+ "outputs": [
182
+ {
183
+ "data": {
184
+ "application/vnd.jupyter.widget-view+json": {
185
+ "model_id": "9df048b46f764cf68cbe0045b8ff73a8",
186
+ "version_major": 2,
187
+ "version_minor": 0
188
+ },
189
+ "text/plain": [
190
+ " 0%| | 0/164 [00:00<?, ?it/s]"
191
+ ]
192
+ },
193
+ "metadata": {},
194
+ "output_type": "display_data"
195
+ }
196
+ ],
197
+ "source": [
198
+ "hypotheses = []\n",
199
+ "references = []\n",
200
+ "\n",
201
+ "for mels, texts in tqdm(loader):\n",
202
+ " results = model.decode(mels, options)\n",
203
+ " hypotheses.extend([result.text for result in results])\n",
204
+ " references.extend(texts)"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": 8,
210
+ "metadata": {
211
+ "colab": {
212
+ "base_uri": "https://localhost:8080/",
213
+ "height": 424
214
+ },
215
+ "id": "4nTyynELQ42j",
216
+ "outputId": "1c72d25a-3e87-4c60-a8d1-1da9d2f73bd7"
217
+ },
218
+ "outputs": [
219
+ {
220
+ "data": {
221
+ "text/html": [
222
+ "<div>\n",
223
+ "<style scoped>\n",
224
+ " .dataframe tbody tr th:only-of-type {\n",
225
+ " vertical-align: middle;\n",
226
+ " }\n",
227
+ "\n",
228
+ " .dataframe tbody tr th {\n",
229
+ " vertical-align: top;\n",
230
+ " }\n",
231
+ "\n",
232
+ " .dataframe thead th {\n",
233
+ " text-align: right;\n",
234
+ " }\n",
235
+ "</style>\n",
236
+ "<table border=\"1\" class=\"dataframe\">\n",
237
+ " <thead>\n",
238
+ " <tr style=\"text-align: right;\">\n",
239
+ " <th></th>\n",
240
+ " <th>hypothesis</th>\n",
241
+ " <th>reference</th>\n",
242
+ " </tr>\n",
243
+ " </thead>\n",
244
+ " <tbody>\n",
245
+ " <tr>\n",
246
+ " <th>0</th>\n",
247
+ " <td>He hoped there would be stew for dinner, turni...</td>\n",
248
+ " <td>HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...</td>\n",
249
+ " </tr>\n",
250
+ " <tr>\n",
251
+ " <th>1</th>\n",
252
+ " <td>Stuffered into you, his belly counseled him.</td>\n",
253
+ " <td>STUFF IT INTO YOU HIS BELLY COUNSELLED HIM</td>\n",
254
+ " </tr>\n",
255
+ " <tr>\n",
256
+ " <th>2</th>\n",
257
+ " <td>After early nightfall the yellow lamps would l...</td>\n",
258
+ " <td>AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...</td>\n",
259
+ " </tr>\n",
260
+ " <tr>\n",
261
+ " <th>3</th>\n",
262
+ " <td>Hello Bertie, any good in your mind?</td>\n",
263
+ " <td>HELLO BERTIE ANY GOOD IN YOUR MIND</td>\n",
264
+ " </tr>\n",
265
+ " <tr>\n",
266
+ " <th>4</th>\n",
267
+ " <td>Number 10. Fresh Nelly is waiting on you. Good...</td>\n",
268
+ " <td>NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...</td>\n",
269
+ " </tr>\n",
270
+ " <tr>\n",
271
+ " <th>...</th>\n",
272
+ " <td>...</td>\n",
273
+ " <td>...</td>\n",
274
+ " </tr>\n",
275
+ " <tr>\n",
276
+ " <th>2615</th>\n",
277
+ " <td>Oh, to shoot my soul's full meaning into futur...</td>\n",
278
+ " <td>OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...</td>\n",
279
+ " </tr>\n",
280
+ " <tr>\n",
281
+ " <th>2616</th>\n",
282
+ " <td>Then I, long tried by natural ills, received t...</td>\n",
283
+ " <td>THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...</td>\n",
284
+ " </tr>\n",
285
+ " <tr>\n",
286
+ " <th>2617</th>\n",
287
+ " <td>I love thee freely as men strive for right. I ...</td>\n",
288
+ " <td>I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...</td>\n",
289
+ " </tr>\n",
290
+ " <tr>\n",
291
+ " <th>2618</th>\n",
292
+ " <td>I love thee with the passion put to use, in my...</td>\n",
293
+ " <td>I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...</td>\n",
294
+ " </tr>\n",
295
+ " <tr>\n",
296
+ " <th>2619</th>\n",
297
+ " <td>I love thee with the love I seemed to lose wit...</td>\n",
298
+ " <td>I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...</td>\n",
299
+ " </tr>\n",
300
+ " </tbody>\n",
301
+ "</table>\n",
302
+ "<p>2620 rows × 2 columns</p>\n",
303
+ "</div>"
304
+ ],
305
+ "text/plain": [
306
+ " hypothesis \\\n",
307
+ "0 He hoped there would be stew for dinner, turni... \n",
308
+ "1 Stuffered into you, his belly counseled him. \n",
309
+ "2 After early nightfall the yellow lamps would l... \n",
310
+ "3 Hello Bertie, any good in your mind? \n",
311
+ "4 Number 10. Fresh Nelly is waiting on you. Good... \n",
312
+ "... ... \n",
313
+ "2615 Oh, to shoot my soul's full meaning into futur... \n",
314
+ "2616 Then I, long tried by natural ills, received t... \n",
315
+ "2617 I love thee freely as men strive for right. I ... \n",
316
+ "2618 I love thee with the passion put to use, in my... \n",
317
+ "2619 I love thee with the love I seemed to lose wit... \n",
318
+ "\n",
319
+ " reference \n",
320
+ "0 HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP... \n",
321
+ "1 STUFF IT INTO YOU HIS BELLY COUNSELLED HIM \n",
322
+ "2 AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L... \n",
323
+ "3 HELLO BERTIE ANY GOOD IN YOUR MIND \n",
324
+ "4 NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ... \n",
325
+ "... ... \n",
326
+ "2615 OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE... \n",
327
+ "2616 THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE... \n",
328
+ "2617 I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L... \n",
329
+ "2618 I LOVE THEE WITH THE PASSION PUT TO USE IN MY ... \n",
330
+ "2619 I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ... \n",
331
+ "\n",
332
+ "[2620 rows x 2 columns]"
333
+ ]
334
+ },
335
+ "execution_count": 8,
336
+ "metadata": {},
337
+ "output_type": "execute_result"
338
+ }
339
+ ],
340
+ "source": [
341
+ "data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))\n",
342
+ "data"
343
+ ]
344
+ },
345
+ {
346
+ "cell_type": "markdown",
347
+ "metadata": {
348
+ "id": "HPppEJRXX4ox"
349
+ },
350
+ "source": [
351
+ "# Calculating the word error rate\n",
352
+ "\n",
353
+ "Now, we use our English normalizer implementation to standardize the transcription and calculate the WER."
354
+ ]
355
+ },
356
+ {
357
+ "cell_type": "code",
358
+ "execution_count": 9,
359
+ "metadata": {
360
+ "id": "dl-KBDflMhrg"
361
+ },
362
+ "outputs": [],
363
+ "source": [
364
+ "import jiwer\n",
365
+ "from whisper.normalizers import EnglishTextNormalizer\n",
366
+ "\n",
367
+ "normalizer = EnglishTextNormalizer()"
368
+ ]
369
+ },
370
+ {
371
+ "cell_type": "code",
372
+ "execution_count": 10,
373
+ "metadata": {
374
+ "colab": {
375
+ "base_uri": "https://localhost:8080/",
376
+ "height": 641
377
+ },
378
+ "id": "6-O048q4WI4o",
379
+ "outputId": "f2089bc9-f535-441e-f192-26e52ae82b5e"
380
+ },
381
+ "outputs": [
382
+ {
383
+ "data": {
384
+ "text/html": [
385
+ "<div>\n",
386
+ "<style scoped>\n",
387
+ " .dataframe tbody tr th:only-of-type {\n",
388
+ " vertical-align: middle;\n",
389
+ " }\n",
390
+ "\n",
391
+ " .dataframe tbody tr th {\n",
392
+ " vertical-align: top;\n",
393
+ " }\n",
394
+ "\n",
395
+ " .dataframe thead th {\n",
396
+ " text-align: right;\n",
397
+ " }\n",
398
+ "</style>\n",
399
+ "<table border=\"1\" class=\"dataframe\">\n",
400
+ " <thead>\n",
401
+ " <tr style=\"text-align: right;\">\n",
402
+ " <th></th>\n",
403
+ " <th>hypothesis</th>\n",
404
+ " <th>reference</th>\n",
405
+ " <th>hypothesis_clean</th>\n",
406
+ " <th>reference_clean</th>\n",
407
+ " </tr>\n",
408
+ " </thead>\n",
409
+ " <tbody>\n",
410
+ " <tr>\n",
411
+ " <th>0</th>\n",
412
+ " <td>He hoped there would be stew for dinner, turni...</td>\n",
413
+ " <td>HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...</td>\n",
414
+ " <td>he hoped there would be stew for dinner turnip...</td>\n",
415
+ " <td>he hoped there would be stew for dinner turnip...</td>\n",
416
+ " </tr>\n",
417
+ " <tr>\n",
418
+ " <th>1</th>\n",
419
+ " <td>Stuffered into you, his belly counseled him.</td>\n",
420
+ " <td>STUFF IT INTO YOU HIS BELLY COUNSELLED HIM</td>\n",
421
+ " <td>stuffered into you his belly counseled him</td>\n",
422
+ " <td>stuff it into you his belly counseled him</td>\n",
423
+ " </tr>\n",
424
+ " <tr>\n",
425
+ " <th>2</th>\n",
426
+ " <td>After early nightfall the yellow lamps would l...</td>\n",
427
+ " <td>AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...</td>\n",
428
+ " <td>after early nightfall the yellow lamps would l...</td>\n",
429
+ " <td>after early nightfall the yellow lamps would l...</td>\n",
430
+ " </tr>\n",
431
+ " <tr>\n",
432
+ " <th>3</th>\n",
433
+ " <td>Hello Bertie, any good in your mind?</td>\n",
434
+ " <td>HELLO BERTIE ANY GOOD IN YOUR MIND</td>\n",
435
+ " <td>hello bertie any good in your mind</td>\n",
436
+ " <td>hello bertie any good in your mind</td>\n",
437
+ " </tr>\n",
438
+ " <tr>\n",
439
+ " <th>4</th>\n",
440
+ " <td>Number 10. Fresh Nelly is waiting on you. Good...</td>\n",
441
+ " <td>NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...</td>\n",
442
+ " <td>number 10 fresh nelly is waiting on you good n...</td>\n",
443
+ " <td>number 10 fresh nelly is waiting on you good n...</td>\n",
444
+ " </tr>\n",
445
+ " <tr>\n",
446
+ " <th>...</th>\n",
447
+ " <td>...</td>\n",
448
+ " <td>...</td>\n",
449
+ " <td>...</td>\n",
450
+ " <td>...</td>\n",
451
+ " </tr>\n",
452
+ " <tr>\n",
453
+ " <th>2615</th>\n",
454
+ " <td>Oh, to shoot my soul's full meaning into futur...</td>\n",
455
+ " <td>OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...</td>\n",
456
+ " <td>0 to shoot my soul is full meaning into future...</td>\n",
457
+ " <td>0 to shoot my soul is full meaning into future...</td>\n",
458
+ " </tr>\n",
459
+ " <tr>\n",
460
+ " <th>2616</th>\n",
461
+ " <td>Then I, long tried by natural ills, received t...</td>\n",
462
+ " <td>THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...</td>\n",
463
+ " <td>then i long tried by natural ills received the...</td>\n",
464
+ " <td>then i long tried by natural ills received the...</td>\n",
465
+ " </tr>\n",
466
+ " <tr>\n",
467
+ " <th>2617</th>\n",
468
+ " <td>I love thee freely as men strive for right. I ...</td>\n",
469
+ " <td>I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...</td>\n",
470
+ " <td>i love thee freely as men strive for right i l...</td>\n",
471
+ " <td>i love thee freely as men strive for right i l...</td>\n",
472
+ " </tr>\n",
473
+ " <tr>\n",
474
+ " <th>2618</th>\n",
475
+ " <td>I love thee with the passion put to use, in my...</td>\n",
476
+ " <td>I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...</td>\n",
477
+ " <td>i love thee with the passion put to use in my ...</td>\n",
478
+ " <td>i love thee with the passion put to use in my ...</td>\n",
479
+ " </tr>\n",
480
+ " <tr>\n",
481
+ " <th>2619</th>\n",
482
+ " <td>I love thee with the love I seemed to lose wit...</td>\n",
483
+ " <td>I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...</td>\n",
484
+ " <td>i love thee with the love i seemed to lose wit...</td>\n",
485
+ " <td>i love thee with a love i seemed to lose with ...</td>\n",
486
+ " </tr>\n",
487
+ " </tbody>\n",
488
+ "</table>\n",
489
+ "<p>2620 rows × 4 columns</p>\n",
490
+ "</div>"
491
+ ],
492
+ "text/plain": [
493
+ " hypothesis \\\n",
494
+ "0 He hoped there would be stew for dinner, turni... \n",
495
+ "1 Stuffered into you, his belly counseled him. \n",
496
+ "2 After early nightfall the yellow lamps would l... \n",
497
+ "3 Hello Bertie, any good in your mind? \n",
498
+ "4 Number 10. Fresh Nelly is waiting on you. Good... \n",
499
+ "... ... \n",
500
+ "2615 Oh, to shoot my soul's full meaning into futur... \n",
501
+ "2616 Then I, long tried by natural ills, received t... \n",
502
+ "2617 I love thee freely as men strive for right. I ... \n",
503
+ "2618 I love thee with the passion put to use, in my... \n",
504
+ "2619 I love thee with the love I seemed to lose wit... \n",
505
+ "\n",
506
+ " reference \\\n",
507
+ "0 HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP... \n",
508
+ "1 STUFF IT INTO YOU HIS BELLY COUNSELLED HIM \n",
509
+ "2 AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L... \n",
510
+ "3 HELLO BERTIE ANY GOOD IN YOUR MIND \n",
511
+ "4 NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ... \n",
512
+ "... ... \n",
513
+ "2615 OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE... \n",
514
+ "2616 THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE... \n",
515
+ "2617 I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L... \n",
516
+ "2618 I LOVE THEE WITH THE PASSION PUT TO USE IN MY ... \n",
517
+ "2619 I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ... \n",
518
+ "\n",
519
+ " hypothesis_clean \\\n",
520
+ "0 he hoped there would be stew for dinner turnip... \n",
521
+ "1 stuffered into you his belly counseled him \n",
522
+ "2 after early nightfall the yellow lamps would l... \n",
523
+ "3 hello bertie any good in your mind \n",
524
+ "4 number 10 fresh nelly is waiting on you good n... \n",
525
+ "... ... \n",
526
+ "2615 0 to shoot my soul is full meaning into future... \n",
527
+ "2616 then i long tried by natural ills received the... \n",
528
+ "2617 i love thee freely as men strive for right i l... \n",
529
+ "2618 i love thee with the passion put to use in my ... \n",
530
+ "2619 i love thee with the love i seemed to lose wit... \n",
531
+ "\n",
532
+ " reference_clean \n",
533
+ "0 he hoped there would be stew for dinner turnip... \n",
534
+ "1 stuff it into you his belly counseled him \n",
535
+ "2 after early nightfall the yellow lamps would l... \n",
536
+ "3 hello bertie any good in your mind \n",
537
+ "4 number 10 fresh nelly is waiting on you good n... \n",
538
+ "... ... \n",
539
+ "2615 0 to shoot my soul is full meaning into future... \n",
540
+ "2616 then i long tried by natural ills received the... \n",
541
+ "2617 i love thee freely as men strive for right i l... \n",
542
+ "2618 i love thee with the passion put to use in my ... \n",
543
+ "2619 i love thee with a love i seemed to lose with ... \n",
544
+ "\n",
545
+ "[2620 rows x 4 columns]"
546
+ ]
547
+ },
548
+ "execution_count": 10,
549
+ "metadata": {},
550
+ "output_type": "execute_result"
551
+ }
552
+ ],
553
+ "source": [
554
+ "data[\"hypothesis_clean\"] = [normalizer(text) for text in data[\"hypothesis\"]]\n",
555
+ "data[\"reference_clean\"] = [normalizer(text) for text in data[\"reference\"]]\n",
556
+ "data"
557
+ ]
558
+ },
559
+ {
560
+ "cell_type": "code",
561
+ "execution_count": 11,
562
+ "metadata": {
563
+ "colab": {
564
+ "base_uri": "https://localhost:8080/"
565
+ },
566
+ "id": "EBGSITeBYPTT",
567
+ "outputId": "7b3dbe7c-a37e-4a07-a50a-b27d5f88b68f"
568
+ },
569
+ "outputs": [
570
+ {
571
+ "name": "stdout",
572
+ "output_type": "stream",
573
+ "text": [
574
+ "WER: 4.26 %\n"
575
+ ]
576
+ }
577
+ ],
578
+ "source": [
579
+ "wer = jiwer.wer(list(data[\"reference_clean\"]), list(data[\"hypothesis_clean\"]))\n",
580
+ "\n",
581
+ "print(f\"WER: {wer * 100:.2f} %\")"
582
+ ]
583
+ }
584
+ ],
585
+ "metadata": {
586
+ "accelerator": "GPU",
587
+ "colab": {
588
+ "collapsed_sections": [],
589
+ "provenance": []
590
+ },
591
+ "gpuClass": "standard",
592
+ "kernelspec": {
593
+ "display_name": "Python 3 (ipykernel)",
594
+ "language": "python",
595
+ "name": "python3"
596
+ },
597
+ "language_info": {
598
+ "codemirror_mode": {
599
+ "name": "ipython",
600
+ "version": 3
601
+ },
602
+ "file_extension": ".py",
603
+ "mimetype": "text/x-python",
604
+ "name": "python",
605
+ "nbconvert_exporter": "python",
606
+ "pygments_lexer": "ipython3",
607
+ "version": "3.9.9"
608
+ },
609
+ "widgets": {
610
+ "application/vnd.jupyter.widget-state+json": {
611
+ "039b53f2702c4179af7e0548018d0588": {
612
+ "model_module": "@jupyter-widgets/controls",
613
+ "model_module_version": "1.5.0",
614
+ "model_name": "DescriptionStyleModel",
615
+ "state": {
616
+ "_model_module": "@jupyter-widgets/controls",
617
+ "_model_module_version": "1.5.0",
618
+ "_model_name": "DescriptionStyleModel",
619
+ "_view_count": null,
620
+ "_view_module": "@jupyter-widgets/base",
621
+ "_view_module_version": "1.2.0",
622
+ "_view_name": "StyleView",
623
+ "description_width": ""
624
+ }
625
+ },
626
+ "06b9aa5f49fa44ba8c93b647dc7db224": {
627
+ "model_module": "@jupyter-widgets/controls",
628
+ "model_module_version": "1.5.0",
629
+ "model_name": "FloatProgressModel",
630
+ "state": {
631
+ "_dom_classes": [],
632
+ "_model_module": "@jupyter-widgets/controls",
633
+ "_model_module_version": "1.5.0",
634
+ "_model_name": "FloatProgressModel",
635
+ "_view_count": null,
636
+ "_view_module": "@jupyter-widgets/controls",
637
+ "_view_module_version": "1.5.0",
638
+ "_view_name": "ProgressView",
639
+ "bar_style": "success",
640
+ "description": "",
641
+ "description_tooltip": null,
642
+ "layout": "IPY_MODEL_a0d10a42c753453283e5219c22239337",
643
+ "max": 164,
644
+ "min": 0,
645
+ "orientation": "horizontal",
646
+ "style": "IPY_MODEL_09f4cb79ff86465aaf48b0de24869af9",
647
+ "value": 164
648
+ }
649
+ },
650
+ "09a29a91f58d4462942505a3cc415801": {
651
+ "model_module": "@jupyter-widgets/controls",
652
+ "model_module_version": "1.5.0",
653
+ "model_name": "HBoxModel",
654
+ "state": {
655
+ "_dom_classes": [],
656
+ "_model_module": "@jupyter-widgets/controls",
657
+ "_model_module_version": "1.5.0",
658
+ "_model_name": "HBoxModel",
659
+ "_view_count": null,
660
+ "_view_module": "@jupyter-widgets/controls",
661
+ "_view_module_version": "1.5.0",
662
+ "_view_name": "HBoxView",
663
+ "box_style": "",
664
+ "children": [
665
+ "IPY_MODEL_83391f98a240490987c397048fc1a0d4",
666
+ "IPY_MODEL_06b9aa5f49fa44ba8c93b647dc7db224",
667
+ "IPY_MODEL_da9c231ee67047fb89073c95326b72a5"
668
+ ],
669
+ "layout": "IPY_MODEL_48da931ebe7f4fd299f8c98c7d2460ff"
670
+ }
671
+ },
672
+ "09f4cb79ff86465aaf48b0de24869af9": {
673
+ "model_module": "@jupyter-widgets/controls",
674
+ "model_module_version": "1.5.0",
675
+ "model_name": "ProgressStyleModel",
676
+ "state": {
677
+ "_model_module": "@jupyter-widgets/controls",
678
+ "_model_module_version": "1.5.0",
679
+ "_model_name": "ProgressStyleModel",
680
+ "_view_count": null,
681
+ "_view_module": "@jupyter-widgets/base",
682
+ "_view_module_version": "1.2.0",
683
+ "_view_name": "StyleView",
684
+ "bar_color": null,
685
+ "description_width": ""
686
+ }
687
+ },
688
+ "1b9cecf5b3584fba8258a81d4279a25b": {
689
+ "model_module": "@jupyter-widgets/base",
690
+ "model_module_version": "1.2.0",
691
+ "model_name": "LayoutModel",
692
+ "state": {
693
+ "_model_module": "@jupyter-widgets/base",
694
+ "_model_module_version": "1.2.0",
695
+ "_model_name": "LayoutModel",
696
+ "_view_count": null,
697
+ "_view_module": "@jupyter-widgets/base",
698
+ "_view_module_version": "1.2.0",
699
+ "_view_name": "LayoutView",
700
+ "align_content": null,
701
+ "align_items": null,
702
+ "align_self": null,
703
+ "border": null,
704
+ "bottom": null,
705
+ "display": null,
706
+ "flex": null,
707
+ "flex_flow": null,
708
+ "grid_area": null,
709
+ "grid_auto_columns": null,
710
+ "grid_auto_flow": null,
711
+ "grid_auto_rows": null,
712
+ "grid_column": null,
713
+ "grid_gap": null,
714
+ "grid_row": null,
715
+ "grid_template_areas": null,
716
+ "grid_template_columns": null,
717
+ "grid_template_rows": null,
718
+ "height": null,
719
+ "justify_content": null,
720
+ "justify_items": null,
721
+ "left": null,
722
+ "margin": null,
723
+ "max_height": null,
724
+ "max_width": null,
725
+ "min_height": null,
726
+ "min_width": null,
727
+ "object_fit": null,
728
+ "object_position": null,
729
+ "order": null,
730
+ "overflow": null,
731
+ "overflow_x": null,
732
+ "overflow_y": null,
733
+ "padding": null,
734
+ "right": null,
735
+ "top": null,
736
+ "visibility": null,
737
+ "width": null
738
+ }
739
+ },
740
+ "39f5a6ae8ba74c8598f9c6d5b8ad2d65": {
741
+ "model_module": "@jupyter-widgets/controls",
742
+ "model_module_version": "1.5.0",
743
+ "model_name": "DescriptionStyleModel",
744
+ "state": {
745
+ "_model_module": "@jupyter-widgets/controls",
746
+ "_model_module_version": "1.5.0",
747
+ "_model_name": "DescriptionStyleModel",
748
+ "_view_count": null,
749
+ "_view_module": "@jupyter-widgets/base",
750
+ "_view_module_version": "1.2.0",
751
+ "_view_name": "StyleView",
752
+ "description_width": ""
753
+ }
754
+ },
755
+ "48da931ebe7f4fd299f8c98c7d2460ff": {
756
+ "model_module": "@jupyter-widgets/base",
757
+ "model_module_version": "1.2.0",
758
+ "model_name": "LayoutModel",
759
+ "state": {
760
+ "_model_module": "@jupyter-widgets/base",
761
+ "_model_module_version": "1.2.0",
762
+ "_model_name": "LayoutModel",
763
+ "_view_count": null,
764
+ "_view_module": "@jupyter-widgets/base",
765
+ "_view_module_version": "1.2.0",
766
+ "_view_name": "LayoutView",
767
+ "align_content": null,
768
+ "align_items": null,
769
+ "align_self": null,
770
+ "border": null,
771
+ "bottom": null,
772
+ "display": null,
773
+ "flex": null,
774
+ "flex_flow": null,
775
+ "grid_area": null,
776
+ "grid_auto_columns": null,
777
+ "grid_auto_flow": null,
778
+ "grid_auto_rows": null,
779
+ "grid_column": null,
780
+ "grid_gap": null,
781
+ "grid_row": null,
782
+ "grid_template_areas": null,
783
+ "grid_template_columns": null,
784
+ "grid_template_rows": null,
785
+ "height": null,
786
+ "justify_content": null,
787
+ "justify_items": null,
788
+ "left": null,
789
+ "margin": null,
790
+ "max_height": null,
791
+ "max_width": null,
792
+ "min_height": null,
793
+ "min_width": null,
794
+ "object_fit": null,
795
+ "object_position": null,
796
+ "order": null,
797
+ "overflow": null,
798
+ "overflow_x": null,
799
+ "overflow_y": null,
800
+ "padding": null,
801
+ "right": null,
802
+ "top": null,
803
+ "visibility": null,
804
+ "width": null
805
+ }
806
+ },
807
+ "7a901f447c1d477bb49f954e0feacedd": {
808
+ "model_module": "@jupyter-widgets/base",
809
+ "model_module_version": "1.2.0",
810
+ "model_name": "LayoutModel",
811
+ "state": {
812
+ "_model_module": "@jupyter-widgets/base",
813
+ "_model_module_version": "1.2.0",
814
+ "_model_name": "LayoutModel",
815
+ "_view_count": null,
816
+ "_view_module": "@jupyter-widgets/base",
817
+ "_view_module_version": "1.2.0",
818
+ "_view_name": "LayoutView",
819
+ "align_content": null,
820
+ "align_items": null,
821
+ "align_self": null,
822
+ "border": null,
823
+ "bottom": null,
824
+ "display": null,
825
+ "flex": null,
826
+ "flex_flow": null,
827
+ "grid_area": null,
828
+ "grid_auto_columns": null,
829
+ "grid_auto_flow": null,
830
+ "grid_auto_rows": null,
831
+ "grid_column": null,
832
+ "grid_gap": null,
833
+ "grid_row": null,
834
+ "grid_template_areas": null,
835
+ "grid_template_columns": null,
836
+ "grid_template_rows": null,
837
+ "height": null,
838
+ "justify_content": null,
839
+ "justify_items": null,
840
+ "left": null,
841
+ "margin": null,
842
+ "max_height": null,
843
+ "max_width": null,
844
+ "min_height": null,
845
+ "min_width": null,
846
+ "object_fit": null,
847
+ "object_position": null,
848
+ "order": null,
849
+ "overflow": null,
850
+ "overflow_x": null,
851
+ "overflow_y": null,
852
+ "padding": null,
853
+ "right": null,
854
+ "top": null,
855
+ "visibility": null,
856
+ "width": null
857
+ }
858
+ },
859
+ "83391f98a240490987c397048fc1a0d4": {
860
+ "model_module": "@jupyter-widgets/controls",
861
+ "model_module_version": "1.5.0",
862
+ "model_name": "HTMLModel",
863
+ "state": {
864
+ "_dom_classes": [],
865
+ "_model_module": "@jupyter-widgets/controls",
866
+ "_model_module_version": "1.5.0",
867
+ "_model_name": "HTMLModel",
868
+ "_view_count": null,
869
+ "_view_module": "@jupyter-widgets/controls",
870
+ "_view_module_version": "1.5.0",
871
+ "_view_name": "HTMLView",
872
+ "description": "",
873
+ "description_tooltip": null,
874
+ "layout": "IPY_MODEL_7a901f447c1d477bb49f954e0feacedd",
875
+ "placeholder": "​",
876
+ "style": "IPY_MODEL_39f5a6ae8ba74c8598f9c6d5b8ad2d65",
877
+ "value": "100%"
878
+ }
879
+ },
880
+ "a0d10a42c753453283e5219c22239337": {
881
+ "model_module": "@jupyter-widgets/base",
882
+ "model_module_version": "1.2.0",
883
+ "model_name": "LayoutModel",
884
+ "state": {
885
+ "_model_module": "@jupyter-widgets/base",
886
+ "_model_module_version": "1.2.0",
887
+ "_model_name": "LayoutModel",
888
+ "_view_count": null,
889
+ "_view_module": "@jupyter-widgets/base",
890
+ "_view_module_version": "1.2.0",
891
+ "_view_name": "LayoutView",
892
+ "align_content": null,
893
+ "align_items": null,
894
+ "align_self": null,
895
+ "border": null,
896
+ "bottom": null,
897
+ "display": null,
898
+ "flex": null,
899
+ "flex_flow": null,
900
+ "grid_area": null,
901
+ "grid_auto_columns": null,
902
+ "grid_auto_flow": null,
903
+ "grid_auto_rows": null,
904
+ "grid_column": null,
905
+ "grid_gap": null,
906
+ "grid_row": null,
907
+ "grid_template_areas": null,
908
+ "grid_template_columns": null,
909
+ "grid_template_rows": null,
910
+ "height": null,
911
+ "justify_content": null,
912
+ "justify_items": null,
913
+ "left": null,
914
+ "margin": null,
915
+ "max_height": null,
916
+ "max_width": null,
917
+ "min_height": null,
918
+ "min_width": null,
919
+ "object_fit": null,
920
+ "object_position": null,
921
+ "order": null,
922
+ "overflow": null,
923
+ "overflow_x": null,
924
+ "overflow_y": null,
925
+ "padding": null,
926
+ "right": null,
927
+ "top": null,
928
+ "visibility": null,
929
+ "width": null
930
+ }
931
+ },
932
+ "da9c231ee67047fb89073c95326b72a5": {
933
+ "model_module": "@jupyter-widgets/controls",
934
+ "model_module_version": "1.5.0",
935
+ "model_name": "HTMLModel",
936
+ "state": {
937
+ "_dom_classes": [],
938
+ "_model_module": "@jupyter-widgets/controls",
939
+ "_model_module_version": "1.5.0",
940
+ "_model_name": "HTMLModel",
941
+ "_view_count": null,
942
+ "_view_module": "@jupyter-widgets/controls",
943
+ "_view_module_version": "1.5.0",
944
+ "_view_name": "HTMLView",
945
+ "description": "",
946
+ "description_tooltip": null,
947
+ "layout": "IPY_MODEL_1b9cecf5b3584fba8258a81d4279a25b",
948
+ "placeholder": "​",
949
+ "style": "IPY_MODEL_039b53f2702c4179af7e0548018d0588",
950
+ "value": " 164/164 [05:08&lt;00:00, 1.86s/it]"
951
+ }
952
+ }
953
+ }
954
+ }
955
+ },
956
+ "nbformat": 4,
957
+ "nbformat_minor": 1
958
+ }
whisper/notebooks/Multilingual_ASR.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
whisper/pyproject.toml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [tool.black]
2
+
3
+ [tool.isort]
4
+ profile = "black"
5
+ include_trailing_comma = true
6
+ line_length = 88
7
+ multi_line_output = 3
8
+
whisper/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ numba
2
+ numpy
3
+ torch
4
+ tqdm
5
+ more-itertools
6
+ tiktoken==0.3.3
whisper/setup.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import platform
3
+ import sys
4
+
5
+ import pkg_resources
6
+ from setuptools import find_packages, setup
7
+
8
+
9
+ def read_version(fname="whisper/version.py"):
10
+ exec(compile(open(fname, encoding="utf-8").read(), fname, "exec"))
11
+ return locals()["__version__"]
12
+
13
+
14
+ requirements = []
15
+ if sys.platform.startswith("linux") and platform.machine() == "x86_64":
16
+ requirements.append("triton==2.0.0")
17
+
18
+ setup(
19
+ name="openai-whisper",
20
+ py_modules=["whisper"],
21
+ version=read_version(),
22
+ description="Robust Speech Recognition via Large-Scale Weak Supervision",
23
+ long_description=open("README.md", encoding="utf-8").read(),
24
+ long_description_content_type="text/markdown",
25
+ readme="README.md",
26
+ python_requires=">=3.8",
27
+ author="OpenAI",
28
+ url="https://github.com/openai/whisper",
29
+ license="MIT",
30
+ packages=find_packages(exclude=["tests*"]),
31
+ install_requires=requirements
32
+ + [
33
+ str(r)
34
+ for r in pkg_resources.parse_requirements(
35
+ open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
36
+ )
37
+ ],
38
+ entry_points={
39
+ "console_scripts": ["whisper=whisper.transcribe:cli"],
40
+ },
41
+ include_package_data=True,
42
+ extras_require={"dev": ["pytest", "scipy", "black", "flake8", "isort"]},
43
+ )
whisper/tests/conftest.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random as rand
2
+
3
+ import numpy
4
+ import pytest
5
+
6
+
7
+ def pytest_configure(config):
8
+ config.addinivalue_line("markers", "requires_cuda")
9
+
10
+
11
+ @pytest.fixture
12
+ def random():
13
+ rand.seed(42)
14
+ numpy.random.seed(42)
whisper/tests/jfk.flac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63a4b1e4c1dc655ac70961ffbf518acd249df237e5a0152faae9a4a836949715
3
+ size 1152693
whisper/tests/test_audio.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+
3
+ import numpy as np
4
+
5
+ from whisper.audio import SAMPLE_RATE, load_audio, log_mel_spectrogram
6
+
7
+
8
+ def test_audio():
9
+ audio_path = os.path.join(os.path.dirname(__file__), "jfk.flac")
10
+ audio = load_audio(audio_path)
11
+ assert audio.ndim == 1
12
+ assert SAMPLE_RATE * 10 < audio.shape[0] < SAMPLE_RATE * 12
13
+ assert 0 < audio.std() < 1
14
+
15
+ mel_from_audio = log_mel_spectrogram(audio)
16
+ mel_from_file = log_mel_spectrogram(audio_path)
17
+
18
+ assert np.allclose(mel_from_audio, mel_from_file)
19
+ assert mel_from_audio.max() - mel_from_audio.min() <= 2.0
whisper/tests/test_normalizer.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from whisper.normalizers import EnglishTextNormalizer
4
+ from whisper.normalizers.english import (
5
+ EnglishNumberNormalizer,
6
+ EnglishSpellingNormalizer,
7
+ )
8
+
9
+
10
+ @pytest.mark.parametrize("std", [EnglishNumberNormalizer(), EnglishTextNormalizer()])
11
+ def test_number_normalizer(std):
12
+ assert std("two") == "2"
13
+ assert std("thirty one") == "31"
14
+ assert std("five twenty four") == "524"
15
+ assert std("nineteen ninety nine") == "1999"
16
+ assert std("twenty nineteen") == "2019"
17
+
18
+ assert std("two point five million") == "2500000"
19
+ assert std("four point two billions") == "4200000000s"
20
+ assert std("200 thousand") == "200000"
21
+ assert std("200 thousand dollars") == "$200000"
22
+ assert std("$20 million") == "$20000000"
23
+ assert std("€52.4 million") == "€52400000"
24
+ assert std("£77 thousands") == "£77000s"
25
+
26
+ assert std("two double o eight") == "2008"
27
+
28
+ assert std("three thousand twenty nine") == "3029"
29
+ assert std("forty three thousand two hundred sixty") == "43260"
30
+ assert std("forty three thousand two hundred and sixty") == "43260"
31
+
32
+ assert std("nineteen fifties") == "1950s"
33
+ assert std("thirty first") == "31st"
34
+ assert std("thirty three thousand and three hundred and thirty third") == "33333rd"
35
+
36
+ assert std("three billion") == "3000000000"
37
+ assert std("millions") == "1000000s"
38
+
39
+ assert std("july third twenty twenty") == "july 3rd 2020"
40
+ assert std("august twenty sixth twenty twenty one") == "august 26th 2021"
41
+ assert std("3 14") == "3 14"
42
+ assert std("3.14") == "3.14"
43
+ assert std("3 point 2") == "3.2"
44
+ assert std("3 point 14") == "3.14"
45
+ assert std("fourteen point 4") == "14.4"
46
+ assert std("two point two five dollars") == "$2.25"
47
+ assert std("two hundred million dollars") == "$200000000"
48
+ assert std("$20.1 million") == "$20100000"
49
+
50
+ assert std("ninety percent") == "90%"
51
+ assert std("seventy six per cent") == "76%"
52
+
53
+ assert std("double oh seven") == "007"
54
+ assert std("double zero seven") == "007"
55
+ assert std("nine one one") == "911"
56
+ assert std("nine double one") == "911"
57
+ assert std("one triple oh one") == "10001"
58
+
59
+ assert std("two thousandth") == "2000th"
60
+ assert std("thirty two thousandth") == "32000th"
61
+
62
+ assert std("minus 500") == "-500"
63
+ assert std("positive twenty thousand") == "+20000"
64
+
65
+ assert std("two dollars and seventy cents") == "$2.70"
66
+ assert std("3 cents") == "¢3"
67
+ assert std("$0.36") == "¢36"
68
+ assert std("three euros and sixty five cents") == "€3.65"
69
+
70
+ assert std("three and a half million") == "3500000"
71
+ assert std("forty eight and a half dollars") == "$48.5"
72
+ assert std("b747") == "b 747"
73
+ assert std("10 th") == "10th"
74
+ assert std("10th") == "10th"
75
+
76
+
77
+ def test_spelling_normalizer():
78
+ std = EnglishSpellingNormalizer()
79
+
80
+ assert std("mobilisation") == "mobilization"
81
+ assert std("cancelation") == "cancellation"
82
+
83
+
84
+ def test_text_normalizer():
85
+ std = EnglishTextNormalizer()
86
+ assert std("Let's") == "let us"
87
+ assert std("he's like") == "he is like"
88
+ assert std("she's been like") == "she has been like"
89
+ assert std("10km") == "10 km"
90
+ assert std("10mm") == "10 mm"
91
+ assert std("RC232") == "rc 232"
92
+
93
+ assert (
94
+ std("Mr. Park visited Assoc. Prof. Kim Jr.")
95
+ == "mister park visited associate professor kim junior"
96
+ )
whisper/tests/test_timing.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+ import scipy.ndimage
4
+ import torch
5
+
6
+ from whisper.timing import dtw_cpu, dtw_cuda, median_filter
7
+
8
+ sizes = [
9
+ (10, 20),
10
+ (32, 16),
11
+ (123, 1500),
12
+ (234, 189),
13
+ ]
14
+ shapes = [
15
+ (10,),
16
+ (1, 15),
17
+ (4, 5, 345),
18
+ (6, 12, 240, 512),
19
+ ]
20
+
21
+
22
+ @pytest.mark.parametrize("N, M", sizes)
23
+ def test_dtw(N: int, M: int):
24
+ steps = np.concatenate([np.zeros(N - 1), np.ones(M - 1)])
25
+ np.random.shuffle(steps)
26
+ x = np.random.random((N, M)).astype(np.float32)
27
+
28
+ i, j, k = 0, 0, 0
29
+ trace = []
30
+ while True:
31
+ x[i, j] -= 1
32
+ trace.append((i, j))
33
+
34
+ if k == len(steps):
35
+ break
36
+
37
+ if k + 1 < len(steps) and steps[k] != steps[k + 1]:
38
+ i += 1
39
+ j += 1
40
+ k += 2
41
+ continue
42
+
43
+ if steps[k] == 0:
44
+ i += 1
45
+ if steps[k] == 1:
46
+ j += 1
47
+ k += 1
48
+
49
+ trace = np.array(trace).T
50
+ dtw_trace = dtw_cpu(x)
51
+
52
+ assert np.allclose(trace, dtw_trace)
53
+
54
+
55
+ @pytest.mark.requires_cuda
56
+ @pytest.mark.parametrize("N, M", sizes)
57
+ def test_dtw_cuda_equivalence(N: int, M: int):
58
+ x_numpy = np.random.randn(N, M).astype(np.float32)
59
+ x_cuda = torch.from_numpy(x_numpy).cuda()
60
+
61
+ trace_cpu = dtw_cpu(x_numpy)
62
+ trace_cuda = dtw_cuda(x_cuda)
63
+
64
+ assert np.allclose(trace_cpu, trace_cuda)
65
+
66
+
67
+ @pytest.mark.parametrize("shape", shapes)
68
+ def test_median_filter(shape):
69
+ x = torch.randn(*shape)
70
+
71
+ for filter_width in [3, 5, 7, 13]:
72
+ filtered = median_filter(x, filter_width)
73
+
74
+ # using np.pad to reflect-pad, because Scipy's behavior is different near the edges.
75
+ pad_width = filter_width // 2
76
+ padded_x = np.pad(
77
+ x, [(0, 0)] * (x.ndim - 1) + [(pad_width, pad_width)], mode="reflect"
78
+ )
79
+ scipy_filtered = scipy.ndimage.median_filter(
80
+ padded_x, [1] * (x.ndim - 1) + [filter_width]
81
+ )
82
+ scipy_filtered = scipy_filtered[..., pad_width:-pad_width]
83
+
84
+ assert np.allclose(filtered, scipy_filtered)
85
+
86
+
87
+ @pytest.mark.requires_cuda
88
+ @pytest.mark.parametrize("shape", shapes)
89
+ def test_median_filter_equivalence(shape):
90
+ x = torch.randn(*shape)
91
+
92
+ for filter_width in [3, 5, 7, 13]:
93
+ filtered_cpu = median_filter(x, filter_width)
94
+ filtered_gpu = median_filter(x.cuda(), filter_width).cpu()
95
+
96
+ assert np.allclose(filtered_cpu, filtered_gpu)
whisper/tests/test_tokenizer.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from whisper.tokenizer import get_tokenizer
2
+
3
+
4
+ def test_tokenizer():
5
+ gpt2_tokenizer = get_tokenizer(multilingual=False)
6
+ multilingual_tokenizer = get_tokenizer(multilingual=True)
7
+
8
+ text = "다람쥐 헌 쳇바퀴에 타고파"
9
+ gpt2_tokens = gpt2_tokenizer.encode(text)
10
+ multilingual_tokens = multilingual_tokenizer.encode(text)
11
+
12
+ assert gpt2_tokenizer.decode(gpt2_tokens) == text
13
+ assert multilingual_tokenizer.decode(multilingual_tokens) == text
14
+ assert len(gpt2_tokens) > len(multilingual_tokens)
15
+
16
+
17
+ def test_split_on_unicode():
18
+ multilingual_tokenizer = get_tokenizer(multilingual=True)
19
+
20
+ tokens = [8404, 871, 287, 6, 246, 526, 3210, 20378]
21
+ words, word_tokens = multilingual_tokenizer.split_tokens_on_unicode(tokens)
22
+
23
+ assert words == [" elle", " est", " l", "'", "�", "é", "rit", "oire"]
24
+ assert word_tokens == [[8404], [871], [287], [6], [246], [526], [3210], [20378]]
whisper/tests/test_transcribe.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import pytest
4
+ import torch
5
+
6
+ import whisper
7
+ from whisper.tokenizer import get_tokenizer
8
+
9
+
10
+ @pytest.mark.parametrize("model_name", whisper.available_models())
11
+ def test_transcribe(model_name: str):
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ model = whisper.load_model(model_name).to(device)
14
+ audio_path = os.path.join(os.path.dirname(__file__), "jfk.flac")
15
+
16
+ language = "en" if model_name.endswith(".en") else None
17
+ result = model.transcribe(
18
+ audio_path, language=language, temperature=0.0, word_timestamps=True
19
+ )
20
+ assert result["language"] == "en"
21
+ assert result["text"] == "".join([s["text"] for s in result["segments"]])
22
+
23
+ transcription = result["text"].lower()
24
+ assert "my fellow americans" in transcription
25
+ assert "your country" in transcription
26
+ assert "do for you" in transcription
27
+
28
+ tokenizer = get_tokenizer(model.is_multilingual)
29
+ all_tokens = [t for s in result["segments"] for t in s["tokens"]]
30
+ assert tokenizer.decode(all_tokens) == result["text"]
31
+ assert tokenizer.decode_with_timestamps(all_tokens).startswith("<|0.00|>")
32
+
33
+ timing_checked = False
34
+ for segment in result["segments"]:
35
+ for timing in segment["words"]:
36
+ assert timing["start"] < timing["end"]
37
+ if timing["word"].strip(" ,") == "Americans":
38
+ assert timing["start"] <= 1.8
39
+ assert timing["end"] >= 1.8
40
+ timing_checked = True
41
+
42
+ assert timing_checked
whisper/whisper/__init__.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import io
3
+ import os
4
+ import urllib
5
+ import warnings
6
+ from typing import List, Optional, Union
7
+
8
+ import torch
9
+ from tqdm import tqdm
10
+
11
+ from .audio import load_audio, log_mel_spectrogram, pad_or_trim
12
+ from .decoding import DecodingOptions, DecodingResult, decode, detect_language
13
+ from .model import ModelDimensions, Whisper
14
+ from .transcribe import transcribe
15
+ from .version import __version__
16
+
17
+ _MODELS = {
18
+ "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
19
+ "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
20
+ "base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
21
+ "base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
22
+ "small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
23
+ "small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
24
+ "medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
25
+ "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
26
+ "large-v1": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt",
27
+ "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
28
+ "large": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
29
+ }
30
+
31
+ # base85-encoded (n_layers, n_heads) boolean arrays indicating the cross-attention heads that are
32
+ # highly correlated to the word-level timing, i.e. the alignment between audio and text tokens.
33
+ _ALIGNMENT_HEADS = {
34
+ "tiny.en": b"ABzY8J1N>@0{>%R00Bk>$p{7v037`oCl~+#00",
35
+ "tiny": b"ABzY8bu8Lr0{>%RKn9Fp%m@SkK7Kt=7ytkO",
36
+ "base.en": b"ABzY8;40c<0{>%RzzG;p*o+Vo09|#PsxSZm00",
37
+ "base": b"ABzY8KQ!870{>%RzyTQH3`Q^yNP!>##QT-<FaQ7m",
38
+ "small.en": b"ABzY8>?_)10{>%RpeA61k&I|OI3I$65C{;;pbCHh0B{qLQ;+}v00",
39
+ "small": b"ABzY8DmU6=0{>%Rpa?J`kvJ6qF(V^F86#Xh7JUGMK}P<N0000",
40
+ "medium.en": b"ABzY8usPae0{>%R7<zz_OvQ{)4kMa0BMw6u5rT}kRKX;$NfYBv00*Hl@qhsU00",
41
+ "medium": b"ABzY8B0Jh+0{>%R7}kK1fFL7w6%<-Pf*t^=N)Qr&0RR9",
42
+ "large-v1": b"ABzY8r9j$a0{>%R7#4sLmoOs{s)o3~84-RPdcFk!JR<kSfC2yj",
43
+ "large-v2": b"ABzY8zd+h!0{>%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj",
44
+ "large": b"ABzY8zd+h!0{>%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj",
45
+ }
46
+
47
+
48
+ def _download(url: str, root: str, in_memory: bool) -> Union[bytes, str]:
49
+ os.makedirs(root, exist_ok=True)
50
+
51
+ expected_sha256 = url.split("/")[-2]
52
+ download_target = os.path.join(root, os.path.basename(url))
53
+
54
+ if os.path.exists(download_target) and not os.path.isfile(download_target):
55
+ raise RuntimeError(f"{download_target} exists and is not a regular file")
56
+
57
+ if os.path.isfile(download_target):
58
+ with open(download_target, "rb") as f:
59
+ model_bytes = f.read()
60
+ if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
61
+ return model_bytes if in_memory else download_target
62
+ else:
63
+ warnings.warn(
64
+ f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file"
65
+ )
66
+
67
+ with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
68
+ with tqdm(
69
+ total=int(source.info().get("Content-Length")),
70
+ ncols=80,
71
+ unit="iB",
72
+ unit_scale=True,
73
+ unit_divisor=1024,
74
+ ) as loop:
75
+ while True:
76
+ buffer = source.read(8192)
77
+ if not buffer:
78
+ break
79
+
80
+ output.write(buffer)
81
+ loop.update(len(buffer))
82
+
83
+ model_bytes = open(download_target, "rb").read()
84
+ if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
85
+ raise RuntimeError(
86
+ "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
87
+ )
88
+
89
+ return model_bytes if in_memory else download_target
90
+
91
+
92
+ def available_models() -> List[str]:
93
+ """Returns the names of available models"""
94
+ return list(_MODELS.keys())
95
+
96
+
97
+ def load_model(
98
+ name: str,
99
+ device: Optional[Union[str, torch.device]] = None,
100
+ download_root: str = None,
101
+ in_memory: bool = False,
102
+ ) -> Whisper:
103
+ """
104
+ Load a Whisper ASR model
105
+
106
+ Parameters
107
+ ----------
108
+ name : str
109
+ one of the official model names listed by `whisper.available_models()`, or
110
+ path to a model checkpoint containing the model dimensions and the model state_dict.
111
+ device : Union[str, torch.device]
112
+ the PyTorch device to put the model into
113
+ download_root: str
114
+ path to download the model files; by default, it uses "~/.cache/whisper"
115
+ in_memory: bool
116
+ whether to preload the model weights into host memory
117
+
118
+ Returns
119
+ -------
120
+ model : Whisper
121
+ The Whisper ASR model instance
122
+ """
123
+
124
+ if device is None:
125
+ device = "cuda" if torch.cuda.is_available() else "cpu"
126
+ if download_root is None:
127
+ default = os.path.join(os.path.expanduser("~"), ".cache")
128
+ download_root = os.path.join(os.getenv("XDG_CACHE_HOME", default), "whisper")
129
+
130
+ if name in _MODELS:
131
+ checkpoint_file = _download(_MODELS[name], download_root, in_memory)
132
+ alignment_heads = _ALIGNMENT_HEADS[name]
133
+ elif os.path.isfile(name):
134
+ checkpoint_file = open(name, "rb").read() if in_memory else name
135
+ alignment_heads = None
136
+ else:
137
+ raise RuntimeError(
138
+ f"Model {name} not found; available models = {available_models()}"
139
+ )
140
+
141
+ with (
142
+ io.BytesIO(checkpoint_file) if in_memory else open(checkpoint_file, "rb")
143
+ ) as fp:
144
+ checkpoint = torch.load(fp, map_location=device)
145
+ del checkpoint_file
146
+
147
+ dims = ModelDimensions(**checkpoint["dims"])
148
+ model = Whisper(dims)
149
+ model.load_state_dict(checkpoint["model_state_dict"])
150
+
151
+ if alignment_heads is not None:
152
+ model.set_alignment_heads(alignment_heads)
153
+
154
+ return model.to(device)
whisper/whisper/__main__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .transcribe import cli
2
+
3
+ cli()
whisper/whisper/assets/gpt2.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
whisper/whisper/assets/mel_filters.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd2cc75e70e36fcbdd8ffbc2499062f30094093e6bf2cbafa9859f59972b420b
3
+ size 2048
whisper/whisper/assets/multilingual.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
whisper/whisper/audio.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from functools import lru_cache
3
+ from subprocess import CalledProcessError, run
4
+ from typing import Optional, Union
5
+
6
+ import numpy as np
7
+ import torch
8
+ import torch.nn.functional as F
9
+
10
+ from .utils import exact_div
11
+
12
+ # hard-coded audio hyperparameters
13
+ SAMPLE_RATE = 16000
14
+ N_FFT = 400
15
+ N_MELS = 80
16
+ HOP_LENGTH = 160
17
+ CHUNK_LENGTH = 30
18
+ N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk
19
+ N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH) # 3000 frames in a mel spectrogram input
20
+
21
+ N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2 # the initial convolutions has stride 2
22
+ FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH) # 10ms per audio frame
23
+ TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN) # 20ms per audio token
24
+
25
+
26
+ def load_audio(file: str, sr: int = SAMPLE_RATE):
27
+ """
28
+ Open an audio file and read as mono waveform, resampling as necessary
29
+
30
+ Parameters
31
+ ----------
32
+ file: str
33
+ The audio file to open
34
+
35
+ sr: int
36
+ The sample rate to resample the audio if necessary
37
+
38
+ Returns
39
+ -------
40
+ A NumPy array containing the audio waveform, in float32 dtype.
41
+ """
42
+
43
+ # This launches a subprocess to decode audio while down-mixing
44
+ # and resampling as necessary. Requires the ffmpeg CLI in PATH.
45
+ # fmt: off
46
+ cmd = [
47
+ "ffmpeg",
48
+ "-nostdin",
49
+ "-threads", "0",
50
+ "-i", file,
51
+ "-f", "s16le",
52
+ "-ac", "1",
53
+ "-acodec", "pcm_s16le",
54
+ "-ar", str(sr),
55
+ "-"
56
+ ]
57
+ # fmt: on
58
+ try:
59
+ out = run(cmd, capture_output=True, check=True).stdout
60
+ except CalledProcessError as e:
61
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
62
+
63
+ return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
64
+
65
+
66
+ def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
67
+ """
68
+ Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
69
+ """
70
+ if torch.is_tensor(array):
71
+ if array.shape[axis] > length:
72
+ array = array.index_select(
73
+ dim=axis, index=torch.arange(length, device=array.device)
74
+ )
75
+
76
+ if array.shape[axis] < length:
77
+ pad_widths = [(0, 0)] * array.ndim
78
+ pad_widths[axis] = (0, length - array.shape[axis])
79
+ array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
80
+ else:
81
+ if array.shape[axis] > length:
82
+ array = array.take(indices=range(length), axis=axis)
83
+
84
+ if array.shape[axis] < length:
85
+ pad_widths = [(0, 0)] * array.ndim
86
+ pad_widths[axis] = (0, length - array.shape[axis])
87
+ array = np.pad(array, pad_widths)
88
+
89
+ return array
90
+
91
+
92
+ @lru_cache(maxsize=None)
93
+ def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
94
+ """
95
+ load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
96
+ Allows decoupling librosa dependency; saved using:
97
+
98
+ np.savez_compressed(
99
+ "mel_filters.npz",
100
+ mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
101
+ )
102
+ """
103
+ assert n_mels == 80, f"Unsupported n_mels: {n_mels}"
104
+ with np.load(
105
+ os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
106
+ ) as f:
107
+ return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
108
+
109
+
110
+ def log_mel_spectrogram(
111
+ audio: Union[str, np.ndarray, torch.Tensor],
112
+ n_mels: int = N_MELS,
113
+ padding: int = 0,
114
+ device: Optional[Union[str, torch.device]] = None,
115
+ ):
116
+ """
117
+ Compute the log-Mel spectrogram of
118
+
119
+ Parameters
120
+ ----------
121
+ audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
122
+ The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
123
+
124
+ n_mels: int
125
+ The number of Mel-frequency filters, only 80 is supported
126
+
127
+ padding: int
128
+ Number of zero samples to pad to the right
129
+
130
+ device: Optional[Union[str, torch.device]]
131
+ If given, the audio tensor is moved to this device before STFT
132
+
133
+ Returns
134
+ -------
135
+ torch.Tensor, shape = (80, n_frames)
136
+ A Tensor that contains the Mel spectrogram
137
+ """
138
+ if not torch.is_tensor(audio):
139
+ if isinstance(audio, str):
140
+ audio = load_audio(audio)
141
+ audio = torch.from_numpy(audio)
142
+
143
+ if device is not None:
144
+ audio = audio.to(device)
145
+ if padding > 0:
146
+ audio = F.pad(audio, (0, padding))
147
+ window = torch.hann_window(N_FFT).to(audio.device)
148
+ stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
149
+ magnitudes = stft[..., :-1].abs() ** 2
150
+
151
+ filters = mel_filters(audio.device, n_mels)
152
+ mel_spec = filters @ magnitudes
153
+
154
+ log_spec = torch.clamp(mel_spec, min=1e-10).log10()
155
+ log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
156
+ log_spec = (log_spec + 4.0) / 4.0
157
+ return log_spec
whisper/whisper/decoding.py ADDED
@@ -0,0 +1,821 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field, replace
2
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Sequence, Tuple, Union
3
+
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn.functional as F
7
+ from torch import Tensor
8
+ from torch.distributions import Categorical
9
+
10
+ from .audio import CHUNK_LENGTH
11
+ from .tokenizer import Tokenizer, get_tokenizer
12
+ from .utils import compression_ratio
13
+
14
+ if TYPE_CHECKING:
15
+ from .model import Whisper
16
+
17
+
18
+ @torch.no_grad()
19
+ def detect_language(
20
+ model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None
21
+ ) -> Tuple[Tensor, List[dict]]:
22
+ """
23
+ Detect the spoken language in the audio, and return them as list of strings, along with the ids
24
+ of the most probable language tokens and the probability distribution over all language tokens.
25
+ This is performed outside the main decode loop in order to not interfere with kv-caching.
26
+
27
+ Returns
28
+ -------
29
+ language_tokens : Tensor, shape = (n_audio,)
30
+ ids of the most probable language tokens, which appears after the startoftranscript token.
31
+ language_probs : List[Dict[str, float]], length = n_audio
32
+ list of dictionaries containing the probability distribution over all languages.
33
+ """
34
+ if tokenizer is None:
35
+ tokenizer = get_tokenizer(model.is_multilingual)
36
+ if (
37
+ tokenizer.language is None
38
+ or tokenizer.language_token not in tokenizer.sot_sequence
39
+ ):
40
+ raise ValueError(
41
+ "This model doesn't have language tokens so it can't perform lang id"
42
+ )
43
+
44
+ single = mel.ndim == 2
45
+ if single:
46
+ mel = mel.unsqueeze(0)
47
+
48
+ # skip encoder forward pass if already-encoded audio features were given
49
+ if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state):
50
+ mel = model.encoder(mel)
51
+
52
+ # forward pass using a single token, startoftranscript
53
+ n_audio = mel.shape[0]
54
+ x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device) # [n_audio, 1]
55
+ logits = model.logits(x, mel)[:, 0]
56
+
57
+ # collect detected languages; suppress all non-language tokens
58
+ mask = torch.ones(logits.shape[-1], dtype=torch.bool)
59
+ mask[list(tokenizer.all_language_tokens)] = False
60
+ logits[:, mask] = -np.inf
61
+ language_tokens = logits.argmax(dim=-1)
62
+ language_token_probs = logits.softmax(dim=-1).cpu()
63
+ language_probs = [
64
+ {
65
+ c: language_token_probs[i, j].item()
66
+ for j, c in zip(tokenizer.all_language_tokens, tokenizer.all_language_codes)
67
+ }
68
+ for i in range(n_audio)
69
+ ]
70
+
71
+ if single:
72
+ language_tokens = language_tokens[0]
73
+ language_probs = language_probs[0]
74
+
75
+ return language_tokens, language_probs
76
+
77
+
78
+ @dataclass(frozen=True)
79
+ class DecodingOptions:
80
+ # whether to perform X->X "transcribe" or X->English "translate"
81
+ task: str = "transcribe"
82
+
83
+ # language that the audio is in; uses detected language if None
84
+ language: Optional[str] = None
85
+
86
+ # sampling-related options
87
+ temperature: float = 0.0
88
+ sample_len: Optional[int] = None # maximum number of tokens to sample
89
+ best_of: Optional[int] = None # number of independent sample trajectories, if t > 0
90
+ beam_size: Optional[int] = None # number of beams in beam search, if t == 0
91
+ patience: Optional[float] = None # patience in beam search (arxiv:2204.05424)
92
+
93
+ # "alpha" in Google NMT, or None for length norm, when ranking generations
94
+ # to select which to return among the beams or best-of-N samples
95
+ length_penalty: Optional[float] = None
96
+
97
+ # text or tokens to feed as the prompt or the prefix; for more info:
98
+ # https://github.com/openai/whisper/discussions/117#discussioncomment-3727051
99
+ prompt: Optional[Union[str, List[int]]] = None # for the previous context
100
+ prefix: Optional[Union[str, List[int]]] = None # to prefix the current context
101
+
102
+ # list of tokens ids (or comma-separated token ids) to suppress
103
+ # "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`
104
+ suppress_tokens: Optional[Union[str, Iterable[int]]] = "-1"
105
+ suppress_blank: bool = True # this will suppress blank outputs
106
+
107
+ # timestamp sampling options
108
+ without_timestamps: bool = False # use <|notimestamps|> to sample text tokens only
109
+ max_initial_timestamp: Optional[float] = 1.0
110
+
111
+ # implementation details
112
+ fp16: bool = True # use fp16 for most of the calculation
113
+
114
+
115
+ @dataclass(frozen=True)
116
+ class DecodingResult:
117
+ audio_features: Tensor
118
+ language: str
119
+ language_probs: Optional[Dict[str, float]] = None
120
+ tokens: List[int] = field(default_factory=list)
121
+ text: str = ""
122
+ avg_logprob: float = np.nan
123
+ no_speech_prob: float = np.nan
124
+ temperature: float = np.nan
125
+ compression_ratio: float = np.nan
126
+
127
+
128
+ class Inference:
129
+ def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
130
+ """Perform a forward pass on the decoder and return per-token logits"""
131
+ raise NotImplementedError
132
+
133
+ def rearrange_kv_cache(self, source_indices) -> None:
134
+ """Update the key-value cache according to the updated beams"""
135
+ raise NotImplementedError
136
+
137
+ def cleanup_caching(self) -> None:
138
+ """Clean up any resources or hooks after decoding is finished"""
139
+ pass
140
+
141
+
142
+ class PyTorchInference(Inference):
143
+ def __init__(self, model: "Whisper", initial_token_length: int):
144
+ self.model: "Whisper" = model
145
+ self.initial_token_length = initial_token_length
146
+ self.kv_cache = {}
147
+ self.hooks = []
148
+
149
+ key_modules = [block.attn.key for block in self.model.decoder.blocks]
150
+ value_modules = [block.attn.value for block in self.model.decoder.blocks]
151
+ self.kv_modules = key_modules + value_modules
152
+
153
+ def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
154
+ if not self.kv_cache:
155
+ self.kv_cache, self.hooks = self.model.install_kv_cache_hooks()
156
+
157
+ if tokens.shape[-1] > self.initial_token_length:
158
+ # only need to use the last token except in the first forward pass
159
+ tokens = tokens[:, -1:]
160
+
161
+ return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache)
162
+
163
+ def cleanup_caching(self):
164
+ for hook in self.hooks:
165
+ hook.remove()
166
+
167
+ self.kv_cache = {}
168
+ self.hooks = []
169
+
170
+ def rearrange_kv_cache(self, source_indices):
171
+ if source_indices != list(range(len(source_indices))):
172
+ for module in self.kv_modules:
173
+ # update the key/value cache to contain the selected sequences
174
+ self.kv_cache[module] = self.kv_cache[module][source_indices].detach()
175
+
176
+
177
+ class SequenceRanker:
178
+ def rank(
179
+ self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]
180
+ ) -> List[int]:
181
+ """
182
+ Given a list of groups of samples and their cumulative log probabilities,
183
+ return the indices of the samples in each group to select as the final result
184
+ """
185
+ raise NotImplementedError
186
+
187
+
188
+ class MaximumLikelihoodRanker(SequenceRanker):
189
+ """
190
+ Select the sample with the highest log probabilities, penalized using either
191
+ a simple length normalization or Google NMT paper's length penalty
192
+ """
193
+
194
+ def __init__(self, length_penalty: Optional[float]):
195
+ self.length_penalty = length_penalty
196
+
197
+ def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]):
198
+ def scores(logprobs, lengths):
199
+ result = []
200
+ for logprob, length in zip(logprobs, lengths):
201
+ if self.length_penalty is None:
202
+ penalty = length
203
+ else:
204
+ # from the Google NMT paper
205
+ penalty = ((5 + length) / 6) ** self.length_penalty
206
+ result.append(logprob / penalty)
207
+ return result
208
+
209
+ # get the sequence with the highest score
210
+ lengths = [[len(t) for t in s] for s in tokens]
211
+ return [np.argmax(scores(p, l)) for p, l in zip(sum_logprobs, lengths)]
212
+
213
+
214
+ class TokenDecoder:
215
+ def reset(self):
216
+ """Initialize any stateful variables for decoding a new sequence"""
217
+
218
+ def update(
219
+ self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor
220
+ ) -> Tuple[Tensor, bool]:
221
+ """Specify how to select the next token, based on the current trace and logits
222
+
223
+ Parameters
224
+ ----------
225
+ tokens : Tensor, shape = (n_batch, current_sequence_length)
226
+ all tokens in the context so far, including the prefix and sot_sequence tokens
227
+
228
+ logits : Tensor, shape = (n_batch, vocab_size)
229
+ per-token logits of the probability distribution at the current step
230
+
231
+ sum_logprobs : Tensor, shape = (n_batch)
232
+ cumulative log probabilities for each sequence
233
+
234
+ Returns
235
+ -------
236
+ tokens : Tensor, shape = (n_batch, current_sequence_length + 1)
237
+ the tokens, appended with the selected next token
238
+
239
+ completed : bool
240
+ True if all sequences has reached the end of text
241
+
242
+ """
243
+ raise NotImplementedError
244
+
245
+ def finalize(
246
+ self, tokens: Tensor, sum_logprobs: Tensor
247
+ ) -> Tuple[Sequence[Sequence[Tensor]], List[List[float]]]:
248
+ """Finalize search and return the final candidate sequences
249
+
250
+ Parameters
251
+ ----------
252
+ tokens : Tensor, shape = (n_audio, n_group, current_sequence_length)
253
+ all tokens in the context so far, including the prefix and sot_sequence
254
+
255
+ sum_logprobs : Tensor, shape = (n_audio, n_group)
256
+ cumulative log probabilities for each sequence
257
+
258
+ Returns
259
+ -------
260
+ tokens : Sequence[Sequence[Tensor]], length = n_audio
261
+ sequence of Tensors containing candidate token sequences, for each audio input
262
+
263
+ sum_logprobs : List[List[float]], length = n_audio
264
+ sequence of cumulative log probabilities corresponding to the above
265
+
266
+ """
267
+ raise NotImplementedError
268
+
269
+
270
+ class GreedyDecoder(TokenDecoder):
271
+ def __init__(self, temperature: float, eot: int):
272
+ self.temperature = temperature
273
+ self.eot = eot
274
+
275
+ def update(
276
+ self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor
277
+ ) -> Tuple[Tensor, bool]:
278
+ if self.temperature == 0:
279
+ next_tokens = logits.argmax(dim=-1)
280
+ else:
281
+ next_tokens = Categorical(logits=logits / self.temperature).sample()
282
+
283
+ logprobs = F.log_softmax(logits.float(), dim=-1)
284
+ current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens]
285
+ sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot)
286
+
287
+ next_tokens[tokens[:, -1] == self.eot] = self.eot
288
+ tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1)
289
+
290
+ completed = (tokens[:, -1] == self.eot).all()
291
+ return tokens, completed
292
+
293
+ def finalize(self, tokens: Tensor, sum_logprobs: Tensor):
294
+ # make sure each sequence has at least one EOT token at the end
295
+ tokens = F.pad(tokens, (0, 1), value=self.eot)
296
+ return tokens, sum_logprobs.tolist()
297
+
298
+
299
+ class BeamSearchDecoder(TokenDecoder):
300
+ def __init__(
301
+ self,
302
+ beam_size: int,
303
+ eot: int,
304
+ inference: Inference,
305
+ patience: Optional[float] = None,
306
+ ):
307
+ self.beam_size = beam_size
308
+ self.eot = eot
309
+ self.inference = inference
310
+ self.patience = patience or 1.0
311
+ self.max_candidates: int = round(beam_size * self.patience)
312
+ self.finished_sequences = None
313
+
314
+ assert (
315
+ self.max_candidates > 0
316
+ ), f"Invalid beam size ({beam_size}) or patience ({patience})"
317
+
318
+ def reset(self):
319
+ self.finished_sequences = None
320
+
321
+ def update(
322
+ self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor
323
+ ) -> Tuple[Tensor, bool]:
324
+ if tokens.shape[0] % self.beam_size != 0:
325
+ raise ValueError(f"{tokens.shape}[0] % {self.beam_size} != 0")
326
+
327
+ n_audio = tokens.shape[0] // self.beam_size
328
+ if self.finished_sequences is None: # for the first update
329
+ self.finished_sequences = [{} for _ in range(n_audio)]
330
+
331
+ logprobs = F.log_softmax(logits.float(), dim=-1)
332
+ next_tokens, source_indices, finished_sequences = [], [], []
333
+ for i in range(n_audio):
334
+ scores, sources, finished = {}, {}, {}
335
+
336
+ # STEP 1: calculate the cumulative log probabilities for possible candidates
337
+ for j in range(self.beam_size):
338
+ idx = i * self.beam_size + j
339
+ prefix = tokens[idx].tolist()
340
+ for logprob, token in zip(*logprobs[idx].topk(self.beam_size + 1)):
341
+ new_logprob = (sum_logprobs[idx] + logprob).item()
342
+ sequence = tuple(prefix + [token.item()])
343
+ scores[sequence] = new_logprob
344
+ sources[sequence] = idx
345
+
346
+ # STEP 2: rank the candidates and keep the top beam_size sequences for each audio
347
+ saved = 0
348
+ for sequence in sorted(scores, key=scores.get, reverse=True):
349
+ if sequence[-1] == self.eot:
350
+ finished[sequence] = scores[sequence]
351
+ else:
352
+ sum_logprobs[len(next_tokens)] = scores[sequence]
353
+ next_tokens.append(sequence)
354
+ source_indices.append(sources[sequence])
355
+
356
+ saved += 1
357
+ if saved == self.beam_size:
358
+ break
359
+
360
+ finished_sequences.append(finished)
361
+
362
+ tokens = torch.tensor(next_tokens, device=tokens.device)
363
+ self.inference.rearrange_kv_cache(source_indices)
364
+
365
+ # add newly finished sequences to self.finished_sequences
366
+ assert len(self.finished_sequences) == len(finished_sequences)
367
+ for previously_finished, newly_finished in zip(
368
+ self.finished_sequences, finished_sequences
369
+ ):
370
+ for seq in sorted(newly_finished, key=newly_finished.get, reverse=True):
371
+ if len(previously_finished) >= self.max_candidates:
372
+ break # the candidate list is full
373
+ previously_finished[seq] = newly_finished[seq]
374
+
375
+ # mark as completed if all audio has enough number of samples
376
+ completed = all(
377
+ len(sequences) >= self.max_candidates
378
+ for sequences in self.finished_sequences
379
+ )
380
+ return tokens, completed
381
+
382
+ def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor):
383
+ # collect all finished sequences, including patience, and add unfinished ones if not enough
384
+ sum_logprobs = sum_logprobs.cpu()
385
+ for i, sequences in enumerate(self.finished_sequences):
386
+ if (
387
+ len(sequences) < self.beam_size
388
+ ): # when not enough sequences are finished
389
+ for j in list(np.argsort(sum_logprobs[i]))[::-1]:
390
+ sequence = preceding_tokens[i, j].tolist() + [self.eot]
391
+ sequences[tuple(sequence)] = sum_logprobs[i][j].item()
392
+ if len(sequences) >= self.beam_size:
393
+ break
394
+
395
+ tokens: List[List[Tensor]] = [
396
+ [torch.tensor(seq) for seq in sequences.keys()]
397
+ for sequences in self.finished_sequences
398
+ ]
399
+ sum_logprobs: List[List[float]] = [
400
+ list(sequences.values()) for sequences in self.finished_sequences
401
+ ]
402
+ return tokens, sum_logprobs
403
+
404
+
405
+ class LogitFilter:
406
+ def apply(self, logits: Tensor, tokens: Tensor) -> None:
407
+ """Apply any filtering or masking to logits in-place
408
+
409
+ Parameters
410
+ ----------
411
+ logits : Tensor, shape = (n_batch, vocab_size)
412
+ per-token logits of the probability distribution at the current step
413
+
414
+ tokens : Tensor, shape = (n_batch, current_sequence_length)
415
+ all tokens in the context so far, including the prefix and sot_sequence tokens
416
+
417
+ """
418
+ raise NotImplementedError
419
+
420
+
421
+ class SuppressBlank(LogitFilter):
422
+ def __init__(self, tokenizer: Tokenizer, sample_begin: int):
423
+ self.tokenizer = tokenizer
424
+ self.sample_begin = sample_begin
425
+
426
+ def apply(self, logits: Tensor, tokens: Tensor):
427
+ if tokens.shape[1] == self.sample_begin:
428
+ logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf
429
+
430
+
431
+ class SuppressTokens(LogitFilter):
432
+ def __init__(self, suppress_tokens: Sequence[int]):
433
+ self.suppress_tokens = list(suppress_tokens)
434
+
435
+ def apply(self, logits: Tensor, tokens: Tensor):
436
+ logits[:, self.suppress_tokens] = -np.inf
437
+
438
+
439
+ class ApplyTimestampRules(LogitFilter):
440
+ def __init__(
441
+ self,
442
+ tokenizer: Tokenizer,
443
+ sample_begin: int,
444
+ max_initial_timestamp_index: Optional[int],
445
+ ):
446
+ self.tokenizer = tokenizer
447
+ self.sample_begin = sample_begin
448
+ self.max_initial_timestamp_index = max_initial_timestamp_index
449
+
450
+ def apply(self, logits: Tensor, tokens: Tensor):
451
+ # suppress <|notimestamps|> which is handled by without_timestamps
452
+ if self.tokenizer.no_timestamps is not None:
453
+ logits[:, self.tokenizer.no_timestamps] = -np.inf
454
+
455
+ # timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
456
+ for k in range(tokens.shape[0]):
457
+ sampled_tokens = tokens[k, self.sample_begin :]
458
+ seq = [t for t in sampled_tokens.tolist()]
459
+ last_was_timestamp = (
460
+ len(seq) >= 1 and seq[-1] >= self.tokenizer.timestamp_begin
461
+ )
462
+ penultimate_was_timestamp = (
463
+ len(seq) < 2 or seq[-2] >= self.tokenizer.timestamp_begin
464
+ )
465
+
466
+ if last_was_timestamp:
467
+ if penultimate_was_timestamp: # has to be non-timestamp
468
+ logits[k, self.tokenizer.timestamp_begin :] = -np.inf
469
+ else: # cannot be normal text tokens
470
+ logits[k, : self.tokenizer.eot] = -np.inf
471
+
472
+ timestamps = sampled_tokens[
473
+ sampled_tokens.ge(self.tokenizer.timestamp_begin)
474
+ ]
475
+ if timestamps.numel() > 0:
476
+ # timestamps shouldn't decrease; forbid timestamp tokens smaller than the last
477
+ # also force each segment to have a nonzero length, to prevent infinite looping
478
+ if last_was_timestamp and not penultimate_was_timestamp:
479
+ timestamp_last = timestamps[-1]
480
+ else:
481
+ timestamp_last = timestamps[-1] + 1
482
+ logits[k, self.tokenizer.timestamp_begin : timestamp_last] = -np.inf
483
+
484
+ if tokens.shape[1] == self.sample_begin:
485
+ # suppress generating non-timestamp tokens at the beginning
486
+ logits[:, : self.tokenizer.timestamp_begin] = -np.inf
487
+
488
+ # apply the `max_initial_timestamp` option
489
+ if self.max_initial_timestamp_index is not None:
490
+ last_allowed = (
491
+ self.tokenizer.timestamp_begin + self.max_initial_timestamp_index
492
+ )
493
+ logits[:, last_allowed + 1 :] = -np.inf
494
+
495
+ # if sum of probability over timestamps is above any other token, sample timestamp
496
+ logprobs = F.log_softmax(logits.float(), dim=-1)
497
+ for k in range(tokens.shape[0]):
498
+ timestamp_logprob = logprobs[k, self.tokenizer.timestamp_begin :].logsumexp(
499
+ dim=-1
500
+ )
501
+ max_text_token_logprob = logprobs[k, : self.tokenizer.timestamp_begin].max()
502
+ if timestamp_logprob > max_text_token_logprob:
503
+ logits[k, : self.tokenizer.timestamp_begin] = -np.inf
504
+
505
+
506
+ class DecodingTask:
507
+ inference: Inference
508
+ sequence_ranker: SequenceRanker
509
+ decoder: TokenDecoder
510
+ logit_filters: List[LogitFilter]
511
+
512
+ def __init__(self, model: "Whisper", options: DecodingOptions):
513
+ self.model = model
514
+
515
+ language = options.language or "en"
516
+ tokenizer = get_tokenizer(
517
+ model.is_multilingual, language=language, task=options.task
518
+ )
519
+ self.tokenizer: Tokenizer = tokenizer
520
+ self.options: DecodingOptions = self._verify_options(options)
521
+
522
+ self.n_group: int = options.beam_size or options.best_of or 1
523
+ self.n_ctx: int = model.dims.n_text_ctx
524
+ self.sample_len: int = options.sample_len or model.dims.n_text_ctx // 2
525
+
526
+ self.sot_sequence: Tuple[int] = tokenizer.sot_sequence
527
+ if self.options.without_timestamps:
528
+ self.sot_sequence = tokenizer.sot_sequence_including_notimestamps
529
+
530
+ self.initial_tokens: Tuple[int] = self._get_initial_tokens()
531
+ self.sample_begin: int = len(self.initial_tokens)
532
+ self.sot_index: int = self.initial_tokens.index(tokenizer.sot)
533
+
534
+ # inference: implements the forward pass through the decoder, including kv caching
535
+ self.inference = PyTorchInference(model, len(self.initial_tokens))
536
+
537
+ # sequence ranker: implements how to rank a group of sampled sequences
538
+ self.sequence_ranker = MaximumLikelihoodRanker(options.length_penalty)
539
+
540
+ # decoder: implements how to select the next tokens, given the autoregressive distribution
541
+ if options.beam_size is not None:
542
+ self.decoder = BeamSearchDecoder(
543
+ options.beam_size, tokenizer.eot, self.inference, options.patience
544
+ )
545
+ else:
546
+ self.decoder = GreedyDecoder(options.temperature, tokenizer.eot)
547
+
548
+ # logit filters: applies various rules to suppress or penalize certain tokens
549
+ self.logit_filters = []
550
+ if self.options.suppress_blank:
551
+ self.logit_filters.append(SuppressBlank(self.tokenizer, self.sample_begin))
552
+ if self.options.suppress_tokens:
553
+ self.logit_filters.append(SuppressTokens(self._get_suppress_tokens()))
554
+ if not options.without_timestamps:
555
+ precision = CHUNK_LENGTH / model.dims.n_audio_ctx # usually 0.02 seconds
556
+ max_initial_timestamp_index = None
557
+ if options.max_initial_timestamp:
558
+ max_initial_timestamp_index = round(
559
+ self.options.max_initial_timestamp / precision
560
+ )
561
+ self.logit_filters.append(
562
+ ApplyTimestampRules(
563
+ tokenizer, self.sample_begin, max_initial_timestamp_index
564
+ )
565
+ )
566
+
567
+ def _verify_options(self, options: DecodingOptions) -> DecodingOptions:
568
+ if options.beam_size is not None and options.best_of is not None:
569
+ raise ValueError("beam_size and best_of can't be given together")
570
+ if options.temperature == 0:
571
+ if options.best_of is not None:
572
+ raise ValueError("best_of with greedy sampling (T=0) is not compatible")
573
+ if options.patience is not None and options.beam_size is None:
574
+ raise ValueError("patience requires beam_size to be given")
575
+ if options.length_penalty is not None and not (
576
+ 0 <= options.length_penalty <= 1
577
+ ):
578
+ raise ValueError("length_penalty (alpha) should be a value between 0 and 1")
579
+
580
+ return options
581
+
582
+ def _get_initial_tokens(self) -> Tuple[int]:
583
+ tokens = list(self.sot_sequence)
584
+
585
+ if prefix := self.options.prefix:
586
+ prefix_tokens = (
587
+ self.tokenizer.encode(" " + prefix.strip())
588
+ if isinstance(prefix, str)
589
+ else prefix
590
+ )
591
+ if self.sample_len is not None:
592
+ max_prefix_len = self.n_ctx // 2 - self.sample_len
593
+ prefix_tokens = prefix_tokens[-max_prefix_len:]
594
+ tokens = tokens + prefix_tokens
595
+
596
+ if prompt := self.options.prompt:
597
+ prompt_tokens = (
598
+ self.tokenizer.encode(" " + prompt.strip())
599
+ if isinstance(prompt, str)
600
+ else prompt
601
+ )
602
+ tokens = (
603
+ [self.tokenizer.sot_prev]
604
+ + prompt_tokens[-(self.n_ctx // 2 - 1) :]
605
+ + tokens
606
+ )
607
+
608
+ return tuple(tokens)
609
+
610
+ def _get_suppress_tokens(self) -> Tuple[int]:
611
+ suppress_tokens = self.options.suppress_tokens
612
+
613
+ if isinstance(suppress_tokens, str):
614
+ suppress_tokens = [int(t) for t in suppress_tokens.split(",")]
615
+
616
+ if -1 in suppress_tokens:
617
+ suppress_tokens = [t for t in suppress_tokens if t >= 0]
618
+ suppress_tokens.extend(self.tokenizer.non_speech_tokens)
619
+ elif suppress_tokens is None or len(suppress_tokens) == 0:
620
+ suppress_tokens = [] # interpret empty string as an empty list
621
+ else:
622
+ assert isinstance(suppress_tokens, list), "suppress_tokens must be a list"
623
+
624
+ suppress_tokens.extend(
625
+ [
626
+ self.tokenizer.transcribe,
627
+ self.tokenizer.translate,
628
+ self.tokenizer.sot,
629
+ self.tokenizer.sot_prev,
630
+ self.tokenizer.sot_lm,
631
+ ]
632
+ )
633
+ if self.tokenizer.no_speech is not None:
634
+ # no-speech probability is collected separately
635
+ suppress_tokens.append(self.tokenizer.no_speech)
636
+
637
+ return tuple(sorted(set(suppress_tokens)))
638
+
639
+ def _get_audio_features(self, mel: Tensor):
640
+ if self.options.fp16:
641
+ mel = mel.half()
642
+
643
+ if mel.shape[-2:] == (
644
+ self.model.dims.n_audio_ctx,
645
+ self.model.dims.n_audio_state,
646
+ ):
647
+ # encoded audio features are given; skip audio encoding
648
+ audio_features = mel
649
+ else:
650
+ audio_features = self.model.encoder(mel)
651
+
652
+ if audio_features.dtype != (
653
+ torch.float16 if self.options.fp16 else torch.float32
654
+ ):
655
+ return TypeError(
656
+ f"audio_features has an incorrect dtype: {audio_features.dtype}"
657
+ )
658
+
659
+ return audio_features
660
+
661
+ def _detect_language(self, audio_features: Tensor, tokens: Tensor):
662
+ languages = [self.options.language] * audio_features.shape[0]
663
+ lang_probs = None
664
+
665
+ if self.options.language is None or self.options.task == "lang_id":
666
+ lang_tokens, lang_probs = self.model.detect_language(
667
+ audio_features, self.tokenizer
668
+ )
669
+ languages = [max(probs, key=probs.get) for probs in lang_probs]
670
+ if self.options.language is None:
671
+ tokens[:, self.sot_index + 1] = lang_tokens # write language tokens
672
+
673
+ return languages, lang_probs
674
+
675
+ def _main_loop(self, audio_features: Tensor, tokens: Tensor):
676
+ n_batch = tokens.shape[0]
677
+ sum_logprobs: Tensor = torch.zeros(n_batch, device=audio_features.device)
678
+ no_speech_probs = [np.nan] * n_batch
679
+
680
+ try:
681
+ for i in range(self.sample_len):
682
+ logits = self.inference.logits(tokens, audio_features)
683
+
684
+ if (
685
+ i == 0 and self.tokenizer.no_speech is not None
686
+ ): # save no_speech_probs
687
+ probs_at_sot = logits[:, self.sot_index].float().softmax(dim=-1)
688
+ no_speech_probs = probs_at_sot[:, self.tokenizer.no_speech].tolist()
689
+
690
+ # now we need to consider the logits at the last token only
691
+ logits = logits[:, -1]
692
+
693
+ # apply the logit filters, e.g. for suppressing or applying penalty to
694
+ for logit_filter in self.logit_filters:
695
+ logit_filter.apply(logits, tokens)
696
+
697
+ # expand the tokens tensor with the selected next tokens
698
+ tokens, completed = self.decoder.update(tokens, logits, sum_logprobs)
699
+
700
+ if completed or tokens.shape[-1] > self.n_ctx:
701
+ break
702
+ finally:
703
+ self.inference.cleanup_caching()
704
+
705
+ return tokens, sum_logprobs, no_speech_probs
706
+
707
+ @torch.no_grad()
708
+ def run(self, mel: Tensor) -> List[DecodingResult]:
709
+ self.decoder.reset()
710
+ tokenizer: Tokenizer = self.tokenizer
711
+ n_audio: int = mel.shape[0]
712
+
713
+ audio_features: Tensor = self._get_audio_features(mel) # encoder forward pass
714
+ tokens: Tensor = torch.tensor([self.initial_tokens]).repeat(n_audio, 1)
715
+
716
+ # detect language if requested, overwriting the language token
717
+ languages, language_probs = self._detect_language(audio_features, tokens)
718
+ if self.options.task == "lang_id":
719
+ return [
720
+ DecodingResult(
721
+ audio_features=features, language=language, language_probs=probs
722
+ )
723
+ for features, language, probs in zip(
724
+ audio_features, languages, language_probs
725
+ )
726
+ ]
727
+
728
+ # repeat text tensors by the group size, for beam search or best-of-n sampling
729
+ tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device)
730
+
731
+ # call the main sampling loop
732
+ tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, tokens)
733
+
734
+ # reshape the tensors to have (n_audio, n_group) as the first two dimensions
735
+ audio_features = audio_features[:: self.n_group]
736
+ no_speech_probs = no_speech_probs[:: self.n_group]
737
+ assert audio_features.shape[0] == len(no_speech_probs) == n_audio
738
+
739
+ tokens = tokens.reshape(n_audio, self.n_group, -1)
740
+ sum_logprobs = sum_logprobs.reshape(n_audio, self.n_group)
741
+
742
+ # get the final candidates for each group, and slice between the first sampled token and EOT
743
+ tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs)
744
+ tokens: List[List[Tensor]] = [
745
+ [t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s]
746
+ for s in tokens
747
+ ]
748
+
749
+ # select the top-ranked sample in each group
750
+ selected = self.sequence_ranker.rank(tokens, sum_logprobs)
751
+ tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)]
752
+ texts: List[str] = [tokenizer.decode(t).strip() for t in tokens]
753
+
754
+ sum_logprobs: List[float] = [lp[i] for i, lp in zip(selected, sum_logprobs)]
755
+ avg_logprobs: List[float] = [
756
+ lp / (len(t) + 1) for t, lp in zip(tokens, sum_logprobs)
757
+ ]
758
+
759
+ fields = (
760
+ texts,
761
+ languages,
762
+ tokens,
763
+ audio_features,
764
+ avg_logprobs,
765
+ no_speech_probs,
766
+ )
767
+ if len(set(map(len, fields))) != 1:
768
+ raise RuntimeError(f"inconsistent result lengths: {list(map(len, fields))}")
769
+
770
+ return [
771
+ DecodingResult(
772
+ audio_features=features,
773
+ language=language,
774
+ tokens=tokens,
775
+ text=text,
776
+ avg_logprob=avg_logprob,
777
+ no_speech_prob=no_speech_prob,
778
+ temperature=self.options.temperature,
779
+ compression_ratio=compression_ratio(text),
780
+ )
781
+ for text, language, tokens, features, avg_logprob, no_speech_prob in zip(
782
+ *fields
783
+ )
784
+ ]
785
+
786
+
787
+ @torch.no_grad()
788
+ def decode(
789
+ model: "Whisper",
790
+ mel: Tensor,
791
+ options: DecodingOptions = DecodingOptions(),
792
+ **kwargs,
793
+ ) -> Union[DecodingResult, List[DecodingResult]]:
794
+ """
795
+ Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s).
796
+
797
+ Parameters
798
+ ----------
799
+ model: Whisper
800
+ the Whisper model instance
801
+
802
+ mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000)
803
+ A tensor containing the Mel spectrogram(s)
804
+
805
+ options: DecodingOptions
806
+ A dataclass that contains all necessary options for decoding 30-second segments
807
+
808
+ Returns
809
+ -------
810
+ result: Union[DecodingResult, List[DecodingResult]]
811
+ The result(s) of decoding contained in `DecodingResult` dataclass instance(s)
812
+ """
813
+ if single := mel.ndim == 2:
814
+ mel = mel.unsqueeze(0)
815
+
816
+ if kwargs:
817
+ options = replace(options, **kwargs)
818
+
819
+ result = DecodingTask(model, options).run(mel)
820
+
821
+ return result[0] if single else result
whisper/whisper/model.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import gzip
3
+ from dataclasses import dataclass
4
+ from typing import Dict, Iterable, Optional
5
+
6
+ import numpy as np
7
+ import torch
8
+ import torch.nn.functional as F
9
+ from torch import Tensor, nn
10
+
11
+ from .decoding import decode as decode_function
12
+ from .decoding import detect_language as detect_language_function
13
+ from .transcribe import transcribe as transcribe_function
14
+
15
+
16
+ @dataclass
17
+ class ModelDimensions:
18
+ n_mels: int
19
+ n_audio_ctx: int
20
+ n_audio_state: int
21
+ n_audio_head: int
22
+ n_audio_layer: int
23
+ n_vocab: int
24
+ n_text_ctx: int
25
+ n_text_state: int
26
+ n_text_head: int
27
+ n_text_layer: int
28
+
29
+
30
+ class LayerNorm(nn.LayerNorm):
31
+ def forward(self, x: Tensor) -> Tensor:
32
+ return super().forward(x.float()).type(x.dtype)
33
+
34
+
35
+ class Linear(nn.Linear):
36
+ def forward(self, x: Tensor) -> Tensor:
37
+ return F.linear(
38
+ x,
39
+ self.weight.to(x.dtype),
40
+ None if self.bias is None else self.bias.to(x.dtype),
41
+ )
42
+
43
+
44
+ class Conv1d(nn.Conv1d):
45
+ def _conv_forward(
46
+ self, x: Tensor, weight: Tensor, bias: Optional[Tensor]
47
+ ) -> Tensor:
48
+ return super()._conv_forward(
49
+ x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
50
+ )
51
+
52
+
53
+ def sinusoids(length, channels, max_timescale=10000):
54
+ """Returns sinusoids for positional embedding"""
55
+ assert channels % 2 == 0
56
+ log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
57
+ inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
58
+ scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
59
+ return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
60
+
61
+
62
+ class MultiHeadAttention(nn.Module):
63
+ def __init__(self, n_state: int, n_head: int):
64
+ super().__init__()
65
+ self.n_head = n_head
66
+ self.query = Linear(n_state, n_state)
67
+ self.key = Linear(n_state, n_state, bias=False)
68
+ self.value = Linear(n_state, n_state)
69
+ self.out = Linear(n_state, n_state)
70
+
71
+ def forward(
72
+ self,
73
+ x: Tensor,
74
+ xa: Optional[Tensor] = None,
75
+ mask: Optional[Tensor] = None,
76
+ kv_cache: Optional[dict] = None,
77
+ ):
78
+ q = self.query(x)
79
+
80
+ if kv_cache is None or xa is None or self.key not in kv_cache:
81
+ # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
82
+ # otherwise, perform key/value projections for self- or cross-attention as usual.
83
+ k = self.key(x if xa is None else xa)
84
+ v = self.value(x if xa is None else xa)
85
+ else:
86
+ # for cross-attention, calculate keys and values once and reuse in subsequent calls.
87
+ k = kv_cache[self.key]
88
+ v = kv_cache[self.value]
89
+
90
+ wv, qk = self.qkv_attention(q, k, v, mask)
91
+ return self.out(wv), qk
92
+
93
+ def qkv_attention(
94
+ self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None
95
+ ):
96
+ n_batch, n_ctx, n_state = q.shape
97
+ scale = (n_state // self.n_head) ** -0.25
98
+ q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
99
+ k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
100
+ v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
101
+
102
+ qk = q @ k
103
+ if mask is not None:
104
+ qk = qk + mask[:n_ctx, :n_ctx]
105
+ qk = qk.float()
106
+
107
+ w = F.softmax(qk, dim=-1).to(q.dtype)
108
+ return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach()
109
+
110
+
111
+ class ResidualAttentionBlock(nn.Module):
112
+ def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
113
+ super().__init__()
114
+
115
+ self.attn = MultiHeadAttention(n_state, n_head)
116
+ self.attn_ln = LayerNorm(n_state)
117
+
118
+ self.cross_attn = (
119
+ MultiHeadAttention(n_state, n_head) if cross_attention else None
120
+ )
121
+ self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
122
+
123
+ n_mlp = n_state * 4
124
+ self.mlp = nn.Sequential(
125
+ Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)
126
+ )
127
+ self.mlp_ln = LayerNorm(n_state)
128
+
129
+ def forward(
130
+ self,
131
+ x: Tensor,
132
+ xa: Optional[Tensor] = None,
133
+ mask: Optional[Tensor] = None,
134
+ kv_cache: Optional[dict] = None,
135
+ ):
136
+ x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0]
137
+ if self.cross_attn:
138
+ x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)[0]
139
+ x = x + self.mlp(self.mlp_ln(x))
140
+ return x
141
+
142
+
143
+ class AudioEncoder(nn.Module):
144
+ def __init__(
145
+ self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
146
+ ):
147
+ super().__init__()
148
+ self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
149
+ self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
150
+ self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
151
+
152
+ self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
153
+ [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
154
+ )
155
+ self.ln_post = LayerNorm(n_state)
156
+
157
+ def forward(self, x: Tensor):
158
+ """
159
+ x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
160
+ the mel spectrogram of the audio
161
+ """
162
+ x = F.gelu(self.conv1(x))
163
+ x = F.gelu(self.conv2(x))
164
+ x = x.permute(0, 2, 1)
165
+
166
+ assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
167
+ x = (x + self.positional_embedding).to(x.dtype)
168
+
169
+ for block in self.blocks:
170
+ x = block(x)
171
+
172
+ x = self.ln_post(x)
173
+ return x
174
+
175
+
176
+ class TextDecoder(nn.Module):
177
+ def __init__(
178
+ self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
179
+ ):
180
+ super().__init__()
181
+
182
+ self.token_embedding = nn.Embedding(n_vocab, n_state)
183
+ self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
184
+
185
+ self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
186
+ [
187
+ ResidualAttentionBlock(n_state, n_head, cross_attention=True)
188
+ for _ in range(n_layer)
189
+ ]
190
+ )
191
+ self.ln = LayerNorm(n_state)
192
+
193
+ mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
194
+ self.register_buffer("mask", mask, persistent=False)
195
+
196
+ def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
197
+ """
198
+ x : torch.LongTensor, shape = (batch_size, <= n_ctx)
199
+ the text tokens
200
+ xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state)
201
+ the encoded audio features to be attended on
202
+ """
203
+ offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
204
+ x = (
205
+ self.token_embedding(x)
206
+ + self.positional_embedding[offset : offset + x.shape[-1]]
207
+ )
208
+ x = x.to(xa.dtype)
209
+
210
+ for block in self.blocks:
211
+ x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
212
+
213
+ x = self.ln(x)
214
+ logits = (
215
+ x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)
216
+ ).float()
217
+
218
+ return logits
219
+
220
+
221
+ class Whisper(nn.Module):
222
+ def __init__(self, dims: ModelDimensions):
223
+ super().__init__()
224
+ self.dims = dims
225
+ self.encoder = AudioEncoder(
226
+ self.dims.n_mels,
227
+ self.dims.n_audio_ctx,
228
+ self.dims.n_audio_state,
229
+ self.dims.n_audio_head,
230
+ self.dims.n_audio_layer,
231
+ )
232
+ self.decoder = TextDecoder(
233
+ self.dims.n_vocab,
234
+ self.dims.n_text_ctx,
235
+ self.dims.n_text_state,
236
+ self.dims.n_text_head,
237
+ self.dims.n_text_layer,
238
+ )
239
+ # use the last half layers for alignment by default; see `set_alignment_heads()` below
240
+ all_heads = torch.zeros(
241
+ self.dims.n_text_layer, self.dims.n_text_head, dtype=torch.bool
242
+ )
243
+ all_heads[self.dims.n_text_layer // 2 :] = True
244
+ self.register_buffer("alignment_heads", all_heads.to_sparse(), persistent=False)
245
+
246
+ def set_alignment_heads(self, dump: bytes):
247
+ array = np.frombuffer(
248
+ gzip.decompress(base64.b85decode(dump)), dtype=bool
249
+ ).copy()
250
+ mask = torch.from_numpy(array).reshape(
251
+ self.dims.n_text_layer, self.dims.n_text_head
252
+ )
253
+ self.register_buffer("alignment_heads", mask.to_sparse(), persistent=False)
254
+
255
+ def embed_audio(self, mel: torch.Tensor):
256
+ return self.encoder(mel)
257
+
258
+ def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):
259
+ return self.decoder(tokens, audio_features)
260
+
261
+ def forward(
262
+ self, mel: torch.Tensor, tokens: torch.Tensor
263
+ ) -> Dict[str, torch.Tensor]:
264
+ return self.decoder(tokens, self.encoder(mel))
265
+
266
+ @property
267
+ def device(self):
268
+ return next(self.parameters()).device
269
+
270
+ @property
271
+ def is_multilingual(self):
272
+ return self.dims.n_vocab == 51865
273
+
274
+ def install_kv_cache_hooks(self, cache: Optional[dict] = None):
275
+ """
276
+ The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
277
+ tensors calculated for the previous positions. This method returns a dictionary that stores
278
+ all caches, and the necessary hooks for the key and value projection modules that save the
279
+ intermediate tensors to be reused during later calculations.
280
+
281
+ Returns
282
+ -------
283
+ cache : Dict[nn.Module, torch.Tensor]
284
+ A dictionary object mapping the key/value projection modules to its cache
285
+ hooks : List[RemovableHandle]
286
+ List of PyTorch RemovableHandle objects to stop the hooks to be called
287
+ """
288
+ cache = {**cache} if cache is not None else {}
289
+ hooks = []
290
+
291
+ def save_to_cache(module, _, output):
292
+ if module not in cache or output.shape[1] > self.dims.n_text_ctx:
293
+ # save as-is, for the first token or cross attention
294
+ cache[module] = output
295
+ else:
296
+ cache[module] = torch.cat([cache[module], output], dim=1).detach()
297
+ return cache[module]
298
+
299
+ def install_hooks(layer: nn.Module):
300
+ if isinstance(layer, MultiHeadAttention):
301
+ hooks.append(layer.key.register_forward_hook(save_to_cache))
302
+ hooks.append(layer.value.register_forward_hook(save_to_cache))
303
+
304
+ self.decoder.apply(install_hooks)
305
+ return cache, hooks
306
+
307
+ detect_language = detect_language_function
308
+ transcribe = transcribe_function
309
+ decode = decode_function
whisper/whisper/normalizers/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .basic import BasicTextNormalizer as BasicTextNormalizer
2
+ from .english import EnglishTextNormalizer as EnglishTextNormalizer
whisper/whisper/normalizers/basic.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import unicodedata
3
+
4
+ import regex
5
+
6
+ # non-ASCII letters that are not separated by "NFKD" normalization
7
+ ADDITIONAL_DIACRITICS = {
8
+ "œ": "oe",
9
+ "Œ": "OE",
10
+ "ø": "o",
11
+ "Ø": "O",
12
+ "æ": "ae",
13
+ "Æ": "AE",
14
+ "ß": "ss",
15
+ "ẞ": "SS",
16
+ "đ": "d",
17
+ "Đ": "D",
18
+ "ð": "d",
19
+ "Ð": "D",
20
+ "þ": "th",
21
+ "Þ": "th",
22
+ "ł": "l",
23
+ "Ł": "L",
24
+ }
25
+
26
+
27
+ def remove_symbols_and_diacritics(s: str, keep=""):
28
+ """
29
+ Replace any other markers, symbols, and punctuations with a space,
30
+ and drop any diacritics (category 'Mn' and some manual mappings)
31
+ """
32
+ return "".join(
33
+ c
34
+ if c in keep
35
+ else ADDITIONAL_DIACRITICS[c]
36
+ if c in ADDITIONAL_DIACRITICS
37
+ else ""
38
+ if unicodedata.category(c) == "Mn"
39
+ else " "
40
+ if unicodedata.category(c)[0] in "MSP"
41
+ else c
42
+ for c in unicodedata.normalize("NFKD", s)
43
+ )
44
+
45
+
46
+ def remove_symbols(s: str):
47
+ """
48
+ Replace any other markers, symbols, punctuations with a space, keeping diacritics
49
+ """
50
+ return "".join(
51
+ " " if unicodedata.category(c)[0] in "MSP" else c
52
+ for c in unicodedata.normalize("NFKC", s)
53
+ )
54
+
55
+
56
+ class BasicTextNormalizer:
57
+ def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
58
+ self.clean = (
59
+ remove_symbols_and_diacritics if remove_diacritics else remove_symbols
60
+ )
61
+ self.split_letters = split_letters
62
+
63
+ def __call__(self, s: str):
64
+ s = s.lower()
65
+ s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
66
+ s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
67
+ s = self.clean(s).lower()
68
+
69
+ if self.split_letters:
70
+ s = " ".join(regex.findall(r"\X", s, regex.U))
71
+
72
+ s = re.sub(
73
+ r"\s+", " ", s
74
+ ) # replace any successive whitespace characters with a space
75
+
76
+ return s
whisper/whisper/normalizers/english.json ADDED
@@ -0,0 +1,1741 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accessorise": "accessorize",
3
+ "accessorised": "accessorized",
4
+ "accessorises": "accessorizes",
5
+ "accessorising": "accessorizing",
6
+ "acclimatisation": "acclimatization",
7
+ "acclimatise": "acclimatize",
8
+ "acclimatised": "acclimatized",
9
+ "acclimatises": "acclimatizes",
10
+ "acclimatising": "acclimatizing",
11
+ "accoutrements": "accouterments",
12
+ "aeon": "eon",
13
+ "aeons": "eons",
14
+ "aerogramme": "aerogram",
15
+ "aerogrammes": "aerograms",
16
+ "aeroplane": "airplane",
17
+ "aeroplanes": "airplanes",
18
+ "aesthete": "esthete",
19
+ "aesthetes": "esthetes",
20
+ "aesthetic": "esthetic",
21
+ "aesthetically": "esthetically",
22
+ "aesthetics": "esthetics",
23
+ "aetiology": "etiology",
24
+ "ageing": "aging",
25
+ "aggrandisement": "aggrandizement",
26
+ "agonise": "agonize",
27
+ "agonised": "agonized",
28
+ "agonises": "agonizes",
29
+ "agonising": "agonizing",
30
+ "agonisingly": "agonizingly",
31
+ "almanack": "almanac",
32
+ "almanacks": "almanacs",
33
+ "aluminium": "aluminum",
34
+ "amortisable": "amortizable",
35
+ "amortisation": "amortization",
36
+ "amortisations": "amortizations",
37
+ "amortise": "amortize",
38
+ "amortised": "amortized",
39
+ "amortises": "amortizes",
40
+ "amortising": "amortizing",
41
+ "amphitheatre": "amphitheater",
42
+ "amphitheatres": "amphitheaters",
43
+ "anaemia": "anemia",
44
+ "anaemic": "anemic",
45
+ "anaesthesia": "anesthesia",
46
+ "anaesthetic": "anesthetic",
47
+ "anaesthetics": "anesthetics",
48
+ "anaesthetise": "anesthetize",
49
+ "anaesthetised": "anesthetized",
50
+ "anaesthetises": "anesthetizes",
51
+ "anaesthetising": "anesthetizing",
52
+ "anaesthetist": "anesthetist",
53
+ "anaesthetists": "anesthetists",
54
+ "anaesthetize": "anesthetize",
55
+ "anaesthetized": "anesthetized",
56
+ "anaesthetizes": "anesthetizes",
57
+ "anaesthetizing": "anesthetizing",
58
+ "analogue": "analog",
59
+ "analogues": "analogs",
60
+ "analyse": "analyze",
61
+ "analysed": "analyzed",
62
+ "analyses": "analyzes",
63
+ "analysing": "analyzing",
64
+ "anglicise": "anglicize",
65
+ "anglicised": "anglicized",
66
+ "anglicises": "anglicizes",
67
+ "anglicising": "anglicizing",
68
+ "annualised": "annualized",
69
+ "antagonise": "antagonize",
70
+ "antagonised": "antagonized",
71
+ "antagonises": "antagonizes",
72
+ "antagonising": "antagonizing",
73
+ "apologise": "apologize",
74
+ "apologised": "apologized",
75
+ "apologises": "apologizes",
76
+ "apologising": "apologizing",
77
+ "appal": "appall",
78
+ "appals": "appalls",
79
+ "appetiser": "appetizer",
80
+ "appetisers": "appetizers",
81
+ "appetising": "appetizing",
82
+ "appetisingly": "appetizingly",
83
+ "arbour": "arbor",
84
+ "arbours": "arbors",
85
+ "archeological": "archaeological",
86
+ "archaeologically": "archeologically",
87
+ "archaeologist": "archeologist",
88
+ "archaeologists": "archeologists",
89
+ "archaeology": "archeology</span>",
90
+ "ardour": "ardor",
91
+ "armour": "armor",
92
+ "armoured": "armored",
93
+ "armourer": "armorer",
94
+ "armourers": "armorers",
95
+ "armouries": "armories",
96
+ "armoury": "armory",
97
+ "artefact": "artifact",
98
+ "artefacts": "artifacts",
99
+ "authorise": "authorize",
100
+ "authorised": "authorized",
101
+ "authorises": "authorizes",
102
+ "authorising": "authorizing",
103
+ "axe": "ax",
104
+ "backpedalled": "backpedaled",
105
+ "backpedalling": "backpedaling",
106
+ "bannister": "banister",
107
+ "bannisters": "banisters",
108
+ "baptise": "baptize",
109
+ "baptised": "baptized",
110
+ "baptises": "baptizes",
111
+ "baptising": "baptizing",
112
+ "bastardise": "bastardize",
113
+ "bastardised": "bastardized",
114
+ "bastardises": "bastardizes",
115
+ "bastardising": "bastardizing",
116
+ "battleax": "battleaxe",
117
+ "baulk": "balk",
118
+ "baulked": "balked",
119
+ "baulking": "balking",
120
+ "baulks": "balks",
121
+ "bedevilled": "bedeviled",
122
+ "bedevilling": "bedeviling",
123
+ "behaviour": "behavior",
124
+ "behavioural": "behavioral",
125
+ "behaviourism": "behaviorism",
126
+ "behaviourist": "behaviorist",
127
+ "behaviourists": "behaviorists",
128
+ "behaviours": "behaviors",
129
+ "behove": "behoove",
130
+ "behoved": "behooved",
131
+ "behoves": "behooves",
132
+ "bejewelled": "bejeweled",
133
+ "belabour": "belabor",
134
+ "belaboured": "belabored",
135
+ "belabouring": "belaboring",
136
+ "belabours": "belabors",
137
+ "bevelled": "beveled",
138
+ "bevvies": "bevies",
139
+ "bevvy": "bevy",
140
+ "biassed": "biased",
141
+ "biassing": "biasing",
142
+ "bingeing": "binging",
143
+ "bougainvillaea": "bougainvillea",
144
+ "bougainvillaeas": "bougainvilleas",
145
+ "bowdlerise": "bowdlerize",
146
+ "bowdlerised": "bowdlerized",
147
+ "bowdlerises": "bowdlerizes",
148
+ "bowdlerising": "bowdlerizing",
149
+ "breathalyse": "breathalyze",
150
+ "breathalysed": "breathalyzed",
151
+ "breathalyser": "breathalyzer",
152
+ "breathalysers": "breathalyzers",
153
+ "breathalyses": "breathalyzes",
154
+ "breathalysing": "breathalyzing",
155
+ "brutalise": "brutalize",
156
+ "brutalised": "brutalized",
157
+ "brutalises": "brutalizes",
158
+ "brutalising": "brutalizing",
159
+ "busses": "buses",
160
+ "bussing": "busing",
161
+ "caesarean": "cesarean",
162
+ "caesareans": "cesareans",
163
+ "calibre": "caliber",
164
+ "calibres": "calibers",
165
+ "calliper": "caliper",
166
+ "callipers": "calipers",
167
+ "callisthenics": "calisthenics",
168
+ "canalise": "canalize",
169
+ "canalised": "canalized",
170
+ "canalises": "canalizes",
171
+ "canalising": "canalizing",
172
+ "cancelation": "cancellation",
173
+ "cancelations": "cancellations",
174
+ "cancelled": "canceled",
175
+ "cancelling": "canceling",
176
+ "candour": "candor",
177
+ "cannibalise": "cannibalize",
178
+ "cannibalised": "cannibalized",
179
+ "cannibalises": "cannibalizes",
180
+ "cannibalising": "cannibalizing",
181
+ "canonise": "canonize",
182
+ "canonised": "canonized",
183
+ "canonises": "canonizes",
184
+ "canonising": "canonizing",
185
+ "capitalise": "capitalize",
186
+ "capitalised": "capitalized",
187
+ "capitalises": "capitalizes",
188
+ "capitalising": "capitalizing",
189
+ "caramelise": "caramelize",
190
+ "caramelised": "caramelized",
191
+ "caramelises": "caramelizes",
192
+ "caramelising": "caramelizing",
193
+ "carbonise": "carbonize",
194
+ "carbonised": "carbonized",
195
+ "carbonises": "carbonizes",
196
+ "carbonising": "carbonizing",
197
+ "carolled": "caroled",
198
+ "carolling": "caroling",
199
+ "catalogue": "catalog",
200
+ "catalogued": "cataloged",
201
+ "catalogues": "catalogs",
202
+ "cataloguing": "cataloging",
203
+ "catalyse": "catalyze",
204
+ "catalysed": "catalyzed",
205
+ "catalyses": "catalyzes",
206
+ "catalysing": "catalyzing",
207
+ "categorise": "categorize",
208
+ "categorised": "categorized",
209
+ "categorises": "categorizes",
210
+ "categorising": "categorizing",
211
+ "cauterise": "cauterize",
212
+ "cauterised": "cauterized",
213
+ "cauterises": "cauterizes",
214
+ "cauterising": "cauterizing",
215
+ "cavilled": "caviled",
216
+ "cavilling": "caviling",
217
+ "centigramme": "centigram",
218
+ "centigrammes": "centigrams",
219
+ "centilitre": "centiliter",
220
+ "centilitres": "centiliters",
221
+ "centimetre": "centimeter",
222
+ "centimetres": "centimeters",
223
+ "centralise": "centralize",
224
+ "centralised": "centralized",
225
+ "centralises": "centralizes",
226
+ "centralising": "centralizing",
227
+ "centre": "center",
228
+ "centred": "centered",
229
+ "centrefold": "centerfold",
230
+ "centrefolds": "centerfolds",
231
+ "centrepiece": "centerpiece",
232
+ "centrepieces": "centerpieces",
233
+ "centres": "centers",
234
+ "channelled": "channeled",
235
+ "channelling": "channeling",
236
+ "characterise": "characterize",
237
+ "characterised": "characterized",
238
+ "characterises": "characterizes",
239
+ "characterising": "characterizing",
240
+ "cheque": "check",
241
+ "chequebook": "checkbook",
242
+ "chequebooks": "checkbooks",
243
+ "chequered": "checkered",
244
+ "cheques": "checks",
245
+ "chilli": "chili",
246
+ "chimaera": "chimera",
247
+ "chimaeras": "chimeras",
248
+ "chiselled": "chiseled",
249
+ "chiselling": "chiseling",
250
+ "circularise": "circularize",
251
+ "circularised": "circularized",
252
+ "circularises": "circularizes",
253
+ "circularising": "circularizing",
254
+ "civilise": "civilize",
255
+ "civilised": "civilized",
256
+ "civilises": "civilizes",
257
+ "civilising": "civilizing",
258
+ "clamour": "clamor",
259
+ "clamoured": "clamored",
260
+ "clamouring": "clamoring",
261
+ "clamours": "clamors",
262
+ "clangour": "clangor",
263
+ "clarinettist": "clarinetist",
264
+ "clarinettists": "clarinetists",
265
+ "collectivise": "collectivize",
266
+ "collectivised": "collectivized",
267
+ "collectivises": "collectivizes",
268
+ "collectivising": "collectivizing",
269
+ "colonisation": "colonization",
270
+ "colonise": "colonize",
271
+ "colonised": "colonized",
272
+ "coloniser": "colonizer",
273
+ "colonisers": "colonizers",
274
+ "colonises": "colonizes",
275
+ "colonising": "colonizing",
276
+ "colour": "color",
277
+ "colourant": "colorant",
278
+ "colourants": "colorants",
279
+ "coloured": "colored",
280
+ "coloureds": "coloreds",
281
+ "colourful": "colorful",
282
+ "colourfully": "colorfully",
283
+ "colouring": "coloring",
284
+ "colourize": "colorize",
285
+ "colourized": "colorized",
286
+ "colourizes": "colorizes",
287
+ "colourizing": "colorizing",
288
+ "colourless": "colorless",
289
+ "colours": "colors",
290
+ "commercialise": "commercialize",
291
+ "commercialised": "commercialized",
292
+ "commercialises": "commercializes",
293
+ "commercialising": "commercializing",
294
+ "compartmentalise": "compartmentalize",
295
+ "compartmentalised": "compartmentalized",
296
+ "compartmentalises": "compartmentalizes",
297
+ "compartmentalising": "compartmentalizing",
298
+ "computerise": "computerize",
299
+ "computerised": "computerized",
300
+ "computerises": "computerizes",
301
+ "computerising": "computerizing",
302
+ "conceptualise": "conceptualize",
303
+ "conceptualised": "conceptualized",
304
+ "conceptualises": "conceptualizes",
305
+ "conceptualising": "conceptualizing",
306
+ "connexion": "connection",
307
+ "connexions": "connections",
308
+ "contextualise": "contextualize",
309
+ "contextualised": "contextualized",
310
+ "contextualises": "contextualizes",
311
+ "contextualising": "contextualizing",
312
+ "cosier": "cozier",
313
+ "cosies": "cozies",
314
+ "cosiest": "coziest",
315
+ "cosily": "cozily",
316
+ "cosiness": "coziness",
317
+ "cosy": "cozy",
318
+ "councillor": "councilor",
319
+ "councillors": "councilors",
320
+ "counselled": "counseled",
321
+ "counselling": "counseling",
322
+ "counsellor": "counselor",
323
+ "counsellors": "counselors",
324
+ "crenelated": "crenellated",
325
+ "criminalise": "criminalize",
326
+ "criminalised": "criminalized",
327
+ "criminalises": "criminalizes",
328
+ "criminalising": "criminalizing",
329
+ "criticise": "criticize",
330
+ "criticised": "criticized",
331
+ "criticises": "criticizes",
332
+ "criticising": "criticizing",
333
+ "crueller": "crueler",
334
+ "cruellest": "cruelest",
335
+ "crystallisation": "crystallization",
336
+ "crystallise": "crystallize",
337
+ "crystallised": "crystallized",
338
+ "crystallises": "crystallizes",
339
+ "crystallising": "crystallizing",
340
+ "cudgelled": "cudgeled",
341
+ "cudgelling": "cudgeling",
342
+ "customise": "customize",
343
+ "customised": "customized",
344
+ "customises": "customizes",
345
+ "customising": "customizing",
346
+ "cypher": "cipher",
347
+ "cyphers": "ciphers",
348
+ "decentralisation": "decentralization",
349
+ "decentralise": "decentralize",
350
+ "decentralised": "decentralized",
351
+ "decentralises": "decentralizes",
352
+ "decentralising": "decentralizing",
353
+ "decriminalisation": "decriminalization",
354
+ "decriminalise": "decriminalize",
355
+ "decriminalised": "decriminalized",
356
+ "decriminalises": "decriminalizes",
357
+ "decriminalising": "decriminalizing",
358
+ "defence": "defense",
359
+ "defenceless": "defenseless",
360
+ "defences": "defenses",
361
+ "dehumanisation": "dehumanization",
362
+ "dehumanise": "dehumanize",
363
+ "dehumanised": "dehumanized",
364
+ "dehumanises": "dehumanizes",
365
+ "dehumanising": "dehumanizing",
366
+ "demeanour": "demeanor",
367
+ "demilitarisation": "demilitarization",
368
+ "demilitarise": "demilitarize",
369
+ "demilitarised": "demilitarized",
370
+ "demilitarises": "demilitarizes",
371
+ "demilitarising": "demilitarizing",
372
+ "demobilisation": "demobilization",
373
+ "demobilise": "demobilize",
374
+ "demobilised": "demobilized",
375
+ "demobilises": "demobilizes",
376
+ "demobilising": "demobilizing",
377
+ "democratisation": "democratization",
378
+ "democratise": "democratize",
379
+ "democratised": "democratized",
380
+ "democratises": "democratizes",
381
+ "democratising": "democratizing",
382
+ "demonise": "demonize",
383
+ "demonised": "demonized",
384
+ "demonises": "demonizes",
385
+ "demonising": "demonizing",
386
+ "demoralisation": "demoralization",
387
+ "demoralise": "demoralize",
388
+ "demoralised": "demoralized",
389
+ "demoralises": "demoralizes",
390
+ "demoralising": "demoralizing",
391
+ "denationalisation": "denationalization",
392
+ "denationalise": "denationalize",
393
+ "denationalised": "denationalized",
394
+ "denationalises": "denationalizes",
395
+ "denationalising": "denationalizing",
396
+ "deodorise": "deodorize",
397
+ "deodorised": "deodorized",
398
+ "deodorises": "deodorizes",
399
+ "deodorising": "deodorizing",
400
+ "depersonalise": "depersonalize",
401
+ "depersonalised": "depersonalized",
402
+ "depersonalises": "depersonalizes",
403
+ "depersonalising": "depersonalizing",
404
+ "deputise": "deputize",
405
+ "deputised": "deputized",
406
+ "deputises": "deputizes",
407
+ "deputising": "deputizing",
408
+ "desensitisation": "desensitization",
409
+ "desensitise": "desensitize",
410
+ "desensitised": "desensitized",
411
+ "desensitises": "desensitizes",
412
+ "desensitising": "desensitizing",
413
+ "destabilisation": "destabilization",
414
+ "destabilise": "destabilize",
415
+ "destabilised": "destabilized",
416
+ "destabilises": "destabilizes",
417
+ "destabilising": "destabilizing",
418
+ "dialled": "dialed",
419
+ "dialling": "dialing",
420
+ "dialogue": "dialog",
421
+ "dialogues": "dialogs",
422
+ "diarrhoea": "diarrhea",
423
+ "digitise": "digitize",
424
+ "digitised": "digitized",
425
+ "digitises": "digitizes",
426
+ "digitising": "digitizing",
427
+ "disc": "disk",
428
+ "discolour": "discolor",
429
+ "discoloured": "discolored",
430
+ "discolouring": "discoloring",
431
+ "discolours": "discolors",
432
+ "discs": "disks",
433
+ "disembowelled": "disemboweled",
434
+ "disembowelling": "disemboweling",
435
+ "disfavour": "disfavor",
436
+ "dishevelled": "disheveled",
437
+ "dishonour": "dishonor",
438
+ "dishonourable": "dishonorable",
439
+ "dishonourably": "dishonorably",
440
+ "dishonoured": "dishonored",
441
+ "dishonouring": "dishonoring",
442
+ "dishonours": "dishonors",
443
+ "disorganisation": "disorganization",
444
+ "disorganised": "disorganized",
445
+ "distil": "distill",
446
+ "distils": "distills",
447
+ "dramatisation": "dramatization",
448
+ "dramatisations": "dramatizations",
449
+ "dramatise": "dramatize",
450
+ "dramatised": "dramatized",
451
+ "dramatises": "dramatizes",
452
+ "dramatising": "dramatizing",
453
+ "draught": "draft",
454
+ "draughtboard": "draftboard",
455
+ "draughtboards": "draftboards",
456
+ "draughtier": "draftier",
457
+ "draughtiest": "draftiest",
458
+ "draughts": "drafts",
459
+ "draughtsman": "draftsman",
460
+ "draughtsmanship": "draftsmanship",
461
+ "draughtsmen": "draftsmen",
462
+ "draughtswoman": "draftswoman",
463
+ "draughtswomen": "draftswomen",
464
+ "draughty": "drafty",
465
+ "drivelled": "driveled",
466
+ "drivelling": "driveling",
467
+ "duelled": "dueled",
468
+ "duelling": "dueling",
469
+ "economise": "economize",
470
+ "economised": "economized",
471
+ "economises": "economizes",
472
+ "economising": "economizing",
473
+ "edoema": "edema",
474
+ "editorialise": "editorialize",
475
+ "editorialised": "editorialized",
476
+ "editorialises": "editorializes",
477
+ "editorialising": "editorializing",
478
+ "empathise": "empathize",
479
+ "empathised": "empathized",
480
+ "empathises": "empathizes",
481
+ "empathising": "empathizing",
482
+ "emphasise": "emphasize",
483
+ "emphasised": "emphasized",
484
+ "emphasises": "emphasizes",
485
+ "emphasising": "emphasizing",
486
+ "enamelled": "enameled",
487
+ "enamelling": "enameling",
488
+ "enamoured": "enamored",
489
+ "encyclopaedia": "encyclopedia",
490
+ "encyclopaedias": "encyclopedias",
491
+ "encyclopaedic": "encyclopedic",
492
+ "endeavour": "endeavor",
493
+ "endeavoured": "endeavored",
494
+ "endeavouring": "endeavoring",
495
+ "endeavours": "endeavors",
496
+ "energise": "energize",
497
+ "energised": "energized",
498
+ "energises": "energizes",
499
+ "energising": "energizing",
500
+ "enrol": "enroll",
501
+ "enrols": "enrolls",
502
+ "enthral": "enthrall",
503
+ "enthrals": "enthralls",
504
+ "epaulette": "epaulet",
505
+ "epaulettes": "epaulets",
506
+ "epicentre": "epicenter",
507
+ "epicentres": "epicenters",
508
+ "epilogue": "epilog",
509
+ "epilogues": "epilogs",
510
+ "epitomise": "epitomize",
511
+ "epitomised": "epitomized",
512
+ "epitomises": "epitomizes",
513
+ "epitomising": "epitomizing",
514
+ "equalisation": "equalization",
515
+ "equalise": "equalize",
516
+ "equalised": "equalized",
517
+ "equaliser": "equalizer",
518
+ "equalisers": "equalizers",
519
+ "equalises": "equalizes",
520
+ "equalising": "equalizing",
521
+ "eulogise": "eulogize",
522
+ "eulogised": "eulogized",
523
+ "eulogises": "eulogizes",
524
+ "eulogising": "eulogizing",
525
+ "evangelise": "evangelize",
526
+ "evangelised": "evangelized",
527
+ "evangelises": "evangelizes",
528
+ "evangelising": "evangelizing",
529
+ "exorcise": "exorcize",
530
+ "exorcised": "exorcized",
531
+ "exorcises": "exorcizes",
532
+ "exorcising": "exorcizing",
533
+ "extemporisation": "extemporization",
534
+ "extemporise": "extemporize",
535
+ "extemporised": "extemporized",
536
+ "extemporises": "extemporizes",
537
+ "extemporising": "extemporizing",
538
+ "externalisation": "externalization",
539
+ "externalisations": "externalizations",
540
+ "externalise": "externalize",
541
+ "externalised": "externalized",
542
+ "externalises": "externalizes",
543
+ "externalising": "externalizing",
544
+ "factorise": "factorize",
545
+ "factorised": "factorized",
546
+ "factorises": "factorizes",
547
+ "factorising": "factorizing",
548
+ "faecal": "fecal",
549
+ "faeces": "feces",
550
+ "familiarisation": "familiarization",
551
+ "familiarise": "familiarize",
552
+ "familiarised": "familiarized",
553
+ "familiarises": "familiarizes",
554
+ "familiarising": "familiarizing",
555
+ "fantasise": "fantasize",
556
+ "fantasised": "fantasized",
557
+ "fantasises": "fantasizes",
558
+ "fantasising": "fantasizing",
559
+ "favour": "favor",
560
+ "favourable": "favorable",
561
+ "favourably": "favorably",
562
+ "favoured": "favored",
563
+ "favouring": "favoring",
564
+ "favourite": "favorite",
565
+ "favourites": "favorites",
566
+ "favouritism": "favoritism",
567
+ "favours": "favors",
568
+ "feminise": "feminize",
569
+ "feminised": "feminized",
570
+ "feminises": "feminizes",
571
+ "feminising": "feminizing",
572
+ "fertilisation": "fertilization",
573
+ "fertilise": "fertilize",
574
+ "fertilised": "fertilized",
575
+ "fertiliser": "fertilizer",
576
+ "fertilisers": "fertilizers",
577
+ "fertilises": "fertilizes",
578
+ "fertilising": "fertilizing",
579
+ "fervour": "fervor",
580
+ "fibre": "fiber",
581
+ "fibreglass": "fiberglass",
582
+ "fibres": "fibers",
583
+ "fictionalisation": "fictionalization",
584
+ "fictionalisations": "fictionalizations",
585
+ "fictionalise": "fictionalize",
586
+ "fictionalised": "fictionalized",
587
+ "fictionalises": "fictionalizes",
588
+ "fictionalising": "fictionalizing",
589
+ "fillet": "filet",
590
+ "filleted": "fileted",
591
+ "filleting": "fileting",
592
+ "fillets": "filets",
593
+ "finalisation": "finalization",
594
+ "finalise": "finalize",
595
+ "finalised": "finalized",
596
+ "finalises": "finalizes",
597
+ "finalising": "finalizing",
598
+ "flautist": "flutist",
599
+ "flautists": "flutists",
600
+ "flavour": "flavor",
601
+ "flavoured": "flavored",
602
+ "flavouring": "flavoring",
603
+ "flavourings": "flavorings",
604
+ "flavourless": "flavorless",
605
+ "flavours": "flavors",
606
+ "flavoursome": "flavorsome",
607
+ "flyer / flier": "flier / flyer",
608
+ "foetal": "fetal",
609
+ "foetid": "fetid",
610
+ "foetus": "fetus",
611
+ "foetuses": "fetuses",
612
+ "formalisation": "formalization",
613
+ "formalise": "formalize",
614
+ "formalised": "formalized",
615
+ "formalises": "formalizes",
616
+ "formalising": "formalizing",
617
+ "fossilisation": "fossilization",
618
+ "fossilise": "fossilize",
619
+ "fossilised": "fossilized",
620
+ "fossilises": "fossilizes",
621
+ "fossilising": "fossilizing",
622
+ "fraternisation": "fraternization",
623
+ "fraternise": "fraternize",
624
+ "fraternised": "fraternized",
625
+ "fraternises": "fraternizes",
626
+ "fraternising": "fraternizing",
627
+ "fulfil": "fulfill",
628
+ "fulfilment": "fulfillment",
629
+ "fulfils": "fulfills",
630
+ "funnelled": "funneled",
631
+ "funnelling": "funneling",
632
+ "galvanise": "galvanize",
633
+ "galvanised": "galvanized",
634
+ "galvanises": "galvanizes",
635
+ "galvanising": "galvanizing",
636
+ "gambolled": "gamboled",
637
+ "gambolling": "gamboling",
638
+ "gaol": "jail",
639
+ "gaolbird": "jailbird",
640
+ "gaolbirds": "jailbirds",
641
+ "gaolbreak": "jailbreak",
642
+ "gaolbreaks": "jailbreaks",
643
+ "gaoled": "jailed",
644
+ "gaoler": "jailer",
645
+ "gaolers": "jailers",
646
+ "gaoling": "jailing",
647
+ "gaols": "jails",
648
+ "gasses": "gases",
649
+ "gage": "gauge",
650
+ "gaged": "gauged",
651
+ "gages": "gauges",
652
+ "gaging": "gauging",
653
+ "generalisation": "generalization",
654
+ "generalisations": "generalizations",
655
+ "generalise": "generalize",
656
+ "generalised": "generalized",
657
+ "generalises": "generalizes",
658
+ "generalising": "generalizing",
659
+ "ghettoise": "ghettoize",
660
+ "ghettoised": "ghettoized",
661
+ "ghettoises": "ghettoizes",
662
+ "ghettoising": "ghettoizing",
663
+ "gipsies": "gypsies",
664
+ "glamorise": "glamorize",
665
+ "glamorised": "glamorized",
666
+ "glamorises": "glamorizes",
667
+ "glamorising": "glamorizing",
668
+ "glamor": "glamour",
669
+ "globalisation": "globalization",
670
+ "globalise": "globalize",
671
+ "globalised": "globalized",
672
+ "globalises": "globalizes",
673
+ "globalising": "globalizing",
674
+ "glueing": "gluing",
675
+ "goitre": "goiter",
676
+ "goitres": "goiters",
677
+ "gonorrhoea": "gonorrhea",
678
+ "gramme": "gram",
679
+ "grammes": "grams",
680
+ "gravelled": "graveled",
681
+ "grey": "gray",
682
+ "greyed": "grayed",
683
+ "greying": "graying",
684
+ "greyish": "grayish",
685
+ "greyness": "grayness",
686
+ "greys": "grays",
687
+ "grovelled": "groveled",
688
+ "grovelling": "groveling",
689
+ "groyne": "groin",
690
+ "groynes": "groins",
691
+ "gruelling": "grueling",
692
+ "gruellingly": "gruelingly",
693
+ "gryphon": "griffin",
694
+ "gryphons": "griffins",
695
+ "gynaecological": "gynecological",
696
+ "gynaecologist": "gynecologist",
697
+ "gynaecologists": "gynecologists",
698
+ "gynaecology": "gynecology",
699
+ "haematological": "hematological",
700
+ "haematologist": "hematologist",
701
+ "haematologists": "hematologists",
702
+ "haematology": "hematology",
703
+ "haemoglobin": "hemoglobin",
704
+ "haemophilia": "hemophilia",
705
+ "haemophiliac": "hemophiliac",
706
+ "haemophiliacs": "hemophiliacs",
707
+ "haemorrhage": "hemorrhage",
708
+ "haemorrhaged": "hemorrhaged",
709
+ "haemorrhages": "hemorrhages",
710
+ "haemorrhaging": "hemorrhaging",
711
+ "haemorrhoids": "hemorrhoids",
712
+ "harbour": "harbor",
713
+ "harboured": "harbored",
714
+ "harbouring": "harboring",
715
+ "harbours": "harbors",
716
+ "harmonisation": "harmonization",
717
+ "harmonise": "harmonize",
718
+ "harmonised": "harmonized",
719
+ "harmonises": "harmonizes",
720
+ "harmonising": "harmonizing",
721
+ "homoeopath": "homeopath",
722
+ "homoeopathic": "homeopathic",
723
+ "homoeopaths": "homeopaths",
724
+ "homoeopathy": "homeopathy",
725
+ "homogenise": "homogenize",
726
+ "homogenised": "homogenized",
727
+ "homogenises": "homogenizes",
728
+ "homogenising": "homogenizing",
729
+ "honour": "honor",
730
+ "honourable": "honorable",
731
+ "honourably": "honorably",
732
+ "honoured": "honored",
733
+ "honouring": "honoring",
734
+ "honours": "honors",
735
+ "hospitalisation": "hospitalization",
736
+ "hospitalise": "hospitalize",
737
+ "hospitalised": "hospitalized",
738
+ "hospitalises": "hospitalizes",
739
+ "hospitalising": "hospitalizing",
740
+ "humanise": "humanize",
741
+ "humanised": "humanized",
742
+ "humanises": "humanizes",
743
+ "humanising": "humanizing",
744
+ "humour": "humor",
745
+ "humoured": "humored",
746
+ "humouring": "humoring",
747
+ "humourless": "humorless",
748
+ "humours": "humors",
749
+ "hybridise": "hybridize",
750
+ "hybridised": "hybridized",
751
+ "hybridises": "hybridizes",
752
+ "hybridising": "hybridizing",
753
+ "hypnotise": "hypnotize",
754
+ "hypnotised": "hypnotized",
755
+ "hypnotises": "hypnotizes",
756
+ "hypnotising": "hypnotizing",
757
+ "hypothesise": "hypothesize",
758
+ "hypothesised": "hypothesized",
759
+ "hypothesises": "hypothesizes",
760
+ "hypothesising": "hypothesizing",
761
+ "idealisation": "idealization",
762
+ "idealise": "idealize",
763
+ "idealised": "idealized",
764
+ "idealises": "idealizes",
765
+ "idealising": "idealizing",
766
+ "idolise": "idolize",
767
+ "idolised": "idolized",
768
+ "idolises": "idolizes",
769
+ "idolising": "idolizing",
770
+ "immobilisation": "immobilization",
771
+ "immobilise": "immobilize",
772
+ "immobilised": "immobilized",
773
+ "immobiliser": "immobilizer",
774
+ "immobilisers": "immobilizers",
775
+ "immobilises": "immobilizes",
776
+ "immobilising": "immobilizing",
777
+ "immortalise": "immortalize",
778
+ "immortalised": "immortalized",
779
+ "immortalises": "immortalizes",
780
+ "immortalising": "immortalizing",
781
+ "immunisation": "immunization",
782
+ "immunise": "immunize",
783
+ "immunised": "immunized",
784
+ "immunises": "immunizes",
785
+ "immunising": "immunizing",
786
+ "impanelled": "impaneled",
787
+ "impanelling": "impaneling",
788
+ "imperilled": "imperiled",
789
+ "imperilling": "imperiling",
790
+ "individualise": "individualize",
791
+ "individualised": "individualized",
792
+ "individualises": "individualizes",
793
+ "individualising": "individualizing",
794
+ "industrialise": "industrialize",
795
+ "industrialised": "industrialized",
796
+ "industrialises": "industrializes",
797
+ "industrialising": "industrializing",
798
+ "inflexion": "inflection",
799
+ "inflexions": "inflections",
800
+ "initialise": "initialize",
801
+ "initialised": "initialized",
802
+ "initialises": "initializes",
803
+ "initialising": "initializing",
804
+ "initialled": "initialed",
805
+ "initialling": "initialing",
806
+ "instal": "install",
807
+ "instalment": "installment",
808
+ "instalments": "installments",
809
+ "instals": "installs",
810
+ "instil": "instill",
811
+ "instils": "instills",
812
+ "institutionalisation": "institutionalization",
813
+ "institutionalise": "institutionalize",
814
+ "institutionalised": "institutionalized",
815
+ "institutionalises": "institutionalizes",
816
+ "institutionalising": "institutionalizing",
817
+ "intellectualise": "intellectualize",
818
+ "intellectualised": "intellectualized",
819
+ "intellectualises": "intellectualizes",
820
+ "intellectualising": "intellectualizing",
821
+ "internalisation": "internalization",
822
+ "internalise": "internalize",
823
+ "internalised": "internalized",
824
+ "internalises": "internalizes",
825
+ "internalising": "internalizing",
826
+ "internationalisation": "internationalization",
827
+ "internationalise": "internationalize",
828
+ "internationalised": "internationalized",
829
+ "internationalises": "internationalizes",
830
+ "internationalising": "internationalizing",
831
+ "ionisation": "ionization",
832
+ "ionise": "ionize",
833
+ "ionised": "ionized",
834
+ "ioniser": "ionizer",
835
+ "ionisers": "ionizers",
836
+ "ionises": "ionizes",
837
+ "ionising": "ionizing",
838
+ "italicise": "italicize",
839
+ "italicised": "italicized",
840
+ "italicises": "italicizes",
841
+ "italicising": "italicizing",
842
+ "itemise": "itemize",
843
+ "itemised": "itemized",
844
+ "itemises": "itemizes",
845
+ "itemising": "itemizing",
846
+ "jeopardise": "jeopardize",
847
+ "jeopardised": "jeopardized",
848
+ "jeopardises": "jeopardizes",
849
+ "jeopardising": "jeopardizing",
850
+ "jewelled": "jeweled",
851
+ "jeweller": "jeweler",
852
+ "jewellers": "jewelers",
853
+ "jewellery": "jewelry",
854
+ "judgement": "judgment",
855
+ "kilogramme": "kilogram",
856
+ "kilogrammes": "kilograms",
857
+ "kilometre": "kilometer",
858
+ "kilometres": "kilometers",
859
+ "labelled": "labeled",
860
+ "labelling": "labeling",
861
+ "labour": "labor",
862
+ "laboured": "labored",
863
+ "labourer": "laborer",
864
+ "labourers": "laborers",
865
+ "labouring": "laboring",
866
+ "labours": "labors",
867
+ "lacklustre": "lackluster",
868
+ "legalisation": "legalization",
869
+ "legalise": "legalize",
870
+ "legalised": "legalized",
871
+ "legalises": "legalizes",
872
+ "legalising": "legalizing",
873
+ "legitimise": "legitimize",
874
+ "legitimised": "legitimized",
875
+ "legitimises": "legitimizes",
876
+ "legitimising": "legitimizing",
877
+ "leukaemia": "leukemia",
878
+ "levelled": "leveled",
879
+ "leveller": "leveler",
880
+ "levellers": "levelers",
881
+ "levelling": "leveling",
882
+ "libelled": "libeled",
883
+ "libelling": "libeling",
884
+ "libellous": "libelous",
885
+ "liberalisation": "liberalization",
886
+ "liberalise": "liberalize",
887
+ "liberalised": "liberalized",
888
+ "liberalises": "liberalizes",
889
+ "liberalising": "liberalizing",
890
+ "licence": "license",
891
+ "licenced": "licensed",
892
+ "licences": "licenses",
893
+ "licencing": "licensing",
894
+ "likeable": "likable",
895
+ "lionisation": "lionization",
896
+ "lionise": "lionize",
897
+ "lionised": "lionized",
898
+ "lionises": "lionizes",
899
+ "lionising": "lionizing",
900
+ "liquidise": "liquidize",
901
+ "liquidised": "liquidized",
902
+ "liquidiser": "liquidizer",
903
+ "liquidisers": "liquidizers",
904
+ "liquidises": "liquidizes",
905
+ "liquidising": "liquidizing",
906
+ "litre": "liter",
907
+ "litres": "liters",
908
+ "localise": "localize",
909
+ "localised": "localized",
910
+ "localises": "localizes",
911
+ "localising": "localizing",
912
+ "louvre": "louver",
913
+ "louvred": "louvered",
914
+ "louvres": "louvers",
915
+ "lustre": "luster",
916
+ "magnetise": "magnetize",
917
+ "magnetised": "magnetized",
918
+ "magnetises": "magnetizes",
919
+ "magnetising": "magnetizing",
920
+ "manoeuvrability": "maneuverability",
921
+ "manoeuvrable": "maneuverable",
922
+ "manoeuvre": "maneuver",
923
+ "manoeuvred": "maneuvered",
924
+ "manoeuvres": "maneuvers",
925
+ "manoeuvring": "maneuvering",
926
+ "manoeuvrings": "maneuverings",
927
+ "marginalisation": "marginalization",
928
+ "marginalise": "marginalize",
929
+ "marginalised": "marginalized",
930
+ "marginalises": "marginalizes",
931
+ "marginalising": "marginalizing",
932
+ "marshalled": "marshaled",
933
+ "marshalling": "marshaling",
934
+ "marvelled": "marveled",
935
+ "marvelling": "marveling",
936
+ "marvellous": "marvelous",
937
+ "marvellously": "marvelously",
938
+ "materialisation": "materialization",
939
+ "materialise": "materialize",
940
+ "materialised": "materialized",
941
+ "materialises": "materializes",
942
+ "materialising": "materializing",
943
+ "maximisation": "maximization",
944
+ "maximise": "maximize",
945
+ "maximised": "maximized",
946
+ "maximises": "maximizes",
947
+ "maximising": "maximizing",
948
+ "meagre": "meager",
949
+ "mechanisation": "mechanization",
950
+ "mechanise": "mechanize",
951
+ "mechanised": "mechanized",
952
+ "mechanises": "mechanizes",
953
+ "mechanising": "mechanizing",
954
+ "mediaeval": "medieval",
955
+ "memorialise": "memorialize",
956
+ "memorialised": "memorialized",
957
+ "memorialises": "memorializes",
958
+ "memorialising": "memorializing",
959
+ "memorise": "memorize",
960
+ "memorised": "memorized",
961
+ "memorises": "memorizes",
962
+ "memorising": "memorizing",
963
+ "mesmerise": "mesmerize",
964
+ "mesmerised": "mesmerized",
965
+ "mesmerises": "mesmerizes",
966
+ "mesmerising": "mesmerizing",
967
+ "metabolise": "metabolize",
968
+ "metabolised": "metabolized",
969
+ "metabolises": "metabolizes",
970
+ "metabolising": "metabolizing",
971
+ "metre": "meter",
972
+ "metres": "meters",
973
+ "micrometre": "micrometer",
974
+ "micrometres": "micrometers",
975
+ "militarise": "militarize",
976
+ "militarised": "militarized",
977
+ "militarises": "militarizes",
978
+ "militarising": "militarizing",
979
+ "milligramme": "milligram",
980
+ "milligrammes": "milligrams",
981
+ "millilitre": "milliliter",
982
+ "millilitres": "milliliters",
983
+ "millimetre": "millimeter",
984
+ "millimetres": "millimeters",
985
+ "miniaturisation": "miniaturization",
986
+ "miniaturise": "miniaturize",
987
+ "miniaturised": "miniaturized",
988
+ "miniaturises": "miniaturizes",
989
+ "miniaturising": "miniaturizing",
990
+ "minibusses": "minibuses",
991
+ "minimise": "minimize",
992
+ "minimised": "minimized",
993
+ "minimises": "minimizes",
994
+ "minimising": "minimizing",
995
+ "misbehaviour": "misbehavior",
996
+ "misdemeanour": "misdemeanor",
997
+ "misdemeanours": "misdemeanors",
998
+ "misspelt": "misspelled",
999
+ "mitre": "miter",
1000
+ "mitres": "miters",
1001
+ "mobilisation": "mobilization",
1002
+ "mobilise": "mobilize",
1003
+ "mobilised": "mobilized",
1004
+ "mobilises": "mobilizes",
1005
+ "mobilising": "mobilizing",
1006
+ "modelled": "modeled",
1007
+ "modeller": "modeler",
1008
+ "modellers": "modelers",
1009
+ "modelling": "modeling",
1010
+ "modernise": "modernize",
1011
+ "modernised": "modernized",
1012
+ "modernises": "modernizes",
1013
+ "modernising": "modernizing",
1014
+ "moisturise": "moisturize",
1015
+ "moisturised": "moisturized",
1016
+ "moisturiser": "moisturizer",
1017
+ "moisturisers": "moisturizers",
1018
+ "moisturises": "moisturizes",
1019
+ "moisturising": "moisturizing",
1020
+ "monologue": "monolog",
1021
+ "monologues": "monologs",
1022
+ "monopolisation": "monopolization",
1023
+ "monopolise": "monopolize",
1024
+ "monopolised": "monopolized",
1025
+ "monopolises": "monopolizes",
1026
+ "monopolising": "monopolizing",
1027
+ "moralise": "moralize",
1028
+ "moralised": "moralized",
1029
+ "moralises": "moralizes",
1030
+ "moralising": "moralizing",
1031
+ "motorised": "motorized",
1032
+ "mould": "mold",
1033
+ "moulded": "molded",
1034
+ "moulder": "molder",
1035
+ "mouldered": "moldered",
1036
+ "mouldering": "moldering",
1037
+ "moulders": "molders",
1038
+ "mouldier": "moldier",
1039
+ "mouldiest": "moldiest",
1040
+ "moulding": "molding",
1041
+ "mouldings": "moldings",
1042
+ "moulds": "molds",
1043
+ "mouldy": "moldy",
1044
+ "moult": "molt",
1045
+ "moulted": "molted",
1046
+ "moulting": "molting",
1047
+ "moults": "molts",
1048
+ "moustache": "mustache",
1049
+ "moustached": "mustached",
1050
+ "moustaches": "mustaches",
1051
+ "moustachioed": "mustachioed",
1052
+ "multicoloured": "multicolored",
1053
+ "nationalisation": "nationalization",
1054
+ "nationalisations": "nationalizations",
1055
+ "nationalise": "nationalize",
1056
+ "nationalised": "nationalized",
1057
+ "nationalises": "nationalizes",
1058
+ "nationalising": "nationalizing",
1059
+ "naturalisation": "naturalization",
1060
+ "naturalise": "naturalize",
1061
+ "naturalised": "naturalized",
1062
+ "naturalises": "naturalizes",
1063
+ "naturalising": "naturalizing",
1064
+ "neighbour": "neighbor",
1065
+ "neighbourhood": "neighborhood",
1066
+ "neighbourhoods": "neighborhoods",
1067
+ "neighbouring": "neighboring",
1068
+ "neighbourliness": "neighborliness",
1069
+ "neighbourly": "neighborly",
1070
+ "neighbours": "neighbors",
1071
+ "neutralisation": "neutralization",
1072
+ "neutralise": "neutralize",
1073
+ "neutralised": "neutralized",
1074
+ "neutralises": "neutralizes",
1075
+ "neutralising": "neutralizing",
1076
+ "normalisation": "normalization",
1077
+ "normalise": "normalize",
1078
+ "normalised": "normalized",
1079
+ "normalises": "normalizes",
1080
+ "normalising": "normalizing",
1081
+ "odour": "odor",
1082
+ "odourless": "odorless",
1083
+ "odours": "odors",
1084
+ "oesophagus": "esophagus",
1085
+ "oesophaguses": "esophaguses",
1086
+ "oestrogen": "estrogen",
1087
+ "offence": "offense",
1088
+ "offences": "offenses",
1089
+ "omelette": "omelet",
1090
+ "omelettes": "omelets",
1091
+ "optimise": "optimize",
1092
+ "optimised": "optimized",
1093
+ "optimises": "optimizes",
1094
+ "optimising": "optimizing",
1095
+ "organisation": "organization",
1096
+ "organisational": "organizational",
1097
+ "organisations": "organizations",
1098
+ "organise": "organize",
1099
+ "organised": "organized",
1100
+ "organiser": "organizer",
1101
+ "organisers": "organizers",
1102
+ "organises": "organizes",
1103
+ "organising": "organizing",
1104
+ "orthopaedic": "orthopedic",
1105
+ "orthopaedics": "orthopedics",
1106
+ "ostracise": "ostracize",
1107
+ "ostracised": "ostracized",
1108
+ "ostracises": "ostracizes",
1109
+ "ostracising": "ostracizing",
1110
+ "outmanoeuvre": "outmaneuver",
1111
+ "outmanoeuvred": "outmaneuvered",
1112
+ "outmanoeuvres": "outmaneuvers",
1113
+ "outmanoeuvring": "outmaneuvering",
1114
+ "overemphasise": "overemphasize",
1115
+ "overemphasised": "overemphasized",
1116
+ "overemphasises": "overemphasizes",
1117
+ "overemphasising": "overemphasizing",
1118
+ "oxidisation": "oxidization",
1119
+ "oxidise": "oxidize",
1120
+ "oxidised": "oxidized",
1121
+ "oxidises": "oxidizes",
1122
+ "oxidising": "oxidizing",
1123
+ "paederast": "pederast",
1124
+ "paederasts": "pederasts",
1125
+ "paediatric": "pediatric",
1126
+ "paediatrician": "pediatrician",
1127
+ "paediatricians": "pediatricians",
1128
+ "paediatrics": "pediatrics",
1129
+ "paedophile": "pedophile",
1130
+ "paedophiles": "pedophiles",
1131
+ "paedophilia": "pedophilia",
1132
+ "palaeolithic": "paleolithic",
1133
+ "palaeontologist": "paleontologist",
1134
+ "palaeontologists": "paleontologists",
1135
+ "palaeontology": "paleontology",
1136
+ "panelled": "paneled",
1137
+ "panelling": "paneling",
1138
+ "panellist": "panelist",
1139
+ "panellists": "panelists",
1140
+ "paralyse": "paralyze",
1141
+ "paralysed": "paralyzed",
1142
+ "paralyses": "paralyzes",
1143
+ "paralysing": "paralyzing",
1144
+ "parcelled": "parceled",
1145
+ "parcelling": "parceling",
1146
+ "parlour": "parlor",
1147
+ "parlours": "parlors",
1148
+ "particularise": "particularize",
1149
+ "particularised": "particularized",
1150
+ "particularises": "particularizes",
1151
+ "particularising": "particularizing",
1152
+ "passivisation": "passivization",
1153
+ "passivise": "passivize",
1154
+ "passivised": "passivized",
1155
+ "passivises": "passivizes",
1156
+ "passivising": "passivizing",
1157
+ "pasteurisation": "pasteurization",
1158
+ "pasteurise": "pasteurize",
1159
+ "pasteurised": "pasteurized",
1160
+ "pasteurises": "pasteurizes",
1161
+ "pasteurising": "pasteurizing",
1162
+ "patronise": "patronize",
1163
+ "patronised": "patronized",
1164
+ "patronises": "patronizes",
1165
+ "patronising": "patronizing",
1166
+ "patronisingly": "patronizingly",
1167
+ "pedalled": "pedaled",
1168
+ "pedalling": "pedaling",
1169
+ "pedestrianisation": "pedestrianization",
1170
+ "pedestrianise": "pedestrianize",
1171
+ "pedestrianised": "pedestrianized",
1172
+ "pedestrianises": "pedestrianizes",
1173
+ "pedestrianising": "pedestrianizing",
1174
+ "penalise": "penalize",
1175
+ "penalised": "penalized",
1176
+ "penalises": "penalizes",
1177
+ "penalising": "penalizing",
1178
+ "pencilled": "penciled",
1179
+ "pencilling": "penciling",
1180
+ "personalise": "personalize",
1181
+ "personalised": "personalized",
1182
+ "personalises": "personalizes",
1183
+ "personalising": "personalizing",
1184
+ "pharmacopoeia": "pharmacopeia",
1185
+ "pharmacopoeias": "pharmacopeias",
1186
+ "philosophise": "philosophize",
1187
+ "philosophised": "philosophized",
1188
+ "philosophises": "philosophizes",
1189
+ "philosophising": "philosophizing",
1190
+ "philtre": "filter",
1191
+ "philtres": "filters",
1192
+ "phoney": "phony",
1193
+ "plagiarise": "plagiarize",
1194
+ "plagiarised": "plagiarized",
1195
+ "plagiarises": "plagiarizes",
1196
+ "plagiarising": "plagiarizing",
1197
+ "plough": "plow",
1198
+ "ploughed": "plowed",
1199
+ "ploughing": "plowing",
1200
+ "ploughman": "plowman",
1201
+ "ploughmen": "plowmen",
1202
+ "ploughs": "plows",
1203
+ "ploughshare": "plowshare",
1204
+ "ploughshares": "plowshares",
1205
+ "polarisation": "polarization",
1206
+ "polarise": "polarize",
1207
+ "polarised": "polarized",
1208
+ "polarises": "polarizes",
1209
+ "polarising": "polarizing",
1210
+ "politicisation": "politicization",
1211
+ "politicise": "politicize",
1212
+ "politicised": "politicized",
1213
+ "politicises": "politicizes",
1214
+ "politicising": "politicizing",
1215
+ "popularisation": "popularization",
1216
+ "popularise": "popularize",
1217
+ "popularised": "popularized",
1218
+ "popularises": "popularizes",
1219
+ "popularising": "popularizing",
1220
+ "pouffe": "pouf",
1221
+ "pouffes": "poufs",
1222
+ "practise": "practice",
1223
+ "practised": "practiced",
1224
+ "practises": "practices",
1225
+ "practising": "practicing",
1226
+ "praesidium": "presidium",
1227
+ "praesidiums": "presidiums",
1228
+ "pressurisation": "pressurization",
1229
+ "pressurise": "pressurize",
1230
+ "pressurised": "pressurized",
1231
+ "pressurises": "pressurizes",
1232
+ "pressurising": "pressurizing",
1233
+ "pretence": "pretense",
1234
+ "pretences": "pretenses",
1235
+ "primaeval": "primeval",
1236
+ "prioritisation": "prioritization",
1237
+ "prioritise": "prioritize",
1238
+ "prioritised": "prioritized",
1239
+ "prioritises": "prioritizes",
1240
+ "prioritising": "prioritizing",
1241
+ "privatisation": "privatization",
1242
+ "privatisations": "privatizations",
1243
+ "privatise": "privatize",
1244
+ "privatised": "privatized",
1245
+ "privatises": "privatizes",
1246
+ "privatising": "privatizing",
1247
+ "professionalisation": "professionalization",
1248
+ "professionalise": "professionalize",
1249
+ "professionalised": "professionalized",
1250
+ "professionalises": "professionalizes",
1251
+ "professionalising": "professionalizing",
1252
+ "programme": "program",
1253
+ "programmes": "programs",
1254
+ "prologue": "prolog",
1255
+ "prologues": "prologs",
1256
+ "propagandise": "propagandize",
1257
+ "propagandised": "propagandized",
1258
+ "propagandises": "propagandizes",
1259
+ "propagandising": "propagandizing",
1260
+ "proselytise": "proselytize",
1261
+ "proselytised": "proselytized",
1262
+ "proselytiser": "proselytizer",
1263
+ "proselytisers": "proselytizers",
1264
+ "proselytises": "proselytizes",
1265
+ "proselytising": "proselytizing",
1266
+ "psychoanalyse": "psychoanalyze",
1267
+ "psychoanalysed": "psychoanalyzed",
1268
+ "psychoanalyses": "psychoanalyzes",
1269
+ "psychoanalysing": "psychoanalyzing",
1270
+ "publicise": "publicize",
1271
+ "publicised": "publicized",
1272
+ "publicises": "publicizes",
1273
+ "publicising": "publicizing",
1274
+ "pulverisation": "pulverization",
1275
+ "pulverise": "pulverize",
1276
+ "pulverised": "pulverized",
1277
+ "pulverises": "pulverizes",
1278
+ "pulverising": "pulverizing",
1279
+ "pummelled": "pummel",
1280
+ "pummelling": "pummeled",
1281
+ "pyjama": "pajama",
1282
+ "pyjamas": "pajamas",
1283
+ "pzazz": "pizzazz",
1284
+ "quarrelled": "quarreled",
1285
+ "quarrelling": "quarreling",
1286
+ "radicalise": "radicalize",
1287
+ "radicalised": "radicalized",
1288
+ "radicalises": "radicalizes",
1289
+ "radicalising": "radicalizing",
1290
+ "rancour": "rancor",
1291
+ "randomise": "randomize",
1292
+ "randomised": "randomized",
1293
+ "randomises": "randomizes",
1294
+ "randomising": "randomizing",
1295
+ "rationalisation": "rationalization",
1296
+ "rationalisations": "rationalizations",
1297
+ "rationalise": "rationalize",
1298
+ "rationalised": "rationalized",
1299
+ "rationalises": "rationalizes",
1300
+ "rationalising": "rationalizing",
1301
+ "ravelled": "raveled",
1302
+ "ravelling": "raveling",
1303
+ "realisable": "realizable",
1304
+ "realisation": "realization",
1305
+ "realisations": "realizations",
1306
+ "realise": "realize",
1307
+ "realised": "realized",
1308
+ "realises": "realizes",
1309
+ "realising": "realizing",
1310
+ "recognisable": "recognizable",
1311
+ "recognisably": "recognizably",
1312
+ "recognisance": "recognizance",
1313
+ "recognise": "recognize",
1314
+ "recognised": "recognized",
1315
+ "recognises": "recognizes",
1316
+ "recognising": "recognizing",
1317
+ "reconnoitre": "reconnoiter",
1318
+ "reconnoitred": "reconnoitered",
1319
+ "reconnoitres": "reconnoiters",
1320
+ "reconnoitring": "reconnoitering",
1321
+ "refuelled": "refueled",
1322
+ "refuelling": "refueling",
1323
+ "regularisation": "regularization",
1324
+ "regularise": "regularize",
1325
+ "regularised": "regularized",
1326
+ "regularises": "regularizes",
1327
+ "regularising": "regularizing",
1328
+ "remodelled": "remodeled",
1329
+ "remodelling": "remodeling",
1330
+ "remould": "remold",
1331
+ "remoulded": "remolded",
1332
+ "remoulding": "remolding",
1333
+ "remoulds": "remolds",
1334
+ "reorganisation": "reorganization",
1335
+ "reorganisations": "reorganizations",
1336
+ "reorganise": "reorganize",
1337
+ "reorganised": "reorganized",
1338
+ "reorganises": "reorganizes",
1339
+ "reorganising": "reorganizing",
1340
+ "revelled": "reveled",
1341
+ "reveller": "reveler",
1342
+ "revellers": "revelers",
1343
+ "revelling": "reveling",
1344
+ "revitalise": "revitalize",
1345
+ "revitalised": "revitalized",
1346
+ "revitalises": "revitalizes",
1347
+ "revitalising": "revitalizing",
1348
+ "revolutionise": "revolutionize",
1349
+ "revolutionised": "revolutionized",
1350
+ "revolutionises": "revolutionizes",
1351
+ "revolutionising": "revolutionizing",
1352
+ "rhapsodise": "rhapsodize",
1353
+ "rhapsodised": "rhapsodized",
1354
+ "rhapsodises": "rhapsodizes",
1355
+ "rhapsodising": "rhapsodizing",
1356
+ "rigour": "rigor",
1357
+ "rigours": "rigors",
1358
+ "ritualised": "ritualized",
1359
+ "rivalled": "rivaled",
1360
+ "rivalling": "rivaling",
1361
+ "romanticise": "romanticize",
1362
+ "romanticised": "romanticized",
1363
+ "romanticises": "romanticizes",
1364
+ "romanticising": "romanticizing",
1365
+ "rumour": "rumor",
1366
+ "rumoured": "rumored",
1367
+ "rumours": "rumors",
1368
+ "sabre": "saber",
1369
+ "sabres": "sabers",
1370
+ "saltpetre": "saltpeter",
1371
+ "sanitise": "sanitize",
1372
+ "sanitised": "sanitized",
1373
+ "sanitises": "sanitizes",
1374
+ "sanitising": "sanitizing",
1375
+ "satirise": "satirize",
1376
+ "satirised": "satirized",
1377
+ "satirises": "satirizes",
1378
+ "satirising": "satirizing",
1379
+ "saviour": "savior",
1380
+ "saviours": "saviors",
1381
+ "savour": "savor",
1382
+ "savoured": "savored",
1383
+ "savouries": "savories",
1384
+ "savouring": "savoring",
1385
+ "savours": "savors",
1386
+ "savoury": "savory",
1387
+ "scandalise": "scandalize",
1388
+ "scandalised": "scandalized",
1389
+ "scandalises": "scandalizes",
1390
+ "scandalising": "scandalizing",
1391
+ "sceptic": "skeptic",
1392
+ "sceptical": "skeptical",
1393
+ "sceptically": "skeptically",
1394
+ "scepticism": "skepticism",
1395
+ "sceptics": "skeptics",
1396
+ "sceptre": "scepter",
1397
+ "sceptres": "scepters",
1398
+ "scrutinise": "scrutinize",
1399
+ "scrutinised": "scrutinized",
1400
+ "scrutinises": "scrutinizes",
1401
+ "scrutinising": "scrutinizing",
1402
+ "secularisation": "secularization",
1403
+ "secularise": "secularize",
1404
+ "secularised": "secularized",
1405
+ "secularises": "secularizes",
1406
+ "secularising": "secularizing",
1407
+ "sensationalise": "sensationalize",
1408
+ "sensationalised": "sensationalized",
1409
+ "sensationalises": "sensationalizes",
1410
+ "sensationalising": "sensationalizing",
1411
+ "sensitise": "sensitize",
1412
+ "sensitised": "sensitized",
1413
+ "sensitises": "sensitizes",
1414
+ "sensitising": "sensitizing",
1415
+ "sentimentalise": "sentimentalize",
1416
+ "sentimentalised": "sentimentalized",
1417
+ "sentimentalises": "sentimentalizes",
1418
+ "sentimentalising": "sentimentalizing",
1419
+ "sepulchre": "sepulcher",
1420
+ "sepulchres": "sepulchers",
1421
+ "serialisation": "serialization",
1422
+ "serialisations": "serializations",
1423
+ "serialise": "serialize",
1424
+ "serialised": "serialized",
1425
+ "serialises": "serializes",
1426
+ "serialising": "serializing",
1427
+ "sermonise": "sermonize",
1428
+ "sermonised": "sermonized",
1429
+ "sermonises": "sermonizes",
1430
+ "sermonising": "sermonizing",
1431
+ "sheikh": "sheik",
1432
+ "shovelled": "shoveled",
1433
+ "shovelling": "shoveling",
1434
+ "shrivelled": "shriveled",
1435
+ "shrivelling": "shriveling",
1436
+ "signalise": "signalize",
1437
+ "signalised": "signalized",
1438
+ "signalises": "signalizes",
1439
+ "signalising": "signalizing",
1440
+ "signalled": "signaled",
1441
+ "signalling": "signaling",
1442
+ "smoulder": "smolder",
1443
+ "smouldered": "smoldered",
1444
+ "smouldering": "smoldering",
1445
+ "smoulders": "smolders",
1446
+ "snivelled": "sniveled",
1447
+ "snivelling": "sniveling",
1448
+ "snorkelled": "snorkeled",
1449
+ "snorkelling": "snorkeling",
1450
+ "snowplough": "snowplow",
1451
+ "snowploughs": "snowplow",
1452
+ "socialisation": "socialization",
1453
+ "socialise": "socialize",
1454
+ "socialised": "socialized",
1455
+ "socialises": "socializes",
1456
+ "socialising": "socializing",
1457
+ "sodomise": "sodomize",
1458
+ "sodomised": "sodomized",
1459
+ "sodomises": "sodomizes",
1460
+ "sodomising": "sodomizing",
1461
+ "solemnise": "solemnize",
1462
+ "solemnised": "solemnized",
1463
+ "solemnises": "solemnizes",
1464
+ "solemnising": "solemnizing",
1465
+ "sombre": "somber",
1466
+ "specialisation": "specialization",
1467
+ "specialisations": "specializations",
1468
+ "specialise": "specialize",
1469
+ "specialised": "specialized",
1470
+ "specialises": "specializes",
1471
+ "specialising": "specializing",
1472
+ "spectre": "specter",
1473
+ "spectres": "specters",
1474
+ "spiralled": "spiraled",
1475
+ "spiralling": "spiraling",
1476
+ "splendour": "splendor",
1477
+ "splendours": "splendors",
1478
+ "squirrelled": "squirreled",
1479
+ "squirrelling": "squirreling",
1480
+ "stabilisation": "stabilization",
1481
+ "stabilise": "stabilize",
1482
+ "stabilised": "stabilized",
1483
+ "stabiliser": "stabilizer",
1484
+ "stabilisers": "stabilizers",
1485
+ "stabilises": "stabilizes",
1486
+ "stabilising": "stabilizing",
1487
+ "standardisation": "standardization",
1488
+ "standardise": "standardize",
1489
+ "standardised": "standardized",
1490
+ "standardises": "standardizes",
1491
+ "standardising": "standardizing",
1492
+ "stencilled": "stenciled",
1493
+ "stencilling": "stenciling",
1494
+ "sterilisation": "sterilization",
1495
+ "sterilisations": "sterilizations",
1496
+ "sterilise": "sterilize",
1497
+ "sterilised": "sterilized",
1498
+ "steriliser": "sterilizer",
1499
+ "sterilisers": "sterilizers",
1500
+ "sterilises": "sterilizes",
1501
+ "sterilising": "sterilizing",
1502
+ "stigmatisation": "stigmatization",
1503
+ "stigmatise": "stigmatize",
1504
+ "stigmatised": "stigmatized",
1505
+ "stigmatises": "stigmatizes",
1506
+ "stigmatising": "stigmatizing",
1507
+ "storey": "story",
1508
+ "storeys": "stories",
1509
+ "subsidisation": "subsidization",
1510
+ "subsidise": "subsidize",
1511
+ "subsidised": "subsidized",
1512
+ "subsidiser": "subsidizer",
1513
+ "subsidisers": "subsidizers",
1514
+ "subsidises": "subsidizes",
1515
+ "subsidising": "subsidizing",
1516
+ "succour": "succor",
1517
+ "succoured": "succored",
1518
+ "succouring": "succoring",
1519
+ "succours": "succors",
1520
+ "sulphate": "sulfate",
1521
+ "sulphates": "sulfates",
1522
+ "sulphide": "sulfide",
1523
+ "sulphides": "sulfides",
1524
+ "sulphur": "sulfur",
1525
+ "sulphurous": "sulfurous",
1526
+ "summarise": "summarize",
1527
+ "summarised": "summarized",
1528
+ "summarises": "summarizes",
1529
+ "summarising": "summarizing",
1530
+ "swivelled": "swiveled",
1531
+ "swivelling": "swiveling",
1532
+ "symbolise": "symbolize",
1533
+ "symbolised": "symbolized",
1534
+ "symbolises": "symbolizes",
1535
+ "symbolising": "symbolizing",
1536
+ "sympathise": "sympathize",
1537
+ "sympathised": "sympathized",
1538
+ "sympathiser": "sympathizer",
1539
+ "sympathisers": "sympathizers",
1540
+ "sympathises": "sympathizes",
1541
+ "sympathising": "sympathizing",
1542
+ "synchronisation": "synchronization",
1543
+ "synchronise": "synchronize",
1544
+ "synchronised": "synchronized",
1545
+ "synchronises": "synchronizes",
1546
+ "synchronising": "synchronizing",
1547
+ "synthesise": "synthesize",
1548
+ "synthesised": "synthesized",
1549
+ "synthesiser": "synthesizer",
1550
+ "synthesisers": "synthesizers",
1551
+ "synthesises": "synthesizes",
1552
+ "synthesising": "synthesizing",
1553
+ "syphon": "siphon",
1554
+ "syphoned": "siphoned",
1555
+ "syphoning": "siphoning",
1556
+ "syphons": "siphons",
1557
+ "systematisation": "systematization",
1558
+ "systematise": "systematize",
1559
+ "systematised": "systematized",
1560
+ "systematises": "systematizes",
1561
+ "systematising": "systematizing",
1562
+ "tantalise": "tantalize",
1563
+ "tantalised": "tantalized",
1564
+ "tantalises": "tantalizes",
1565
+ "tantalising": "tantalizing",
1566
+ "tantalisingly": "tantalizingly",
1567
+ "tasselled": "tasseled",
1568
+ "technicolour": "technicolor",
1569
+ "temporise": "temporize",
1570
+ "temporised": "temporized",
1571
+ "temporises": "temporizes",
1572
+ "temporising": "temporizing",
1573
+ "tenderise": "tenderize",
1574
+ "tenderised": "tenderized",
1575
+ "tenderises": "tenderizes",
1576
+ "tenderising": "tenderizing",
1577
+ "terrorise": "terrorize",
1578
+ "terrorised": "terrorized",
1579
+ "terrorises": "terrorizes",
1580
+ "terrorising": "terrorizing",
1581
+ "theatre": "theater",
1582
+ "theatregoer": "theatergoer",
1583
+ "theatregoers": "theatergoers",
1584
+ "theatres": "theaters",
1585
+ "theorise": "theorize",
1586
+ "theorised": "theorized",
1587
+ "theorises": "theorizes",
1588
+ "theorising": "theorizing",
1589
+ "tonne": "ton",
1590
+ "tonnes": "tons",
1591
+ "towelled": "toweled",
1592
+ "towelling": "toweling",
1593
+ "toxaemia": "toxemia",
1594
+ "tranquillise": "tranquilize",
1595
+ "tranquillised": "tranquilized",
1596
+ "tranquilliser": "tranquilizer",
1597
+ "tranquillisers": "tranquilizers",
1598
+ "tranquillises": "tranquilizes",
1599
+ "tranquillising": "tranquilizing",
1600
+ "tranquillity": "tranquility",
1601
+ "tranquillize": "tranquilize",
1602
+ "tranquillized": "tranquilized",
1603
+ "tranquillizer": "tranquilizer",
1604
+ "tranquillizers": "tranquilizers",
1605
+ "tranquillizes": "tranquilizes",
1606
+ "tranquillizing": "tranquilizing",
1607
+ "tranquilly": "tranquility",
1608
+ "transistorised": "transistorized",
1609
+ "traumatise": "traumatize",
1610
+ "traumatised": "traumatized",
1611
+ "traumatises": "traumatizes",
1612
+ "traumatising": "traumatizing",
1613
+ "travelled": "traveled",
1614
+ "traveller": "traveler",
1615
+ "travellers": "travelers",
1616
+ "travelling": "traveling",
1617
+ "travelog": "travelogue",
1618
+ "travelogs": "travelogues",
1619
+ "trialled": "trialed",
1620
+ "trialling": "trialing",
1621
+ "tricolour": "tricolor",
1622
+ "tricolours": "tricolors",
1623
+ "trivialise": "trivialize",
1624
+ "trivialised": "trivialized",
1625
+ "trivialises": "trivializes",
1626
+ "trivialising": "trivializing",
1627
+ "tumour": "tumor",
1628
+ "tumours": "tumors",
1629
+ "tunnelled": "tunneled",
1630
+ "tunnelling": "tunneling",
1631
+ "tyrannise": "tyrannize",
1632
+ "tyrannised": "tyrannized",
1633
+ "tyrannises": "tyrannizes",
1634
+ "tyrannising": "tyrannizing",
1635
+ "tyre": "tire",
1636
+ "tyres": "tires",
1637
+ "unauthorised": "unauthorized",
1638
+ "uncivilised": "uncivilized",
1639
+ "underutilised": "underutilized",
1640
+ "unequalled": "unequaled",
1641
+ "unfavourable": "unfavorable",
1642
+ "unfavourably": "unfavorably",
1643
+ "unionisation": "unionization",
1644
+ "unionise": "unionize",
1645
+ "unionised": "unionized",
1646
+ "unionises": "unionizes",
1647
+ "unionising": "unionizing",
1648
+ "unorganised": "unorganized",
1649
+ "unravelled": "unraveled",
1650
+ "unravelling": "unraveling",
1651
+ "unrecognisable": "unrecognizable",
1652
+ "unrecognised": "unrecognized",
1653
+ "unrivalled": "unrivaled",
1654
+ "unsavoury": "unsavory",
1655
+ "untrammelled": "untrammeled",
1656
+ "urbanisation": "urbanization",
1657
+ "urbanise": "urbanize",
1658
+ "urbanised": "urbanized",
1659
+ "urbanises": "urbanizes",
1660
+ "urbanising": "urbanizing",
1661
+ "utilisable": "utilizable",
1662
+ "utilisation": "utilization",
1663
+ "utilise": "utilize",
1664
+ "utilised": "utilized",
1665
+ "utilises": "utilizes",
1666
+ "utilising": "utilizing",
1667
+ "valour": "valor",
1668
+ "vandalise": "vandalize",
1669
+ "vandalised": "vandalized",
1670
+ "vandalises": "vandalizes",
1671
+ "vandalising": "vandalizing",
1672
+ "vaporisation": "vaporization",
1673
+ "vaporise": "vaporize",
1674
+ "vaporised": "vaporized",
1675
+ "vaporises": "vaporizes",
1676
+ "vaporising": "vaporizing",
1677
+ "vapour": "vapor",
1678
+ "vapours": "vapors",
1679
+ "verbalise": "verbalize",
1680
+ "verbalised": "verbalized",
1681
+ "verbalises": "verbalizes",
1682
+ "verbalising": "verbalizing",
1683
+ "victimisation": "victimization",
1684
+ "victimise": "victimize",
1685
+ "victimised": "victimized",
1686
+ "victimises": "victimizes",
1687
+ "victimising": "victimizing",
1688
+ "videodisc": "videodisk",
1689
+ "videodiscs": "videodisks",
1690
+ "vigour": "vigor",
1691
+ "visualisation": "visualization",
1692
+ "visualisations": "visualizations",
1693
+ "visualise": "visualize",
1694
+ "visualised": "visualized",
1695
+ "visualises": "visualizes",
1696
+ "visualising": "visualizing",
1697
+ "vocalisation": "vocalization",
1698
+ "vocalisations": "vocalizations",
1699
+ "vocalise": "vocalize",
1700
+ "vocalised": "vocalized",
1701
+ "vocalises": "vocalizes",
1702
+ "vocalising": "vocalizing",
1703
+ "vulcanised": "vulcanized",
1704
+ "vulgarisation": "vulgarization",
1705
+ "vulgarise": "vulgarize",
1706
+ "vulgarised": "vulgarized",
1707
+ "vulgarises": "vulgarizes",
1708
+ "vulgarising": "vulgarizing",
1709
+ "waggon": "wagon",
1710
+ "waggons": "wagons",
1711
+ "watercolour": "watercolor",
1712
+ "watercolours": "watercolors",
1713
+ "weaselled": "weaseled",
1714
+ "weaselling": "weaseling",
1715
+ "westernisation": "westernization",
1716
+ "westernise": "westernize",
1717
+ "westernised": "westernized",
1718
+ "westernises": "westernizes",
1719
+ "westernising": "westernizing",
1720
+ "womanise": "womanize",
1721
+ "womanised": "womanized",
1722
+ "womaniser": "womanizer",
1723
+ "womanisers": "womanizers",
1724
+ "womanises": "womanizes",
1725
+ "womanising": "womanizing",
1726
+ "woollen": "woolen",
1727
+ "woollens": "woolens",
1728
+ "woollies": "woolies",
1729
+ "woolly": "wooly",
1730
+ "worshipped": "worshiped",
1731
+ "worshipping": "worshiping",
1732
+ "worshipper": "worshiper",
1733
+ "yodelled": "yodeled",
1734
+ "yodelling": "yodeling",
1735
+ "yoghourt": "yogurt",
1736
+ "yoghourts": "yogurts",
1737
+ "yoghurt": "yogurt",
1738
+ "yoghurts": "yogurts",
1739
+ "mhm": "hmm",
1740
+ "mmm": "hmm"
1741
+ }
whisper/whisper/normalizers/english.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from fractions import Fraction
5
+ from typing import Iterator, List, Match, Optional, Union
6
+
7
+ from more_itertools import windowed
8
+
9
+ from .basic import remove_symbols_and_diacritics
10
+
11
+
12
+ class EnglishNumberNormalizer:
13
+ """
14
+ Convert any spelled-out numbers into arabic numbers, while handling:
15
+
16
+ - remove any commas
17
+ - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
18
+ - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
19
+ - spell out `one` and `ones`
20
+ - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
21
+ """
22
+
23
+ def __init__(self):
24
+ super().__init__()
25
+
26
+ self.zeros = {"o", "oh", "zero"}
27
+ self.ones = {
28
+ name: i
29
+ for i, name in enumerate(
30
+ [
31
+ "one",
32
+ "two",
33
+ "three",
34
+ "four",
35
+ "five",
36
+ "six",
37
+ "seven",
38
+ "eight",
39
+ "nine",
40
+ "ten",
41
+ "eleven",
42
+ "twelve",
43
+ "thirteen",
44
+ "fourteen",
45
+ "fifteen",
46
+ "sixteen",
47
+ "seventeen",
48
+ "eighteen",
49
+ "nineteen",
50
+ ],
51
+ start=1,
52
+ )
53
+ }
54
+ self.ones_plural = {
55
+ "sixes" if name == "six" else name + "s": (value, "s")
56
+ for name, value in self.ones.items()
57
+ }
58
+ self.ones_ordinal = {
59
+ "zeroth": (0, "th"),
60
+ "first": (1, "st"),
61
+ "second": (2, "nd"),
62
+ "third": (3, "rd"),
63
+ "fifth": (5, "th"),
64
+ "twelfth": (12, "th"),
65
+ **{
66
+ name + ("h" if name.endswith("t") else "th"): (value, "th")
67
+ for name, value in self.ones.items()
68
+ if value > 3 and value != 5 and value != 12
69
+ },
70
+ }
71
+ self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal}
72
+
73
+ self.tens = {
74
+ "twenty": 20,
75
+ "thirty": 30,
76
+ "forty": 40,
77
+ "fifty": 50,
78
+ "sixty": 60,
79
+ "seventy": 70,
80
+ "eighty": 80,
81
+ "ninety": 90,
82
+ }
83
+ self.tens_plural = {
84
+ name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()
85
+ }
86
+ self.tens_ordinal = {
87
+ name.replace("y", "ieth"): (value, "th")
88
+ for name, value in self.tens.items()
89
+ }
90
+ self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal}
91
+
92
+ self.multipliers = {
93
+ "hundred": 100,
94
+ "thousand": 1_000,
95
+ "million": 1_000_000,
96
+ "billion": 1_000_000_000,
97
+ "trillion": 1_000_000_000_000,
98
+ "quadrillion": 1_000_000_000_000_000,
99
+ "quintillion": 1_000_000_000_000_000_000,
100
+ "sextillion": 1_000_000_000_000_000_000_000,
101
+ "septillion": 1_000_000_000_000_000_000_000_000,
102
+ "octillion": 1_000_000_000_000_000_000_000_000_000,
103
+ "nonillion": 1_000_000_000_000_000_000_000_000_000_000,
104
+ "decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
105
+ }
106
+ self.multipliers_plural = {
107
+ name + "s": (value, "s") for name, value in self.multipliers.items()
108
+ }
109
+ self.multipliers_ordinal = {
110
+ name + "th": (value, "th") for name, value in self.multipliers.items()
111
+ }
112
+ self.multipliers_suffixed = {
113
+ **self.multipliers_plural,
114
+ **self.multipliers_ordinal,
115
+ }
116
+ self.decimals = {*self.ones, *self.tens, *self.zeros}
117
+
118
+ self.preceding_prefixers = {
119
+ "minus": "-",
120
+ "negative": "-",
121
+ "plus": "+",
122
+ "positive": "+",
123
+ }
124
+ self.following_prefixers = {
125
+ "pound": "£",
126
+ "pounds": "£",
127
+ "euro": "€",
128
+ "euros": "€",
129
+ "dollar": "$",
130
+ "dollars": "$",
131
+ "cent": "¢",
132
+ "cents": "¢",
133
+ }
134
+ self.prefixes = set(
135
+ list(self.preceding_prefixers.values())
136
+ + list(self.following_prefixers.values())
137
+ )
138
+ self.suffixers = {
139
+ "per": {"cent": "%"},
140
+ "percent": "%",
141
+ }
142
+ self.specials = {"and", "double", "triple", "point"}
143
+
144
+ self.words = set(
145
+ [
146
+ key
147
+ for mapping in [
148
+ self.zeros,
149
+ self.ones,
150
+ self.ones_suffixed,
151
+ self.tens,
152
+ self.tens_suffixed,
153
+ self.multipliers,
154
+ self.multipliers_suffixed,
155
+ self.preceding_prefixers,
156
+ self.following_prefixers,
157
+ self.suffixers,
158
+ self.specials,
159
+ ]
160
+ for key in mapping
161
+ ]
162
+ )
163
+ self.literal_words = {"one", "ones"}
164
+
165
+ def process_words(self, words: List[str]) -> Iterator[str]:
166
+ prefix: Optional[str] = None
167
+ value: Optional[Union[str, int]] = None
168
+ skip = False
169
+
170
+ def to_fraction(s: str):
171
+ try:
172
+ return Fraction(s)
173
+ except ValueError:
174
+ return None
175
+
176
+ def output(result: Union[str, int]):
177
+ nonlocal prefix, value
178
+ result = str(result)
179
+ if prefix is not None:
180
+ result = prefix + result
181
+ value = None
182
+ prefix = None
183
+ return result
184
+
185
+ if len(words) == 0:
186
+ return
187
+
188
+ for prev, current, next in windowed([None] + words + [None], 3):
189
+ if skip:
190
+ skip = False
191
+ continue
192
+
193
+ next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next)
194
+ has_prefix = current[0] in self.prefixes
195
+ current_without_prefix = current[1:] if has_prefix else current
196
+ if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
197
+ # arabic numbers (potentially with signs and fractions)
198
+ f = to_fraction(current_without_prefix)
199
+ assert f is not None
200
+ if value is not None:
201
+ if isinstance(value, str) and value.endswith("."):
202
+ # concatenate decimals / ip address components
203
+ value = str(value) + str(current)
204
+ continue
205
+ else:
206
+ yield output(value)
207
+
208
+ prefix = current[0] if has_prefix else prefix
209
+ if f.denominator == 1:
210
+ value = f.numerator # store integers as int
211
+ else:
212
+ value = current_without_prefix
213
+ elif current not in self.words:
214
+ # non-numeric words
215
+ if value is not None:
216
+ yield output(value)
217
+ yield output(current)
218
+ elif current in self.zeros:
219
+ value = str(value or "") + "0"
220
+ elif current in self.ones:
221
+ ones = self.ones[current]
222
+
223
+ if value is None:
224
+ value = ones
225
+ elif isinstance(value, str) or prev in self.ones:
226
+ if (
227
+ prev in self.tens and ones < 10
228
+ ): # replace the last zero with the digit
229
+ assert value[-1] == "0"
230
+ value = value[:-1] + str(ones)
231
+ else:
232
+ value = str(value) + str(ones)
233
+ elif ones < 10:
234
+ if value % 10 == 0:
235
+ value += ones
236
+ else:
237
+ value = str(value) + str(ones)
238
+ else: # eleven to nineteen
239
+ if value % 100 == 0:
240
+ value += ones
241
+ else:
242
+ value = str(value) + str(ones)
243
+ elif current in self.ones_suffixed:
244
+ # ordinal or cardinal; yield the number right away
245
+ ones, suffix = self.ones_suffixed[current]
246
+ if value is None:
247
+ yield output(str(ones) + suffix)
248
+ elif isinstance(value, str) or prev in self.ones:
249
+ if prev in self.tens and ones < 10:
250
+ assert value[-1] == "0"
251
+ yield output(value[:-1] + str(ones) + suffix)
252
+ else:
253
+ yield output(str(value) + str(ones) + suffix)
254
+ elif ones < 10:
255
+ if value % 10 == 0:
256
+ yield output(str(value + ones) + suffix)
257
+ else:
258
+ yield output(str(value) + str(ones) + suffix)
259
+ else: # eleven to nineteen
260
+ if value % 100 == 0:
261
+ yield output(str(value + ones) + suffix)
262
+ else:
263
+ yield output(str(value) + str(ones) + suffix)
264
+ value = None
265
+ elif current in self.tens:
266
+ tens = self.tens[current]
267
+ if value is None:
268
+ value = tens
269
+ elif isinstance(value, str):
270
+ value = str(value) + str(tens)
271
+ else:
272
+ if value % 100 == 0:
273
+ value += tens
274
+ else:
275
+ value = str(value) + str(tens)
276
+ elif current in self.tens_suffixed:
277
+ # ordinal or cardinal; yield the number right away
278
+ tens, suffix = self.tens_suffixed[current]
279
+ if value is None:
280
+ yield output(str(tens) + suffix)
281
+ elif isinstance(value, str):
282
+ yield output(str(value) + str(tens) + suffix)
283
+ else:
284
+ if value % 100 == 0:
285
+ yield output(str(value + tens) + suffix)
286
+ else:
287
+ yield output(str(value) + str(tens) + suffix)
288
+ elif current in self.multipliers:
289
+ multiplier = self.multipliers[current]
290
+ if value is None:
291
+ value = multiplier
292
+ elif isinstance(value, str) or value == 0:
293
+ f = to_fraction(value)
294
+ p = f * multiplier if f is not None else None
295
+ if f is not None and p.denominator == 1:
296
+ value = p.numerator
297
+ else:
298
+ yield output(value)
299
+ value = multiplier
300
+ else:
301
+ before = value // 1000 * 1000
302
+ residual = value % 1000
303
+ value = before + residual * multiplier
304
+ elif current in self.multipliers_suffixed:
305
+ multiplier, suffix = self.multipliers_suffixed[current]
306
+ if value is None:
307
+ yield output(str(multiplier) + suffix)
308
+ elif isinstance(value, str):
309
+ f = to_fraction(value)
310
+ p = f * multiplier if f is not None else None
311
+ if f is not None and p.denominator == 1:
312
+ yield output(str(p.numerator) + suffix)
313
+ else:
314
+ yield output(value)
315
+ yield output(str(multiplier) + suffix)
316
+ else: # int
317
+ before = value // 1000 * 1000
318
+ residual = value % 1000
319
+ value = before + residual * multiplier
320
+ yield output(str(value) + suffix)
321
+ value = None
322
+ elif current in self.preceding_prefixers:
323
+ # apply prefix (positive, minus, etc.) if it precedes a number
324
+ if value is not None:
325
+ yield output(value)
326
+
327
+ if next in self.words or next_is_numeric:
328
+ prefix = self.preceding_prefixers[current]
329
+ else:
330
+ yield output(current)
331
+ elif current in self.following_prefixers:
332
+ # apply prefix (dollars, cents, etc.) only after a number
333
+ if value is not None:
334
+ prefix = self.following_prefixers[current]
335
+ yield output(value)
336
+ else:
337
+ yield output(current)
338
+ elif current in self.suffixers:
339
+ # apply suffix symbols (percent -> '%')
340
+ if value is not None:
341
+ suffix = self.suffixers[current]
342
+ if isinstance(suffix, dict):
343
+ if next in suffix:
344
+ yield output(str(value) + suffix[next])
345
+ skip = True
346
+ else:
347
+ yield output(value)
348
+ yield output(current)
349
+ else:
350
+ yield output(str(value) + suffix)
351
+ else:
352
+ yield output(current)
353
+ elif current in self.specials:
354
+ if next not in self.words and not next_is_numeric:
355
+ # apply special handling only if the next word can be numeric
356
+ if value is not None:
357
+ yield output(value)
358
+ yield output(current)
359
+ elif current == "and":
360
+ # ignore "and" after hundreds, thousands, etc.
361
+ if prev not in self.multipliers:
362
+ if value is not None:
363
+ yield output(value)
364
+ yield output(current)
365
+ elif current == "double" or current == "triple":
366
+ if next in self.ones or next in self.zeros:
367
+ repeats = 2 if current == "double" else 3
368
+ ones = self.ones.get(next, 0)
369
+ value = str(value or "") + str(ones) * repeats
370
+ skip = True
371
+ else:
372
+ if value is not None:
373
+ yield output(value)
374
+ yield output(current)
375
+ elif current == "point":
376
+ if next in self.decimals or next_is_numeric:
377
+ value = str(value or "") + "."
378
+ else:
379
+ # should all have been covered at this point
380
+ raise ValueError(f"Unexpected token: {current}")
381
+ else:
382
+ # all should have been covered at this point
383
+ raise ValueError(f"Unexpected token: {current}")
384
+
385
+ if value is not None:
386
+ yield output(value)
387
+
388
+ def preprocess(self, s: str):
389
+ # replace "<number> and a half" with "<number> point five"
390
+ results = []
391
+
392
+ segments = re.split(r"\band\s+a\s+half\b", s)
393
+ for i, segment in enumerate(segments):
394
+ if len(segment.strip()) == 0:
395
+ continue
396
+ if i == len(segments) - 1:
397
+ results.append(segment)
398
+ else:
399
+ results.append(segment)
400
+ last_word = segment.rsplit(maxsplit=2)[-1]
401
+ if last_word in self.decimals or last_word in self.multipliers:
402
+ results.append("point five")
403
+ else:
404
+ results.append("and a half")
405
+
406
+ s = " ".join(results)
407
+
408
+ # put a space at number/letter boundary
409
+ s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
410
+ s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
411
+
412
+ # but remove spaces which could be a suffix
413
+ s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)
414
+
415
+ return s
416
+
417
+ def postprocess(self, s: str):
418
+ def combine_cents(m: Match):
419
+ try:
420
+ currency = m.group(1)
421
+ integer = m.group(2)
422
+ cents = int(m.group(3))
423
+ return f"{currency}{integer}.{cents:02d}"
424
+ except ValueError:
425
+ return m.string
426
+
427
+ def extract_cents(m: Match):
428
+ try:
429
+ return f"¢{int(m.group(1))}"
430
+ except ValueError:
431
+ return m.string
432
+
433
+ # apply currency postprocessing; "$2 and ¢7" -> "$2.07"
434
+ s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
435
+ s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)
436
+
437
+ # write "one(s)" instead of "1(s)", just for the readability
438
+ s = re.sub(r"\b1(s?)\b", r"one\1", s)
439
+
440
+ return s
441
+
442
+ def __call__(self, s: str):
443
+ s = self.preprocess(s)
444
+ s = " ".join(word for word in self.process_words(s.split()) if word is not None)
445
+ s = self.postprocess(s)
446
+
447
+ return s
448
+
449
+
450
+ class EnglishSpellingNormalizer:
451
+ """
452
+ Applies British-American spelling mappings as listed in [1].
453
+
454
+ [1] https://www.tysto.com/uk-us-spelling-list.html
455
+ """
456
+
457
+ def __init__(self):
458
+ mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
459
+ self.mapping = json.load(open(mapping_path))
460
+
461
+ def __call__(self, s: str):
462
+ return " ".join(self.mapping.get(word, word) for word in s.split())
463
+
464
+
465
+ class EnglishTextNormalizer:
466
+ def __init__(self):
467
+ self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
468
+ self.replacers = {
469
+ # common contractions
470
+ r"\bwon't\b": "will not",
471
+ r"\bcan't\b": "can not",
472
+ r"\blet's\b": "let us",
473
+ r"\bain't\b": "aint",
474
+ r"\by'all\b": "you all",
475
+ r"\bwanna\b": "want to",
476
+ r"\bgotta\b": "got to",
477
+ r"\bgonna\b": "going to",
478
+ r"\bi'ma\b": "i am going to",
479
+ r"\bimma\b": "i am going to",
480
+ r"\bwoulda\b": "would have",
481
+ r"\bcoulda\b": "could have",
482
+ r"\bshoulda\b": "should have",
483
+ r"\bma'am\b": "madam",
484
+ # contractions in titles/prefixes
485
+ r"\bmr\b": "mister ",
486
+ r"\bmrs\b": "missus ",
487
+ r"\bst\b": "saint ",
488
+ r"\bdr\b": "doctor ",
489
+ r"\bprof\b": "professor ",
490
+ r"\bcapt\b": "captain ",
491
+ r"\bgov\b": "governor ",
492
+ r"\bald\b": "alderman ",
493
+ r"\bgen\b": "general ",
494
+ r"\bsen\b": "senator ",
495
+ r"\brep\b": "representative ",
496
+ r"\bpres\b": "president ",
497
+ r"\brev\b": "reverend ",
498
+ r"\bhon\b": "honorable ",
499
+ r"\basst\b": "assistant ",
500
+ r"\bassoc\b": "associate ",
501
+ r"\blt\b": "lieutenant ",
502
+ r"\bcol\b": "colonel ",
503
+ r"\bjr\b": "junior ",
504
+ r"\bsr\b": "senior ",
505
+ r"\besq\b": "esquire ",
506
+ # prefect tenses, ideally it should be any past participles, but it's harder..
507
+ r"'d been\b": " had been",
508
+ r"'s been\b": " has been",
509
+ r"'d gone\b": " had gone",
510
+ r"'s gone\b": " has gone",
511
+ r"'d done\b": " had done", # "'s done" is ambiguous
512
+ r"'s got\b": " has got",
513
+ # general contractions
514
+ r"n't\b": " not",
515
+ r"'re\b": " are",
516
+ r"'s\b": " is",
517
+ r"'d\b": " would",
518
+ r"'ll\b": " will",
519
+ r"'t\b": " not",
520
+ r"'ve\b": " have",
521
+ r"'m\b": " am",
522
+ }
523
+ self.standardize_numbers = EnglishNumberNormalizer()
524
+ self.standardize_spellings = EnglishSpellingNormalizer()
525
+
526
+ def __call__(self, s: str):
527
+ s = s.lower()
528
+
529
+ s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
530
+ s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
531
+ s = re.sub(self.ignore_patterns, "", s)
532
+ s = re.sub(r"\s+'", "'", s) # when there's a space before an apostrophe
533
+
534
+ for pattern, replacement in self.replacers.items():
535
+ s = re.sub(pattern, replacement, s)
536
+
537
+ s = re.sub(r"(\d),(\d)", r"\1\2", s) # remove commas between digits
538
+ s = re.sub(r"\.([^0-9]|$)", r" \1", s) # remove periods not followed by numbers
539
+ s = remove_symbols_and_diacritics(s, keep=".%$¢€£") # keep numeric symbols
540
+
541
+ s = self.standardize_numbers(s)
542
+ s = self.standardize_spellings(s)
543
+
544
+ # now remove prefix/suffix symbols that are not preceded/followed by numbers
545
+ s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
546
+ s = re.sub(r"([^0-9])%", r"\1 ", s)
547
+
548
+ s = re.sub(r"\s+", " ", s) # replace any successive whitespaces with a space
549
+
550
+ return s
whisper/whisper/timing.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+ import subprocess
3
+ import warnings
4
+ from dataclasses import dataclass
5
+ from typing import TYPE_CHECKING, List
6
+
7
+ import numba
8
+ import numpy as np
9
+ import torch
10
+ import torch.nn.functional as F
11
+
12
+ from .audio import HOP_LENGTH, SAMPLE_RATE, TOKENS_PER_SECOND
13
+ from .tokenizer import Tokenizer
14
+
15
+ if TYPE_CHECKING:
16
+ from .model import Whisper
17
+
18
+
19
+ def median_filter(x: torch.Tensor, filter_width: int):
20
+ """Apply a median filter of width `filter_width` along the last dimension of `x`"""
21
+ pad_width = filter_width // 2
22
+ if x.shape[-1] <= pad_width:
23
+ # F.pad requires the padding width to be smaller than the input dimension
24
+ return x
25
+
26
+ if (ndim := x.ndim) <= 2:
27
+ # `F.pad` does not support 1D or 2D inputs for reflect padding but supports 3D and 4D
28
+ x = x[None, None, :]
29
+
30
+ assert (
31
+ filter_width > 0 and filter_width % 2 == 1
32
+ ), "`filter_width` should be an odd number"
33
+
34
+ result = None
35
+ x = F.pad(x, (filter_width // 2, filter_width // 2, 0, 0), mode="reflect")
36
+ if x.is_cuda:
37
+ try:
38
+ from .triton_ops import median_filter_cuda
39
+
40
+ result = median_filter_cuda(x, filter_width)
41
+ except (RuntimeError, subprocess.CalledProcessError):
42
+ warnings.warn(
43
+ "Failed to launch Triton kernels, likely due to missing CUDA toolkit; "
44
+ "falling back to a slower median kernel implementation..."
45
+ )
46
+
47
+ if result is None:
48
+ # sort() is faster than torch.median (https://github.com/pytorch/pytorch/issues/51450)
49
+ result = x.unfold(-1, filter_width, 1).sort()[0][..., filter_width // 2]
50
+
51
+ if ndim <= 2:
52
+ result = result[0, 0]
53
+
54
+ return result
55
+
56
+
57
+ @numba.jit(nopython=True)
58
+ def backtrace(trace: np.ndarray):
59
+ i = trace.shape[0] - 1
60
+ j = trace.shape[1] - 1
61
+ trace[0, :] = 2
62
+ trace[:, 0] = 1
63
+
64
+ result = []
65
+ while i > 0 or j > 0:
66
+ result.append((i - 1, j - 1))
67
+
68
+ if trace[i, j] == 0:
69
+ i -= 1
70
+ j -= 1
71
+ elif trace[i, j] == 1:
72
+ i -= 1
73
+ elif trace[i, j] == 2:
74
+ j -= 1
75
+ else:
76
+ raise ValueError("Unexpected trace[i, j]")
77
+
78
+ result = np.array(result)
79
+ return result[::-1, :].T
80
+
81
+
82
+ @numba.jit(nopython=True, parallel=True)
83
+ def dtw_cpu(x: np.ndarray):
84
+ N, M = x.shape
85
+ cost = np.ones((N + 1, M + 1), dtype=np.float32) * np.inf
86
+ trace = -np.ones((N + 1, M + 1), dtype=np.float32)
87
+
88
+ cost[0, 0] = 0
89
+ for j in range(1, M + 1):
90
+ for i in range(1, N + 1):
91
+ c0 = cost[i - 1, j - 1]
92
+ c1 = cost[i - 1, j]
93
+ c2 = cost[i, j - 1]
94
+
95
+ if c0 < c1 and c0 < c2:
96
+ c, t = c0, 0
97
+ elif c1 < c0 and c1 < c2:
98
+ c, t = c1, 1
99
+ else:
100
+ c, t = c2, 2
101
+
102
+ cost[i, j] = x[i - 1, j - 1] + c
103
+ trace[i, j] = t
104
+
105
+ return backtrace(trace)
106
+
107
+
108
+ def dtw_cuda(x, BLOCK_SIZE=1024):
109
+ from .triton_ops import dtw_kernel
110
+
111
+ M, N = x.shape
112
+ assert M < BLOCK_SIZE, f"M should be smaller than {BLOCK_SIZE=}"
113
+
114
+ x_skew = (
115
+ F.pad(x, (0, M + 1), value=np.inf).flatten()[: M * (N + M)].reshape(M, N + M)
116
+ )
117
+ x_skew = x_skew.T.contiguous()
118
+ cost = torch.ones(N + M + 2, M + 2) * np.inf
119
+ cost[0, 0] = 0
120
+ cost = cost.cuda()
121
+ trace = torch.zeros_like(cost, dtype=torch.int32)
122
+
123
+ dtw_kernel[(1,)](
124
+ cost,
125
+ trace,
126
+ x_skew,
127
+ x_skew.stride(0),
128
+ cost.stride(0),
129
+ trace.stride(0),
130
+ N,
131
+ M,
132
+ BLOCK_SIZE=BLOCK_SIZE,
133
+ )
134
+
135
+ trace = trace.T.flatten()[: (M + 1) * (M + N + 3)].reshape(M + 1, M + N + 3)[
136
+ :, : N + 1
137
+ ]
138
+ return backtrace(trace.cpu().numpy())
139
+
140
+
141
+ def dtw(x: torch.Tensor) -> np.ndarray:
142
+ if x.is_cuda:
143
+ try:
144
+ return dtw_cuda(x)
145
+ except (RuntimeError, subprocess.CalledProcessError):
146
+ warnings.warn(
147
+ "Failed to launch Triton kernels, likely due to missing CUDA toolkit; "
148
+ "falling back to a slower DTW implementation..."
149
+ )
150
+
151
+ return dtw_cpu(x.double().cpu().numpy())
152
+
153
+
154
+ @dataclass
155
+ class WordTiming:
156
+ word: str
157
+ tokens: List[int]
158
+ start: float
159
+ end: float
160
+ probability: float
161
+
162
+
163
+ def find_alignment(
164
+ model: "Whisper",
165
+ tokenizer: Tokenizer,
166
+ text_tokens: List[int],
167
+ mel: torch.Tensor,
168
+ num_frames: int,
169
+ *,
170
+ medfilt_width: int = 7,
171
+ qk_scale: float = 1.0,
172
+ ) -> List[WordTiming]:
173
+ if len(text_tokens) == 0:
174
+ return []
175
+
176
+ tokens = torch.tensor(
177
+ [
178
+ *tokenizer.sot_sequence,
179
+ tokenizer.no_timestamps,
180
+ *text_tokens,
181
+ tokenizer.eot,
182
+ ]
183
+ ).to(model.device)
184
+
185
+ # install hooks on the cross attention layers to retrieve the attention weights
186
+ QKs = [None] * model.dims.n_text_layer
187
+ hooks = [
188
+ block.cross_attn.register_forward_hook(
189
+ lambda _, ins, outs, index=i: QKs.__setitem__(index, outs[-1][0])
190
+ )
191
+ for i, block in enumerate(model.decoder.blocks)
192
+ ]
193
+
194
+ with torch.no_grad():
195
+ logits = model(mel.unsqueeze(0), tokens.unsqueeze(0))[0]
196
+ sampled_logits = logits[len(tokenizer.sot_sequence) :, : tokenizer.eot]
197
+ token_probs = sampled_logits.softmax(dim=-1)
198
+ text_token_probs = token_probs[np.arange(len(text_tokens)), text_tokens]
199
+ text_token_probs = text_token_probs.tolist()
200
+
201
+ for hook in hooks:
202
+ hook.remove()
203
+
204
+ # heads * tokens * frames
205
+ weights = torch.stack([QKs[_l][_h] for _l, _h in model.alignment_heads.indices().T])
206
+ weights = weights[:, :, : num_frames // 2]
207
+ weights = (weights * qk_scale).softmax(dim=-1)
208
+ std, mean = torch.std_mean(weights, dim=-2, keepdim=True, unbiased=False)
209
+ weights = (weights - mean) / std
210
+ weights = median_filter(weights, medfilt_width)
211
+
212
+ matrix = weights.mean(axis=0)
213
+ matrix = matrix[len(tokenizer.sot_sequence) : -1]
214
+ text_indices, time_indices = dtw(-matrix)
215
+
216
+ words, word_tokens = tokenizer.split_to_word_tokens(text_tokens + [tokenizer.eot])
217
+ if len(word_tokens) <= 1:
218
+ # return on eot only
219
+ # >>> np.pad([], (1, 0))
220
+ # array([0.])
221
+ # This results in crashes when we lookup jump_times with float, like
222
+ # IndexError: arrays used as indices must be of integer (or boolean) type
223
+ return []
224
+ word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0))
225
+
226
+ jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
227
+ jump_times = time_indices[jumps] / TOKENS_PER_SECOND
228
+ start_times = jump_times[word_boundaries[:-1]]
229
+ end_times = jump_times[word_boundaries[1:]]
230
+ word_probabilities = [
231
+ np.mean(text_token_probs[i:j])
232
+ for i, j in zip(word_boundaries[:-1], word_boundaries[1:])
233
+ ]
234
+
235
+ return [
236
+ WordTiming(word, tokens, start, end, probability)
237
+ for word, tokens, start, end, probability in zip(
238
+ words, word_tokens, start_times, end_times, word_probabilities
239
+ )
240
+ ]
241
+
242
+
243
+ def merge_punctuations(alignment: List[WordTiming], prepended: str, appended: str):
244
+ # merge prepended punctuations
245
+ i = len(alignment) - 2
246
+ j = len(alignment) - 1
247
+ while i >= 0:
248
+ previous = alignment[i]
249
+ following = alignment[j]
250
+ if previous.word.startswith(" ") and previous.word.strip() in prepended:
251
+ # prepend it to the following word
252
+ following.word = previous.word + following.word
253
+ following.tokens = previous.tokens + following.tokens
254
+ previous.word = ""
255
+ previous.tokens = []
256
+ else:
257
+ j = i
258
+ i -= 1
259
+
260
+ # merge appended punctuations
261
+ i = 0
262
+ j = 1
263
+ while j < len(alignment):
264
+ previous = alignment[i]
265
+ following = alignment[j]
266
+ if not previous.word.endswith(" ") and following.word in appended:
267
+ # append it to the previous word
268
+ previous.word = previous.word + following.word
269
+ previous.tokens = previous.tokens + following.tokens
270
+ following.word = ""
271
+ following.tokens = []
272
+ else:
273
+ i = j
274
+ j += 1
275
+
276
+
277
+ def add_word_timestamps(
278
+ *,
279
+ segments: List[dict],
280
+ model: "Whisper",
281
+ tokenizer: Tokenizer,
282
+ mel: torch.Tensor,
283
+ num_frames: int,
284
+ prepend_punctuations: str = "\"'“¿([{-",
285
+ append_punctuations: str = "\"'.。,,!!??::”)]}、",
286
+ last_speech_timestamp: float,
287
+ **kwargs,
288
+ ):
289
+ if len(segments) == 0:
290
+ return
291
+
292
+ text_tokens_per_segment = [
293
+ [token for token in segment["tokens"] if token < tokenizer.eot]
294
+ for segment in segments
295
+ ]
296
+
297
+ text_tokens = list(itertools.chain.from_iterable(text_tokens_per_segment))
298
+ alignment = find_alignment(model, tokenizer, text_tokens, mel, num_frames, **kwargs)
299
+ word_durations = np.array([t.end - t.start for t in alignment])
300
+ word_durations = word_durations[word_durations.nonzero()]
301
+ median_duration = np.median(word_durations) if len(word_durations) > 0 else 0.0
302
+ max_duration = median_duration * 2
303
+
304
+ # hack: truncate long words at sentence boundaries.
305
+ # a better segmentation algorithm based on VAD should be able to replace this.
306
+ if len(word_durations) > 0:
307
+ sentence_end_marks = ".。!!??"
308
+ # ensure words at sentence boundaries are not longer than twice the median word duration.
309
+ for i in range(1, len(alignment)):
310
+ if alignment[i].end - alignment[i].start > max_duration:
311
+ if alignment[i].word in sentence_end_marks:
312
+ alignment[i].end = alignment[i].start + max_duration
313
+ elif alignment[i - 1].word in sentence_end_marks:
314
+ alignment[i].start = alignment[i].end - max_duration
315
+
316
+ merge_punctuations(alignment, prepend_punctuations, append_punctuations)
317
+
318
+ time_offset = segments[0]["seek"] * HOP_LENGTH / SAMPLE_RATE
319
+ word_index = 0
320
+
321
+ for segment, text_tokens in zip(segments, text_tokens_per_segment):
322
+ saved_tokens = 0
323
+ words = []
324
+
325
+ while word_index < len(alignment) and saved_tokens < len(text_tokens):
326
+ timing = alignment[word_index]
327
+
328
+ if timing.word:
329
+ words.append(
330
+ dict(
331
+ word=timing.word,
332
+ start=round(time_offset + timing.start, 2),
333
+ end=round(time_offset + timing.end, 2),
334
+ probability=timing.probability,
335
+ )
336
+ )
337
+
338
+ saved_tokens += len(timing.tokens)
339
+ word_index += 1
340
+
341
+ # hack: truncate long words at segment boundaries.
342
+ # a better segmentation algorithm based on VAD should be able to replace this.
343
+ if len(words) > 0:
344
+ # ensure the first and second word after a pause is not longer than
345
+ # twice the median word duration.
346
+ if words[0]["end"] - last_speech_timestamp > median_duration * 4 and (
347
+ words[0]["end"] - words[0]["start"] > max_duration
348
+ or (
349
+ len(words) > 1
350
+ and words[1]["end"] - words[0]["start"] > max_duration * 2
351
+ )
352
+ ):
353
+ if (
354
+ len(words) > 1
355
+ and words[1]["end"] - words[1]["start"] > max_duration
356
+ ):
357
+ boundary = max(words[1]["end"] / 2, words[1]["end"] - max_duration)
358
+ words[0]["end"] = words[1]["start"] = boundary
359
+ words[0]["start"] = max(0, words[0]["end"] - max_duration)
360
+
361
+ # prefer the segment-level start timestamp if the first word is too long.
362
+ if (
363
+ segment["start"] < words[0]["end"]
364
+ and segment["start"] - 0.5 > words[0]["start"]
365
+ ):
366
+ words[0]["start"] = max(
367
+ 0, min(words[0]["end"] - median_duration, segment["start"])
368
+ )
369
+ else:
370
+ segment["start"] = words[0]["start"]
371
+
372
+ # prefer the segment-level end timestamp if the last word is too long.
373
+ if (
374
+ segment["end"] > words[-1]["start"]
375
+ and segment["end"] + 0.5 < words[-1]["end"]
376
+ ):
377
+ words[-1]["end"] = max(
378
+ words[-1]["start"] + median_duration, segment["end"]
379
+ )
380
+ else:
381
+ segment["end"] = words[-1]["end"]
382
+
383
+ last_speech_timestamp = segment["end"]
384
+
385
+ segment["words"] = words
whisper/whisper/tokenizer.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ import string
4
+ from dataclasses import dataclass, field
5
+ from functools import cached_property, lru_cache
6
+ from typing import Dict, List, Optional, Tuple
7
+
8
+ import tiktoken
9
+
10
+ LANGUAGES = {
11
+ "en": "english",
12
+ "zh": "chinese",
13
+ "de": "german",
14
+ "es": "spanish",
15
+ "ru": "russian",
16
+ "ko": "korean",
17
+ "fr": "french",
18
+ "ja": "japanese",
19
+ "pt": "portuguese",
20
+ "tr": "turkish",
21
+ "pl": "polish",
22
+ "ca": "catalan",
23
+ "nl": "dutch",
24
+ "ar": "arabic",
25
+ "sv": "swedish",
26
+ "it": "italian",
27
+ "id": "indonesian",
28
+ "hi": "hindi",
29
+ "fi": "finnish",
30
+ "vi": "vietnamese",
31
+ "he": "hebrew",
32
+ "uk": "ukrainian",
33
+ "el": "greek",
34
+ "ms": "malay",
35
+ "cs": "czech",
36
+ "ro": "romanian",
37
+ "da": "danish",
38
+ "hu": "hungarian",
39
+ "ta": "tamil",
40
+ "no": "norwegian",
41
+ "th": "thai",
42
+ "ur": "urdu",
43
+ "hr": "croatian",
44
+ "bg": "bulgarian",
45
+ "lt": "lithuanian",
46
+ "la": "latin",
47
+ "mi": "maori",
48
+ "ml": "malayalam",
49
+ "cy": "welsh",
50
+ "sk": "slovak",
51
+ "te": "telugu",
52
+ "fa": "persian",
53
+ "lv": "latvian",
54
+ "bn": "bengali",
55
+ "sr": "serbian",
56
+ "az": "azerbaijani",
57
+ "sl": "slovenian",
58
+ "kn": "kannada",
59
+ "et": "estonian",
60
+ "mk": "macedonian",
61
+ "br": "breton",
62
+ "eu": "basque",
63
+ "is": "icelandic",
64
+ "hy": "armenian",
65
+ "ne": "nepali",
66
+ "mn": "mongolian",
67
+ "bs": "bosnian",
68
+ "kk": "kazakh",
69
+ "sq": "albanian",
70
+ "sw": "swahili",
71
+ "gl": "galician",
72
+ "mr": "marathi",
73
+ "pa": "punjabi",
74
+ "si": "sinhala",
75
+ "km": "khmer",
76
+ "sn": "shona",
77
+ "yo": "yoruba",
78
+ "so": "somali",
79
+ "af": "afrikaans",
80
+ "oc": "occitan",
81
+ "ka": "georgian",
82
+ "be": "belarusian",
83
+ "tg": "tajik",
84
+ "sd": "sindhi",
85
+ "gu": "gujarati",
86
+ "am": "amharic",
87
+ "yi": "yiddish",
88
+ "lo": "lao",
89
+ "uz": "uzbek",
90
+ "fo": "faroese",
91
+ "ht": "haitian creole",
92
+ "ps": "pashto",
93
+ "tk": "turkmen",
94
+ "nn": "nynorsk",
95
+ "mt": "maltese",
96
+ "sa": "sanskrit",
97
+ "lb": "luxembourgish",
98
+ "my": "myanmar",
99
+ "bo": "tibetan",
100
+ "tl": "tagalog",
101
+ "mg": "malagasy",
102
+ "as": "assamese",
103
+ "tt": "tatar",
104
+ "haw": "hawaiian",
105
+ "ln": "lingala",
106
+ "ha": "hausa",
107
+ "ba": "bashkir",
108
+ "jw": "javanese",
109
+ "su": "sundanese",
110
+ }
111
+
112
+ # language code lookup by name, with a few language aliases
113
+ TO_LANGUAGE_CODE = {
114
+ **{language: code for code, language in LANGUAGES.items()},
115
+ "burmese": "my",
116
+ "valencian": "ca",
117
+ "flemish": "nl",
118
+ "haitian": "ht",
119
+ "letzeburgesch": "lb",
120
+ "pushto": "ps",
121
+ "panjabi": "pa",
122
+ "moldavian": "ro",
123
+ "moldovan": "ro",
124
+ "sinhalese": "si",
125
+ "castilian": "es",
126
+ }
127
+
128
+
129
+ @dataclass
130
+ class Tokenizer:
131
+ """A thin wrapper around `tiktoken` providing quick access to special tokens"""
132
+
133
+ encoding: tiktoken.Encoding
134
+ language: Optional[str] = None
135
+ task: Optional[str] = None
136
+ sot_sequence: Tuple[int] = ()
137
+ special_tokens: Dict[str, int] = field(default_factory=dict)
138
+
139
+ def __post_init__(self):
140
+ for special in self.encoding.special_tokens_set:
141
+ special_token = self.encoding.encode_single_token(special)
142
+ self.special_tokens[special] = special_token
143
+
144
+ sot: int = self.special_tokens["<|startoftranscript|>"]
145
+ translate: int = self.special_tokens["<|translate|>"]
146
+ transcribe: int = self.special_tokens["<|transcribe|>"]
147
+
148
+ langs = tuple(LANGUAGES.keys())
149
+ sot_sequence = [sot]
150
+ if self.language is not None:
151
+ sot_sequence.append(sot + 1 + langs.index(self.language))
152
+ if self.task is not None:
153
+ task_token: int = transcribe if self.task == "transcribe" else translate
154
+ sot_sequence.append(task_token)
155
+
156
+ self.sot_sequence = tuple(sot_sequence)
157
+
158
+ def encode(self, text, **kwargs):
159
+ return self.encoding.encode(text, **kwargs)
160
+
161
+ def decode(self, token_ids: List[int], **kwargs) -> str:
162
+ token_ids = [t for t in token_ids if t < self.timestamp_begin]
163
+ return self.encoding.decode(token_ids, **kwargs)
164
+
165
+ def decode_with_timestamps(self, token_ids: List[int], **kwargs) -> str:
166
+ """
167
+ Timestamp tokens are above other special tokens' id range and are ignored by `decode()`.
168
+ This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
169
+ """
170
+ return self.encoding.decode(token_ids, **kwargs)
171
+
172
+ @cached_property
173
+ def eot(self) -> int:
174
+ return self.encoding.eot_token
175
+
176
+ @cached_property
177
+ def transcribe(self) -> int:
178
+ return self.special_tokens["<|transcribe|>"]
179
+
180
+ @cached_property
181
+ def translate(self) -> int:
182
+ return self.special_tokens["<|translate|>"]
183
+
184
+ @cached_property
185
+ def sot(self) -> int:
186
+ return self.special_tokens["<|startoftranscript|>"]
187
+
188
+ @cached_property
189
+ def sot_lm(self) -> int:
190
+ return self.special_tokens["<|startoflm|>"]
191
+
192
+ @cached_property
193
+ def sot_prev(self) -> int:
194
+ return self.special_tokens["<|startofprev|>"]
195
+
196
+ @cached_property
197
+ def no_speech(self) -> int:
198
+ return self.special_tokens["<|nospeech|>"]
199
+
200
+ @cached_property
201
+ def no_timestamps(self) -> int:
202
+ return self.special_tokens["<|notimestamps|>"]
203
+
204
+ @cached_property
205
+ def timestamp_begin(self) -> int:
206
+ return self.special_tokens["<|0.00|>"]
207
+
208
+ @cached_property
209
+ def language_token(self) -> int:
210
+ """Returns the token id corresponding to the value of the `language` field"""
211
+ if self.language is None:
212
+ raise ValueError("This tokenizer does not have language token configured")
213
+
214
+ if token := self.special_tokens.get(f"<|{self.language}|>", None):
215
+ return token
216
+
217
+ raise KeyError(f"Language {self.language} not found in tokenizer.")
218
+
219
+ @cached_property
220
+ def all_language_tokens(self) -> Tuple[int]:
221
+ result = []
222
+ for token, token_id in self.special_tokens.items():
223
+ if token.strip("<|>") in LANGUAGES:
224
+ result.append(token_id)
225
+ return tuple(result)
226
+
227
+ @cached_property
228
+ def all_language_codes(self) -> Tuple[str]:
229
+ return tuple(self.decode([_l]).strip("<|>") for _l in self.all_language_tokens)
230
+
231
+ @cached_property
232
+ def sot_sequence_including_notimestamps(self) -> Tuple[int]:
233
+ return tuple(list(self.sot_sequence) + [self.no_timestamps])
234
+
235
+ @cached_property
236
+ def non_speech_tokens(self) -> Tuple[int]:
237
+ """
238
+ Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
239
+ annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
240
+
241
+ - ♪♪♪
242
+ - ( SPEAKING FOREIGN LANGUAGE )
243
+ - [DAVID] Hey there,
244
+
245
+ keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
246
+ """
247
+ symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
248
+ symbols += (
249
+ "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
250
+ )
251
+
252
+ # symbols that may be a single token or multiple tokens depending on the tokenizer.
253
+ # In case they're multiple tokens, suppress the first token, which is safe because:
254
+ # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
255
+ # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
256
+ miscellaneous = set("♩♪♫♬♭♮♯")
257
+ assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
258
+
259
+ # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
260
+ result = {self.encoding.encode(" -")[0], self.encoding.encode(" '")[0]}
261
+ for symbol in symbols + list(miscellaneous):
262
+ for tokens in [
263
+ self.encoding.encode(symbol),
264
+ self.encoding.encode(" " + symbol),
265
+ ]:
266
+ if len(tokens) == 1 or symbol in miscellaneous:
267
+ result.add(tokens[0])
268
+
269
+ return tuple(sorted(result))
270
+
271
+ def split_to_word_tokens(self, tokens: List[int]):
272
+ if self.language in {"zh", "ja", "th", "lo", "my"}:
273
+ # These languages don't typically use spaces, so it is difficult to split words
274
+ # without morpheme analysis. Here, we instead split words at any
275
+ # position where the tokens are decoded as valid unicode points
276
+ return self.split_tokens_on_unicode(tokens)
277
+
278
+ return self.split_tokens_on_spaces(tokens)
279
+
280
+ def split_tokens_on_unicode(self, tokens: List[int]):
281
+ decoded_full = self.decode_with_timestamps(tokens)
282
+ replacement_char = "\ufffd"
283
+
284
+ words = []
285
+ word_tokens = []
286
+ current_tokens = []
287
+ unicode_offset = 0
288
+
289
+ for token in tokens:
290
+ current_tokens.append(token)
291
+ decoded = self.decode_with_timestamps(current_tokens)
292
+
293
+ if (
294
+ replacement_char not in decoded
295
+ or decoded_full[unicode_offset + decoded.index(replacement_char)]
296
+ == replacement_char
297
+ ):
298
+ words.append(decoded)
299
+ word_tokens.append(current_tokens)
300
+ current_tokens = []
301
+ unicode_offset += len(decoded)
302
+
303
+ return words, word_tokens
304
+
305
+ def split_tokens_on_spaces(self, tokens: List[int]):
306
+ subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens)
307
+ words = []
308
+ word_tokens = []
309
+
310
+ for subword, subword_tokens in zip(subwords, subword_tokens_list):
311
+ special = subword_tokens[0] >= self.eot
312
+ with_space = subword.startswith(" ")
313
+ punctuation = subword.strip() in string.punctuation
314
+ if special or with_space or punctuation or len(words) == 0:
315
+ words.append(subword)
316
+ word_tokens.append(subword_tokens)
317
+ else:
318
+ words[-1] = words[-1] + subword
319
+ word_tokens[-1].extend(subword_tokens)
320
+
321
+ return words, word_tokens
322
+
323
+
324
+ @lru_cache(maxsize=None)
325
+ def get_encoding(name: str = "gpt2"):
326
+ vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
327
+ ranks = {
328
+ base64.b64decode(token): int(rank)
329
+ for token, rank in (line.split() for line in open(vocab_path) if line)
330
+ }
331
+ n_vocab = len(ranks)
332
+ special_tokens = {}
333
+
334
+ specials = [
335
+ "<|endoftext|>",
336
+ "<|startoftranscript|>",
337
+ *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
338
+ "<|translate|>",
339
+ "<|transcribe|>",
340
+ "<|startoflm|>",
341
+ "<|startofprev|>",
342
+ "<|nospeech|>",
343
+ "<|notimestamps|>",
344
+ *[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
345
+ ]
346
+
347
+ for token in specials:
348
+ special_tokens[token] = n_vocab
349
+ n_vocab += 1
350
+
351
+ return tiktoken.Encoding(
352
+ name=os.path.basename(vocab_path),
353
+ explicit_n_vocab=n_vocab,
354
+ pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
355
+ mergeable_ranks=ranks,
356
+ special_tokens=special_tokens,
357
+ )
358
+
359
+
360
+ @lru_cache(maxsize=None)
361
+ def get_tokenizer(
362
+ multilingual: bool,
363
+ *,
364
+ language: Optional[str] = None,
365
+ task: Optional[str] = None, # Literal["transcribe", "translate", None]
366
+ ) -> Tokenizer:
367
+ if language is not None:
368
+ language = language.lower()
369
+ if language not in LANGUAGES:
370
+ if language in TO_LANGUAGE_CODE:
371
+ language = TO_LANGUAGE_CODE[language]
372
+ else:
373
+ raise ValueError(f"Unsupported language: {language}")
374
+
375
+ if multilingual:
376
+ encoding_name = "multilingual"
377
+ language = language or "en"
378
+ task = task or "transcribe"
379
+ else:
380
+ encoding_name = "gpt2"
381
+ language = None
382
+ task = None
383
+
384
+ encoding = get_encoding(name=encoding_name)
385
+
386
+ return Tokenizer(encoding=encoding, language=language, task=task)
whisper/whisper/transcribe.py ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import warnings
4
+ from typing import TYPE_CHECKING, Optional, Tuple, Union
5
+
6
+ import numpy as np
7
+ import torch
8
+ import tqdm
9
+
10
+ from .audio import (
11
+ FRAMES_PER_SECOND,
12
+ HOP_LENGTH,
13
+ N_FRAMES,
14
+ N_SAMPLES,
15
+ SAMPLE_RATE,
16
+ log_mel_spectrogram,
17
+ pad_or_trim,
18
+ )
19
+ from .decoding import DecodingOptions, DecodingResult
20
+ from .timing import add_word_timestamps
21
+ from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer
22
+ from .utils import (
23
+ exact_div,
24
+ format_timestamp,
25
+ get_writer,
26
+ make_safe,
27
+ optional_float,
28
+ optional_int,
29
+ str2bool,
30
+ )
31
+
32
+ if TYPE_CHECKING:
33
+ from .model import Whisper
34
+
35
+
36
+ def transcribe(
37
+ model: "Whisper",
38
+ audio: Union[str, np.ndarray, torch.Tensor],
39
+ *,
40
+ verbose: Optional[bool] = None,
41
+ temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
42
+ compression_ratio_threshold: Optional[float] = 2.4,
43
+ logprob_threshold: Optional[float] = -1.0,
44
+ no_speech_threshold: Optional[float] = 0.6,
45
+ condition_on_previous_text: bool = True,
46
+ initial_prompt: Optional[str] = None,
47
+ word_timestamps: bool = False,
48
+ prepend_punctuations: str = "\"'“¿([{-",
49
+ append_punctuations: str = "\"'.。,,!!??::”)]}、",
50
+ **decode_options,
51
+ ):
52
+ """
53
+ Transcribe an audio file using Whisper
54
+
55
+ Parameters
56
+ ----------
57
+ model: Whisper
58
+ The Whisper model instance
59
+
60
+ audio: Union[str, np.ndarray, torch.Tensor]
61
+ The path to the audio file to open, or the audio waveform
62
+
63
+ verbose: bool
64
+ Whether to display the text being decoded to the console. If True, displays all the details,
65
+ If False, displays minimal details. If None, does not display anything
66
+
67
+ temperature: Union[float, Tuple[float, ...]]
68
+ Temperature for sampling. It can be a tuple of temperatures, which will be successively used
69
+ upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
70
+
71
+ compression_ratio_threshold: float
72
+ If the gzip compression ratio is above this value, treat as failed
73
+
74
+ logprob_threshold: float
75
+ If the average log probability over sampled tokens is below this value, treat as failed
76
+
77
+ no_speech_threshold: float
78
+ If the no_speech probability is higher than this value AND the average log probability
79
+ over sampled tokens is below `logprob_threshold`, consider the segment as silent
80
+
81
+ condition_on_previous_text: bool
82
+ if True, the previous output of the model is provided as a prompt for the next window;
83
+ disabling may make the text inconsistent across windows, but the model becomes less prone to
84
+ getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
85
+
86
+ word_timestamps: bool
87
+ Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
88
+ and include the timestamps for each word in each segment.
89
+
90
+ prepend_punctuations: str
91
+ If word_timestamps is True, merge these punctuation symbols with the next word
92
+
93
+ append_punctuations: str
94
+ If word_timestamps is True, merge these punctuation symbols with the previous word
95
+
96
+ initial_prompt: Optional[str]
97
+ Optional text to provide as a prompt for the first window. This can be used to provide, or
98
+ "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
99
+ to make it more likely to predict those word correctly.
100
+
101
+ decode_options: dict
102
+ Keyword arguments to construct `DecodingOptions` instances
103
+
104
+ Returns
105
+ -------
106
+ A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
107
+ the spoken language ("language"), which is detected when `decode_options["language"]` is None.
108
+ """
109
+ dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
110
+ if model.device == torch.device("cpu"):
111
+ if torch.cuda.is_available():
112
+ warnings.warn("Performing inference on CPU when CUDA is available")
113
+ if dtype == torch.float16:
114
+ warnings.warn("FP16 is not supported on CPU; using FP32 instead")
115
+ dtype = torch.float32
116
+
117
+ if dtype == torch.float32:
118
+ decode_options["fp16"] = False
119
+
120
+ # Pad 30-seconds of silence to the input audio, for slicing
121
+ mel = log_mel_spectrogram(audio, padding=N_SAMPLES)
122
+ content_frames = mel.shape[-1] - N_FRAMES
123
+
124
+ if decode_options.get("language", None) is None:
125
+ if not model.is_multilingual:
126
+ decode_options["language"] = "en"
127
+ else:
128
+ if verbose:
129
+ print(
130
+ "Detecting language using up to the first 30 seconds. Use `--language` to specify the language"
131
+ )
132
+ mel_segment = pad_or_trim(mel, N_FRAMES).to(model.device).to(dtype)
133
+ _, probs = model.detect_language(mel_segment)
134
+ decode_options["language"] = max(probs, key=probs.get)
135
+ if verbose is not None:
136
+ print(
137
+ f"Detected language: {LANGUAGES[decode_options['language']].title()}"
138
+ )
139
+
140
+ language: str = decode_options["language"]
141
+ task: str = decode_options.get("task", "transcribe")
142
+ tokenizer = get_tokenizer(model.is_multilingual, language=language, task=task)
143
+
144
+ if word_timestamps and task == "translate":
145
+ warnings.warn("Word-level timestamps on translations may not be reliable.")
146
+
147
+ def decode_with_fallback(segment: torch.Tensor) -> DecodingResult:
148
+ temperatures = (
149
+ [temperature] if isinstance(temperature, (int, float)) else temperature
150
+ )
151
+ decode_result = None
152
+
153
+ for t in temperatures:
154
+ kwargs = {**decode_options}
155
+ if t > 0:
156
+ # disable beam_size and patience when t > 0
157
+ kwargs.pop("beam_size", None)
158
+ kwargs.pop("patience", None)
159
+ else:
160
+ # disable best_of when t == 0
161
+ kwargs.pop("best_of", None)
162
+
163
+ options = DecodingOptions(**kwargs, temperature=t)
164
+ decode_result = model.decode(segment, options)
165
+
166
+ needs_fallback = False
167
+ if (
168
+ compression_ratio_threshold is not None
169
+ and decode_result.compression_ratio > compression_ratio_threshold
170
+ ):
171
+ needs_fallback = True # too repetitive
172
+ if (
173
+ logprob_threshold is not None
174
+ and decode_result.avg_logprob < logprob_threshold
175
+ ):
176
+ needs_fallback = True # average log probability is too low
177
+ if (
178
+ no_speech_threshold is not None
179
+ and decode_result.no_speech_prob > no_speech_threshold
180
+ ):
181
+ needs_fallback = False # silence
182
+ if not needs_fallback:
183
+ break
184
+
185
+ return decode_result
186
+
187
+ seek = 0
188
+ input_stride = exact_div(
189
+ N_FRAMES, model.dims.n_audio_ctx
190
+ ) # mel frames per output token: 2
191
+ time_precision = (
192
+ input_stride * HOP_LENGTH / SAMPLE_RATE
193
+ ) # time per output token: 0.02 (seconds)
194
+ all_tokens = []
195
+ all_segments = []
196
+ prompt_reset_since = 0
197
+
198
+ if initial_prompt is not None:
199
+ initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip())
200
+ all_tokens.extend(initial_prompt_tokens)
201
+ else:
202
+ initial_prompt_tokens = []
203
+
204
+ def new_segment(
205
+ *, start: float, end: float, tokens: torch.Tensor, result: DecodingResult
206
+ ):
207
+ tokens = tokens.tolist()
208
+ text_tokens = [token for token in tokens if token < tokenizer.eot]
209
+ return {
210
+ "seek": seek,
211
+ "start": start,
212
+ "end": end,
213
+ "text": tokenizer.decode(text_tokens),
214
+ "tokens": tokens,
215
+ "temperature": result.temperature,
216
+ "avg_logprob": result.avg_logprob,
217
+ "compression_ratio": result.compression_ratio,
218
+ "no_speech_prob": result.no_speech_prob,
219
+ }
220
+
221
+ # show the progress bar when verbose is False (if True, transcribed text will be printed)
222
+ with tqdm.tqdm(
223
+ total=content_frames, unit="frames", disable=verbose is not False
224
+ ) as pbar:
225
+ last_speech_timestamp = 0.0
226
+ while seek < content_frames:
227
+ time_offset = float(seek * HOP_LENGTH / SAMPLE_RATE)
228
+ mel_segment = mel[:, seek : seek + N_FRAMES]
229
+ segment_size = min(N_FRAMES, content_frames - seek)
230
+ segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE
231
+ mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype)
232
+
233
+ decode_options["prompt"] = all_tokens[prompt_reset_since:]
234
+ result: DecodingResult = decode_with_fallback(mel_segment)
235
+ tokens = torch.tensor(result.tokens)
236
+
237
+ if no_speech_threshold is not None:
238
+ # no voice activity check
239
+ should_skip = result.no_speech_prob > no_speech_threshold
240
+ if (
241
+ logprob_threshold is not None
242
+ and result.avg_logprob > logprob_threshold
243
+ ):
244
+ # don't skip if the logprob is high enough, despite the no_speech_prob
245
+ should_skip = False
246
+
247
+ if should_skip:
248
+ seek += segment_size # fast-forward to the next segment boundary
249
+ continue
250
+
251
+ previous_seek = seek
252
+ current_segments = []
253
+
254
+ timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin)
255
+ single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True]
256
+
257
+ consecutive = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0]
258
+ consecutive.add_(1)
259
+ if len(consecutive) > 0:
260
+ # if the output contains two consecutive timestamp tokens
261
+ slices = consecutive.tolist()
262
+ if single_timestamp_ending:
263
+ slices.append(len(tokens))
264
+
265
+ last_slice = 0
266
+ for current_slice in slices:
267
+ sliced_tokens = tokens[last_slice:current_slice]
268
+ start_timestamp_pos = (
269
+ sliced_tokens[0].item() - tokenizer.timestamp_begin
270
+ )
271
+ end_timestamp_pos = (
272
+ sliced_tokens[-1].item() - tokenizer.timestamp_begin
273
+ )
274
+ current_segments.append(
275
+ new_segment(
276
+ start=time_offset + start_timestamp_pos * time_precision,
277
+ end=time_offset + end_timestamp_pos * time_precision,
278
+ tokens=sliced_tokens,
279
+ result=result,
280
+ )
281
+ )
282
+ last_slice = current_slice
283
+
284
+ if single_timestamp_ending:
285
+ # single timestamp at the end means no speech after the last timestamp.
286
+ seek += segment_size
287
+ else:
288
+ # otherwise, ignore the unfinished segment and seek to the last timestamp
289
+ last_timestamp_pos = (
290
+ tokens[last_slice - 1].item() - tokenizer.timestamp_begin
291
+ )
292
+ seek += last_timestamp_pos * input_stride
293
+ else:
294
+ duration = segment_duration
295
+ timestamps = tokens[timestamp_tokens.nonzero().flatten()]
296
+ if (
297
+ len(timestamps) > 0
298
+ and timestamps[-1].item() != tokenizer.timestamp_begin
299
+ ):
300
+ # no consecutive timestamps but it has a timestamp; use the last one.
301
+ last_timestamp_pos = (
302
+ timestamps[-1].item() - tokenizer.timestamp_begin
303
+ )
304
+ duration = last_timestamp_pos * time_precision
305
+
306
+ current_segments.append(
307
+ new_segment(
308
+ start=time_offset,
309
+ end=time_offset + duration,
310
+ tokens=tokens,
311
+ result=result,
312
+ )
313
+ )
314
+ seek += segment_size
315
+
316
+ if word_timestamps:
317
+ add_word_timestamps(
318
+ segments=current_segments,
319
+ model=model,
320
+ tokenizer=tokenizer,
321
+ mel=mel_segment,
322
+ num_frames=segment_size,
323
+ prepend_punctuations=prepend_punctuations,
324
+ append_punctuations=append_punctuations,
325
+ last_speech_timestamp=last_speech_timestamp,
326
+ )
327
+ word_end_timestamps = [
328
+ w["end"] for s in current_segments for w in s["words"]
329
+ ]
330
+ if len(word_end_timestamps) > 0:
331
+ last_speech_timestamp = word_end_timestamps[-1]
332
+ if not single_timestamp_ending and len(word_end_timestamps) > 0:
333
+ seek_shift = round(
334
+ (word_end_timestamps[-1] - time_offset) * FRAMES_PER_SECOND
335
+ )
336
+ if seek_shift > 0:
337
+ seek = previous_seek + seek_shift
338
+
339
+ if verbose:
340
+ for segment in current_segments:
341
+ start, end, text = segment["start"], segment["end"], segment["text"]
342
+ line = f"[{format_timestamp(start)} --> {format_timestamp(end)}] {text}"
343
+ print(make_safe(line))
344
+
345
+ # if a segment is instantaneous or does not contain text, clear it
346
+ for i, segment in enumerate(current_segments):
347
+ if segment["start"] == segment["end"] or segment["text"].strip() == "":
348
+ segment["text"] = ""
349
+ segment["tokens"] = []
350
+ segment["words"] = []
351
+
352
+ all_segments.extend(
353
+ [
354
+ {"id": i, **segment}
355
+ for i, segment in enumerate(
356
+ current_segments, start=len(all_segments)
357
+ )
358
+ ]
359
+ )
360
+ all_tokens.extend(
361
+ [token for segment in current_segments for token in segment["tokens"]]
362
+ )
363
+
364
+ if not condition_on_previous_text or result.temperature > 0.5:
365
+ # do not feed the prompt tokens if a high temperature was used
366
+ prompt_reset_since = len(all_tokens)
367
+
368
+ # update progress bar
369
+ pbar.update(min(content_frames, seek) - previous_seek)
370
+
371
+ return dict(
372
+ text=tokenizer.decode(all_tokens[len(initial_prompt_tokens) :]),
373
+ segments=all_segments,
374
+ language=language,
375
+ )
376
+
377
+
378
+ def cli():
379
+ from . import available_models
380
+
381
+ # fmt: off
382
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
383
+ parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
384
+ parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
385
+ parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
386
+ parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
387
+ parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
388
+ parser.add_argument("--output_format", "-f", type=str, default="all", choices=["txt", "vtt", "srt", "tsv", "json", "all"], help="format of the output file; if not specified, all available formats will be produced")
389
+ parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
390
+
391
+ parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
392
+ parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")
393
+
394
+ parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
395
+ parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
396
+ parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
397
+ parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
398
+ parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")
399
+
400
+ parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
401
+ parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
402
+ parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
403
+ parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
404
+
405
+ parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
406
+ parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
407
+ parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
408
+ parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
409
+ parser.add_argument("--word_timestamps", type=str2bool, default=False, help="(experimental) extract word-level timestamps and refine the results based on them")
410
+ parser.add_argument("--prepend_punctuations", type=str, default="\"\'“¿([{-", help="if word_timestamps is True, merge these punctuation symbols with the next word")
411
+ parser.add_argument("--append_punctuations", type=str, default="\"\'.。,,!!??::”)]}、", help="if word_timestamps is True, merge these punctuation symbols with the previous word")
412
+ parser.add_argument("--highlight_words", type=str2bool, default=False, help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt")
413
+ parser.add_argument("--max_line_width", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of characters in a line before breaking the line")
414
+ parser.add_argument("--max_line_count", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of lines in a segment")
415
+ parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
416
+ # fmt: on
417
+
418
+ args = parser.parse_args().__dict__
419
+ model_name: str = args.pop("model")
420
+ model_dir: str = args.pop("model_dir")
421
+ output_dir: str = args.pop("output_dir")
422
+ output_format: str = args.pop("output_format")
423
+ device: str = args.pop("device")
424
+ os.makedirs(output_dir, exist_ok=True)
425
+
426
+ if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
427
+ if args["language"] is not None:
428
+ warnings.warn(
429
+ f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead."
430
+ )
431
+ args["language"] = "en"
432
+
433
+ temperature = args.pop("temperature")
434
+ if (increment := args.pop("temperature_increment_on_fallback")) is not None:
435
+ temperature = tuple(np.arange(temperature, 1.0 + 1e-6, increment))
436
+ else:
437
+ temperature = [temperature]
438
+
439
+ if (threads := args.pop("threads")) > 0:
440
+ torch.set_num_threads(threads)
441
+
442
+ from . import load_model
443
+
444
+ model = load_model(model_name, device=device, download_root=model_dir)
445
+
446
+ writer = get_writer(output_format, output_dir)
447
+ word_options = ["highlight_words", "max_line_count", "max_line_width"]
448
+ if not args["word_timestamps"]:
449
+ for option in word_options:
450
+ if args[option]:
451
+ parser.error(f"--{option} requires --word_timestamps True")
452
+ if args["max_line_count"] and not args["max_line_width"]:
453
+ warnings.warn("--max_line_count has no effect without --max_line_width")
454
+ writer_args = {arg: args.pop(arg) for arg in word_options}
455
+ for audio_path in args.pop("audio"):
456
+ result = transcribe(model, audio_path, temperature=temperature, **args)
457
+ writer(result, audio_path, writer_args)
458
+
459
+
460
+ if __name__ == "__main__":
461
+ cli()
whisper/whisper/triton_ops.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+
3
+ import numpy as np
4
+ import torch
5
+
6
+ try:
7
+ import triton
8
+ import triton.language as tl
9
+ except ImportError:
10
+ raise RuntimeError("triton import failed; try `pip install --pre triton`")
11
+
12
+
13
+ @triton.jit
14
+ def dtw_kernel(
15
+ cost, trace, x, x_stride, cost_stride, trace_stride, N, M, BLOCK_SIZE: tl.constexpr
16
+ ):
17
+ offsets = tl.arange(0, BLOCK_SIZE)
18
+ mask = offsets < M
19
+
20
+ for k in range(1, N + M + 1): # k = i + j
21
+ tl.debug_barrier()
22
+
23
+ p0 = cost + (k - 1) * cost_stride
24
+ p1 = cost + k * cost_stride
25
+ p2 = cost + k * cost_stride + 1
26
+
27
+ c0 = tl.load(p0 + offsets, mask=mask)
28
+ c1 = tl.load(p1 + offsets, mask=mask)
29
+ c2 = tl.load(p2 + offsets, mask=mask)
30
+
31
+ x_row = tl.load(x + (k - 1) * x_stride + offsets, mask=mask, other=0)
32
+ cost_row = x_row + tl.minimum(tl.minimum(c0, c1), c2)
33
+
34
+ cost_ptr = cost + (k + 1) * cost_stride + 1
35
+ tl.store(cost_ptr + offsets, cost_row, mask=mask)
36
+
37
+ trace_ptr = trace + (k + 1) * trace_stride + 1
38
+ tl.store(trace_ptr + offsets, 2, mask=mask & (c2 <= c0) & (c2 <= c1))
39
+ tl.store(trace_ptr + offsets, 1, mask=mask & (c1 <= c0) & (c1 <= c2))
40
+ tl.store(trace_ptr + offsets, 0, mask=mask & (c0 <= c1) & (c0 <= c2))
41
+
42
+
43
+ @lru_cache(maxsize=None)
44
+ def median_kernel(filter_width: int):
45
+ @triton.jit
46
+ def kernel(
47
+ y, x, x_stride, y_stride, BLOCK_SIZE: tl.constexpr
48
+ ): # x.shape[-1] == filter_width
49
+ row_idx = tl.program_id(0)
50
+ offsets = tl.arange(0, BLOCK_SIZE)
51
+ mask = offsets < y_stride
52
+
53
+ x_ptr = x + row_idx * x_stride # noqa: F841
54
+ y_ptr = y + row_idx * y_stride
55
+
56
+ LOAD_ALL_ROWS_HERE # noqa: F821
57
+
58
+ BUBBLESORT_HERE # noqa: F821
59
+
60
+ tl.store(y_ptr + offsets, MIDDLE_ROW_HERE, mask=mask) # noqa: F821
61
+
62
+ kernel = triton.JITFunction(kernel.fn)
63
+ kernel.src = kernel.src.replace(
64
+ " LOAD_ALL_ROWS_HERE",
65
+ "\n".join(
66
+ [
67
+ f" row{i} = tl.load(x_ptr + offsets + {i}, mask=mask)"
68
+ for i in range(filter_width)
69
+ ]
70
+ ),
71
+ )
72
+ kernel.src = kernel.src.replace(
73
+ " BUBBLESORT_HERE",
74
+ "\n\n".join(
75
+ [
76
+ "\n\n".join(
77
+ [
78
+ "\n".join(
79
+ [
80
+ f" smaller = tl.where(row{j} < row{j + 1}, row{j}, row{j + 1})",
81
+ f" larger = tl.where(row{j} > row{j + 1}, row{j}, row{j + 1})",
82
+ f" row{j} = smaller",
83
+ f" row{j + 1} = larger",
84
+ ]
85
+ )
86
+ for j in range(filter_width - i - 1)
87
+ ]
88
+ )
89
+ for i in range(filter_width // 2 + 1)
90
+ ]
91
+ ),
92
+ )
93
+ kernel.src = kernel.src.replace("MIDDLE_ROW_HERE", f"row{filter_width // 2}")
94
+
95
+ return kernel
96
+
97
+
98
+ def median_filter_cuda(x: torch.Tensor, filter_width: int):
99
+ """Apply a median filter of given width along the last dimension of x"""
100
+ slices = x.contiguous().unfold(-1, filter_width, 1)
101
+ grid = np.prod(slices.shape[:-2])
102
+
103
+ kernel = median_kernel(filter_width)
104
+ y = torch.empty_like(slices[..., 0])
105
+
106
+ BLOCK_SIZE = 1 << (y.stride(-2) - 1).bit_length()
107
+ kernel[(grid,)](y, x, x.stride(-2), y.stride(-2), BLOCK_SIZE=BLOCK_SIZE)
108
+
109
+ return y
whisper/whisper/utils.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ import sys
5
+ import zlib
6
+ from typing import Callable, Optional, TextIO
7
+
8
+ system_encoding = sys.getdefaultencoding()
9
+
10
+ if system_encoding != "utf-8":
11
+
12
+ def make_safe(string):
13
+ # replaces any character not representable using the system default encoding with an '?',
14
+ # avoiding UnicodeEncodeError (https://github.com/openai/whisper/discussions/729).
15
+ return string.encode(system_encoding, errors="replace").decode(system_encoding)
16
+
17
+ else:
18
+
19
+ def make_safe(string):
20
+ # utf-8 can encode any Unicode code point, so no need to do the round-trip encoding
21
+ return string
22
+
23
+
24
+ def exact_div(x, y):
25
+ assert x % y == 0
26
+ return x // y
27
+
28
+
29
+ def str2bool(string):
30
+ str2val = {"True": True, "False": False}
31
+ if string in str2val:
32
+ return str2val[string]
33
+ else:
34
+ raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
35
+
36
+
37
+ def optional_int(string):
38
+ return None if string == "None" else int(string)
39
+
40
+
41
+ def optional_float(string):
42
+ return None if string == "None" else float(string)
43
+
44
+
45
+ def compression_ratio(text) -> float:
46
+ text_bytes = text.encode("utf-8")
47
+ return len(text_bytes) / len(zlib.compress(text_bytes))
48
+
49
+
50
+ def format_timestamp(
51
+ seconds: float, always_include_hours: bool = False, decimal_marker: str = "."
52
+ ):
53
+ assert seconds >= 0, "non-negative timestamp expected"
54
+ milliseconds = round(seconds * 1000.0)
55
+
56
+ hours = milliseconds // 3_600_000
57
+ milliseconds -= hours * 3_600_000
58
+
59
+ minutes = milliseconds // 60_000
60
+ milliseconds -= minutes * 60_000
61
+
62
+ seconds = milliseconds // 1_000
63
+ milliseconds -= seconds * 1_000
64
+
65
+ hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
66
+ return (
67
+ f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
68
+ )
69
+
70
+
71
+ class ResultWriter:
72
+ extension: str
73
+
74
+ def __init__(self, output_dir: str):
75
+ self.output_dir = output_dir
76
+
77
+ def __call__(self, result: dict, audio_path: str, options: dict):
78
+ audio_basename = os.path.basename(audio_path)
79
+ audio_basename = os.path.splitext(audio_basename)[0]
80
+ output_path = os.path.join(
81
+ self.output_dir, audio_basename + "." + self.extension
82
+ )
83
+
84
+ with open(output_path, "w", encoding="utf-8") as f:
85
+ self.write_result(result, file=f, options=options)
86
+
87
+ def write_result(self, result: dict, file: TextIO, options: dict):
88
+ raise NotImplementedError
89
+
90
+
91
+ class WriteTXT(ResultWriter):
92
+ extension: str = "txt"
93
+
94
+ def write_result(self, result: dict, file: TextIO, options: dict):
95
+ for segment in result["segments"]:
96
+ print(segment["text"].strip(), file=file, flush=True)
97
+
98
+
99
+ class SubtitlesWriter(ResultWriter):
100
+ always_include_hours: bool
101
+ decimal_marker: str
102
+
103
+ def iterate_result(self, result: dict, options: dict):
104
+ raw_max_line_width: Optional[int] = options["max_line_width"]
105
+ max_line_count: Optional[int] = options["max_line_count"]
106
+ highlight_words: bool = options["highlight_words"]
107
+ max_line_width = 1000 if raw_max_line_width is None else raw_max_line_width
108
+ preserve_segments = max_line_count is None or raw_max_line_width is None
109
+
110
+ def iterate_subtitles():
111
+ line_len = 0
112
+ line_count = 1
113
+ # the next subtitle to yield (a list of word timings with whitespace)
114
+ subtitle: list[dict] = []
115
+ last = result["segments"][0]["words"][0]["start"]
116
+ for segment in result["segments"]:
117
+ for i, original_timing in enumerate(segment["words"]):
118
+ timing = original_timing.copy()
119
+ long_pause = not preserve_segments and timing["start"] - last > 3.0
120
+ has_room = line_len + len(timing["word"]) <= max_line_width
121
+ seg_break = i == 0 and len(subtitle) > 0 and preserve_segments
122
+ if line_len > 0 and has_room and not long_pause and not seg_break:
123
+ # line continuation
124
+ line_len += len(timing["word"])
125
+ else:
126
+ # new line
127
+ timing["word"] = timing["word"].strip()
128
+ if (
129
+ len(subtitle) > 0
130
+ and max_line_count is not None
131
+ and (long_pause or line_count >= max_line_count)
132
+ or seg_break
133
+ ):
134
+ # subtitle break
135
+ yield subtitle
136
+ subtitle = []
137
+ line_count = 1
138
+ elif line_len > 0:
139
+ # line break
140
+ line_count += 1
141
+ timing["word"] = "\n" + timing["word"]
142
+ line_len = len(timing["word"].strip())
143
+ subtitle.append(timing)
144
+ last = timing["start"]
145
+ if len(subtitle) > 0:
146
+ yield subtitle
147
+
148
+ if "words" in result["segments"][0]:
149
+ for subtitle in iterate_subtitles():
150
+ subtitle_start = self.format_timestamp(subtitle[0]["start"])
151
+ subtitle_end = self.format_timestamp(subtitle[-1]["end"])
152
+ subtitle_text = "".join([word["word"] for word in subtitle])
153
+ if highlight_words:
154
+ last = subtitle_start
155
+ all_words = [timing["word"] for timing in subtitle]
156
+ for i, this_word in enumerate(subtitle):
157
+ start = self.format_timestamp(this_word["start"])
158
+ end = self.format_timestamp(this_word["end"])
159
+ if last != start:
160
+ yield last, start, subtitle_text
161
+
162
+ yield start, end, "".join(
163
+ [
164
+ re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
165
+ if j == i
166
+ else word
167
+ for j, word in enumerate(all_words)
168
+ ]
169
+ )
170
+ last = end
171
+ else:
172
+ yield subtitle_start, subtitle_end, subtitle_text
173
+ else:
174
+ for segment in result["segments"]:
175
+ segment_start = self.format_timestamp(segment["start"])
176
+ segment_end = self.format_timestamp(segment["end"])
177
+ segment_text = segment["text"].strip().replace("-->", "->")
178
+ yield segment_start, segment_end, segment_text
179
+
180
+ def format_timestamp(self, seconds: float):
181
+ return format_timestamp(
182
+ seconds=seconds,
183
+ always_include_hours=self.always_include_hours,
184
+ decimal_marker=self.decimal_marker,
185
+ )
186
+
187
+
188
+ class WriteVTT(SubtitlesWriter):
189
+ extension: str = "vtt"
190
+ always_include_hours: bool = False
191
+ decimal_marker: str = "."
192
+
193
+ def write_result(self, result: dict, file: TextIO, options: dict):
194
+ print("WEBVTT\n", file=file)
195
+ for start, end, text in self.iterate_result(result, options):
196
+ print(f"{start} --> {end}\n{text}\n", file=file, flush=True)
197
+
198
+
199
+ class WriteSRT(SubtitlesWriter):
200
+ extension: str = "srt"
201
+ always_include_hours: bool = True
202
+ decimal_marker: str = ","
203
+
204
+ def write_result(self, result: dict, file: TextIO, options: dict):
205
+ for i, (start, end, text) in enumerate(
206
+ self.iterate_result(result, options), start=1
207
+ ):
208
+ print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True)
209
+
210
+
211
+ class WriteTSV(ResultWriter):
212
+ """
213
+ Write a transcript to a file in TSV (tab-separated values) format containing lines like:
214
+ <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
215
+
216
+ Using integer milliseconds as start and end times means there's no chance of interference from
217
+ an environment setting a language encoding that causes the decimal in a floating point number
218
+ to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
219
+ """
220
+
221
+ extension: str = "tsv"
222
+
223
+ def write_result(self, result: dict, file: TextIO, options: dict):
224
+ print("start", "end", "text", sep="\t", file=file)
225
+ for segment in result["segments"]:
226
+ print(round(1000 * segment["start"]), file=file, end="\t")
227
+ print(round(1000 * segment["end"]), file=file, end="\t")
228
+ print(segment["text"].strip().replace("\t", " "), file=file, flush=True)
229
+
230
+
231
+ class WriteJSON(ResultWriter):
232
+ extension: str = "json"
233
+
234
+ def write_result(self, result: dict, file: TextIO, options: dict):
235
+ json.dump(result, file)
236
+
237
+
238
+ def get_writer(
239
+ output_format: str, output_dir: str
240
+ ) -> Callable[[dict, TextIO, dict], None]:
241
+ writers = {
242
+ "txt": WriteTXT,
243
+ "vtt": WriteVTT,
244
+ "srt": WriteSRT,
245
+ "tsv": WriteTSV,
246
+ "json": WriteJSON,
247
+ }
248
+
249
+ if output_format == "all":
250
+ all_writers = [writer(output_dir) for writer in writers.values()]
251
+
252
+ def write_all(result: dict, file: TextIO, options: dict):
253
+ for writer in all_writers:
254
+ writer(result, file, options)
255
+
256
+ return write_all
257
+
258
+ return writers[output_format](output_dir)
whisper/whisper/version.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = "20230918"