{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vK6B3ubS21Ky", "outputId": "45514cb3-fbc5-4156-b58a-8fd89900fbd5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting datasets==1.18.3\n", " Downloading datasets-1.18.3-py3-none-any.whl (311 kB)\n", "\u001b[?25l\r", "\u001b[K |█ | 10 kB 27.7 MB/s eta 0:00:01\r", "\u001b[K |██ | 20 kB 17.0 MB/s eta 0:00:01\r", "\u001b[K |███▏ | 30 kB 10.7 MB/s eta 0:00:01\r", "\u001b[K |████▏ | 40 kB 8.9 MB/s eta 0:00:01\r", "\u001b[K |█████▎ | 51 kB 8.5 MB/s eta 0:00:01\r", "\u001b[K |██████▎ | 61 kB 8.8 MB/s eta 0:00:01\r", "\u001b[K |███████▍ | 71 kB 6.3 MB/s eta 0:00:01\r", "\u001b[K |████████▍ | 81 kB 7.0 MB/s eta 0:00:01\r", "\u001b[K |█████████▌ | 92 kB 7.6 MB/s eta 0:00:01\r", "\u001b[K |██████████▌ | 102 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |███████████▋ | 112 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |████████████▋ | 122 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |█████████████▊ | 133 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |██████████████▊ | 143 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |███████████████▊ | 153 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |████████████████▉ | 163 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |█████████████████▉ | 174 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |███████████████████ | 184 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |████████████████████ | 194 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |█████████████████████ | 204 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |██████████████████████ | 215 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |███████████████████████▏ | 225 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |████████████████████████▏ | 235 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |█████████████████████████▎ | 245 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |██████████████████████████▎ | 256 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |███████████████████████████▍ | 266 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |████████████████████████████▍ | 276 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |█████████████████████████████▍ | 286 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |██████████████████████████████▌ | 296 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |███████████████████████████████▌| 307 kB 8.2 MB/s eta 0:00:01\r", "\u001b[K |████████████████████████████████| 311 kB 8.2 MB/s \n", "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets==1.18.3) (21.3)\n", "Collecting xxhash\n", " Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)\n", "\u001b[K |████████████████████████████████| 243 kB 63.1 MB/s \n", "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets==1.18.3) (1.3.5)\n", "Collecting huggingface-hub<1.0.0,>=0.1.0\n", " Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)\n", "\u001b[K |████████████████████████████████| 67 kB 7.6 MB/s \n", "\u001b[?25hRequirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets==1.18.3) (0.70.12.2)\n", "Requirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets==1.18.3) (0.3.4)\n", "Collecting fsspec[http]>=2021.05.0\n", " Downloading fsspec-2022.1.0-py3-none-any.whl (133 kB)\n", "\u001b[K |████████████████████████████████| 133 kB 75.2 MB/s \n", "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets==1.18.3) (1.19.5)\n", "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from datasets==1.18.3) (4.10.1)\n", "Collecting aiohttp\n", " Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)\n", "\u001b[K |████████████████████████████████| 1.1 MB 45.3 MB/s \n", "\u001b[?25hRequirement already satisfied: pyarrow!=4.0.0,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets==1.18.3) (6.0.1)\n", "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets==1.18.3) (4.62.3)\n", "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets==1.18.3) (2.23.0)\n", "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==1.18.3) (3.13)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==1.18.3) (3.10.0.2)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==1.18.3) (3.4.2)\n", "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets==1.18.3) (3.0.7)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==1.18.3) (2.10)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==1.18.3) (2021.10.8)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==1.18.3) (3.0.4)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==1.18.3) (1.24.3)\n", "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==1.18.3) (2.0.11)\n", "Collecting async-timeout<5.0,>=4.0.0a3\n", " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n", "Collecting aiosignal>=1.1.2\n", " Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\n", "Collecting yarl<2.0,>=1.0\n", " Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n", "\u001b[K |████████████████████████████████| 271 kB 73.6 MB/s \n", "\u001b[?25hCollecting frozenlist>=1.1.1\n", " Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n", "\u001b[K |████████████████████████████████| 144 kB 75.8 MB/s \n", "\u001b[?25hRequirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==1.18.3) (21.4.0)\n", "Collecting multidict<7.0,>=4.5\n", " Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)\n", "\u001b[K |████████████████████████████████| 94 kB 4.6 MB/s \n", "\u001b[?25hCollecting asynctest==0.13.0\n", " Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\n", "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->datasets==1.18.3) (3.7.0)\n", "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets==1.18.3) (2.8.2)\n", "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets==1.18.3) (2018.9)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets==1.18.3) (1.15.0)\n", "Installing collected packages: multidict, frozenlist, yarl, asynctest, async-timeout, aiosignal, fsspec, aiohttp, xxhash, huggingface-hub, datasets\n", "Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.2 asynctest-0.13.0 datasets-1.18.3 frozenlist-1.3.0 fsspec-2022.1.0 huggingface-hub-0.4.0 multidict-6.0.2 xxhash-2.0.2 yarl-1.7.2\n", "Collecting transformers\n", " Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)\n", "\u001b[K |████████████████████████████████| 3.5 MB 7.7 MB/s \n", "\u001b[?25hCollecting pyyaml>=5.1\n", " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", "\u001b[K |████████████████████████████████| 596 kB 69.9 MB/s \n", "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.3)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.4.2)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.4.0)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.62.3)\n", "Collecting tokenizers!=0.11.3,>=0.10.1\n", " Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)\n", "\u001b[K |████████████████████████████████| 6.8 MB 56.8 MB/s \n", "\u001b[?25hCollecting sacremoses\n", " Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)\n", "\u001b[K |████████████████████████████████| 895 kB 63.1 MB/s \n", "\u001b[?25hRequirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.10.1)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.1.0->transformers) (3.10.0.2)\n", "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (3.0.7)\n", "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.7.0)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.10.8)\n", "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n", "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.1.0)\n", "Installing collected packages: pyyaml, tokenizers, sacremoses, transformers\n", " Attempting uninstall: pyyaml\n", " Found existing installation: PyYAML 3.13\n", " Uninstalling PyYAML-3.13:\n", " Successfully uninstalled PyYAML-3.13\n", "Successfully installed pyyaml-6.0 sacremoses-0.0.47 tokenizers-0.11.4 transformers-4.16.2\n", "Collecting pyctcdecode\n", " Downloading pyctcdecode-0.3.0-py2.py3-none-any.whl (43 kB)\n", "\u001b[K |████████████████████████████████| 43 kB 1.6 MB/s \n", "\u001b[?25hCollecting pygtrie<3.0,>=2.1\n", " Downloading pygtrie-2.4.2.tar.gz (35 kB)\n", "Requirement already satisfied: numpy<2.0.0,>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from pyctcdecode) (1.19.5)\n", "Collecting hypothesis<7,>=6.14\n", " Downloading hypothesis-6.36.1-py3-none-any.whl (376 kB)\n", "\u001b[K |████████████████████████████████| 376 kB 15.6 MB/s \n", "\u001b[?25hRequirement already satisfied: sortedcontainers<3.0.0,>=2.1.0 in /usr/local/lib/python3.7/dist-packages (from hypothesis<7,>=6.14->pyctcdecode) (2.4.0)\n", "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.7/dist-packages (from hypothesis<7,>=6.14->pyctcdecode) (21.4.0)\n", "Building wheels for collected packages: pygtrie\n", " Building wheel for pygtrie (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for pygtrie: filename=pygtrie-2.4.2-py3-none-any.whl size=19063 sha256=3e1760cae3e3e4b932ad5b40f9521fc521cfb4264fc18c6dad6eb32cef06dd6a\n", " Stored in directory: /root/.cache/pip/wheels/d3/f8/ba/1d828b1603ea422686eb694253a43cb3a5901ea4696c1e0603\n", "Successfully built pygtrie\n", "Installing collected packages: pygtrie, hypothesis, pyctcdecode\n", "Successfully installed hypothesis-6.36.1 pyctcdecode-0.3.0 pygtrie-2.4.2\n", "Collecting jiwer\n", " Downloading jiwer-2.3.0-py3-none-any.whl (15 kB)\n", "Collecting python-Levenshtein==0.12.2\n", " Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)\n", "\u001b[K |████████████████████████████████| 50 kB 4.9 MB/s \n", "\u001b[?25hRequirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from python-Levenshtein==0.12.2->jiwer) (57.4.0)\n", "Building wheels for collected packages: python-Levenshtein\n", " Building wheel for python-Levenshtein (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.2-cp37-cp37m-linux_x86_64.whl size=149867 sha256=d73a420a77fb8a29d4d2d2a66007f7304e6a3220c58a8cd107b90776e066ca95\n", " Stored in directory: /root/.cache/pip/wheels/05/5f/ca/7c4367734892581bb5ff896f15027a932c551080b2abd3e00d\n", "Successfully built python-Levenshtein\n", "Installing collected packages: python-Levenshtein, jiwer\n", "Successfully installed jiwer-2.3.0 python-Levenshtein-0.12.2\n", "Collecting https://github.com/kpu/kenlm/archive/master.zip\n", " Downloading https://github.com/kpu/kenlm/archive/master.zip\n", "\u001b[K / 541 kB 2.6 MB/s\n", "\u001b[?25hBuilding wheels for collected packages: kenlm\n", " Building wheel for kenlm (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for kenlm: filename=kenlm-0.0.0-cp37-cp37m-linux_x86_64.whl size=2336650 sha256=f922b3c4b1f55c42d03c7f220392afc2f3083bfaf5c7efddae493f021199a0a9\n", " Stored in directory: /tmp/pip-ephem-wheel-cache-xu_qizqg/wheels/3d/aa/02/7b4a2eab5d7a2a9391bd9680dbad6270808a147bc3b7047e4e\n", "Successfully built kenlm\n", "Installing collected packages: kenlm\n", "Successfully installed kenlm-0.0.0\n", "\n", " _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|\n", " _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n", " _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|\n", " _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n", " _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|\n", "\n", " To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/token.\n", " (Deprecated, will be removed in v0.3.0) To login with username and password instead, interrupt with Ctrl+C.\n", " \n", "Token: \n", "Login successful\n", "Your token has been saved to /root/.huggingface/token\n", "\u001b[1m\u001b[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.\n", "You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default\n", "\n", "git config --global credential.helper store\u001b[0m\n" ] } ], "source": [ "!pip install datasets==1.18.3\n", "!pip install transformers\n", "!pip install pyctcdecode\n", "!pip install jiwer\n", "!pip install https://github.com/kpu/kenlm/archive/master.zip\n", "!huggingface-cli login" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 81, "referenced_widgets": [ "97811a06efab4df29d3ef838767a6da9", "ab02ea54b8e9400589dca75f7e96b24f", "5ac151e924344e468277e37b485bd9b6", "09a3df22649449c4b4d07c48e42cf878", "c256502ba22443bc827eeff817ee0946", "24b43add057d46c7aa17a47d1a42c4b2", "9b3168c0a1a14ae38009988aab6edced", "d0d16c12b6324ddeb2a2225405a66788", "66b6209125cb447f9f80fec90a90c992", "05f8011d09e7468bb9b05ab6fa4511b0", "eefc1a67bb5540d2a2798c1d422c7682", "c0cfd62da1fe4902b3ae977edbbfabd1", "4d46e6ca685649609379c8f1ff12ad76", "559392b1540d43e698c5b9e6a3553a64", "3f8c60d5962348be9659800019aaebba", "24d4432d875142fdaca84e4f7c2e02df", "6caca32fab474cf281201a52a7197b3a", "8c9476b5571f4bdc97d20c80c1425795", "83f66c1c61b646408635e4bd76eb63b6", "2bc9920b48614148b6cd2b6bf077dda6", "bcd6df19df36463e836bab2010eeba2c", "4f3b1f3b4e374dadaddd2852255a2f7e" ] }, "id": "wUoMGUOH24Sv", "outputId": "bba7b310-5ed2-4936-fee2-57a495ac3aaa" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "97811a06efab4df29d3ef838767a6da9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/1.90k [00:00