{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b7523cd66cf343f98fd3006be918a3b6", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/10.1k [00:00=1.10\n", " Downloading wrapt-1.13.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (84 kB)\n", " |████████████████████████████████| 84 kB 12.8 MB/s \n", "\u001b[?25hBuilding wheels for collected packages: jaconv\n", " Building wheel for jaconv (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Created wheel for jaconv: filename=jaconv-0.3-py3-none-any.whl size=15553 sha256=fd764f215e4d567cb60062a7052497b66729e9e2190e2e00153e0d19734088e7\n", " Stored in directory: /workspace/.cache/pip/wheels/73/e8/fb/b4ad8117719f79ac73bc05406d1768f845688cdbeed7aad87e\n", "Successfully built jaconv\n", "Installing collected packages: wrapt, jaconv, deprecated, pykakasi\n", "Successfully installed deprecated-1.2.13 jaconv-0.3 pykakasi-2.2.1 wrapt-1.13.3\n", "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.2 is available.\n", "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip install pykakasi" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "にんじゃ ひらがな kana\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_2159/3076271513.py:4: DeprecationWarning: Call to deprecated method setMode. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n", " kakasi.setMode('J', 'H') #Convert from kanji to hiragana\n", "/tmp/ipykernel_2159/3076271513.py:6: DeprecationWarning: Call to deprecated method getConverter. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n", " conv = kakasi.getConverter()\n", "/tmp/ipykernel_2159/3076271513.py:10: DeprecationWarning: Call to deprecated method do. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n", " print(conv.do(str))\n" ] } ], "source": [ "from pykakasi import kakasi\n", "\n", "kakasi = kakasi()\n", "kakasi.setMode('J', 'H') #Convert from kanji to hiragana\n", "# kakasi.setMode(\"K\", \"H\") #Convert from katakana to hiragana\n", "conv = kakasi.getConverter()\n", "\n", "str = 'にんじゃ 平仮名 kana'\n", "\n", "print(conv.do(str))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "repo_name = 'https://huggingface.co/AndrewMcDowell/wav2vec2-xls-r-1B-german'\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "common_voice_train = common_voice_train.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n", "common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n", "\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ad26c4d7d02948a3bc30d86a0f3527c8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0ex [00:00, ?ex/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_2159/322450745.py:5: DeprecationWarning: Call to deprecated method do. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n", " batch[\"sentence\"] = conv.do(re.sub(chars_to_remove_regex, '', batch[\"sentence\"]))\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "93295f1cd50f4557a96ff1bf139c9a37", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0ex [00:00, ?ex/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import re\n", "chars_to_remove_regex = '[\\,\\?\\!\\-\\;\\:\\\"\\“\\%\\‘\\”\\�\\—\\’\\…\\–\\(\\,\\[\\]\\)\\(\\!]'\n", "# \\.\n", "def remove_special_characters(batch):\n", " batch[\"sentence\"] = conv.do(re.sub(chars_to_remove_regex, '', batch[\"sentence\"]))\n", " return batch\n", "\n", "common_voice_train = common_voice_train.map(remove_special_characters)\n", "common_voice_test = common_voice_test.map(remove_special_characters)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'client_id': '02a8841a00d762472a4797b56ee01643e8d9ece5a225f2e91c007ab1f94c49c99e50d19986ff3fefb18190257323f34238828114aa607f84fbe9764ecf5aaeaa',\n", " 'path': 'cv-corpus-8.0-2022-01-19/ja/clips/common_voice_ja_25467658.mp3',\n", " 'audio': {'path': 'cv-corpus-8.0-2022-01-19/ja/clips/common_voice_ja_25467658.mp3',\n", " 'array': array([0. , 0. , 0. , ..., 0.00026336, 0.00038834,\n", " 0.00026771], dtype=float32),\n", " 'sampling_rate': 48000},\n", " 'sentence': 'ちょっとがっこうでトラブルがありまして。',\n", " 'up_votes': 2,\n", " 'down_votes': 0,\n", " 'age': 'fourties',\n", " 'gender': 'female',\n", " 'accent': '',\n", " 'locale': 'ja',\n", " 'segment': ''}" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "common_voice_train[1]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: datasets in /opt/conda/lib/python3.8/site-packages (1.18.2.dev0)\n", "Collecting datasets\n", " Downloading datasets-1.18.3-py3-none-any.whl (311 kB)\n", " |████████████████████████████████| 311 kB 11.0 MB/s \n", "\u001b[?25hRequirement already satisfied: aiohttp in /opt/conda/lib/python3.8/site-packages (from datasets) (3.8.1)\n", "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (0.4.0)\n", "Requirement already satisfied: dill in /opt/conda/lib/python3.8/site-packages (from datasets) (0.3.4)\n", "Requirement already satisfied: xxhash in /opt/conda/lib/python3.8/site-packages (from datasets) (2.0.2)\n", "Requirement already satisfied: multiprocess in /opt/conda/lib/python3.8/site-packages (from datasets) (0.70.12.2)\n", "Requirement already satisfied: pandas in /opt/conda/lib/python3.8/site-packages (from datasets) (1.4.0)\n", "Requirement already satisfied: pyarrow!=4.0.0,>=3.0.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (6.0.1)\n", "Requirement already satisfied: requests>=2.19.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (2.24.0)\n", "Requirement already satisfied: fsspec[http]>=2021.05.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (2022.1.0)\n", "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.8/site-packages (from datasets) (1.19.2)\n", "Requirement already satisfied: tqdm>=4.62.1 in /opt/conda/lib/python3.8/site-packages (from datasets) (4.62.3)\n", "Requirement already satisfied: packaging in /opt/conda/lib/python3.8/site-packages (from datasets) (21.3)\n", "Requirement already satisfied: filelock in /opt/conda/lib/python3.8/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.0.12)\n", "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.8/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (5.4.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.8/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (4.0.1)\n", "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.8/site-packages (from packaging->datasets) (3.0.7)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (1.25.11)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (2020.12.5)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (3.0.4)\n", "Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (2.10)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.2)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.0)\n", "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (2.0.10)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (1.7.2)\n", "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (21.4.0)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (1.2.0)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.2)\n", "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.8/site-packages (from pandas->datasets) (2021.1)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.8/site-packages (from pandas->datasets) (2.8.2)\n", "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.8/site-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.15.0)\n", "Installing collected packages: datasets\n", " Attempting uninstall: datasets\n", " Found existing installation: datasets 1.18.2.dev0\n", " Uninstalling datasets-1.18.2.dev0:\n", "\u001b[31mERROR: Could not install packages due to an OSError: [Errno 13] Permission denied: 'entry_points.txt'\n", "Consider using the `--user` option or check the permissions.\n", "\u001b[0m\n", "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.2 is available.\n", "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip install --upgrade datasets" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting num2words\n", " Downloading num2words-0.5.10-py3-none-any.whl (101 kB)\n", " |████████████████████████████████| 101 kB 7.9 MB/s \n", "\u001b[?25hCollecting docopt>=0.6.2\n", " Downloading docopt-0.6.2.tar.gz (25 kB)\n", " Preparing metadata (setup.py) ... \u001b[?25ldone\n", "\u001b[?25hBuilding wheels for collected packages: docopt\n", " Building wheel for docopt (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13704 sha256=7cda85e4b3980668714aad8f5d706fb5b189c2804ce1d99ca2380537fdc78031\n", " Stored in directory: /workspace/.cache/pip/wheels/56/ea/58/ead137b087d9e326852a851351d1debf4ada529b6ac0ec4e8c\n", "Successfully built docopt\n", "Installing collected packages: docopt, num2words\n", "Successfully installed docopt-0.6.2 num2words-0.5.10\n", "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.2 is available.\n", "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip install num2words" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0da8fd9cdae64c1fa80fbcfc412bcf9c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0ex [00:00, ?ex/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "\n", "from num2words import num2words\n", "import regex as re\n", "matches = []\n", "\n", "def replace_numbers(match):\n", " match = match.group()\n", " matches.append(match)\n", " return num2words(match, lang='de')\n", "\n", "def replace_numbers_in_batch(batch):\n", " batch[\"sentence\"] = re.sub(r'\\d+(?:,\\d+)?', replace_numbers, batch[\"sentence\"])\n", " return batch\n", "\n", "common_voice_test_2 = common_voice_test.map(replace_numbers_in_batch)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "54d62ea7a0214b6abc5de1f106b330dc", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0ex [00:00, ?ex/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "common_voice_train_2 = common_voice_train.map(replace_numbers_in_batch)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(matches)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# def replace_accented_characters(batch):\n", "# accented_string = u'Málaga'\n", "# # accented_string is of type 'unicode'\n", "# import unidecode\n", "# unaccented_string = unidecode.unidecode(accented_string)\n", "# batch[\"sentence\"] = re.sub('[â]', 'a', batch[\"sentence\"])\n", "# batch[\"sentence\"] = re.sub('[î]', 'i', batch[\"sentence\"])\n", "# batch[\"sentence\"] = re.sub('[ô]', 'o', batch[\"sentence\"])\n", "# batch[\"sentence\"] = re.sub('[û]', 'u', batch[\"sentence\"])\n", "# return batch\n", "\n", "def strip_accents(batch):\n", " return ''.join(c for c in unicodedata.normalize('NFD', batch[\"sentence\"]) if unicodedata.category(c) != 'Mn')\n", "\n", "common_voice_train = common_voice_train.map(replace_accented_characters)\n", "common_voice_test = common_voice_test.map(replace_accented_characters)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def extract_all_chars(batch):\n", " all_text = \" \".join(batch[\"sentence\"])\n", " vocab = list(set(all_text))\n", " return {\"vocab\": [vocab], \"all_text\": [all_text]}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c40f4d6b6bb74a56b2c570a3a53d7f4b", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00\": 0, \"\": 1, \"\": 2, \"\": 3, \"|\": 4, \"'\": 5, \"-\": 6, \"A\": 7, \"B\": 8, \"C\": 9, \"D\": 10, \"E\": 11, \"F\": 12, \"G\": 13, \"H\": 14, \"I\": 15, \"J\": 16, \"K\": 17, \"L\": 18, \"M\": 19, \"N\": 20, \"O\": 21, \"P\": 22, \"Q\": 23, \"R\": 24, \"S\": 25, \"T\": 26, \"U\": 27, \"V\": 28, \"W\": 29, \"X\": 30, \"Y\": 31, \"Z\": 32, \"Ä\": 33, \"Í\": 34, \"Ó\": 35, \"Ö\": 36, \"Ü\": 37}\n" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "manually_kept_values = ['ß', 'ä', 'ö', 'ü']\n", "\n", "punctuation = ['.', ]" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['$', '&', '(', ')', '*', '+', '.', '/', '=', '@', '[', ']', '_', '`', '¡', '§', '«', '°', '´', 'µ', '·', '»', '×', 'à', 'á', 'â', 'ã', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ø', 'ù', 'ú', 'û', 'ý', 'þ', 'ā', 'ă', 'ą', 'ć', 'č', 'ď', 'đ', 'ē', 'ė', 'ę', 'ě', 'ğ', 'ġ', 'ħ', 'ī', 'ı', 'ł', 'ń', 'ņ', 'ň', 'ō', 'ŏ', 'ő', 'œ', 'ř', 'ś', 'ş', 'š', 'ť', 'ū', 'ů', 'ź', 'ż', 'ž', 'ơ', 'ǐ', 'ǔ', 'ș', 'ț', 'ə', 'ʻ', 'ʾ', 'ʿ', '̆', '̇', '̥', 'а', 'в', 'е', 'и', 'к', 'м', 'о', 'р', 'с', 'ф', 'ч', 'ш', 'ѹ', 'א', 'ב', 'נ', 'ע', 'ש', '་', 'ན', 'ḫ', 'ṟ', 'ṣ', 'ṭ', 'ạ', 'ả', 'ắ', 'ằ', 'ế', 'ễ', 'ệ', 'ọ', 'ồ', 'ộ', 'ụ', 'ứ', '‑', '‚', '„', '‟', '′', '″', '‹', '›', '→', '−', '≡', '⟨', '⟩', 'カ', '东', '临', '乡', '关', '合', '城', '孙', '尣', '幺', '支', '比', '毛', '泽', '無', '生', '臣', '辶', '道', '镇', '黃']\n" ] } ], "source": [ "odd_values = []\n", "for index, value in enumerate(sorted(vocab_list)):\n", "# if :\n", " if value not in j_vocab and not (16 <= index <= 41 or value == ' ') and value not in manually_kept_values:\n", " odd_values.append(value)\n", "# print(index, value)\n", " \n", "print(odd_values)" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "$ & ( ) * + . / = @ [ ] _ ` ¡ § « ° ´ µ · » × à á â ã å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ø ù ú û ý þ ā ă ą ć č ď đ ē ė ę ě ğ ġ ħ ī ı ł ń ņ ň ō ŏ ő œ ř ś ş š ť ū ů ź ż ž ơ ǐ ǔ ș ț ə ʻ ʾ ʿ ̆ ̇ ̥ а в е и к м о р с ф ч ш ѹ א ב נ ע ש ་ ན ḫ ṟ ṣ ṭ ạ ả ắ ằ ế ễ ệ ọ ồ ộ ụ ứ ‑ ‚ „ ‟ ′ ″ ‹ › → − ≡ ⟨ ⟩ カ 东 临 乡 关 合 城 孙 尣 幺 支 比 毛 泽 無 生 臣 辶 道 镇 黃\n" ] } ], "source": [ "print(\" \".join(odd_values))\n", "\n", "# for value in odd_values:\n", "# if value not in manually_kept_values:\n", "# print(value)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "$ & ( ) * + = @ [ ] _ ` ¡ § « ° ´ µ · » × à á â ã å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ø ù ú û ý þ ā ă ą ć č ď đ ē ė ę ě ğ ġ ħ ī ı ł ń ņ ň ō ŏ ő œ ř ś ş š ť ū ů ź ż ž ơ ǐ ǔ ș ț ə ʻ ʾ ʿ ̆ ̇ ̥ а в е и к м о р с ф ч ш ѹ א ב נ ע ש ་ ན ḫ ṟ ṣ ṭ ạ ả ắ ằ ế ễ ệ ọ ồ ộ ụ ứ ‑ ‚ „ ‟ ′ ″ ‹ › → − ≡ ⟨ ⟩ カ 东 临 乡 关 合 城 孙 尣 幺 支 比 毛 泽 無 生 臣 辶 道 镇 黃" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "filtered_vocab_list = [value for value in vocab_list if value not in odd_values]" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['ß',\n", " 'j',\n", " 'r',\n", " 'h',\n", " 'd',\n", " 'l',\n", " 'z',\n", " 'n',\n", " 'm',\n", " 'c',\n", " 'ä',\n", " \"'\",\n", " 'g',\n", " 'e',\n", " 'w',\n", " 's',\n", " 'u',\n", " 'k',\n", " 'o',\n", " 'f',\n", " ' ',\n", " 'y',\n", " 'v',\n", " 'ö',\n", " 'ü',\n", " 'p',\n", " 'a',\n", " 'x',\n", " 'b',\n", " 'q',\n", " 't',\n", " 'i']" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtered_vocab_list" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'word_delimiter_token' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Input \u001b[0;32mIn [21]\u001b[0m, in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m vocab_dict \u001b[38;5;241m=\u001b[39m {v: k \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28msorted\u001b[39m(vocab_list))}\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# replace white space with delimiter token\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mword_delimiter_token\u001b[49m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 5\u001b[0m vocab_dict[word_delimiter_token] \u001b[38;5;241m=\u001b[39m vocab_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m vocab_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", "\u001b[0;31mNameError\u001b[0m: name 'word_delimiter_token' is not defined" ] } ], "source": [ "vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}\n", "\n", "# replace white space with delimiter token\n", "if word_delimiter_token is not None:\n", " vocab_dict[word_delimiter_token] = vocab_dict[\" \"]\n", " del vocab_dict[\" \"]\n", "\n", "# add unk and pad token\n", "if unk_token is not None:\n", " vocab_dict[unk_token] = len(vocab_dict)\n", "\n", "if pad_token is not None:\n", " vocab_dict[pad_token] = len(vocab_dict)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "59e89471ea85449ebbc709d0a9d7325c", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/437 [00:00\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m odd_example_texts \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m common_voice_train:\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m letter \u001b[38;5;129;01min\u001b[39;00m odd_values:\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m letter \u001b[38;5;129;01min\u001b[39;00m row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msentence\u001b[39m\u001b[38;5;124m\"\u001b[39m]: \n", "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:1664\u001b[0m, in \u001b[0;36mDataset._iter\u001b[0;34m(self, decoded)\u001b[0m\n\u001b[1;32m 1658\u001b[0m \u001b[38;5;124;03m\"\"\"Iterate through the examples.\u001b[39;00m\n\u001b[1;32m 1659\u001b[0m \n\u001b[1;32m 1660\u001b[0m \u001b[38;5;124;03mIf a formatting is set with :meth:`Dataset.set_format` rows will be returned with the\u001b[39;00m\n\u001b[1;32m 1661\u001b[0m \u001b[38;5;124;03mselected format.\u001b[39;00m\n\u001b[1;32m 1662\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1663\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_rows):\n\u001b[0;32m-> 1664\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_getitem\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1665\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1666\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecoded\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecoded\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1667\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:1915\u001b[0m, in \u001b[0;36mDataset._getitem\u001b[0;34m(self, key, decoded, **kwargs)\u001b[0m\n\u001b[1;32m 1913\u001b[0m formatter \u001b[38;5;241m=\u001b[39m get_formatter(format_type, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures, decoded\u001b[38;5;241m=\u001b[39mdecoded, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mformat_kwargs)\n\u001b[1;32m 1914\u001b[0m pa_subtable \u001b[38;5;241m=\u001b[39m query_table(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data, key, indices\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_indices \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_indices \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m-> 1915\u001b[0m formatted_output \u001b[38;5;241m=\u001b[39m \u001b[43mformat_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1916\u001b[0m \u001b[43m \u001b[49m\u001b[43mpa_subtable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mformatter\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mformatter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mformat_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mformat_columns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_all_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_all_columns\u001b[49m\n\u001b[1;32m 1917\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1918\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m formatted_output\n", "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:533\u001b[0m, in \u001b[0;36mformat_table\u001b[0;34m(table, key, formatter, format_columns, output_all_columns)\u001b[0m\n\u001b[1;32m 531\u001b[0m python_formatter \u001b[38;5;241m=\u001b[39m PythonFormatter(features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 532\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m format_columns \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 533\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mformatter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquery_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquery_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 534\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumn\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 535\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m format_columns:\n", "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:282\u001b[0m, in \u001b[0;36mFormatter.__call__\u001b[0;34m(self, pa_table, query_type)\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, pa_table: pa\u001b[38;5;241m.\u001b[39mTable, query_type: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[RowFormat, ColumnFormat, BatchFormat]:\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrow\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 282\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mformat_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 283\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumn\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mformat_column(pa_table)\n", "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:313\u001b[0m, in \u001b[0;36mPythonFormatter.format_row\u001b[0;34m(self, pa_table)\u001b[0m\n\u001b[1;32m 311\u001b[0m row \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpython_arrow_extractor()\u001b[38;5;241m.\u001b[39mextract_row(pa_table)\n\u001b[1;32m 312\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecoded:\n\u001b[0;32m--> 313\u001b[0m row \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpython_features_decoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 314\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m row\n", "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:222\u001b[0m, in \u001b[0;36mPythonFeaturesDecoder.decode_row\u001b[0;34m(self, row)\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_row\u001b[39m(\u001b[38;5;28mself\u001b[39m, row: \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mdict\u001b[39m:\n\u001b[0;32m--> 222\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures \u001b[38;5;28;01melse\u001b[39;00m row\n", "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1318\u001b[0m, in \u001b[0;36mFeatures.decode_example\u001b[0;34m(self, example)\u001b[0m\n\u001b[1;32m 1308\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_example\u001b[39m(\u001b[38;5;28mself\u001b[39m, example: \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m 1309\u001b[0m \u001b[38;5;124;03m\"\"\"Decode example with custom feature decoding.\u001b[39;00m\n\u001b[1;32m 1310\u001b[0m \n\u001b[1;32m 1311\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1315\u001b[0m \u001b[38;5;124;03m :obj:`dict[str, Any]`\u001b[39;00m\n\u001b[1;32m 1316\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1318\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m 1319\u001b[0m column_name: decode_nested_example(feature, value)\n\u001b[1;32m 1320\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_column_requires_decoding[column_name]\n\u001b[1;32m 1321\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m value\n\u001b[1;32m 1322\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m column_name, (feature, value) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(\n\u001b[1;32m 1323\u001b[0m {key: value \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems() \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m example}, example\n\u001b[1;32m 1324\u001b[0m )\n\u001b[1;32m 1325\u001b[0m }\n", "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1319\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 1308\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_example\u001b[39m(\u001b[38;5;28mself\u001b[39m, example: \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m 1309\u001b[0m \u001b[38;5;124;03m\"\"\"Decode example with custom feature decoding.\u001b[39;00m\n\u001b[1;32m 1310\u001b[0m \n\u001b[1;32m 1311\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1315\u001b[0m \u001b[38;5;124;03m :obj:`dict[str, Any]`\u001b[39;00m\n\u001b[1;32m 1316\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 1318\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[0;32m-> 1319\u001b[0m column_name: \u001b[43mdecode_nested_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1320\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_column_requires_decoding[column_name]\n\u001b[1;32m 1321\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m value\n\u001b[1;32m 1322\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m column_name, (feature, value) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(\n\u001b[1;32m 1323\u001b[0m {key: value \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems() \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m example}, example\n\u001b[1;32m 1324\u001b[0m )\n\u001b[1;32m 1325\u001b[0m }\n", "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1056\u001b[0m, in \u001b[0;36mdecode_nested_example\u001b[0;34m(schema, obj)\u001b[0m\n\u001b[1;32m 1054\u001b[0m \u001b[38;5;66;03m# Object with special decoding:\u001b[39;00m\n\u001b[1;32m 1055\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, (Audio, Image)):\n\u001b[0;32m-> 1056\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mschema\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m obj \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1057\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\n", "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/audio.py:97\u001b[0m, in \u001b[0;36mAudio.decode_example\u001b[0;34m(self, value)\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn audio sample should have one of \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m or \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbytes\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m but both are None in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 96\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m path \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m path\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmp3\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m---> 97\u001b[0m array, sampling_rate \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_decode_mp3\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file:\n", "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/audio.py:183\u001b[0m, in \u001b[0;36mAudio._decode_mp3\u001b[0;34m(self, path_or_file)\u001b[0m\n\u001b[1;32m 181\u001b[0m array \u001b[38;5;241m=\u001b[39m array\u001b[38;5;241m.\u001b[39mnumpy()\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmono:\n\u001b[0;32m--> 183\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43marray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmean\u001b[49m\u001b[43m(\u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array, sampling_rate\n", "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/numpy/core/_methods.py:154\u001b[0m, in \u001b[0;36m_mean\u001b[0;34m(a, axis, dtype, out, keepdims)\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# Cast bool, unsigned int, and int to float64 by default\u001b[39;00m\n\u001b[1;32m 153\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dtype \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 154\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(arr\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, (\u001b[43mnt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minteger\u001b[49m, nt\u001b[38;5;241m.\u001b[39mbool_)):\n\u001b[1;32m 155\u001b[0m dtype \u001b[38;5;241m=\u001b[39m mu\u001b[38;5;241m.\u001b[39mdtype(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mf8\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 156\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(arr\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, nt\u001b[38;5;241m.\u001b[39mfloat16):\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 \n", "1 &\n", "2 '\n", "3 .\n", "4 /\n", "5 A\n", "6 B\n", "7 C\n", "8 D\n", "9 E\n", "10 F\n", "11 G\n", "12 H\n", "13 I\n", "14 J\n", "15 K\n", "16 L\n", "17 M\n", "18 N\n", "19 O\n", "20 P\n", "21 Q\n", "22 R\n", "23 S\n", "24 T\n", "25 U\n", "26 V\n", "27 W\n", "28 X\n", "29 Y\n", "30 Z\n", "31 a\n", "32 b\n", "33 c\n", "34 d\n", "35 e\n", "36 f\n", "37 g\n", "38 h\n", "39 i\n", "40 j\n", "41 k\n", "42 l\n", "43 m\n", "44 n\n", "45 o\n", "46 p\n", "47 q\n", "48 r\n", "49 s\n", "50 t\n", "51 u\n", "52 v\n", "53 w\n", "54 x\n", "55 y\n", "56 z\n", "57 ―\n", "58 、\n", "59 。\n", "60 々\n", "61 〇\n", "62 「\n", "63 」\n", "64 『\n", "65 』\n", "66 〜\n", "67 ぁ\n", "68 あ\n", "69 い\n", "70 う\n", "71 ぇ\n", "72 え\n", "73 お\n", "74 か\n", "75 が\n", "76 き\n", "77 ぎ\n", "78 く\n", "79 ぐ\n", "80 け\n", "81 げ\n", "82 こ\n", "83 ご\n", "84 さ\n", "85 ざ\n", "86 し\n", "87 じ\n", "88 す\n", "89 ず\n", "90 せ\n", "91 ぜ\n", "92 そ\n", "93 ぞ\n", "94 た\n", "95 だ\n", "96 ち\n", "97 ぢ\n", "98 っ\n", "99 つ\n", "100 づ\n", "101 て\n", "102 で\n", "103 と\n", "104 ど\n", "105 な\n", "106 に\n", "107 ぬ\n", "108 ね\n", "109 の\n", "110 は\n", "111 ば\n", "112 ぱ\n", "113 ひ\n", "114 び\n", "115 ぴ\n", "116 ふ\n", "117 ぶ\n", "118 ぷ\n", "119 へ\n", "120 べ\n", "121 ぺ\n", "122 ほ\n", "123 ぼ\n", "124 ぽ\n", "125 ま\n", "126 み\n", "127 む\n", "128 め\n", "129 も\n", "130 ゃ\n", "131 や\n", "132 ゅ\n", "133 ゆ\n", "134 ょ\n", "135 よ\n", "136 ら\n", "137 り\n", "138 る\n", "139 れ\n", "140 ろ\n", "141 わ\n", "142 を\n", "143 ん\n", "144 ァ\n", "145 ア\n", "146 ィ\n", "147 イ\n", "148 ゥ\n", "149 ウ\n", "150 ェ\n", "151 エ\n", "152 ォ\n", "153 オ\n", "154 カ\n", "155 ガ\n", "156 キ\n", "157 ギ\n", "158 ク\n", "159 グ\n", "160 ケ\n", "161 ゲ\n", "162 コ\n", "163 ゴ\n", "164 サ\n", "165 ザ\n", "166 シ\n", "167 ジ\n", "168 ス\n", "169 ズ\n", "170 セ\n", "171 ゼ\n", "172 ソ\n", "173 ゾ\n", "174 タ\n", "175 ダ\n", "176 チ\n", "177 ッ\n", "178 ツ\n", "179 ヅ\n", "180 テ\n", "181 デ\n", "182 ト\n", "183 ド\n", "184 ナ\n", "185 ニ\n", "186 ヌ\n", "187 ネ\n", "188 ノ\n", "189 ハ\n", "190 バ\n", "191 パ\n", "192 ヒ\n", "193 ビ\n", "194 ピ\n", "195 フ\n", "196 ブ\n", "197 プ\n", "198 ヘ\n", "199 ベ\n", "200 ペ\n", "201 ホ\n", "202 ボ\n", "203 ポ\n", "204 マ\n", "205 ミ\n", "206 ム\n", "207 メ\n", "208 モ\n", "209 ャ\n", "210 ヤ\n", "211 ュ\n", "212 ユ\n", "213 ョ\n", "214 ヨ\n", "215 ラ\n", "216 リ\n", "217 ル\n", "218 レ\n", "219 ロ\n", "220 ワ\n", "221 ン\n", "222 ヴ\n", "223 ヶ\n", "224 ・\n", "225 ー\n", "226 繫\n", "227 !\n", "228 &\n", "229 )\n", "230 -\n", "231 .\n", "232 :\n", "233 =\n", "234 ?\n", "235 A\n", "236 D\n", "237 F\n", "238 G\n", "239 N\n", "240 O\n", "241 P\n", "242 S\n", "243 U\n", "244 h\n", "245 j\n", "246 「\n", "247 」\n", "248 ・\n" ] } ], "source": [ "vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}\n", "for key, value in enumerate(vocab_dict):\n", " print(key, value)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def create_vocabulary_from_data(\n", " datasets: DatasetDict,\n", " word_delimiter_token: Optional[str] = None,\n", " unk_token: Optional[str] = None,\n", " pad_token: Optional[str] = None,\n", "):\n", " # Given training and test labels create vocabulary\n", " def extract_all_chars(batch):\n", " all_text = \" \".join(batch[\"target_text\"])\n", " vocab = list(set(all_text))\n", " return {\"vocab\": [vocab], \"all_text\": [all_text]}\n", "\n", " vocabs = datasets.map(\n", " extract_all_chars,\n", " batched=True,\n", " batch_size=-1,\n", " keep_in_memory=True,\n", " remove_columns=datasets[\"train\"].column_names,\n", " )\n", "\n", " # take union of all unique characters in each dataset\n", " vocab_set = functools.reduce(\n", " lambda vocab_1, vocab_2: set(vocab_1[\"vocab\"][0]) | set(vocab_2[\"vocab\"][0]), vocabs.values()\n", " )\n", "\n", " vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}\n", "\n", " # replace white space with delimiter token\n", " if word_delimiter_token is not None:\n", " vocab_dict[word_delimiter_token] = vocab_dict[\" \"]\n", " del vocab_dict[\" \"]\n", "\n", " # add unk and pad token\n", " if unk_token is not None:\n", " vocab_dict[unk_token] = len(vocab_dict)\n", "\n", " if pad_token is not None:\n", " vocab_dict[pad_token] = len(vocab_dict)\n", "\n", " return vocab_dict" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load processor\n", "feature_extractor = AutoFeatureExtractor.from_pretrained(repo_name)\n", "# feature_extractor = processor_with_lm.feature_extractor\n", "sampling_rate = feature_extractor.sampling_rate\n", "\n", "# resample audio\n", "dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=sampling_rate))\n", "\n", "# load eval pipeline\n", "asr = pipeline(\"automatic-speech-recognition\", model=repo_name, feature_extractor=feature_extractor)\n", "\n", "# map function to decode audio\n", "def map_to_pred(batch):\n", " prediction = asr(\n", " batch[\"audio\"][\"array\"])\n", "\n", " batch[\"prediction\"] = prediction[\"text\"]\n", " batch[\"target\"] = batch[\"sentence\"]\n", " return batch\n", "\n", "# run inference on all examples\n", "result = dataset.map(map_to_pred, remove_columns=dataset.column_names)\n", "print(result[\"prediction\"])\n", "\n", "result[0]['target']" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 4 }