kimbochen
/

whisper-small-jp

@@ -306,7 +306,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "id": "c085911c-a10a-41ef-8874-306e0503e9bb",
    "metadata": {},
    "outputs": [],
@@ -328,7 +328,8 @@
     "        transcription = normalizer(transcription).strip()\n",
     "    \n",
     "    # encode target text to label ids\n",
-    "    batch[\"labels\"] = processor.tokenizer(transcription).input_ids\n",
     "    return batch"
    ]
   },
@@ -342,7 +343,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "id": "a37a7cdb-9013-427f-8de9-6a8d0e9dc684",
    "metadata": {},
    "outputs": [],
@@ -360,7 +361,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
    "id": "1b145699-acfc-4b1d-93a2-a2ad3d62674c",
    "metadata": {},
    "outputs": [],
@@ -381,7 +382,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "id": "01cb25ef-4bb0-4325-9461-f59198acadf6",
    "metadata": {},
    "outputs": [],
@@ -402,7 +403,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
    "id": "333f7f6e-6053-4d3b-8924-c733c79b82ac",
    "metadata": {},
    "outputs": [],
@@ -413,14 +414,252 @@
     ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2d56f5bf",
    "metadata": {},
    "outputs": [],
    "source": [
-    "vectorized_datasets['train'][0]"
    ]
   },
   {
@@ -895,7 +1134,7 @@
    "execution_count": 26,
    "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
    "metadata": {
-    "scrolled": false
    },
    "outputs": [
     {
@@ -1139,7 +1378,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "95737cda-c5dd-4887-a4d0-dfcb0d61d977",
    "metadata": {},
    "outputs": [
@@ -1155,6 +1394,60 @@
       "Special tokens file saved in ./special_tokens_map.json\n",
       "added tokens file saved in ./added_tokens.json\n"
      ]
     }
    ],
    "source": [

   },
   {
    "cell_type": "code",
+   "execution_count": 44,
    "id": "c085911c-a10a-41ef-8874-306e0503e9bb",
    "metadata": {},
    "outputs": [],
     "        transcription = normalizer(transcription).strip()\n",
     "    \n",
     "    # encode target text to label ids\n",
+    "#     batch[\"labels\"] = processor.tokenizer(transcription).input_ids\n",
+    "    batch['labels'] = transcription\n",
     "    return batch"
    ]
   },
   },
   {
    "cell_type": "code",
+   "execution_count": 45,
    "id": "a37a7cdb-9013-427f-8de9-6a8d0e9dc684",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 46,
    "id": "1b145699-acfc-4b1d-93a2-a2ad3d62674c",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 47,
    "id": "01cb25ef-4bb0-4325-9461-f59198acadf6",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 48,
    "id": "333f7f6e-6053-4d3b-8924-c733c79b82ac",
    "metadata": {},
    "outputs": [],
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "bede1184",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Reading metadata...: 6505it [00:00, 35406.66it/s]\n",
+      "Reading metadata...: 4485it [00:00, 19930.24it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'多から一へというのは、世界を因果的に決定論的に考えることである、過去から考えることである、機械的に考えることである。'"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "xb = next(iter(vectorized_datasets['train']))\n",
+    "xb['labels']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "ac1e8d5b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<|startoftranscript|>\n",
+      "<|ja|>\n",
+      "<|transcribe|>\n",
+      "<|notimestamps|>\n",
+      "多\n",
+      "から\n",
+      "一\n",
+      "へ\n",
+      "という\n",
+      "のは\n",
+      "、\n",
+      "世界\n",
+      "を\n",
+      "因\n",
+      "果\n",
+      "的\n",
+      "に\n",
+      "決\n",
+      "定\n",
+      "論\n",
+      "的\n",
+      "に\n",
+      "考\n",
+      "える\n",
+      "こと\n",
+      "で\n",
+      "ある\n",
+      "、\n",
+      "過去\n",
+      "から\n",
+      "考\n",
+      "える\n",
+      "こと\n",
+      "で\n",
+      "ある\n",
+      "、\n",
+      "機\n",
+      "�\n",
+      "�\n",
+      "的\n",
+      "に\n",
+      "考\n",
+      "える\n",
+      "こと\n",
+      "で\n",
+      "ある\n",
+      "。\n",
+      "<|endoftext|>\n"
+     ]
+    }
+   ],
+   "source": [
+    "idxs = processor.tokenizer(xb['labels']).input_ids\n",
+    "for idx in idxs:\n",
+    "    print(processor.tokenizer.decode(idx))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "d33cefc4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[多から,\n",
+       " 一,\n",
+       " へ,\n",
+       " と,\n",
+       " いう,\n",
+       " の,\n",
+       " は,\n",
+       " 、,\n",
+       " 世界,\n",
+       " を,\n",
+       " 因果,\n",
+       " 的,\n",
+       " に,\n",
+       " 決定,\n",
+       " 論,\n",
+       " 的,\n",
+       " に,\n",
+       " 考える,\n",
+       " こと,\n",
+       " で,\n",
+       " ある,\n",
+       " 、,\n",
+       " 過去,\n",
+       " から,\n",
+       " 考える,\n",
+       " こと,\n",
+       " で,\n",
+       " ある,\n",
+       " 、,\n",
+       " 機械,\n",
+       " 的,\n",
+       " に,\n",
+       " 考える,\n",
+       " こと,\n",
+       " で,\n",
+       " ある,\n",
+       " 。]"
+      ]
+     },
+     "execution_count": 60,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tagger(xb['labels'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "id": "2cbb82ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Help on method decode in module transformers.tokenization_utils_base:\n",
+      "\n",
+      "decode(token_ids: Union[int, List[int], ForwardRef('np.ndarray'), ForwardRef('torch.Tensor'), ForwardRef('tf.Tensor')], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, **kwargs) -> str method of transformers.models.whisper.tokenization_whisper.WhisperTokenizer instance\n",
+      "    Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special\n",
+      "    tokens and clean up tokenization spaces.\n",
+      "    \n",
+      "    Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.\n",
+      "    \n",
+      "    Args:\n",
+      "        token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):\n",
+      "            List of tokenized input ids. Can be obtained using the `__call__` method.\n",
+      "        skip_special_tokens (`bool`, *optional*, defaults to `False`):\n",
+      "            Whether or not to remove special tokens in the decoding.\n",
+      "        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):\n",
+      "            Whether or not to clean up the tokenization spaces.\n",
+      "        kwargs (additional keyword arguments, *optional*):\n",
+      "            Will be passed to the underlying model specific decode method.\n",
+      "    \n",
+      "    Returns:\n",
+      "        `str`: The decoded sentence.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "help(processor.tokenizer.decode)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "b4b9bbfc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'麩 菓子 は 、 麩 を 主材 料 と し た 日本 の 菓子 。'"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from fugashi import Tagger\n",
+    "\n",
+    "tagger = Tagger('-Owakati')\n",
+    "text = \"麩菓子は、麩を主材料とした日本の菓子。\"\n",
+    "tagger.parse(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "833ca62d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[麩, 菓子, は, 、, 麩, を, 主材, 料, と, し, た, 日本, の, 菓子, 。]"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tagger(text)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "7b7854d6",
    "metadata": {},
    "outputs": [],
    "source": [
+    "raw_datasets['']"
    ]
   },
   {
    "execution_count": 26,
    "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
    "metadata": {
+    "scrolled": true
    },
    "outputs": [
     {
   },
   {
    "cell_type": "code",
+   "execution_count": 28,
    "id": "95737cda-c5dd-4887-a4d0-dfcb0d61d977",
    "metadata": {},
    "outputs": [
       "Special tokens file saved in ./special_tokens_map.json\n",
       "added tokens file saved in ./added_tokens.json\n"
      ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a47d7e61b9144723a4208cc4cc492eee",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload file pytorch_model.bin:   0%|          | 32.0k/922M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a7eb0d82c2fd4f978981915aa2314463",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload file runs/Dec12_04-37-47_150-136-44-233/events.out.tfevents.1670819878.150-136-44-233.69039.0: 100%|###…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "remote: Scanning LFS files for validity, may be slow...        \n",
+      "remote: LFS file scan complete.        \n",
+      "To https://huggingface.co/kimbochen/whisper-small-jp\n",
+      "   d83a98f..0ff52f0  main -> main\n",
+      "\n",
+      "Dropping the following result as it does not have all the necessary fields:\n",
+      "{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}, 'dataset': {'name': 'Common Voice 11.0', 'type': 'mozilla-foundation/common_voice_11_0', 'config': 'ja', 'split': 'test', 'args': 'ja'}}\n",
+      "To https://huggingface.co/kimbochen/whisper-small-jp\n",
+      "   0ff52f0..22e3a01  main -> main\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'https://huggingface.co/kimbochen/whisper-small-jp/commit/0ff52f0f1d63daf816427096a83f7bbf8f3892eb'"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [