{ "cells": [ { "cell_type": "code", "execution_count": 20, "id": "04c8de09", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "import re" ] }, { "cell_type": "code", "execution_count": 23, "id": "1eae750a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/fr/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n" ] } ], "source": [ "dataset = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"fr\", split=\"train\", use_auth_token=True)" ] }, { "cell_type": "code", "execution_count": 24, "id": "da1cfcaa", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c110c54654c045b9a2cbc6cad43fa685", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0ex [00:00, ?ex/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "chars_to_ignore_regex = '[^a-zàâäçéèêëîïôöùûüÿ\\'’ ]'\n", "\n", "def extract_text(batch):\n", " batch[\"text\"] = re.sub(chars_to_ignore_regex, \"\", batch[\"sentence\"].lower()).replace('’', \"'\")\n", " return batch\n", "\n", "dataset = dataset.map(extract_text, remove_columns=[\"sentence\"])" ] }, { "cell_type": "code", "execution_count": 25, "id": "bb306916", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d21bc14560b747f49105f598a2ffe2ff", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Pushing dataset shards to the dataset hub: 0%| | 0/29 [00:00\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[1;32m 5\u001b[0m LM \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlanguage_model/\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m5gram.arpa\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 6\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mkenlm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mLanguageModel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mLM\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;124m-gram model\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(model\u001b[38;5;241m.\u001b[39morder))\n", "File \u001b[0;32mkenlm.pyx:142\u001b[0m, in \u001b[0;36mkenlm.Model.__init__\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mOSError\u001b[0m: Cannot read model 'language_model/5gram.arpa' (End of file Byte: 0)" ] } ], "source": [ "import os\n", "import kenlm\n", "import sys\n", "\n", "LM = os.path.join(\"language_model/\", '5gram.arpa')\n", "model = kenlm.LanguageModel(LM)\n", "print('{0}-gram model'.format(model.order))" ] }, { "cell_type": "code", "execution_count": 38, "id": "130f7f47", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "14200cf29dc74389aeceb56701ee9a5f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/255 [00:00