File size: 11,040 Bytes
9ab5bc1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 |
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Bookworm MTL.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU",
"gpuClass": "standard"
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# Ascendance of a Bookworm MTL\n",
"\n",
"This notebook uses a custom machine translation model to translate the Ascendance of a Bookworm WN into English.\n",
"\n",
"This model is in BETA. Pronouns are not fixed yet, new characters' names may be wrong, and sentence splitting isn't implemented yet, so the model likes making a single long sentence. These issues will be fixed in the future.\n",
"\n",
"If you encounter any poorly translated sentences and want to help improve the model, see the note at the bottom of the page.\n",
"\n",
"To run this notebook, make sure you are using a GPU runtime and then go to\n",
"Runtime > Run all. Once that is done, you can change the text in the translation cell and run it multiple times by clicking the run button to the left of the cell. "
],
"metadata": {
"id": "nkp0dv1zg93C"
}
},
{
"cell_type": "code",
"source": [
"#@title Run this to set up the environment\n",
"\n",
"!pip install transformers\n",
"!pip install accelerate\n",
"!pip install unidecode\n",
"!pip install spacy\n",
"!python -m spacy download ja_core_news_lg"
],
"metadata": {
"cellView": "form",
"id": "nM7cmpX4hl0q"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Run this to import python packages\n",
"\n",
"from functools import partial\n",
"import torch\n",
"from torch.cuda.amp import autocast\n",
"from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM, NllbTokenizerFast\n",
"import spacy\n",
"from tqdm.notebook import tqdm\n",
"import re\n",
"import unidecode\n",
"import unicodedata"
],
"metadata": {
"cellView": "form",
"id": "mSnruJt8r3qP"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Run this to set the output language\n",
"#@markdown This model is multi-lingual! Here you can set the output language.\n",
"#@markdown It is best with English, but it can translate into other\n",
"#@markdown languages too. A couple are listed here, but you can enter a different\n",
"#@markdown one if you want. See pages 13-16 in [this pdf](https://arxiv.org/pdf/2207.04672.pdf)\n",
"#@markdown for a full list of supported languages.\n",
"\n",
"target_language = 'eng_Latn' #@param [\"eng_Latn\", \"spa_Latn\", \"fra_Latn\", \"deu_Latn\"] {allow-input: true}"
],
"metadata": {
"cellView": "form",
"id": "6w_HfApfhn9j"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Run this to initialize the model\n",
"\n",
"DEVICE = 'cuda:0'\n",
"model_checkpoint = \"thefrigidliquidation/nllb-200-distilled-1.3B-bookworm\"\n",
"\n",
"config = AutoConfig.from_pretrained(model_checkpoint)\n",
"tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, src_lang=\"jpn_Jpan\", tgt_lang=target_language)\n",
"model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, torch_dtype=torch.float16).to(DEVICE)\n",
"\n",
"nlp_ja = spacy.load('ja_core_news_lg')"
],
"metadata": {
"cellView": "form",
"id": "cGnkjUgej6Uv"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Run this to set up the code to do the translating\n",
"\n",
"DOTS_REGEX = re.compile(r\"^(?P<dots>[.โฆ]+)ใ?$\")\n",
"\n",
"\n",
"def char_filter(string):\n",
" latin = re.compile('[a-zA-Z]+')\n",
" for char in unicodedata.normalize('NFC', string):\n",
" decoded = unidecode.unidecode(char)\n",
" if latin.match(decoded):\n",
" yield char\n",
" else:\n",
" yield decoded\n",
"\n",
"\n",
"def clean_string(string):\n",
" s = \"\".join(char_filter(string))\n",
" s = \"\\n\".join((x.rstrip() for x in s.splitlines()))\n",
" return s\n",
"\n",
"\n",
"def split_lglines_sentences(nlp, text, split_on_len=200):\n",
" lines = text.splitlines()\n",
" for line in lines:\n",
" if len(line) < split_on_len:\n",
" yield line.strip()\n",
" continue\n",
" doc = nlp(line)\n",
" assert doc.has_annotation(\"SENT_START\")\n",
" spacy_sents = [str(x).strip() for x in doc.sents]\n",
" if len(spacy_sents) == 1:\n",
" yield spacy_sents[0]\n",
" continue\n",
" # japanese spacy is bad. combine again if needed\n",
" sents = []\n",
" for sent in spacy_sents:\n",
" if (len(sent) < 4) and (len(sents) > 0) and (len(sents[-1]) == 0 or sents[-1][-1] != '.'):\n",
" sents[-1] += sent\n",
" else:\n",
" sents.append(sent)\n",
" yield from (x for x in sents if not DOTS_REGEX.match(x))\n",
"\n",
"\n",
"def translate_m2m(translator, tokenizer: NllbTokenizerFast, device, pars, verbose: bool = False):\n",
" en_pars = []\n",
" pars_it = tqdm(pars, leave=False, smoothing=0.0) if verbose else pars\n",
" for line in pars_it:\n",
" if line.strip() == \"\":\n",
" en_pars.append(\"\")\n",
" continue\n",
" inputs = tokenizer(f\"{line}\", return_tensors=\"pt\")\n",
" inputs = {k: v.to(device) for (k, v) in inputs.items()}\n",
" generated_tokens = translator.generate(\n",
" **inputs,\n",
" forced_bos_token_id=tokenizer.lang_code_to_id[tokenizer.tgt_lang],\n",
" max_new_tokens=512,\n",
" no_repeat_ngram_size=4,\n",
" ).cpu()\n",
" with tokenizer.as_target_tokenizer():\n",
" outputs = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n",
" en_pars.append(*outputs)\n",
" return en_pars\n",
"\n",
"\n",
"translate = partial(translate_m2m, model, tokenizer, DEVICE)\n",
"\n",
"\n",
"def translate_long_text(text: str):\n",
" lines = split_lglines_sentences(nlp_ja, text, split_on_len=150)\n",
" with torch.no_grad():\n",
" with autocast(dtype=torch.float16):\n",
" en_lines = translate([clean_string(x).strip() for x in lines], verbose=True)\n",
" for en_line in en_lines:\n",
" print(en_line)"
],
"metadata": {
"cellView": "form",
"id": "zPFc9VP0k4_y"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Run this to translate the text\n",
"\n",
"#@markdown Enter the Japansese text into the box on the left between the three quation marks (\"\"\").\n",
"#@markdown Make sure there is no text on the lines containing the three quotes.\n",
"#@markdown See the example text for an idea of the formatting required.\n",
"\n",
"text = \"\"\"\n",
"ๆฌ้ ใใจใ้บไนใใใฎใฏๆฌใๅฅฝใใ ใ\n",
"\n",
"ๅฟ็ๅญฆใๅฎๆใๆญดๅฒใๅฐ็ใๆ่ฒๅญฆใๆฐไฟๅญฆใๆฐๅญฆใ็ฉ็ใๅฐๅญฆใๅๅญฆใ็็ฉๅญฆใ่ธ่กใไฝ่ฒใ่จ่ชใ็ฉ่ชโฆโฆไบบ้กใฎ็ฅ่ญใใใฃใกใ่ฉฐใ่พผใพใใๆฌใๅฟใฎๅบใใๆใใฆใใใ\n",
"\n",
"ๆงใ
ใช็ฅ่ญใไธๅใซใพใจใใใใฆใใๆฌใ่ชญใใจใใจใฆใๅพใใใๆฐๅใซใชใใใใ่ชๅใใใฎ็ฎใง่ฆใใใจใใชใไธ็ใใๆฌๅฑใๅณๆธ้คจใซไธฆใถๅ็้ใ้ใใฆ่ฆใใฎใใไธ็ใๅบใใฃใฆใใใใใง้ถ้
ใงใใใ\n",
"\n",
"ๅคๅฝใฎๅคใ็ฉ่ชใ ใฃใฆใ้ใๆไปฃใฎใ้ใๅฝใฎ้ขจ็ฟใๅฃ้่ฆใใฆ่ถฃๆทฑใใใใใใใๅ้ใซใใใฆๆญดๅฒใใใใใใใ็ด่งฃใใฆใใใฐใๆ้ใๅฟใใใชใใฆใใคใใฎใใจใงใใใ\n",
"\n",
"้บไนใฏใๅณๆธ้คจใฎๅคใๆฌใ้ใใใใฆใใๆธๅบซใฎใๅคใๆฌ็ฌ็นใฎๅฐใ
้ปดใใณ่ญใๅใใๅใฃใฝใๅใใๅฅฝใใงใๅณๆธ้คจใซ่กใใจใใใใๆธๅบซใซๅ
ฅใ่พผใใใใใงใใฃใใใจๅคใๅใใฎใใ็ฉบๆฐใๅธใ่พผใฟใๅนดใ็ตใๆฌใ่ฆๅใใฐใ้บไนใฏใใใ ใใงๅฌใใใชใฃใฆใ่ๅฅฎใใฆใใพใใ\n",
"\"\"\"[1:-1]\n",
"\n",
"translate_long_text(text)"
],
"metadata": {
"id": "Rwv_rO9plAsj"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Submit corrected sentences to improve the model!\n",
"#@markdown If you encounter poorly translated sentences with the wrong name or term, please correct it!\n",
"#@markdown You can use other translation sites (like [DeepL](https://www.deepl.com/translator))\n",
"#@markdown to make sure the Japanese and English sentences match.\n",
"\n",
"#@markdown Then run this cell and message [u/thefrigidliquidation](https://www.reddit.com/user/thefrigidliquidation/)\n",
"#@markdown on reddit with this cells output.\n",
"\n",
"import base64\n",
"import json\n",
"\n",
"\n",
"\n",
"ja_sent = 'The Japanese sentence.' #@param {type:\"string\"}\n",
"en_sent = 'The corrected English sentence.' #@param {type:\"string\"}\n",
"\n",
"df = {'translation': {'en': en_sent, 'ja': ja_sent}}\n",
"df_json = json.dumps(df)\n",
"\n",
"print(base64.b64encode(df_json.encode('ascii')).decode('ascii'))\n"
],
"metadata": {
"cellView": "form",
"id": "0yx9hnj6yBKA"
},
"execution_count": null,
"outputs": []
}
]
} |