tiedeman commited on
Commit
f9d5d5a
1 Parent(s): 9f13e8d

Initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.spm filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ language:
4
+ - aai
5
+ - ace
6
+ - agn
7
+ - aia
8
+ - akl
9
+ - alj
10
+ - alp
11
+ - ami
12
+ - amk
13
+ - aoz
14
+ - apr
15
+ - atq
16
+ - aui
17
+ - ban
18
+ - bcl
19
+ - bep
20
+ - bhz
21
+ - bik
22
+ - bku
23
+ - blz
24
+ - bmk
25
+ - bnp
26
+ - bpr
27
+ - bps
28
+ - btd
29
+ - bth
30
+ - bto
31
+ - bts
32
+ - btx
33
+ - bug
34
+ - buk
35
+ - bvy
36
+ - bzh
37
+ - ceb
38
+ - cgc
39
+ - ch
40
+ - dad
41
+ - dob
42
+ - dtp
43
+ - dww
44
+ - emi
45
+ - en
46
+ - far
47
+ - fil
48
+ - fj
49
+ - frd
50
+ - gfk
51
+ - gil
52
+ - gor
53
+ - haw
54
+ - hil
55
+ - hla
56
+ - hnn
57
+ - hot
58
+ - hvn
59
+ - iba
60
+ - id
61
+ - ifa
62
+ - ifb
63
+ - ifk
64
+ - ifu
65
+ - ify
66
+ - ilo
67
+ - iry
68
+ - itv
69
+ - jv
70
+ - jvn
71
+ - kbm
72
+ - khz
73
+ - kje
74
+ - kne
75
+ - kpg
76
+ - kqe
77
+ - kqf
78
+ - kqw
79
+ - krj
80
+ - kud
81
+ - kwf
82
+ - kzf
83
+ - laa
84
+ - law
85
+ - lcm
86
+ - leu
87
+ - lew
88
+ - lex
89
+ - lid
90
+ - ljp
91
+ - lnd
92
+ - mad
93
+ - mak
94
+ - mbb
95
+ - mbf
96
+ - mbt
97
+ - mee
98
+ - mek
99
+ - mg
100
+ - mgm
101
+ - mh
102
+ - mhy
103
+ - mi
104
+ - mmo
105
+ - mmx
106
+ - mna
107
+ - mnb
108
+ - mog
109
+ - mox
110
+ - mpx
111
+ - mqj
112
+ - mrw
113
+ - ms
114
+ - msm
115
+ - mta
116
+ - mva
117
+ - mvp
118
+ - mvv
119
+ - mwc
120
+ - mwv
121
+ - myw
122
+ - mzz
123
+ - na
124
+ - nak
125
+ - nia
126
+ - nij
127
+ - niu
128
+ - npy
129
+ - nsn
130
+ - nss
131
+ - nwi
132
+ - obo
133
+ - pag
134
+ - pam
135
+ - pau
136
+ - plw
137
+ - pmf
138
+ - pmy
139
+ - pne
140
+ - ppk
141
+ - prf
142
+ - ptp
143
+ - ptu
144
+ - pwg
145
+ - rai
146
+ - rap
147
+ - rej
148
+ - rro
149
+ - rug
150
+ - sas
151
+ - sbl
152
+ - sda
153
+ - sgb
154
+ - sgz
155
+ - sm
156
+ - smk
157
+ - sml
158
+ - snc
159
+ - sps
160
+ - stn
161
+ - su
162
+ - swp
163
+ - sxn
164
+ - tbc
165
+ - tbl
166
+ - tbo
167
+ - tet
168
+ - tgo
169
+ - tgp
170
+ - tkl
171
+ - tl
172
+ - tlx
173
+ - to
174
+ - tpa
175
+ - tpz
176
+ - trv
177
+ - tte
178
+ - tuc
179
+ - tvl
180
+ - twb
181
+ - twu
182
+ - txa
183
+ - ty
184
+ - ubr
185
+ - uvl
186
+ - viv
187
+ - war
188
+ - wed
189
+ - wuv
190
+ - xsb
191
+ - xsi
192
+ - yml
193
+
194
+ tags:
195
+ - translation
196
+ - opus-mt-tc-bible
197
+
198
+ license: apache-2.0
199
+ model-index:
200
+ - name: opus-mt-tc-bible-big-map-en
201
+ results:
202
+ - task:
203
+ name: Translation multi-eng
204
+ type: translation
205
+ args: multi-eng
206
+ dataset:
207
+ name: tatoeba-test-v2020-07-28-v2023-09-26
208
+ type: tatoeba_mt
209
+ args: multi-eng
210
+ metrics:
211
+ - name: BLEU
212
+ type: bleu
213
+ value: 30.5
214
+ - name: chr-F
215
+ type: chrf
216
+ value: 0.48582
217
+ ---
218
+ # opus-mt-tc-bible-big-map-en
219
+
220
+ ## Table of Contents
221
+ - [Model Details](#model-details)
222
+ - [Uses](#uses)
223
+ - [Risks, Limitations and Biases](#risks-limitations-and-biases)
224
+ - [How to Get Started With the Model](#how-to-get-started-with-the-model)
225
+ - [Training](#training)
226
+ - [Evaluation](#evaluation)
227
+ - [Citation Information](#citation-information)
228
+ - [Acknowledgements](#acknowledgements)
229
+
230
+ ## Model Details
231
+
232
+ Neural machine translation model for translating from Austronesian languages (map) to English (en).
233
+
234
+ This model is part of the [OPUS-MT project](https://github.com/Helsinki-NLP/Opus-MT), an effort to make neural machine translation models widely available and accessible for many languages in the world. All models are originally trained using the amazing framework of [Marian NMT](https://marian-nmt.github.io/), an efficient NMT implementation written in pure C++. The models have been converted to pyTorch using the transformers library by huggingface. Training data is taken from [OPUS](https://opus.nlpl.eu/) and training pipelines use the procedures of [OPUS-MT-train](https://github.com/Helsinki-NLP/Opus-MT-train).
235
+ **Model Description:**
236
+ - **Developed by:** Language Technology Research Group at the University of Helsinki
237
+ - **Model Type:** Translation (transformer-big)
238
+ - **Release**: 2024-08-17
239
+ - **License:** Apache-2.0
240
+ - **Language(s):**
241
+ - Source Language(s): aai ace agn aia akl alj alp ami amk aoz apr atq aui ban bcl bep bhz bik bku blz bmk bnp bpr bps btd bth bto bts btx bug buk bvy bzh ceb cgc cha dad dob dtp dww emi far fij fil frd gfk gil gor haw hil hla hnn hot hvn iba ifa ifb ifk ifu ify ilo ind iry itv jak jav jvn kbm khz kje kne kpg kqe kqf kqw krj kud kwf kzf laa law lcm leu lew lex lid ljp lnd mad mah mak max mbb mbf mbt mee mek mgm mhy mlg mmo mmx mna mnb mog mox mpx mqj mri mrw msa msm mta mva mvp mvv mwc mwv myw mzz nak nau nia nij niu npy nsn nss nwi obo pag pam pau plt plw pmf pmy pne ppk prf ptp ptu pwg rai rap rej rro rug sas sbl sda sgb sgz smk sml smo snc sps stn sun swp sxn tah tbc tbl tbo tet tgl tgo tgp tkl tlx tmw ton tpa tpz trv tte tuc tvl twb twu txa ubr uvl viv war wed wuv xsb xsi yml zlm zsm
242
+ - Target Language(s): eng
243
+ - **Original Model**: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/map-eng/opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.zip)
244
+ - **Resources for more information:**
245
+ - [OPUS-MT dashboard](https://opus.nlpl.eu/dashboard/index.php?pkg=opusmt&test=all&scoreslang=all&chart=standard&model=Tatoeba-MT-models/map-eng/opusTCv20230926max50%2Bbt%2Bjhubc_transformer-big_2024-08-17)
246
+ - [OPUS-MT-train GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
247
+ - [More information about MarianNMT models in the transformers library](https://huggingface.co/docs/transformers/model_doc/marian)
248
+ - [Tatoeba Translation Challenge](https://github.com/Helsinki-NLP/Tatoeba-Challenge/)
249
+ - [HPLT bilingual data v1 (as part of the Tatoeba Translation Challenge dataset)](https://hplt-project.org/datasets/v1)
250
+ - [A massively parallel Bible corpus](https://aclanthology.org/L14-1215/)
251
+
252
+ ## Uses
253
+
254
+ This model can be used for translation and text-to-text generation.
255
+
256
+ ## Risks, Limitations and Biases
257
+
258
+ **CONTENT WARNING: Readers should be aware that the model is trained on various public data sets that may contain content that is disturbing, offensive, and can propagate historical and current stereotypes.**
259
+
260
+ Significant research has explored bias and fairness issues with language models (see, e.g., [Sheng et al. (2021)](https://aclanthology.org/2021.acl-long.330.pdf) and [Bender et al. (2021)](https://dl.acm.org/doi/pdf/10.1145/3442188.3445922)).
261
+
262
+ ## How to Get Started With the Model
263
+
264
+ A short example code:
265
+
266
+ ```python
267
+ from transformers import MarianMTModel, MarianTokenizer
268
+
269
+ src_text = [
270
+ "Tidak ada yang harus tahu.",
271
+ "Napintaska."
272
+ ]
273
+
274
+ model_name = "pytorch-models/opus-mt-tc-bible-big-map-en"
275
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
276
+ model = MarianMTModel.from_pretrained(model_name)
277
+ translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
278
+
279
+ for t in translated:
280
+ print( tokenizer.decode(t, skip_special_tokens=True) )
281
+
282
+ # expected output:
283
+ # No one should know.
284
+ # You're beautiful.
285
+ ```
286
+
287
+ You can also use OPUS-MT models with the transformers pipelines, for example:
288
+
289
+ ```python
290
+ from transformers import pipeline
291
+ pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-bible-big-map-en")
292
+ print(pipe("Tidak ada yang harus tahu."))
293
+
294
+ # expected output: No one should know.
295
+ ```
296
+
297
+ ## Training
298
+
299
+ - **Data**: opusTCv20230926max50+bt+jhubc ([source](https://github.com/Helsinki-NLP/Tatoeba-Challenge))
300
+ - **Pre-processing**: SentencePiece (spm32k,spm32k)
301
+ - **Model Type:** transformer-big
302
+ - **Original MarianNMT Model**: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/map-eng/opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.zip)
303
+ - **Training Scripts**: [GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
304
+
305
+ ## Evaluation
306
+
307
+ * [Model scores at the OPUS-MT dashboard](https://opus.nlpl.eu/dashboard/index.php?pkg=opusmt&test=all&scoreslang=all&chart=standard&model=Tatoeba-MT-models/map-eng/opusTCv20230926max50%2Bbt%2Bjhubc_transformer-big_2024-08-17)
308
+ * test set translations: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/map-eng/opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.test.txt)
309
+ * test set scores: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/map-eng/opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.eval.txt)
310
+ * benchmark results: [benchmark_results.txt](benchmark_results.txt)
311
+ * benchmark output: [benchmark_translations.zip](benchmark_translations.zip)
312
+
313
+ | langpair | testset | chr-F | BLEU | #sent | #words |
314
+ |----------|---------|-------|-------|-------|--------|
315
+ | multi-eng | tatoeba-test-v2020-07-28-v2023-09-26 | 0.48582 | 30.5 | 10000 | 75897 |
316
+
317
+ ## Citation Information
318
+
319
+ * Publications: [Democratizing neural machine translation with OPUS-MT](https://doi.org/10.1007/s10579-023-09704-w) and [OPUS-MT – Building open translation services for the World](https://aclanthology.org/2020.eamt-1.61/) and [The Tatoeba Translation Challenge – Realistic Data Sets for Low Resource and Multilingual MT](https://aclanthology.org/2020.wmt-1.139/) (Please, cite if you use this model.)
320
+
321
+ ```bibtex
322
+ @article{tiedemann2023democratizing,
323
+ title={Democratizing neural machine translation with {OPUS-MT}},
324
+ author={Tiedemann, J{\"o}rg and Aulamo, Mikko and Bakshandaeva, Daria and Boggia, Michele and Gr{\"o}nroos, Stig-Arne and Nieminen, Tommi and Raganato, Alessandro and Scherrer, Yves and Vazquez, Raul and Virpioja, Sami},
325
+ journal={Language Resources and Evaluation},
326
+ number={58},
327
+ pages={713--755},
328
+ year={2023},
329
+ publisher={Springer Nature},
330
+ issn={1574-0218},
331
+ doi={10.1007/s10579-023-09704-w}
332
+ }
333
+
334
+ @inproceedings{tiedemann-thottingal-2020-opus,
335
+ title = "{OPUS}-{MT} {--} Building open translation services for the World",
336
+ author = {Tiedemann, J{\"o}rg and Thottingal, Santhosh},
337
+ booktitle = "Proceedings of the 22nd Annual Conference of the European Association for Machine Translation",
338
+ month = nov,
339
+ year = "2020",
340
+ address = "Lisboa, Portugal",
341
+ publisher = "European Association for Machine Translation",
342
+ url = "https://aclanthology.org/2020.eamt-1.61",
343
+ pages = "479--480",
344
+ }
345
+
346
+ @inproceedings{tiedemann-2020-tatoeba,
347
+ title = "The Tatoeba Translation Challenge {--} Realistic Data Sets for Low Resource and Multilingual {MT}",
348
+ author = {Tiedemann, J{\"o}rg},
349
+ booktitle = "Proceedings of the Fifth Conference on Machine Translation",
350
+ month = nov,
351
+ year = "2020",
352
+ address = "Online",
353
+ publisher = "Association for Computational Linguistics",
354
+ url = "https://aclanthology.org/2020.wmt-1.139",
355
+ pages = "1174--1182",
356
+ }
357
+ ```
358
+
359
+ ## Acknowledgements
360
+
361
+ The work is supported by the [HPLT project](https://hplt-project.org/), funded by the European Union’s Horizon Europe research and innovation programme under grant agreement No 101070350. We are also grateful for the generous computational resources and IT infrastructure provided by [CSC -- IT Center for Science](https://www.csc.fi/), Finland, and the [EuroHPC supercomputer LUMI](https://www.lumi-supercomputer.eu/).
362
+
363
+ ## Model conversion info
364
+
365
+ * transformers version: 4.45.1
366
+ * OPUS-MT git hash: 0882077
367
+ * port time: Tue Oct 8 12:07:23 EEST 2024
368
+ * port machine: LM0-400-22516.local
benchmark_results.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ multi-eng tatoeba-test-v2020-07-28-v2023-09-26 0.48582 30.5 10000 75897
benchmark_translations.zip ADDED
File without changes
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "pytorch-models/opus-mt-tc-bible-big-map-en",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "relu",
5
+ "architectures": [
6
+ "MarianMTModel"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": 0,
10
+ "classifier_dropout": 0.0,
11
+ "d_model": 1024,
12
+ "decoder_attention_heads": 16,
13
+ "decoder_ffn_dim": 4096,
14
+ "decoder_layerdrop": 0.0,
15
+ "decoder_layers": 6,
16
+ "decoder_start_token_id": 59739,
17
+ "decoder_vocab_size": 59740,
18
+ "dropout": 0.1,
19
+ "encoder_attention_heads": 16,
20
+ "encoder_ffn_dim": 4096,
21
+ "encoder_layerdrop": 0.0,
22
+ "encoder_layers": 6,
23
+ "eos_token_id": 769,
24
+ "forced_eos_token_id": null,
25
+ "init_std": 0.02,
26
+ "is_encoder_decoder": true,
27
+ "max_length": null,
28
+ "max_position_embeddings": 1024,
29
+ "model_type": "marian",
30
+ "normalize_embedding": false,
31
+ "num_beams": null,
32
+ "num_hidden_layers": 6,
33
+ "pad_token_id": 59739,
34
+ "scale_embedding": true,
35
+ "share_encoder_decoder_embeddings": true,
36
+ "static_position_embeddings": true,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.45.1",
39
+ "use_cache": true,
40
+ "vocab_size": 59740
41
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bad_words_ids": [
4
+ [
5
+ 59739
6
+ ]
7
+ ],
8
+ "bos_token_id": 0,
9
+ "decoder_start_token_id": 59739,
10
+ "eos_token_id": 769,
11
+ "forced_eos_token_id": 769,
12
+ "max_length": 512,
13
+ "num_beams": 4,
14
+ "pad_token_id": 59739,
15
+ "transformers_version": "4.45.1"
16
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2db3ee80789e513dd4c303e28ffaab9fa28c8042c0e592bc1d392f74949b8ce
3
+ size 950393120
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:828c8da05a8acd5f355017fa03f94c927ada079fa72ff7e138a05d35dff780ae
3
+ size 950444357
source.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61ac724d4323a6c53bc8436ec423335e3590d944c9fb8aff932f501fb3c93ffa
3
+ size 772802
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
target.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:382a138dedba365907b8e56dfdf155cef1f686d1d767837b7d4110d8d7542cf0
3
+ size 801284
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"source_lang": "map", "target_lang": "en", "unk_token": "<unk>", "eos_token": "</s>", "pad_token": "<pad>", "model_max_length": 512, "sp_model_kwargs": {}, "separate_vocabs": false, "special_tokens_map_file": null, "name_or_path": "marian-models/opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17/map-en", "tokenizer_class": "MarianTokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff