tiedeman commited on
Commit
8d39f9f
1 Parent(s): c60f16b

Initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.spm filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ language:
4
+ - aai
5
+ - ace
6
+ - agn
7
+ - aia
8
+ - akl
9
+ - alj
10
+ - alp
11
+ - amk
12
+ - aoz
13
+ - apr
14
+ - atq
15
+ - aui
16
+ - ban
17
+ - bcl
18
+ - bep
19
+ - bhz
20
+ - bik
21
+ - bku
22
+ - blz
23
+ - bmk
24
+ - bnp
25
+ - bpr
26
+ - bps
27
+ - btd
28
+ - bth
29
+ - bto
30
+ - bts
31
+ - btx
32
+ - bug
33
+ - buk
34
+ - bvy
35
+ - bzh
36
+ - ceb
37
+ - cgc
38
+ - ch
39
+ - dad
40
+ - dob
41
+ - dtp
42
+ - dww
43
+ - emi
44
+ - en
45
+ - far
46
+ - fil
47
+ - fj
48
+ - frd
49
+ - gfk
50
+ - gil
51
+ - gor
52
+ - haw
53
+ - hil
54
+ - hla
55
+ - hnn
56
+ - hot
57
+ - hvn
58
+ - iba
59
+ - id
60
+ - ifa
61
+ - ifb
62
+ - ifk
63
+ - ifu
64
+ - ify
65
+ - ilo
66
+ - iry
67
+ - itv
68
+ - jv
69
+ - jvn
70
+ - kbm
71
+ - khz
72
+ - kje
73
+ - kne
74
+ - kpg
75
+ - kqe
76
+ - kqf
77
+ - kqw
78
+ - krj
79
+ - kud
80
+ - kwf
81
+ - kzf
82
+ - laa
83
+ - law
84
+ - lcm
85
+ - leu
86
+ - lew
87
+ - lex
88
+ - lid
89
+ - ljp
90
+ - lnd
91
+ - mad
92
+ - mak
93
+ - mbb
94
+ - mbf
95
+ - mbt
96
+ - mee
97
+ - mek
98
+ - mg
99
+ - mgm
100
+ - mh
101
+ - mhy
102
+ - mi
103
+ - mmo
104
+ - mmx
105
+ - mna
106
+ - mnb
107
+ - mog
108
+ - mox
109
+ - mpx
110
+ - mqj
111
+ - mrw
112
+ - ms
113
+ - msm
114
+ - mta
115
+ - mva
116
+ - mvp
117
+ - mvv
118
+ - mwc
119
+ - mwv
120
+ - myw
121
+ - mzz
122
+ - na
123
+ - nak
124
+ - nia
125
+ - nij
126
+ - niu
127
+ - npy
128
+ - nsn
129
+ - nss
130
+ - nwi
131
+ - obo
132
+ - pag
133
+ - pam
134
+ - pau
135
+ - plw
136
+ - pmf
137
+ - pmy
138
+ - pne
139
+ - ppk
140
+ - prf
141
+ - ptp
142
+ - ptu
143
+ - pwg
144
+ - rai
145
+ - rap
146
+ - rej
147
+ - rro
148
+ - rug
149
+ - sas
150
+ - sbl
151
+ - sda
152
+ - sgb
153
+ - sgz
154
+ - sm
155
+ - smk
156
+ - sml
157
+ - snc
158
+ - sps
159
+ - stn
160
+ - su
161
+ - swp
162
+ - sxn
163
+ - tbc
164
+ - tbl
165
+ - tbo
166
+ - tet
167
+ - tgo
168
+ - tgp
169
+ - tkl
170
+ - tl
171
+ - tlx
172
+ - to
173
+ - tpa
174
+ - tpz
175
+ - tte
176
+ - tuc
177
+ - tvl
178
+ - twb
179
+ - twu
180
+ - txa
181
+ - ty
182
+ - ubr
183
+ - uvl
184
+ - viv
185
+ - war
186
+ - wed
187
+ - wuv
188
+ - xsb
189
+ - xsi
190
+ - yml
191
+
192
+ tags:
193
+ - translation
194
+ - opus-mt-tc-bible
195
+
196
+ license: apache-2.0
197
+ model-index:
198
+ - name: opus-mt-tc-bible-big-poz-en
199
+ results:
200
+ - task:
201
+ name: Translation multi-eng
202
+ type: translation
203
+ args: multi-eng
204
+ dataset:
205
+ name: tatoeba-test-v2020-07-28-v2023-09-26
206
+ type: tatoeba_mt
207
+ args: multi-eng
208
+ metrics:
209
+ - name: BLEU
210
+ type: bleu
211
+ value: 30.5
212
+ - name: chr-F
213
+ type: chrf
214
+ value: 0.48821
215
+ ---
216
+ # opus-mt-tc-bible-big-poz-en
217
+
218
+ ## Table of Contents
219
+ - [Model Details](#model-details)
220
+ - [Uses](#uses)
221
+ - [Risks, Limitations and Biases](#risks-limitations-and-biases)
222
+ - [How to Get Started With the Model](#how-to-get-started-with-the-model)
223
+ - [Training](#training)
224
+ - [Evaluation](#evaluation)
225
+ - [Citation Information](#citation-information)
226
+ - [Acknowledgements](#acknowledgements)
227
+
228
+ ## Model Details
229
+
230
+ Neural machine translation model for translating from Malayo-Polynesian languages (poz) to English (en).
231
+
232
+ This model is part of the [OPUS-MT project](https://github.com/Helsinki-NLP/Opus-MT), an effort to make neural machine translation models widely available and accessible for many languages in the world. All models are originally trained using the amazing framework of [Marian NMT](https://marian-nmt.github.io/), an efficient NMT implementation written in pure C++. The models have been converted to pyTorch using the transformers library by huggingface. Training data is taken from [OPUS](https://opus.nlpl.eu/) and training pipelines use the procedures of [OPUS-MT-train](https://github.com/Helsinki-NLP/Opus-MT-train).
233
+ **Model Description:**
234
+ - **Developed by:** Language Technology Research Group at the University of Helsinki
235
+ - **Model Type:** Translation (transformer-big)
236
+ - **Release**: 2024-08-17
237
+ - **License:** Apache-2.0
238
+ - **Language(s):**
239
+ - Source Language(s): aai ace agn aia akl alj alp amk aoz apr atq aui ban bcl bep bhz bik bku blz bmk bnp bpr bps btd bth bto bts btx bug buk bvy bzh ceb cgc cha dad dob dtp dww emi far fij fil frd gfk gil gor haw hil hla hnn hot hvn iba ifa ifb ifk ifu ify ilo ind iry itv jak jav jvn kbm khz kje kne kpg kqe kqf kqw krj kud kwf kzf laa law lcm leu lew lex lid ljp lnd mad mah mak max mbb mbf mbt mee mek mgm mhy mlg mmo mmx mna mnb mog mox mpx mqj mri mrw msa msm mta mva mvp mvv mwc mwv myw mzz nak nau nia nij niu npy nsn nss nwi obo pag pam pau plt plw pmf pmy pne ppk prf ptp ptu pwg rai rap rej rro rug sas sbl sda sgb sgz smk sml smo snc sps stn sun swp sxn tah tbc tbl tbo tet tgl tgo tgp tkl tlx tmw ton tpa tpz tte tuc tvl twb twu txa ubr uvl viv war wed wuv xsb xsi yml zlm zsm
240
+ - Target Language(s): eng
241
+ - **Original Model**: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/poz-eng/opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.zip)
242
+ - **Resources for more information:**
243
+ - [OPUS-MT dashboard](https://opus.nlpl.eu/dashboard/index.php?pkg=opusmt&test=all&scoreslang=all&chart=standard&model=Tatoeba-MT-models/poz-eng/opusTCv20230926max50%2Bbt%2Bjhubc_transformer-big_2024-08-17)
244
+ - [OPUS-MT-train GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
245
+ - [More information about MarianNMT models in the transformers library](https://huggingface.co/docs/transformers/model_doc/marian)
246
+ - [Tatoeba Translation Challenge](https://github.com/Helsinki-NLP/Tatoeba-Challenge/)
247
+ - [HPLT bilingual data v1 (as part of the Tatoeba Translation Challenge dataset)](https://hplt-project.org/datasets/v1)
248
+ - [A massively parallel Bible corpus](https://aclanthology.org/L14-1215/)
249
+
250
+ ## Uses
251
+
252
+ This model can be used for translation and text-to-text generation.
253
+
254
+ ## Risks, Limitations and Biases
255
+
256
+ **CONTENT WARNING: Readers should be aware that the model is trained on various public data sets that may contain content that is disturbing, offensive, and can propagate historical and current stereotypes.**
257
+
258
+ Significant research has explored bias and fairness issues with language models (see, e.g., [Sheng et al. (2021)](https://aclanthology.org/2021.acl-long.330.pdf) and [Bender et al. (2021)](https://dl.acm.org/doi/pdf/10.1145/3442188.3445922)).
259
+
260
+ ## How to Get Started With the Model
261
+
262
+ A short example code:
263
+
264
+ ```python
265
+ from transformers import MarianMTModel, MarianTokenizer
266
+
267
+ src_text = [
268
+ "Dapat sila'y may mga ideyang pangahas.",
269
+ "Dia memang seorang pekerja keras."
270
+ ]
271
+
272
+ model_name = "pytorch-models/opus-mt-tc-bible-big-poz-en"
273
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
274
+ model = MarianMTModel.from_pretrained(model_name)
275
+ translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
276
+
277
+ for t in translated:
278
+ print( tokenizer.decode(t, skip_special_tokens=True) )
279
+
280
+ # expected output:
281
+ # They should have some ideas.
282
+ # He was a hard worker.
283
+ ```
284
+
285
+ You can also use OPUS-MT models with the transformers pipelines, for example:
286
+
287
+ ```python
288
+ from transformers import pipeline
289
+ pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-bible-big-poz-en")
290
+ print(pipe("Dapat sila'y may mga ideyang pangahas."))
291
+
292
+ # expected output: They should have some ideas.
293
+ ```
294
+
295
+ ## Training
296
+
297
+ - **Data**: opusTCv20230926max50+bt+jhubc ([source](https://github.com/Helsinki-NLP/Tatoeba-Challenge))
298
+ - **Pre-processing**: SentencePiece (spm32k,spm32k)
299
+ - **Model Type:** transformer-big
300
+ - **Original MarianNMT Model**: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/poz-eng/opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.zip)
301
+ - **Training Scripts**: [GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
302
+
303
+ ## Evaluation
304
+
305
+ * [Model scores at the OPUS-MT dashboard](https://opus.nlpl.eu/dashboard/index.php?pkg=opusmt&test=all&scoreslang=all&chart=standard&model=Tatoeba-MT-models/poz-eng/opusTCv20230926max50%2Bbt%2Bjhubc_transformer-big_2024-08-17)
306
+ * test set translations: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/poz-eng/opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.test.txt)
307
+ * test set scores: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/poz-eng/opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17.eval.txt)
308
+ * benchmark results: [benchmark_results.txt](benchmark_results.txt)
309
+ * benchmark output: [benchmark_translations.zip](benchmark_translations.zip)
310
+
311
+ | langpair | testset | chr-F | BLEU | #sent | #words |
312
+ |----------|---------|-------|-------|-------|--------|
313
+ | multi-eng | tatoeba-test-v2020-07-28-v2023-09-26 | 0.48821 | 30.5 | 10000 | 75409 |
314
+
315
+ ## Citation Information
316
+
317
+ * Publications: [Democratizing neural machine translation with OPUS-MT](https://doi.org/10.1007/s10579-023-09704-w) and [OPUS-MT – Building open translation services for the World](https://aclanthology.org/2020.eamt-1.61/) and [The Tatoeba Translation Challenge – Realistic Data Sets for Low Resource and Multilingual MT](https://aclanthology.org/2020.wmt-1.139/) (Please, cite if you use this model.)
318
+
319
+ ```bibtex
320
+ @article{tiedemann2023democratizing,
321
+ title={Democratizing neural machine translation with {OPUS-MT}},
322
+ author={Tiedemann, J{\"o}rg and Aulamo, Mikko and Bakshandaeva, Daria and Boggia, Michele and Gr{\"o}nroos, Stig-Arne and Nieminen, Tommi and Raganato, Alessandro and Scherrer, Yves and Vazquez, Raul and Virpioja, Sami},
323
+ journal={Language Resources and Evaluation},
324
+ number={58},
325
+ pages={713--755},
326
+ year={2023},
327
+ publisher={Springer Nature},
328
+ issn={1574-0218},
329
+ doi={10.1007/s10579-023-09704-w}
330
+ }
331
+
332
+ @inproceedings{tiedemann-thottingal-2020-opus,
333
+ title = "{OPUS}-{MT} {--} Building open translation services for the World",
334
+ author = {Tiedemann, J{\"o}rg and Thottingal, Santhosh},
335
+ booktitle = "Proceedings of the 22nd Annual Conference of the European Association for Machine Translation",
336
+ month = nov,
337
+ year = "2020",
338
+ address = "Lisboa, Portugal",
339
+ publisher = "European Association for Machine Translation",
340
+ url = "https://aclanthology.org/2020.eamt-1.61",
341
+ pages = "479--480",
342
+ }
343
+
344
+ @inproceedings{tiedemann-2020-tatoeba,
345
+ title = "The Tatoeba Translation Challenge {--} Realistic Data Sets for Low Resource and Multilingual {MT}",
346
+ author = {Tiedemann, J{\"o}rg},
347
+ booktitle = "Proceedings of the Fifth Conference on Machine Translation",
348
+ month = nov,
349
+ year = "2020",
350
+ address = "Online",
351
+ publisher = "Association for Computational Linguistics",
352
+ url = "https://aclanthology.org/2020.wmt-1.139",
353
+ pages = "1174--1182",
354
+ }
355
+ ```
356
+
357
+ ## Acknowledgements
358
+
359
+ The work is supported by the [HPLT project](https://hplt-project.org/), funded by the European Union’s Horizon Europe research and innovation programme under grant agreement No 101070350. We are also grateful for the generous computational resources and IT infrastructure provided by [CSC -- IT Center for Science](https://www.csc.fi/), Finland, and the [EuroHPC supercomputer LUMI](https://www.lumi-supercomputer.eu/).
360
+
361
+ ## Model conversion info
362
+
363
+ * transformers version: 4.45.1
364
+ * OPUS-MT git hash: 0882077
365
+ * port time: Tue Oct 8 13:00:57 EEST 2024
366
+ * port machine: LM0-400-22516.local
benchmark_results.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ multi-eng tatoeba-test-v2020-07-28-v2023-09-26 0.48821 30.5 10000 75409
benchmark_translations.zip ADDED
File without changes
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "pytorch-models/opus-mt-tc-bible-big-poz-en",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "relu",
5
+ "architectures": [
6
+ "MarianMTModel"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": 0,
10
+ "classifier_dropout": 0.0,
11
+ "d_model": 1024,
12
+ "decoder_attention_heads": 16,
13
+ "decoder_ffn_dim": 4096,
14
+ "decoder_layerdrop": 0.0,
15
+ "decoder_layers": 6,
16
+ "decoder_start_token_id": 59756,
17
+ "decoder_vocab_size": 59757,
18
+ "dropout": 0.1,
19
+ "encoder_attention_heads": 16,
20
+ "encoder_ffn_dim": 4096,
21
+ "encoder_layerdrop": 0.0,
22
+ "encoder_layers": 6,
23
+ "eos_token_id": 769,
24
+ "forced_eos_token_id": null,
25
+ "init_std": 0.02,
26
+ "is_encoder_decoder": true,
27
+ "max_length": null,
28
+ "max_position_embeddings": 1024,
29
+ "model_type": "marian",
30
+ "normalize_embedding": false,
31
+ "num_beams": null,
32
+ "num_hidden_layers": 6,
33
+ "pad_token_id": 59756,
34
+ "scale_embedding": true,
35
+ "share_encoder_decoder_embeddings": true,
36
+ "static_position_embeddings": true,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.45.1",
39
+ "use_cache": true,
40
+ "vocab_size": 59757
41
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bad_words_ids": [
4
+ [
5
+ 59756
6
+ ]
7
+ ],
8
+ "bos_token_id": 0,
9
+ "decoder_start_token_id": 59756,
10
+ "eos_token_id": 769,
11
+ "forced_eos_token_id": 769,
12
+ "max_length": 512,
13
+ "num_beams": 4,
14
+ "pad_token_id": 59756,
15
+ "transformers_version": "4.45.1"
16
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2b169e1e3dee47dc80d9eca9bf7897a0550e4a89e70e1eff8b864d073ba3ee8
3
+ size 950462820
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db3785875d155fe8669ec6a53131ba27cf4bb81d40e23e001fb9fad6f0c59292
3
+ size 950514053
source.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82ae9869dcc49b6adbcfd9e83660af6d1c238f214f8deffe4904e375fe034763
3
+ size 773117
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
target.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9f4612fd7a5431acb9b1b5497903bc8730ce46135289b08b1c30827679b9f20
3
+ size 801419
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"source_lang": "poz", "target_lang": "en", "unk_token": "<unk>", "eos_token": "</s>", "pad_token": "<pad>", "model_max_length": 512, "sp_model_kwargs": {}, "separate_vocabs": false, "special_tokens_map_file": null, "name_or_path": "marian-models/opusTCv20230926max50+bt+jhubc_transformer-big_2024-08-17/poz-en", "tokenizer_class": "MarianTokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff