tiedeman commited on
Commit
579fe1b
1 Parent(s): eed9b20

Initial commit

Browse files
.gitattributes CHANGED
@@ -29,3 +29,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zst filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zst filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ *.spm filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,803 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - da
4
+ - es
5
+ - fr
6
+ - gmq
7
+ - is
8
+ - it
9
+ - itc
10
+ - la
11
+ - nb
12
+ - nn
13
+ - no
14
+ - pt
15
+ - sv
16
+
17
+ tags:
18
+ - translation
19
+ - opus-mt-tc
20
+
21
+ license: cc-by-4.0
22
+ model-index:
23
+ - name: opus-mt-tc-big-gmq-itc
24
+ results:
25
+ - task:
26
+ name: Translation dan-cat
27
+ type: translation
28
+ args: dan-cat
29
+ dataset:
30
+ name: flores101-devtest
31
+ type: flores_101
32
+ args: dan cat devtest
33
+ metrics:
34
+ - name: BLEU
35
+ type: bleu
36
+ value: 33.4
37
+ - name: chr-F
38
+ type: chrf
39
+ value: 0.59224
40
+ - task:
41
+ name: Translation dan-fra
42
+ type: translation
43
+ args: dan-fra
44
+ dataset:
45
+ name: flores101-devtest
46
+ type: flores_101
47
+ args: dan fra devtest
48
+ metrics:
49
+ - name: BLEU
50
+ type: bleu
51
+ value: 38.3
52
+ - name: chr-F
53
+ type: chrf
54
+ value: 0.63387
55
+ - task:
56
+ name: Translation dan-glg
57
+ type: translation
58
+ args: dan-glg
59
+ dataset:
60
+ name: flores101-devtest
61
+ type: flores_101
62
+ args: dan glg devtest
63
+ metrics:
64
+ - name: BLEU
65
+ type: bleu
66
+ value: 26.4
67
+ - name: chr-F
68
+ type: chrf
69
+ value: 0.54446
70
+ - task:
71
+ name: Translation dan-ita
72
+ type: translation
73
+ args: dan-ita
74
+ dataset:
75
+ name: flores101-devtest
76
+ type: flores_101
77
+ args: dan ita devtest
78
+ metrics:
79
+ - name: BLEU
80
+ type: bleu
81
+ value: 25.7
82
+ - name: chr-F
83
+ type: chrf
84
+ value: 0.55237
85
+ - task:
86
+ name: Translation dan-por
87
+ type: translation
88
+ args: dan-por
89
+ dataset:
90
+ name: flores101-devtest
91
+ type: flores_101
92
+ args: dan por devtest
93
+ metrics:
94
+ - name: BLEU
95
+ type: bleu
96
+ value: 36.9
97
+ - name: chr-F
98
+ type: chrf
99
+ value: 0.62233
100
+ - task:
101
+ name: Translation dan-ron
102
+ type: translation
103
+ args: dan-ron
104
+ dataset:
105
+ name: flores101-devtest
106
+ type: flores_101
107
+ args: dan ron devtest
108
+ metrics:
109
+ - name: BLEU
110
+ type: bleu
111
+ value: 31.8
112
+ - name: chr-F
113
+ type: chrf
114
+ value: 0.58235
115
+ - task:
116
+ name: Translation dan-spa
117
+ type: translation
118
+ args: dan-spa
119
+ dataset:
120
+ name: flores101-devtest
121
+ type: flores_101
122
+ args: dan spa devtest
123
+ metrics:
124
+ - name: BLEU
125
+ type: bleu
126
+ value: 24.3
127
+ - name: chr-F
128
+ type: chrf
129
+ value: 0.52453
130
+ - task:
131
+ name: Translation isl-cat
132
+ type: translation
133
+ args: isl-cat
134
+ dataset:
135
+ name: flores101-devtest
136
+ type: flores_101
137
+ args: isl cat devtest
138
+ metrics:
139
+ - name: BLEU
140
+ type: bleu
141
+ value: 22.7
142
+ - name: chr-F
143
+ type: chrf
144
+ value: 0.48930
145
+ - task:
146
+ name: Translation isl-fra
147
+ type: translation
148
+ args: isl-fra
149
+ dataset:
150
+ name: flores101-devtest
151
+ type: flores_101
152
+ args: isl fra devtest
153
+ metrics:
154
+ - name: BLEU
155
+ type: bleu
156
+ value: 26.2
157
+ - name: chr-F
158
+ type: chrf
159
+ value: 0.52704
160
+ - task:
161
+ name: Translation isl-glg
162
+ type: translation
163
+ args: isl-glg
164
+ dataset:
165
+ name: flores101-devtest
166
+ type: flores_101
167
+ args: isl glg devtest
168
+ metrics:
169
+ - name: BLEU
170
+ type: bleu
171
+ value: 18.0
172
+ - name: chr-F
173
+ type: chrf
174
+ value: 0.45387
175
+ - task:
176
+ name: Translation isl-ita
177
+ type: translation
178
+ args: isl-ita
179
+ dataset:
180
+ name: flores101-devtest
181
+ type: flores_101
182
+ args: isl ita devtest
183
+ metrics:
184
+ - name: BLEU
185
+ type: bleu
186
+ value: 18.6
187
+ - name: chr-F
188
+ type: chrf
189
+ value: 0.47303
190
+ - task:
191
+ name: Translation isl-por
192
+ type: translation
193
+ args: isl-por
194
+ dataset:
195
+ name: flores101-devtest
196
+ type: flores_101
197
+ args: isl por devtest
198
+ metrics:
199
+ - name: BLEU
200
+ type: bleu
201
+ value: 24.9
202
+ - name: chr-F
203
+ type: chrf
204
+ value: 0.51381
205
+ - task:
206
+ name: Translation isl-ron
207
+ type: translation
208
+ args: isl-ron
209
+ dataset:
210
+ name: flores101-devtest
211
+ type: flores_101
212
+ args: isl ron devtest
213
+ metrics:
214
+ - name: BLEU
215
+ type: bleu
216
+ value: 21.6
217
+ - name: chr-F
218
+ type: chrf
219
+ value: 0.48224
220
+ - task:
221
+ name: Translation isl-spa
222
+ type: translation
223
+ args: isl-spa
224
+ dataset:
225
+ name: flores101-devtest
226
+ type: flores_101
227
+ args: isl spa devtest
228
+ metrics:
229
+ - name: BLEU
230
+ type: bleu
231
+ value: 18.1
232
+ - name: chr-F
233
+ type: chrf
234
+ value: 0.45786
235
+ - task:
236
+ name: Translation nob-cat
237
+ type: translation
238
+ args: nob-cat
239
+ dataset:
240
+ name: flores101-devtest
241
+ type: flores_101
242
+ args: nob cat devtest
243
+ metrics:
244
+ - name: BLEU
245
+ type: bleu
246
+ value: 28.9
247
+ - name: chr-F
248
+ type: chrf
249
+ value: 0.55984
250
+ - task:
251
+ name: Translation nob-fra
252
+ type: translation
253
+ args: nob-fra
254
+ dataset:
255
+ name: flores101-devtest
256
+ type: flores_101
257
+ args: nob fra devtest
258
+ metrics:
259
+ - name: BLEU
260
+ type: bleu
261
+ value: 33.8
262
+ - name: chr-F
263
+ type: chrf
264
+ value: 0.60102
265
+ - task:
266
+ name: Translation nob-glg
267
+ type: translation
268
+ args: nob-glg
269
+ dataset:
270
+ name: flores101-devtest
271
+ type: flores_101
272
+ args: nob glg devtest
273
+ metrics:
274
+ - name: BLEU
275
+ type: bleu
276
+ value: 23.4
277
+ - name: chr-F
278
+ type: chrf
279
+ value: 0.52145
280
+ - task:
281
+ name: Translation nob-ita
282
+ type: translation
283
+ args: nob-ita
284
+ dataset:
285
+ name: flores101-devtest
286
+ type: flores_101
287
+ args: nob ita devtest
288
+ metrics:
289
+ - name: BLEU
290
+ type: bleu
291
+ value: 22.2
292
+ - name: chr-F
293
+ type: chrf
294
+ value: 0.52619
295
+ - task:
296
+ name: Translation nob-por
297
+ type: translation
298
+ args: nob-por
299
+ dataset:
300
+ name: flores101-devtest
301
+ type: flores_101
302
+ args: nob por devtest
303
+ metrics:
304
+ - name: BLEU
305
+ type: bleu
306
+ value: 32.2
307
+ - name: chr-F
308
+ type: chrf
309
+ value: 0.58836
310
+ - task:
311
+ name: Translation nob-ron
312
+ type: translation
313
+ args: nob-ron
314
+ dataset:
315
+ name: flores101-devtest
316
+ type: flores_101
317
+ args: nob ron devtest
318
+ metrics:
319
+ - name: BLEU
320
+ type: bleu
321
+ value: 27.6
322
+ - name: chr-F
323
+ type: chrf
324
+ value: 0.54845
325
+ - task:
326
+ name: Translation nob-spa
327
+ type: translation
328
+ args: nob-spa
329
+ dataset:
330
+ name: flores101-devtest
331
+ type: flores_101
332
+ args: nob spa devtest
333
+ metrics:
334
+ - name: BLEU
335
+ type: bleu
336
+ value: 21.8
337
+ - name: chr-F
338
+ type: chrf
339
+ value: 0.50661
340
+ - task:
341
+ name: Translation swe-cat
342
+ type: translation
343
+ args: swe-cat
344
+ dataset:
345
+ name: flores101-devtest
346
+ type: flores_101
347
+ args: swe cat devtest
348
+ metrics:
349
+ - name: BLEU
350
+ type: bleu
351
+ value: 32.4
352
+ - name: chr-F
353
+ type: chrf
354
+ value: 0.58542
355
+ - task:
356
+ name: Translation swe-fra
357
+ type: translation
358
+ args: swe-fra
359
+ dataset:
360
+ name: flores101-devtest
361
+ type: flores_101
362
+ args: swe fra devtest
363
+ metrics:
364
+ - name: BLEU
365
+ type: bleu
366
+ value: 39.3
367
+ - name: chr-F
368
+ type: chrf
369
+ value: 0.63688
370
+ - task:
371
+ name: Translation swe-glg
372
+ type: translation
373
+ args: swe-glg
374
+ dataset:
375
+ name: flores101-devtest
376
+ type: flores_101
377
+ args: swe glg devtest
378
+ metrics:
379
+ - name: BLEU
380
+ type: bleu
381
+ value: 26.0
382
+ - name: chr-F
383
+ type: chrf
384
+ value: 0.53989
385
+ - task:
386
+ name: Translation swe-ita
387
+ type: translation
388
+ args: swe-ita
389
+ dataset:
390
+ name: flores101-devtest
391
+ type: flores_101
392
+ args: swe ita devtest
393
+ metrics:
394
+ - name: BLEU
395
+ type: bleu
396
+ value: 25.9
397
+ - name: chr-F
398
+ type: chrf
399
+ value: 0.55232
400
+ - task:
401
+ name: Translation swe-por
402
+ type: translation
403
+ args: swe-por
404
+ dataset:
405
+ name: flores101-devtest
406
+ type: flores_101
407
+ args: swe por devtest
408
+ metrics:
409
+ - name: BLEU
410
+ type: bleu
411
+ value: 36.5
412
+ - name: chr-F
413
+ type: chrf
414
+ value: 0.61882
415
+ - task:
416
+ name: Translation swe-ron
417
+ type: translation
418
+ args: swe-ron
419
+ dataset:
420
+ name: flores101-devtest
421
+ type: flores_101
422
+ args: swe ron devtest
423
+ metrics:
424
+ - name: BLEU
425
+ type: bleu
426
+ value: 31.0
427
+ - name: chr-F
428
+ type: chrf
429
+ value: 0.57419
430
+ - task:
431
+ name: Translation swe-spa
432
+ type: translation
433
+ args: swe-spa
434
+ dataset:
435
+ name: flores101-devtest
436
+ type: flores_101
437
+ args: swe spa devtest
438
+ metrics:
439
+ - name: BLEU
440
+ type: bleu
441
+ value: 23.8
442
+ - name: chr-F
443
+ type: chrf
444
+ value: 0.52175
445
+ - task:
446
+ name: Translation dan-fra
447
+ type: translation
448
+ args: dan-fra
449
+ dataset:
450
+ name: tatoeba-test-v2021-08-07
451
+ type: tatoeba_mt
452
+ args: dan-fra
453
+ metrics:
454
+ - name: BLEU
455
+ type: bleu
456
+ value: 63.8
457
+ - name: chr-F
458
+ type: chrf
459
+ value: 0.76671
460
+ - task:
461
+ name: Translation dan-ita
462
+ type: translation
463
+ args: dan-ita
464
+ dataset:
465
+ name: tatoeba-test-v2021-08-07
466
+ type: tatoeba_mt
467
+ args: dan-ita
468
+ metrics:
469
+ - name: BLEU
470
+ type: bleu
471
+ value: 56.2
472
+ - name: chr-F
473
+ type: chrf
474
+ value: 0.74658
475
+ - task:
476
+ name: Translation dan-por
477
+ type: translation
478
+ args: dan-por
479
+ dataset:
480
+ name: tatoeba-test-v2021-08-07
481
+ type: tatoeba_mt
482
+ args: dan-por
483
+ metrics:
484
+ - name: BLEU
485
+ type: bleu
486
+ value: 57.8
487
+ - name: chr-F
488
+ type: chrf
489
+ value: 0.74944
490
+ - task:
491
+ name: Translation dan-spa
492
+ type: translation
493
+ args: dan-spa
494
+ dataset:
495
+ name: tatoeba-test-v2021-08-07
496
+ type: tatoeba_mt
497
+ args: dan-spa
498
+ metrics:
499
+ - name: BLEU
500
+ type: bleu
501
+ value: 54.8
502
+ - name: chr-F
503
+ type: chrf
504
+ value: 0.72328
505
+ - task:
506
+ name: Translation isl-ita
507
+ type: translation
508
+ args: isl-ita
509
+ dataset:
510
+ name: tatoeba-test-v2021-08-07
511
+ type: tatoeba_mt
512
+ args: isl-ita
513
+ metrics:
514
+ - name: BLEU
515
+ type: bleu
516
+ value: 51.0
517
+ - name: chr-F
518
+ type: chrf
519
+ value: 0.69354
520
+ - task:
521
+ name: Translation isl-spa
522
+ type: translation
523
+ args: isl-spa
524
+ dataset:
525
+ name: tatoeba-test-v2021-08-07
526
+ type: tatoeba_mt
527
+ args: isl-spa
528
+ metrics:
529
+ - name: BLEU
530
+ type: bleu
531
+ value: 49.2
532
+ - name: chr-F
533
+ type: chrf
534
+ value: 0.66008
535
+ - task:
536
+ name: Translation nob-fra
537
+ type: translation
538
+ args: nob-fra
539
+ dataset:
540
+ name: tatoeba-test-v2021-08-07
541
+ type: tatoeba_mt
542
+ args: nob-fra
543
+ metrics:
544
+ - name: BLEU
545
+ type: bleu
546
+ value: 54.4
547
+ - name: chr-F
548
+ type: chrf
549
+ value: 0.70854
550
+ - task:
551
+ name: Translation nob-spa
552
+ type: translation
553
+ args: nob-spa
554
+ dataset:
555
+ name: tatoeba-test-v2021-08-07
556
+ type: tatoeba_mt
557
+ args: nob-spa
558
+ metrics:
559
+ - name: BLEU
560
+ type: bleu
561
+ value: 55.9
562
+ - name: chr-F
563
+ type: chrf
564
+ value: 0.73672
565
+ - task:
566
+ name: Translation swe-fra
567
+ type: translation
568
+ args: swe-fra
569
+ dataset:
570
+ name: tatoeba-test-v2021-08-07
571
+ type: tatoeba_mt
572
+ args: swe-fra
573
+ metrics:
574
+ - name: BLEU
575
+ type: bleu
576
+ value: 59.2
577
+ - name: chr-F
578
+ type: chrf
579
+ value: 0.73014
580
+ - task:
581
+ name: Translation swe-ita
582
+ type: translation
583
+ args: swe-ita
584
+ dataset:
585
+ name: tatoeba-test-v2021-08-07
586
+ type: tatoeba_mt
587
+ args: swe-ita
588
+ metrics:
589
+ - name: BLEU
590
+ type: bleu
591
+ value: 56.6
592
+ - name: chr-F
593
+ type: chrf
594
+ value: 0.73211
595
+ - task:
596
+ name: Translation swe-por
597
+ type: translation
598
+ args: swe-por
599
+ dataset:
600
+ name: tatoeba-test-v2021-08-07
601
+ type: tatoeba_mt
602
+ args: swe-por
603
+ metrics:
604
+ - name: BLEU
605
+ type: bleu
606
+ value: 48.7
607
+ - name: chr-F
608
+ type: chrf
609
+ value: 0.68146
610
+ - task:
611
+ name: Translation swe-spa
612
+ type: translation
613
+ args: swe-spa
614
+ dataset:
615
+ name: tatoeba-test-v2021-08-07
616
+ type: tatoeba_mt
617
+ args: swe-spa
618
+ metrics:
619
+ - name: BLEU
620
+ type: bleu
621
+ value: 55.3
622
+ - name: chr-F
623
+ type: chrf
624
+ value: 0.71373
625
+ ---
626
+ # opus-mt-tc-big-gmq-itc
627
+
628
+ ## Table of Contents
629
+ - [Model Details](#model-details)
630
+ - [Uses](#uses)
631
+ - [Risks, Limitations and Biases](#risks-limitations-and-biases)
632
+ - [How to Get Started With the Model](#how-to-get-started-with-the-model)
633
+ - [Training](#training)
634
+ - [Evaluation](#evaluation)
635
+ - [Citation Information](#citation-information)
636
+ - [Acknowledgements](#acknowledgements)
637
+
638
+ ## Model Details
639
+
640
+ Neural machine translation model for translating from North Germanic languages (gmq) to Italic languages (itc).
641
+
642
+ This model is part of the [OPUS-MT project](https://github.com/Helsinki-NLP/Opus-MT), an effort to make neural machine translation models widely available and accessible for many languages in the world. All models are originally trained using the amazing framework of [Marian NMT](https://marian-nmt.github.io/), an efficient NMT implementation written in pure C++. The models have been converted to pyTorch using the transformers library by huggingface. Training data is taken from [OPUS](https://opus.nlpl.eu/) and training pipelines use the procedures of [OPUS-MT-train](https://github.com/Helsinki-NLP/Opus-MT-train).
643
+ **Model Description:**
644
+ - **Developed by:** Language Technology Research Group at the University of Helsinki
645
+ - **Model Type:** Translation (transformer-big)
646
+ - **Release**: 2022-08-09
647
+ - **License:** CC-BY-4.0
648
+ - **Language(s):**
649
+ - Source Language(s): dan isl nno nob nor swe
650
+ - Target Language(s): fra ita lat por spa
651
+ - Valid Target Language Labels: >>fra<< >>ita<< >>lat<< >>por<< >>spa<<
652
+ - **Original Model**: [opusTCv20210807_transformer-big_2022-08-09.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/gmq-itc/opusTCv20210807_transformer-big_2022-08-09.zip)
653
+ - **Resources for more information:**
654
+ - [OPUS-MT-train GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
655
+ - More information about released models for this language pair: [OPUS-MT gmq-itc README](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/models/gmq-itc/README.md)
656
+ - [More information about MarianNMT models in the transformers library](https://huggingface.co/docs/transformers/model_doc/marian)
657
+ - [Tatoeba Translation Challenge](https://github.com/Helsinki-NLP/Tatoeba-Challenge/
658
+
659
+ This is a multilingual translation model with multiple target languages. A sentence initial language token is required in the form of `>>id<<` (id = valid target language ID), e.g. `>>fra<<`
660
+
661
+ ## Uses
662
+
663
+ This model can be used for translation and text-to-text generation.
664
+
665
+ ## Risks, Limitations and Biases
666
+
667
+ **CONTENT WARNING: Readers should be aware that the model is trained on various public data sets that may contain content that is disturbing, offensive, and can propagate historical and current stereotypes.**
668
+
669
+ Significant research has explored bias and fairness issues with language models (see, e.g., [Sheng et al. (2021)](https://aclanthology.org/2021.acl-long.330.pdf) and [Bender et al. (2021)](https://dl.acm.org/doi/pdf/10.1145/3442188.3445922)).
670
+
671
+ ## How to Get Started With the Model
672
+
673
+ A short example code:
674
+
675
+ ```python
676
+ from transformers import MarianMTModel, MarianTokenizer
677
+
678
+ src_text = [
679
+ ">>spa<< Jag är inte religiös.",
680
+ ">>por<< Livet er for kort til å lære seg tysk."
681
+ ]
682
+
683
+ model_name = "pytorch-models/opus-mt-tc-big-gmq-itc"
684
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
685
+ model = MarianMTModel.from_pretrained(model_name)
686
+ translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
687
+
688
+ for t in translated:
689
+ print( tokenizer.decode(t, skip_special_tokens=True) )
690
+
691
+ # expected output:
692
+ # No soy religioso.
693
+ # A vida é muito curta para aprender alemão.
694
+ ```
695
+
696
+ You can also use OPUS-MT models with the transformers pipelines, for example:
697
+
698
+ ```python
699
+ from transformers import pipeline
700
+ pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-gmq-itc")
701
+ print(pipe(">>spa<< Jag är inte religiös."))
702
+
703
+ # expected output: No soy religioso.
704
+ ```
705
+
706
+ ## Training
707
+
708
+ - **Data**: opusTCv20210807 ([source](https://github.com/Helsinki-NLP/Tatoeba-Challenge))
709
+ - **Pre-processing**: SentencePiece (spm32k,spm32k)
710
+ - **Model Type:** transformer-big
711
+ - **Original MarianNMT Model**: [opusTCv20210807_transformer-big_2022-08-09.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/gmq-itc/opusTCv20210807_transformer-big_2022-08-09.zip)
712
+ - **Training Scripts**: [GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
713
+
714
+ ## Evaluation
715
+
716
+ * test set translations: [opusTCv20210807_transformer-big_2022-08-09.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/gmq-itc/opusTCv20210807_transformer-big_2022-08-09.test.txt)
717
+ * test set scores: [opusTCv20210807_transformer-big_2022-08-09.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/gmq-itc/opusTCv20210807_transformer-big_2022-08-09.eval.txt)
718
+ * benchmark results: [benchmark_results.txt](benchmark_results.txt)
719
+ * benchmark output: [benchmark_translations.zip](benchmark_translations.zip)
720
+
721
+ | langpair | testset | chr-F | BLEU | #sent | #words |
722
+ |----------|---------|-------|-------|-------|--------|
723
+ | dan-fra | tatoeba-test-v2021-08-07 | 0.76671 | 63.8 | 1731 | 11882 |
724
+ | dan-ita | tatoeba-test-v2021-08-07 | 0.74658 | 56.2 | 284 | 2226 |
725
+ | dan-por | tatoeba-test-v2021-08-07 | 0.74944 | 57.8 | 873 | 5360 |
726
+ | dan-spa | tatoeba-test-v2021-08-07 | 0.72328 | 54.8 | 5000 | 35528 |
727
+ | isl-ita | tatoeba-test-v2021-08-07 | 0.69354 | 51.0 | 236 | 1450 |
728
+ | isl-spa | tatoeba-test-v2021-08-07 | 0.66008 | 49.2 | 238 | 1229 |
729
+ | nob-fra | tatoeba-test-v2021-08-07 | 0.70854 | 54.4 | 323 | 2269 |
730
+ | nob-spa | tatoeba-test-v2021-08-07 | 0.73672 | 55.9 | 885 | 6866 |
731
+ | swe-fra | tatoeba-test-v2021-08-07 | 0.73014 | 59.2 | 1407 | 9580 |
732
+ | swe-ita | tatoeba-test-v2021-08-07 | 0.73211 | 56.6 | 715 | 4711 |
733
+ | swe-por | tatoeba-test-v2021-08-07 | 0.68146 | 48.7 | 320 | 2032 |
734
+ | swe-spa | tatoeba-test-v2021-08-07 | 0.71373 | 55.3 | 1351 | 8235 |
735
+ | dan-cat | flores101-devtest | 0.59224 | 33.4 | 1012 | 27304 |
736
+ | dan-fra | flores101-devtest | 0.63387 | 38.3 | 1012 | 28343 |
737
+ | dan-glg | flores101-devtest | 0.54446 | 26.4 | 1012 | 26582 |
738
+ | dan-ita | flores101-devtest | 0.55237 | 25.7 | 1012 | 27306 |
739
+ | dan-por | flores101-devtest | 0.62233 | 36.9 | 1012 | 26519 |
740
+ | dan-ron | flores101-devtest | 0.58235 | 31.8 | 1012 | 26799 |
741
+ | dan-spa | flores101-devtest | 0.52453 | 24.3 | 1012 | 29199 |
742
+ | isl-cat | flores101-devtest | 0.48930 | 22.7 | 1012 | 27304 |
743
+ | isl-fra | flores101-devtest | 0.52704 | 26.2 | 1012 | 28343 |
744
+ | isl-glg | flores101-devtest | 0.45387 | 18.0 | 1012 | 26582 |
745
+ | isl-ita | flores101-devtest | 0.47303 | 18.6 | 1012 | 27306 |
746
+ | isl-por | flores101-devtest | 0.51381 | 24.9 | 1012 | 26519 |
747
+ | isl-ron | flores101-devtest | 0.48224 | 21.6 | 1012 | 26799 |
748
+ | isl-spa | flores101-devtest | 0.45786 | 18.1 | 1012 | 29199 |
749
+ | nob-cat | flores101-devtest | 0.55984 | 28.9 | 1012 | 27304 |
750
+ | nob-fra | flores101-devtest | 0.60102 | 33.8 | 1012 | 28343 |
751
+ | nob-glg | flores101-devtest | 0.52145 | 23.4 | 1012 | 26582 |
752
+ | nob-ita | flores101-devtest | 0.52619 | 22.2 | 1012 | 27306 |
753
+ | nob-por | flores101-devtest | 0.58836 | 32.2 | 1012 | 26519 |
754
+ | nob-ron | flores101-devtest | 0.54845 | 27.6 | 1012 | 26799 |
755
+ | nob-spa | flores101-devtest | 0.50661 | 21.8 | 1012 | 29199 |
756
+ | swe-cat | flores101-devtest | 0.58542 | 32.4 | 1012 | 27304 |
757
+ | swe-fra | flores101-devtest | 0.63688 | 39.3 | 1012 | 28343 |
758
+ | swe-glg | flores101-devtest | 0.53989 | 26.0 | 1012 | 26582 |
759
+ | swe-ita | flores101-devtest | 0.55232 | 25.9 | 1012 | 27306 |
760
+ | swe-por | flores101-devtest | 0.61882 | 36.5 | 1012 | 26519 |
761
+ | swe-ron | flores101-devtest | 0.57419 | 31.0 | 1012 | 26799 |
762
+ | swe-spa | flores101-devtest | 0.52175 | 23.8 | 1012 | 29199 |
763
+
764
+ ## Citation Information
765
+
766
+ * Publications: [OPUS-MT – Building open translation services for the World](https://aclanthology.org/2020.eamt-1.61/) and [The Tatoeba Translation Challenge – Realistic Data Sets for Low Resource and Multilingual MT](https://aclanthology.org/2020.wmt-1.139/) (Please, cite if you use this model.)
767
+
768
+ ```
769
+ @inproceedings{tiedemann-thottingal-2020-opus,
770
+ title = "{OPUS}-{MT} {--} Building open translation services for the World",
771
+ author = {Tiedemann, J{\"o}rg and Thottingal, Santhosh},
772
+ booktitle = "Proceedings of the 22nd Annual Conference of the European Association for Machine Translation",
773
+ month = nov,
774
+ year = "2020",
775
+ address = "Lisboa, Portugal",
776
+ publisher = "European Association for Machine Translation",
777
+ url = "https://aclanthology.org/2020.eamt-1.61",
778
+ pages = "479--480",
779
+ }
780
+
781
+ @inproceedings{tiedemann-2020-tatoeba,
782
+ title = "The Tatoeba Translation Challenge {--} Realistic Data Sets for Low Resource and Multilingual {MT}",
783
+ author = {Tiedemann, J{\"o}rg},
784
+ booktitle = "Proceedings of the Fifth Conference on Machine Translation",
785
+ month = nov,
786
+ year = "2020",
787
+ address = "Online",
788
+ publisher = "Association for Computational Linguistics",
789
+ url = "https://aclanthology.org/2020.wmt-1.139",
790
+ pages = "1174--1182",
791
+ }
792
+ ```
793
+
794
+ ## Acknowledgements
795
+
796
+ The work is supported by the [European Language Grid](https://www.european-language-grid.eu/) as [pilot project 2866](https://live.european-language-grid.eu/catalogue/#/resource/projects/2866), by the [FoTran project](https://www.helsinki.fi/en/researchgroups/natural-language-understanding-with-cross-lingual-grounding), funded by the European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation programme (grant agreement No 771113), and the [MeMAD project](https://memad.eu/), funded by the European Union’s Horizon 2020 Research and Innovation Programme under grant agreement No 780069. We are also grateful for the generous computational resources and IT infrastructure provided by [CSC -- IT Center for Science](https://www.csc.fi/), Finland.
797
+
798
+ ## Model conversion info
799
+
800
+ * transformers version: 4.16.2
801
+ * OPUS-MT git hash: 8b9f0b0
802
+ * port time: Fri Aug 12 14:15:19 EEST 2022
803
+ * port machine: LM0-400-22516.local
benchmark_results.txt ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dan-cat flores101-dev 0.58904 32.3 997 25962
2
+ dan-fra flores101-dev 0.63627 38.7 997 26706
3
+ dan-glg flores101-dev 0.53965 26.5 997 25265
4
+ dan-ita flores101-dev 0.54662 25.0 997 25840
5
+ dan-por flores101-dev 0.61581 36.3 997 25287
6
+ dan-ron flores101-dev 0.57907 31.9 997 25616
7
+ dan-spa flores101-dev 0.51873 24.1 997 27793
8
+ isl-cat flores101-dev 0.48721 22.7 997 25962
9
+ isl-fra flores101-dev 0.52986 26.4 997 26706
10
+ isl-glg flores101-dev 0.45577 18.5 997 25265
11
+ isl-ita flores101-dev 0.47766 18.7 997 25840
12
+ isl-por flores101-dev 0.51411 25.2 997 25287
13
+ isl-ron flores101-dev 0.48374 22.1 997 25616
14
+ isl-spa flores101-dev 0.45296 18.5 997 27793
15
+ nob-cat flores101-dev 0.55963 28.5 997 25962
16
+ nob-fra flores101-dev 0.59978 34.3 997 26706
17
+ nob-glg flores101-dev 0.51661 23.7 997 25265
18
+ nob-ita flores101-dev 0.52610 22.5 997 25840
19
+ nob-por flores101-dev 0.58589 31.8 997 25287
20
+ nob-ron flores101-dev 0.54908 27.6 997 25616
21
+ nob-spa flores101-dev 0.49895 21.4 997 27793
22
+ swe-cat flores101-dev 0.58623 32.4 997 25962
23
+ swe-fra flores101-dev 0.63855 39.4 997 26706
24
+ swe-glg flores101-dev 0.53654 26.2 997 25265
25
+ swe-ita flores101-dev 0.54976 25.3 997 25840
26
+ swe-por flores101-dev 0.61208 36.2 997 25287
27
+ swe-ron flores101-dev 0.58319 32.6 997 25616
28
+ swe-spa flores101-dev 0.51446 23.7 997 27793
29
+ dan-cat flores101-devtest 0.59224 33.4 1012 27304
30
+ dan-fra flores101-devtest 0.63387 38.3 1012 28343
31
+ dan-glg flores101-devtest 0.54446 26.4 1012 26582
32
+ dan-ita flores101-devtest 0.55237 25.7 1012 27306
33
+ dan-por flores101-devtest 0.62233 36.9 1012 26519
34
+ dan-ron flores101-devtest 0.58235 31.8 1012 26799
35
+ dan-spa flores101-devtest 0.52453 24.3 1012 29199
36
+ isl-cat flores101-devtest 0.48930 22.7 1012 27304
37
+ isl-fra flores101-devtest 0.52704 26.2 1012 28343
38
+ isl-glg flores101-devtest 0.45387 18.0 1012 26582
39
+ isl-ita flores101-devtest 0.47303 18.6 1012 27306
40
+ isl-por flores101-devtest 0.51381 24.9 1012 26519
41
+ isl-ron flores101-devtest 0.48224 21.6 1012 26799
42
+ isl-spa flores101-devtest 0.45786 18.1 1012 29199
43
+ nob-cat flores101-devtest 0.55984 28.9 1012 27304
44
+ nob-fra flores101-devtest 0.60102 33.8 1012 28343
45
+ nob-glg flores101-devtest 0.52145 23.4 1012 26582
46
+ nob-ita flores101-devtest 0.52619 22.2 1012 27306
47
+ nob-por flores101-devtest 0.58836 32.2 1012 26519
48
+ nob-ron flores101-devtest 0.54845 27.6 1012 26799
49
+ nob-spa flores101-devtest 0.50661 21.8 1012 29199
50
+ swe-cat flores101-devtest 0.58542 32.4 1012 27304
51
+ swe-fra flores101-devtest 0.63688 39.3 1012 28343
52
+ swe-glg flores101-devtest 0.53989 26.0 1012 26582
53
+ swe-ita flores101-devtest 0.55232 25.9 1012 27306
54
+ swe-por flores101-devtest 0.61882 36.5 1012 26519
55
+ swe-ron flores101-devtest 0.57419 31.0 1012 26799
56
+ swe-spa flores101-devtest 0.52175 23.8 1012 29199
57
+ dan-fra tatoeba-test-v2020-07-28 0.76678 63.8 1742 11929
58
+ dan-ita tatoeba-test-v2020-07-28 0.74650 56.2 280 2202
59
+ dan-por tatoeba-test-v2020-07-28 0.74906 57.8 871 5351
60
+ nob-fra tatoeba-test-v2020-07-28 0.70768 54.2 322 2261
61
+ swe-fra tatoeba-test-v2020-07-28 0.73024 59.2 1409 9585
62
+ swe-ita tatoeba-test-v2020-07-28 0.72267 55.2 673 4420
63
+ swe-por tatoeba-test-v2020-07-28 0.67926 48.2 299 1878
64
+ dan-fra tatoeba-test-v2021-03-30 0.76678 63.8 1742 11929
65
+ dan-ita tatoeba-test-v2021-03-30 0.74688 56.1 291 2289
66
+ dan-por tatoeba-test-v2021-03-30 0.75007 57.9 880 5406
67
+ isl-ita tatoeba-test-v2021-03-30 0.69485 51.2 237 1455
68
+ isl-spa tatoeba-test-v2021-03-30 0.66114 49.4 239 1233
69
+ nob-fra tatoeba-test-v2021-03-30 0.70983 54.6 326 2286
70
+ nob-spa tatoeba-test-v2021-03-30 0.73669 55.9 894 6934
71
+ swe-fra tatoeba-test-v2021-03-30 0.73024 59.2 1409 9585
72
+ swe-ita tatoeba-test-v2021-03-30 0.72795 55.8 723 4741
73
+ swe-por tatoeba-test-v2021-03-30 0.68052 48.2 319 1996
74
+ dan-fra tatoeba-test-v2021-08-07 0.76671 63.8 1731 11882
75
+ dan-ita tatoeba-test-v2021-08-07 0.74658 56.2 284 2226
76
+ dan-por tatoeba-test-v2021-08-07 0.74944 57.8 873 5360
77
+ dan-spa tatoeba-test-v2021-08-07 0.72328 54.8 5000 35528
78
+ isl-ita tatoeba-test-v2021-08-07 0.69354 51.0 236 1450
79
+ isl-spa tatoeba-test-v2021-08-07 0.66008 49.2 238 1229
80
+ nob-fra tatoeba-test-v2021-08-07 0.70854 54.4 323 2269
81
+ nob-spa tatoeba-test-v2021-08-07 0.73672 55.9 885 6866
82
+ swe-fra tatoeba-test-v2021-08-07 0.73014 59.2 1407 9580
83
+ swe-ita tatoeba-test-v2021-08-07 0.73211 56.6 715 4711
84
+ swe-por tatoeba-test-v2021-08-07 0.68146 48.7 320 2032
85
+ swe-spa tatoeba-test-v2021-08-07 0.71373 55.3 1351 8235
benchmark_translations.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85b21a39e84fba3c2b3806c5a6f22b624dd2db775f08e8ad6190eb81541dc830
3
+ size 9846623
config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "relu",
4
+ "architectures": [
5
+ "MarianMTModel"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "bad_words_ids": [
9
+ [
10
+ 57968
11
+ ]
12
+ ],
13
+ "bos_token_id": 0,
14
+ "classifier_dropout": 0.0,
15
+ "d_model": 1024,
16
+ "decoder_attention_heads": 16,
17
+ "decoder_ffn_dim": 4096,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 6,
20
+ "decoder_start_token_id": 57968,
21
+ "decoder_vocab_size": 57969,
22
+ "dropout": 0.1,
23
+ "encoder_attention_heads": 16,
24
+ "encoder_ffn_dim": 4096,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 6,
27
+ "eos_token_id": 45463,
28
+ "forced_eos_token_id": 45463,
29
+ "init_std": 0.02,
30
+ "is_encoder_decoder": true,
31
+ "max_length": 512,
32
+ "max_position_embeddings": 1024,
33
+ "model_type": "marian",
34
+ "normalize_embedding": false,
35
+ "num_beams": 4,
36
+ "num_hidden_layers": 6,
37
+ "pad_token_id": 57968,
38
+ "scale_embedding": true,
39
+ "share_encoder_decoder_embeddings": true,
40
+ "static_position_embeddings": true,
41
+ "torch_dtype": "float16",
42
+ "transformers_version": "4.18.0.dev0",
43
+ "use_cache": true,
44
+ "vocab_size": 57969
45
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f5051ad03b0826001a0cf16f5fd563d782cf8987c8fe02842cb5d2fc155e660
3
+ size 590363395
source.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48b6f16ca3913c42bf46c4af955e15d461d78ff660b291efbdb7df9bc28c371c
3
+ size 807660
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
target.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8125135d04b6fa8861f1fc8d0d8f8bf9b2af4be394d8c2e4aa89c340ad38105
3
+ size 815536
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"source_lang": "gmq", "target_lang": "itc", "unk_token": "<unk>", "eos_token": "</s>", "pad_token": "<pad>", "model_max_length": 512, "sp_model_kwargs": {}, "separate_vocabs": false, "special_tokens_map_file": null, "name_or_path": "marian-models/opusTCv20210807_transformer-big_2022-08-09/gmq-itc", "tokenizer_class": "MarianTokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff