Initial submission of a model with the new conversion procedure.
Browse files- README.md +111 -0
- config.json +43 -0
- metadata.json +1 -0
- pytorch_model.bin +3 -0
- source.spm +0 -0
- special_tokens_map.json +1 -0
- target.spm +0 -0
- tokenizer_config.json +1 -0
- vocab.json +0 -0
README.md
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- dsb
|
4 |
+
- cs
|
5 |
+
- csb_Latn
|
6 |
+
- hsb
|
7 |
+
- pl
|
8 |
+
- zlw
|
9 |
+
- hu
|
10 |
+
- vro
|
11 |
+
- fi
|
12 |
+
- liv_Latn
|
13 |
+
- mdf
|
14 |
+
- krl
|
15 |
+
- fkv_Latn
|
16 |
+
- mhr
|
17 |
+
- et
|
18 |
+
- sma
|
19 |
+
- udm
|
20 |
+
- vep
|
21 |
+
- myv
|
22 |
+
- kpv
|
23 |
+
- se
|
24 |
+
- izh
|
25 |
+
- fiu
|
26 |
+
|
27 |
+
tags:
|
28 |
+
- translation
|
29 |
+
|
30 |
+
license: apache-2.0
|
31 |
+
---
|
32 |
+
### zlw-fiu
|
33 |
+
* source language name: West Slavic languages
|
34 |
+
* target language name: Finno-Ugrian languages
|
35 |
+
* OPUS readme: [README.md](https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-fiu/README.md)
|
36 |
+
* model: transformer
|
37 |
+
* source language codes: dsb, cs, csb_Latn, hsb, pl, zlw
|
38 |
+
* target language codes: hu, vro, fi, liv_Latn, mdf, krl, fkv_Latn, mhr, et, sma, udm, vep, myv, kpv, se, izh, fiu
|
39 |
+
* dataset: opus
|
40 |
+
* release date: 2021-02-18
|
41 |
+
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
|
42 |
+
* download original weights: [opus-2021-02-18.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-fiu/opus-2021-02-18.zip/zlw-fiu/opus-2021-02-18.zip)
|
43 |
+
* a sentence-initial language token is required in the form of >>id<<(id = valid, usually three-letter target language ID)
|
44 |
+
* Training data:
|
45 |
+
* ces-fin: Tatoeba-train (1000000)
|
46 |
+
* ces-hun: Tatoeba-train (1000000)
|
47 |
+
* pol-est: Tatoeba-train (1000000)
|
48 |
+
* pol-fin: Tatoeba-train (1000000)
|
49 |
+
* pol-hun: Tatoeba-train (1000000)
|
50 |
+
* Validation data:
|
51 |
+
* ces-fin: Tatoeba-dev, 1000
|
52 |
+
* ces-hun: Tatoeba-dev, 1000
|
53 |
+
* est-pol: Tatoeba-dev, 1000
|
54 |
+
* fin-pol: Tatoeba-dev, 1000
|
55 |
+
* hun-pol: Tatoeba-dev, 1000
|
56 |
+
* mhr-pol: Tatoeba-dev, 461
|
57 |
+
* total-size-shuffled: 5426
|
58 |
+
* devset-selected: top 5000 lines of Tatoeba-dev.src.shuffled!
|
59 |
+
* Test data:
|
60 |
+
* newssyscomb2009.ces-hun: 502/9733
|
61 |
+
* newstest2009.ces-hun: 2525/54965
|
62 |
+
* Tatoeba-test.ces-fin: 88/408
|
63 |
+
* Tatoeba-test.ces-hun: 1911/10336
|
64 |
+
* Tatoeba-test.multi-multi: 4562/25497
|
65 |
+
* Tatoeba-test.pol-chm: 5/36
|
66 |
+
* Tatoeba-test.pol-est: 15/98
|
67 |
+
* Tatoeba-test.pol-fin: 609/3293
|
68 |
+
* Tatoeba-test.pol-hun: 1934/11285
|
69 |
+
* test set translations file: [test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-fiu/opus-2021-02-18.zip/zlw-fiu/opus-2021-02-18.test.txt)
|
70 |
+
* test set scores file: [eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-fiu/opus-2021-02-18.zip/zlw-fiu/opus-2021-02-18.eval.txt)
|
71 |
+
* BLEU-scores
|
72 |
+
|Test set|score|
|
73 |
+
|---|---|
|
74 |
+
|Tatoeba-test.ces-fin|57.2|
|
75 |
+
|Tatoeba-test.ces-hun|42.6|
|
76 |
+
|Tatoeba-test.multi-multi|39.4|
|
77 |
+
|Tatoeba-test.pol-hun|36.6|
|
78 |
+
|Tatoeba-test.pol-fin|36.1|
|
79 |
+
|Tatoeba-test.pol-est|20.9|
|
80 |
+
|newssyscomb2009.ces-hun|13.9|
|
81 |
+
|newstest2009.ces-hun|13.9|
|
82 |
+
|Tatoeba-test.pol-chm|2.0|
|
83 |
+
* chr-F-scores
|
84 |
+
|Test set|score|
|
85 |
+
|---|---|
|
86 |
+
|Tatoeba-test.ces-fin|0.71|
|
87 |
+
|Tatoeba-test.ces-hun|0.637|
|
88 |
+
|Tatoeba-test.multi-multi|0.616|
|
89 |
+
|Tatoeba-test.pol-hun|0.605|
|
90 |
+
|Tatoeba-test.pol-fin|0.592|
|
91 |
+
|newssyscomb2009.ces-hun|0.449|
|
92 |
+
|newstest2009.ces-hun|0.443|
|
93 |
+
|Tatoeba-test.pol-est|0.372|
|
94 |
+
|Tatoeba-test.pol-chm|0.007|
|
95 |
+
|
96 |
+
### System Info:
|
97 |
+
* hf_name: zlw-fiu
|
98 |
+
* source_languages: dsb,cs,csb_Latn,hsb,pl,zlw
|
99 |
+
* target_languages: hu,vro,fi,liv_Latn,mdf,krl,fkv_Latn,mhr,et,sma,udm,vep,myv,kpv,se,izh,fiu
|
100 |
+
* opus_readme_url: https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-fiu/opus-2021-02-18.zip/README.md
|
101 |
+
* original_repo: Tatoeba-Challenge
|
102 |
+
* tags: ['translation']
|
103 |
+
* languages: ['dsb', 'cs', 'csb_Latn', 'hsb', 'pl', 'zlw', 'hu', 'vro', 'fi', 'liv_Latn', 'mdf', 'krl', 'fkv_Latn', 'mhr', 'et', 'sma', 'udm', 'vep', 'myv', 'kpv', 'se', 'izh', 'fiu']
|
104 |
+
* src_constituents: ['dsb', 'ces', 'csb_Latn', 'hsb', 'pol']
|
105 |
+
* tgt_constituents: ['hun', 'vro', 'fin', 'liv_Latn', 'mdf', 'krl', 'fkv_Latn', 'mhr', 'est', 'sma', 'udm', 'vep', 'myv', 'kpv', 'sme', 'izh']
|
106 |
+
* src_multilingual: True
|
107 |
+
* tgt_multilingual: True
|
108 |
+
* helsinki_git_sha: a0966db6db0ae616a28471ff0faf461b36fec07d
|
109 |
+
* transformers_git_sha: 3857f2b4e34912c942694489c2b667d9476e55f5
|
110 |
+
* port_machine: bungle
|
111 |
+
* port_time: 2021-06-29-15:24
|
config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"activation_dropout": 0.0,
|
3 |
+
"activation_function": "swish",
|
4 |
+
"architectures": [
|
5 |
+
"MarianMTModel"
|
6 |
+
],
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bad_words_ids": [
|
9 |
+
[
|
10 |
+
59746
|
11 |
+
]
|
12 |
+
],
|
13 |
+
"bos_token_id": 0,
|
14 |
+
"classifier_dropout": 0.0,
|
15 |
+
"d_model": 512,
|
16 |
+
"decoder_attention_heads": 8,
|
17 |
+
"decoder_ffn_dim": 2048,
|
18 |
+
"decoder_layerdrop": 0.0,
|
19 |
+
"decoder_layers": 6,
|
20 |
+
"decoder_start_token_id": 59746,
|
21 |
+
"dropout": 0.1,
|
22 |
+
"encoder_attention_heads": 8,
|
23 |
+
"encoder_ffn_dim": 2048,
|
24 |
+
"encoder_layerdrop": 0.0,
|
25 |
+
"encoder_layers": 6,
|
26 |
+
"eos_token_id": 0,
|
27 |
+
"forced_eos_token_id": 0,
|
28 |
+
"gradient_checkpointing": false,
|
29 |
+
"init_std": 0.02,
|
30 |
+
"is_encoder_decoder": true,
|
31 |
+
"max_length": 512,
|
32 |
+
"max_position_embeddings": 512,
|
33 |
+
"model_type": "marian",
|
34 |
+
"normalize_embedding": false,
|
35 |
+
"num_beams": 6,
|
36 |
+
"num_hidden_layers": 6,
|
37 |
+
"pad_token_id": 59746,
|
38 |
+
"scale_embedding": true,
|
39 |
+
"static_position_embeddings": true,
|
40 |
+
"transformers_version": "4.7.0.dev0",
|
41 |
+
"use_cache": true,
|
42 |
+
"vocab_size": 59747
|
43 |
+
}
|
metadata.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"hf_name": "zlw-fiu", "source_languages": "dsb,cs,csb_Latn,hsb,pl,zlw", "target_languages": "hu,vro,fi,liv_Latn,mdf,krl,fkv_Latn,mhr,et,sma,udm,vep,myv,kpv,se,izh,fiu", "opus_readme_url": "https://object.pouta.csc.fi/Tatoeba-MT-models/zlw-fiu/opus-2021-02-18.zip/README.md", "original_repo": "Tatoeba-Challenge", "tags": ["translation"], "languages": ["dsb", "cs", "csb_Latn", "hsb", "pl", "zlw", "hu", "vro", "fi", "liv_Latn", "mdf", "krl", "fkv_Latn", "mhr", "et", "sma", "udm", "vep", "myv", "kpv", "se", "izh", "fiu"], "src_constituents": ["dsb", "ces", "csb_Latn", "hsb", "pol"], "tgt_constituents": ["hun", "vro", "fin", "liv_Latn", "mdf", "krl", "fkv_Latn", "mhr", "est", "sma", "udm", "vep", "myv", "kpv", "sme", "izh"], "src_multilingual": true, "tgt_multilingual": true, "helsinki_git_sha": "a0966db6db0ae616a28471ff0faf461b36fec07d", "transformers_git_sha": "3857f2b4e34912c942694489c2b667d9476e55f5", "port_machine": "bungle", "port_time": "2021-06-29-15:24"}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:821f51c216a291c58c2cb8eb892709b247de79549c61e73ce60e9b02bf7d447c
|
3 |
+
size 210842993
|
source.spm
ADDED
Binary file (824 kB). View file
|
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
target.spm
ADDED
Binary file (813 kB). View file
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"source_lang": "zlw", "target_lang": "fiu", "unk_token": "<unk>", "eos_token": "</s>", "pad_token": "<pad>", "model_max_length": 512, "sp_model_kwargs": {}, "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "marian_ckpt/zlw-fiu"}
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|