imdbo commited on
Commit
66a1dea
1 Parent(s): 1e95249

update to ct2

Browse files
bpe-gl-en_emb.yaml DELETED
@@ -1,138 +0,0 @@
1
-
2
- save_data: run
3
- ## Where the vocab(s) will be written
4
- src_vocab: run/bpe.vocab.src
5
- tgt_vocab: run/bpe.vocab.tgt
6
- overwrite: True
7
-
8
- # Corpus opts:
9
- data:
10
- europarl:
11
- path_src: corpora/europarl/partitions/en_train.txt
12
- path_tgt: corpora/europarl/partitions/gl_train.txt
13
- transforms: [bpe, filtertoolong]
14
- weight: 120
15
- opensub:
16
- path_tgt: corpora/opensub/partitions/en_train.txt
17
- path_src: corpora/opensub/partitions/gl_train.txt
18
- transforms: [bpe, filtertoolong]
19
- weight: 152
20
- opus:
21
- path_tgt: corpora/opus/partitions/en_train.txt
22
- path_src: corpora/opus/partitions/gl_train.txt
23
- transforms: [bpe, filtertoolong]
24
- weight: 160
25
- ted2020:
26
- path_tgt: corpora/ted2020/partitions/en_train.txt
27
- path_src: corpora/ted2020/partitions/gl_train.txt
28
- transforms: [bpe, filtertoolong]
29
- weight: 10
30
- corgaback:
31
- path_tgt: corpora/corgaback/partitions/en_train.txt
32
- path_src: corpora/corgaback/partitions/gl_train.txt
33
- transforms: [bpe, filtertoolong]
34
- weight: 15
35
- ccmatrix:
36
- path_tgt: corpora/ccmatrix/en_tok_dbo.txt
37
- path_src: corpora/ccmatrix/gl_tok_dbo.txt
38
- transforms: [bpe, filtertoolong]
39
- weight: 380
40
- wikimatrix:
41
- path_tgt: corpora/wikimatrix/en.txt
42
- path_src: corpora/wikimatrix/gl.txt
43
- transforms: [bpe, filtertoolong]
44
- weight: 70
45
- cluvi:
46
- path_tgt: corpora/cluvi/en.txt
47
- path_src: corpora/cluvi/gl.txt
48
- transforms: [bpe, filtertoolong]
49
- weight: 70
50
- valid:
51
- path_tgt: corpora/partitions/all-en_valid.txt
52
- path_src: corpora/partitions/all-gl_valid.txt
53
- transforms: [bpe, filtertoolong]
54
-
55
- ### Transform related opts:
56
- #### Subword
57
- tgt_subword_model: ./bpe/en.code
58
- src_subword_model: ./bpe/gl.code
59
- src_subword_vocab: ./run/bpe.vocab.src
60
- tgt_subword_vocab: ./run/bpe.vocab.tgt
61
- src_subword_type: bpe
62
- tgt_subord_type: bpe
63
-
64
- src_subword_nbest: 1
65
- src_subword_alpha: 0.0
66
- tgt_subword_nbest: 1
67
- tgt_subword_alpha: 0.0
68
- #### Filter
69
- src_seq_length: 150
70
- tgt_seq_length: 150
71
-
72
- # silently ignore empty lines in the data
73
- skip_empty_level: silent
74
-
75
- ##embeddings
76
- tgt_embeddings: ../embeddings/en.emb.txt
77
- src_embeddings: ../embeddings/gl.emb.txt
78
-
79
- ## supported types: GloVe, word2vec
80
- embeddings_type: "word2vec"
81
-
82
- # word_vec_size need to match with the pretrained embeddings dimensions
83
- word_vec_size: 300
84
-
85
-
86
-
87
- # General opts
88
- save_model: run/model
89
- keep_checkpoint: 50
90
- save_checkpoint_steps: 10000
91
- average_decay: 0.0005
92
- seed: 1234
93
- report_every: 1000
94
- train_steps: 200000
95
- valid_steps: 10000
96
-
97
- # Batching
98
- queue_size: 10000
99
- bucket_size: 32768
100
- world_size: 1
101
- gpu_ranks: [0]
102
- batch_type: "tokens"
103
- batch_size: 8192
104
- #batch_size: 4096
105
- valid_batch_size: 64
106
- batch_size_multiple: 1
107
- max_generator_batches: 2
108
- accum_count: [4]
109
- accum_steps: [0]
110
-
111
- # Optimization
112
- model_dtype: "fp16"
113
- optim: "adam"
114
- learning_rate: 2
115
- warmup_steps: 8000
116
- decay_method: "noam"
117
- adam_beta2: 0.998
118
- max_grad_norm: 0
119
- label_smoothing: 0.1
120
- param_init: 0
121
- param_init_glorot: true
122
- normalization: "tokens"
123
-
124
- # Model
125
- encoder_type: transformer
126
- decoder_type: transformer
127
- position_encoding: true
128
- enc_layers: 6
129
- dec_layers: 6
130
- heads: 8
131
- rnn_size: 512
132
- word_vec_size: 512
133
- transformer_ff: 2048
134
- dropout_steps: [0]
135
- dropout: [0.1]
136
- attention_dropout: [0.1]
137
- share_decoder_embeddings: true
138
- share_embeddings: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NOS-MT-gl-en.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c624e72797fc8476941125ae8b490b83d277fe1772fd3b2b561708de3dce17be
3
- size 1563576538
 
 
 
 
README.md CHANGED
@@ -3,63 +3,55 @@ license: mit
3
  language:
4
  - gl
5
  metrics:
6
- - bleu (Gold1): 43.6
7
- - bleu (Gold2): 58.3
8
- - bleu (Flores): 39.0
9
- - bleu (Test-suite): 48.7
10
- ---
11
- license: mit
12
  ---
13
 
14
- **English text [here](https://huggingface.co/proxectonos/Nos_MT-OpenNMT-gl-en/blob/main/README_English.md)**
15
 
16
- **Descrición do Modelo**
17
 
18
- Modelo feito con OpenNMT para o par galego-inglés utilizando unha arquitectura transformer.
19
 
20
- **Como traducir**
21
 
22
- + Abrir terminal bash
23
- + Instalar [Python 3.9](https://www.python.org/downloads/release/python-390/)
24
- + Instalar [Open NMT toolkit v.2.2](https://github.com/OpenNMT/OpenNMT-py)
25
  + Traducir un input_text utilizando o modelo NOS-MT-gl-en co seguinte comando:
26
-
 
 
 
 
 
27
  ```bash
28
- onmt_translate -src input_text -model NOS-MT-gl-en.pt --output ./output_file.txt --replace_unk -gpu 0
 
 
 
29
  ```
30
- + O resultado da tradución estará no PATH indicado no flag -output.
31
 
32
  **Adestramento**
33
 
34
- No adestramento, utilizamos córpora auténticos e sintéticos do [ProxectoNós](https://github.com/proxectonos/corpora). Os primeiros son córpora de traducións feitas directamente por tradutores humanos. Os segundos son córpora de traducións inglés-portugués, que convertemos en inglés-galego a través da tradución automática portugués-galego con Opentrad/Apertium e transliteración para palabras fóra de vocabulario.
35
-
36
 
37
- **Procedemento de adestramento / Training process**
38
 
39
  + Tokenización dos datasets feita co tokenizador (tokenizer.pl) de [linguakit](https://github.com/citiususc/Linguakit) que foi modificado para evitar o salto de liña por token do ficheiro orixinal.
40
 
41
  + O vocabulario BPE para os modelos foi xerado a través do script [learn_bpe.py](https://github.com/OpenNMT/OpenNMT-py/blob/master/tools/learn_bpe.py) da OpenNMT
42
 
43
- + Utilizando o .yaml deste repositorio pode replicar o proceso de adestramento. É preciso modificar os paths do ficheiro .yaml para a Open NMT saber onde ir buscar os textos. Após facer isto, pode do seguinte xeito comezar o proceso:
44
-
45
- ```bash
46
- onmt_build_vocab -config bpe-gl-en_emb.yaml -n_sample 100000
47
- onmt_train -config bpe-gl-en_emb.yaml
48
- ```
49
 
50
- **Hiperparámetros**
51
-
52
- Os parámetros usados para o desenvolvemento do modelo poden ser consultados directamente no mesmo ficheiro .yaml bpe-gl-en_emb.yaml
53
 
54
  **Avaliación**
55
 
56
- A avalación BLEU dos modelos é feita cunha mistura de tests desenvolvidos internamente (gold1, gold2, test-suite) con outros datasets disponíbeis en galego (Flores).
57
 
58
  | GOLD 1 | GOLD 2 | FLORES | TEST-SUITE|
59
  | ------------- |:-------------:| -------:|----------:|
60
- | 43.6 | 58.3 | 39.0 | 48.7 |
61
-
62
-
63
 
64
  **Licenzas do Modelo**
65
 
@@ -89,15 +81,12 @@ SOFTWARE.
89
 
90
  Esta investigación foi financiada polo proxecto "Nós: o galego na sociedade e economía da intelixencia artificial", resultado dun acordo entre a Xunta de Galicia e a Universidade de Santiago de Compostela, o que resultou no subsidio ED431G2019/04 da Consellaría de Educación, Universidade e Formación Profesional da Galiza, e polo Fondo Europeo de Desenvolvemento Rexional (programa ERDF/FEDER), e Grupos de Referencia: ED431C 2020/21.
91
 
92
-
93
- **Citar este traballo**
94
 
95
  Se utilizar este modelo no seu traballo, cite por favor así:
96
 
97
- Daniel Bardanca Outeirinho, Pablo Gamallo Otero, Iria de-Dios-Flores, and José Ramom Pichel Campos. 2024.
98
- Exploring the effects of vocabulary size in neural machine translation: Galician as a target language.
99
- In Proceedings of the 16th International Conference on Computational Processing of Portuguese, pages 600–604,
100
  Santiago de Compostela, Galiza. Association for Computational Lingustics.
101
 
102
-
103
-
 
3
  language:
4
  - gl
5
  metrics:
6
+ - bleu (Gold1): 41.7
7
+ - bleu (Gold2): 55.5
8
+ - bleu (Flores): 37
9
+ - bleu (Test-suite): 45.2
 
 
10
  ---
11
 
12
+ **English text [here](https://huggingface.co/proxectonos/NOS-MT-OpenNMT-gl-en/blob/main/README_English.md)**
13
 
14
+ **Descrición do Modelo**
15
 
16
+ Modelo feito con OpenNMT-py 3.2 para o par español-galego utilizando unha arquitectura transformer. O modelo foi transformado para o formato da ctranslate2.
17
 
18
+ **Como traducir con este Modelo**
19
 
20
+ + Instalar o [Python 3.9](https://www.python.org/downloads/release/python-390/)
21
+ + Instalar o [ctranslate 3.2](https://github.com/OpenNMT/CTranslate2)
 
22
  + Traducir un input_text utilizando o modelo NOS-MT-gl-en co seguinte comando:
23
+ ```bash
24
+ perl tokenizer.perl < input.txt > input.tok
25
+ ```
26
+ ```bash
27
+ subword_nmt.apply_bpe -c ./bpe/es.bpe < input.tok > input.bpe
28
+ ```
29
  ```bash
30
+ python3 translate.py ./ct2-gl-en_12L input.bpe > output.txt
31
+ ```
32
+ ```bash
33
+ sed -i 's/@@ //g' output.txt
34
  ```
 
35
 
36
  **Adestramento**
37
 
38
+ No adestramento, utilizamos córpora auténticos e sintéticos do [ProxectoNós](https://github.com/proxectonos/corpora). Os primeiros son córpora de traducións feitas directamente por tradutores humanos. É importante salientar que a pesar destes textos seren feitos por humanos, non están libres de erros lingüísticos. Os segundos son córpora de traducións español-portugués, que convertemos en español-galego a través da tradución automática portugués-galego con Opentrad/Apertium e transliteración para palabras fóra de vocabulario.
 
39
 
40
+ **Procedemento de adestramento**
41
 
42
  + Tokenización dos datasets feita co tokenizador (tokenizer.pl) de [linguakit](https://github.com/citiususc/Linguakit) que foi modificado para evitar o salto de liña por token do ficheiro orixinal.
43
 
44
  + O vocabulario BPE para os modelos foi xerado a través do script [learn_bpe.py](https://github.com/OpenNMT/OpenNMT-py/blob/master/tools/learn_bpe.py) da OpenNMT
45
 
 
 
 
 
 
 
46
 
 
 
 
47
 
48
  **Avaliación**
49
 
50
+ A avaliación BLEU dos modelos é feita cunha mistura de tests desenvolvidos internamente (gold1, gold2, test-suite) con outros datasets disponíbeis en galego (Flores).
51
 
52
  | GOLD 1 | GOLD 2 | FLORES | TEST-SUITE|
53
  | ------------- |:-------------:| -------:|----------:|
54
+ | 41.7 | 55.5 | 37 | 45.2 |
 
 
55
 
56
  **Licenzas do Modelo**
57
 
 
81
 
82
  Esta investigación foi financiada polo proxecto "Nós: o galego na sociedade e economía da intelixencia artificial", resultado dun acordo entre a Xunta de Galicia e a Universidade de Santiago de Compostela, o que resultou no subsidio ED431G2019/04 da Consellaría de Educación, Universidade e Formación Profesional da Galiza, e polo Fondo Europeo de Desenvolvemento Rexional (programa ERDF/FEDER), e Grupos de Referencia: ED431C 2020/21.
83
 
84
+ **Citar este traballo**
 
85
 
86
  Se utilizar este modelo no seu traballo, cite por favor así:
87
 
88
+ Daniel Bardanca Outeirinho, Pablo Gamallo Otero, Iria de-Dios-Flores, and José Ramom Pichel Campos. 2024.
89
+ Exploring the effects of vocabulary size in neural machine translation: Galician as a target language.
90
+ In Proceedings of the 16th International Conference on Computational Processing of Portuguese, pages 600–604,
91
  Santiago de Compostela, Galiza. Association for Computational Lingustics.
92
 
 
 
README_English.md CHANGED
@@ -3,60 +3,54 @@ license: mit
3
  language:
4
  - gl
5
  metrics:
6
- - bleu (Gold1): 43.6
7
- - bleu (Gold2): 58.3
8
- - bleu (Flores): 39.0
9
- - bleu (Test-suite): 48.7
10
  ---
11
 
12
- ---
13
- License: MIT
14
- ---
15
 
16
- **Model description**
17
 
18
- Model developed with OpenNMT for the Galician-Spanish pair using the transformer architecture.
19
 
20
- **How to translate**
21
 
22
- + Open bash terminal
23
  + Install [Python 3.9](https://www.python.org/downloads/release/python-390/)
24
- + Install [Open NMT toolkit v.2.2](https://github.com/OpenNMT/OpenNMT-py)
25
  + Translate an input_text using the NOS-MT-gl-en model with the following command:
26
-
 
 
 
 
 
27
  ```bash
28
- onmt_translate -src input_text -model NOS-MT-gl-en -output ./output_file.txt -replace_unk -gpu 0
 
 
 
29
  ```
30
- + The resulting translation will be in the PATH indicated by the -output flag.
31
 
32
  **Training**
33
 
34
- To train this model, we have used authentic and synthetic corpora from [ProxectoNós](https://github.com/proxectonos/corpora).
35
 
36
- Authentic corpora are corpora produced by human translators. Synthetic corpora are Spanish-Portuguese translations, which have been converted to Spanish-Galician by means of Portuguese-Galician translation with Opentrad/Apertium and transliteration for out-of-vocabulary words.
37
 
 
38
 
39
- **Training process**
40
 
41
- + Tokenisation was performed with a modified version of the [linguakit](https://github.com/citiususc/Linguakit) tokeniser (tokenizer.pl) that does not append a new line after each token.
42
- + All BPE models were generated with the script [learn_bpe.py](https://github.com/OpenNMT/OpenNMT-py/blob/master/tools/learn_bpe.py)
43
- + Using the .yaml in this repository, it is possible to replicate the original training process. Before training the model, please verify that the path to each target (tgt) and (src) file is correct. Once this is done, proceed as follows:
44
 
45
- ```bash
46
- onmt_build_vocab -config bpe-gl-en_emb.yaml -n_sample 100000
47
- onmt_train -config bpe-gl-en_emb.yaml
48
- ```
49
- **Hyperparameters**
50
-
51
- You may find the parameters used for this model inside the file bpe-gl-en_emb.yaml
52
 
53
- **Evaluation**
54
-
55
- The BLEU evaluation of the models is a mixture of internally developed tests (gold1, gold2, test-suite) and other datasets available in Galician (Flores).
56
 
57
  | GOLD 1 | GOLD 2 | FLORES | TEST-SUITE|
58
  | ------------- |:-------------:| -------:|----------:|
59
- | 43.6 | 58.3 | 39.0 | 48.7 |
60
 
61
  **Licensing information**
62
 
@@ -86,6 +80,11 @@ SOFTWARE.
86
 
87
  This research was funded by the project "Nós: Galician in the society and economy of artificial intelligence", agreement between Xunta de Galicia and University of Santiago de Compostela, and grant ED431G2019/04 by the Galician Ministry of Education, University and Professional Training, and the European Regional Development Fund (ERDF/FEDER program), and Groups of Reference: ED431C 2020/21.
88
 
89
- **Citation Information**
 
 
 
 
 
 
90
 
91
- Gamallo, Pablo; Bardanca, Daniel; Pichel, José Ramom; García, Marcos; Rodríguez-Rey, Sandra; de-Dios-Flores, Iria. 2023. NOS-MT-OpenNMT-gl-en. Url: https://huggingface.co/proxectonos/NOS-MT-OpenNMT-gl-en
 
3
  language:
4
  - gl
5
  metrics:
6
+ - bleu (Gold1): 41.7
7
+ - bleu (Gold2): 55.5
8
+ - bleu (Flores): 37
9
+ - bleu (Test-suite): 45.2
10
  ---
11
 
12
+ **English text [here](https://huggingface.co/proxectonos/NOS-MT-OpenNMT-gl-en/blob/main/README_English.md)**
 
 
13
 
14
+ **Model Description**
15
 
16
+ Model created with OpenNMT-py 3.2 for the Spanish-Galician pair using a transformer architecture. The model was converted to the ctranslate2 format.
17
 
18
+ **How to Translate with this Model**
19
 
 
20
  + Install [Python 3.9](https://www.python.org/downloads/release/python-390/)
21
+ + Install [ctranslate 3.2](https://github.com/OpenNMT/CTranslate2)
22
  + Translate an input_text using the NOS-MT-gl-en model with the following command:
23
+ ```bash
24
+ perl tokenizer.perl < input.txt > input.tok
25
+ ```
26
+ ```bash
27
+ subword_nmt.apply_bpe -c ./bpe/es.bpe < input.tok > input.bpe
28
+ ```
29
  ```bash
30
+ python3 translate.py ./ct2-gl-en_12L input.bpe > output.txt
31
+ ```
32
+ ```bash
33
+ sed -i 's/@@ //g' output.txt
34
  ```
 
35
 
36
  **Training**
37
 
38
+ We used authentic and synthetic corpora from the [ProxectoNós](https://github.com/proxectonos/corpora). The former are translation corpora made directly by human translators. It is important to note that despite these texts being made by humans, they are not free from linguistic errors. The latter are Spanish-Portuguese translation corpora, which we converted into Spanish-Galician through Portuguese-Galician automatic translation with Opentrad/Apertium and transliteration for out-of-vocabulary words.
39
 
40
+ **Training Procedure**
41
 
42
+ + Tokenization of the datasets was done with the tokenizer (tokenizer.pl) from [linguakit](https://github.com/citiususc/Linguakit), which was modified to avoid line breaks per token.
43
 
44
+ + The BPE vocabulary for the models was generated using the script [learn_bpe.py](https://github.com/OpenNMT/OpenNMT-py/blob/master/tools/learn_bpe.py) from OpenNMT.
45
 
 
 
 
46
 
47
+ **Evaluation**
 
 
 
 
 
 
48
 
49
+ The BLEU evaluation of the models is done with a mix of internally developed tests (gold1, gold2, test-suite) with other available datasets in Galician (Flores).
 
 
50
 
51
  | GOLD 1 | GOLD 2 | FLORES | TEST-SUITE|
52
  | ------------- |:-------------:| -------:|----------:|
53
+ | 35.5 | 44.5 | 32.4 | 41.4 |
54
 
55
  **Licensing information**
56
 
 
80
 
81
  This research was funded by the project "Nós: Galician in the society and economy of artificial intelligence", agreement between Xunta de Galicia and University of Santiago de Compostela, and grant ED431G2019/04 by the Galician Ministry of Education, University and Professional Training, and the European Regional Development Fund (ERDF/FEDER program), and Groups of Reference: ED431C 2020/21.
82
 
83
+ **Citation Information**
84
+
85
+
86
+ Daniel Bardanca Outeirinho, Pablo Gamallo Otero, Iria de-Dios-Flores, and José Ramom Pichel Campos. 2024.
87
+ Exploring the effects of vocabulary size in neural machine translation: Galician as a target language.
88
+ In Proceedings of the 16th International Conference on Computational Processing of Portuguese, pages 600–604,
89
+ Santiago de Compostela, Galiza. Association for Computational Lingustics.
90
 
 
ct2-gl-en_12L/config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_source_bos": false,
3
+ "add_source_eos": false,
4
+ "bos_token": "<s>",
5
+ "decoder_start_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "layer_norm_epsilon": null,
8
+ "unk_token": "<unk>"
9
+ }
embeddings/en.emb.txt → ct2-gl-en_12L/model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f242903b80ac06548e8c3648ca309d11dbb2c18c3fca775378b43ada3ff6f36
3
- size 288418417
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffb9614bc9c5f33c1fa56db6e1f5e24b384a191f50979cc5a57510cc97f20e69
3
+ size 497179365
ct2-gl-en_12L/source_vocabulary.json ADDED
The diff for this file is too large to render. See raw diff
 
ct2-gl-en_12L/target_vocabulary.json ADDED
The diff for this file is too large to render. See raw diff
 
embeddings/gl.emb.txt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3590ee6395bd0de984feee52b9e5015bc8f405d0f267c5e6cfdb7e38c466da57
3
- size 382372326