update to ct2
Browse files- bpe-gl-en_emb.yaml +0 -138
- NOS-MT-gl-en.pt +0 -3
- README.md +28 -39
- README_English.md +33 -34
- ct2-gl-en_12L/config.json +9 -0
- embeddings/en.emb.txt → ct2-gl-en_12L/model.bin +2 -2
- ct2-gl-en_12L/source_vocabulary.json +0 -0
- ct2-gl-en_12L/target_vocabulary.json +0 -0
- embeddings/gl.emb.txt +0 -3
bpe-gl-en_emb.yaml
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
|
2 |
-
save_data: run
|
3 |
-
## Where the vocab(s) will be written
|
4 |
-
src_vocab: run/bpe.vocab.src
|
5 |
-
tgt_vocab: run/bpe.vocab.tgt
|
6 |
-
overwrite: True
|
7 |
-
|
8 |
-
# Corpus opts:
|
9 |
-
data:
|
10 |
-
europarl:
|
11 |
-
path_src: corpora/europarl/partitions/en_train.txt
|
12 |
-
path_tgt: corpora/europarl/partitions/gl_train.txt
|
13 |
-
transforms: [bpe, filtertoolong]
|
14 |
-
weight: 120
|
15 |
-
opensub:
|
16 |
-
path_tgt: corpora/opensub/partitions/en_train.txt
|
17 |
-
path_src: corpora/opensub/partitions/gl_train.txt
|
18 |
-
transforms: [bpe, filtertoolong]
|
19 |
-
weight: 152
|
20 |
-
opus:
|
21 |
-
path_tgt: corpora/opus/partitions/en_train.txt
|
22 |
-
path_src: corpora/opus/partitions/gl_train.txt
|
23 |
-
transforms: [bpe, filtertoolong]
|
24 |
-
weight: 160
|
25 |
-
ted2020:
|
26 |
-
path_tgt: corpora/ted2020/partitions/en_train.txt
|
27 |
-
path_src: corpora/ted2020/partitions/gl_train.txt
|
28 |
-
transforms: [bpe, filtertoolong]
|
29 |
-
weight: 10
|
30 |
-
corgaback:
|
31 |
-
path_tgt: corpora/corgaback/partitions/en_train.txt
|
32 |
-
path_src: corpora/corgaback/partitions/gl_train.txt
|
33 |
-
transforms: [bpe, filtertoolong]
|
34 |
-
weight: 15
|
35 |
-
ccmatrix:
|
36 |
-
path_tgt: corpora/ccmatrix/en_tok_dbo.txt
|
37 |
-
path_src: corpora/ccmatrix/gl_tok_dbo.txt
|
38 |
-
transforms: [bpe, filtertoolong]
|
39 |
-
weight: 380
|
40 |
-
wikimatrix:
|
41 |
-
path_tgt: corpora/wikimatrix/en.txt
|
42 |
-
path_src: corpora/wikimatrix/gl.txt
|
43 |
-
transforms: [bpe, filtertoolong]
|
44 |
-
weight: 70
|
45 |
-
cluvi:
|
46 |
-
path_tgt: corpora/cluvi/en.txt
|
47 |
-
path_src: corpora/cluvi/gl.txt
|
48 |
-
transforms: [bpe, filtertoolong]
|
49 |
-
weight: 70
|
50 |
-
valid:
|
51 |
-
path_tgt: corpora/partitions/all-en_valid.txt
|
52 |
-
path_src: corpora/partitions/all-gl_valid.txt
|
53 |
-
transforms: [bpe, filtertoolong]
|
54 |
-
|
55 |
-
### Transform related opts:
|
56 |
-
#### Subword
|
57 |
-
tgt_subword_model: ./bpe/en.code
|
58 |
-
src_subword_model: ./bpe/gl.code
|
59 |
-
src_subword_vocab: ./run/bpe.vocab.src
|
60 |
-
tgt_subword_vocab: ./run/bpe.vocab.tgt
|
61 |
-
src_subword_type: bpe
|
62 |
-
tgt_subord_type: bpe
|
63 |
-
|
64 |
-
src_subword_nbest: 1
|
65 |
-
src_subword_alpha: 0.0
|
66 |
-
tgt_subword_nbest: 1
|
67 |
-
tgt_subword_alpha: 0.0
|
68 |
-
#### Filter
|
69 |
-
src_seq_length: 150
|
70 |
-
tgt_seq_length: 150
|
71 |
-
|
72 |
-
# silently ignore empty lines in the data
|
73 |
-
skip_empty_level: silent
|
74 |
-
|
75 |
-
##embeddings
|
76 |
-
tgt_embeddings: ../embeddings/en.emb.txt
|
77 |
-
src_embeddings: ../embeddings/gl.emb.txt
|
78 |
-
|
79 |
-
## supported types: GloVe, word2vec
|
80 |
-
embeddings_type: "word2vec"
|
81 |
-
|
82 |
-
# word_vec_size need to match with the pretrained embeddings dimensions
|
83 |
-
word_vec_size: 300
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
# General opts
|
88 |
-
save_model: run/model
|
89 |
-
keep_checkpoint: 50
|
90 |
-
save_checkpoint_steps: 10000
|
91 |
-
average_decay: 0.0005
|
92 |
-
seed: 1234
|
93 |
-
report_every: 1000
|
94 |
-
train_steps: 200000
|
95 |
-
valid_steps: 10000
|
96 |
-
|
97 |
-
# Batching
|
98 |
-
queue_size: 10000
|
99 |
-
bucket_size: 32768
|
100 |
-
world_size: 1
|
101 |
-
gpu_ranks: [0]
|
102 |
-
batch_type: "tokens"
|
103 |
-
batch_size: 8192
|
104 |
-
#batch_size: 4096
|
105 |
-
valid_batch_size: 64
|
106 |
-
batch_size_multiple: 1
|
107 |
-
max_generator_batches: 2
|
108 |
-
accum_count: [4]
|
109 |
-
accum_steps: [0]
|
110 |
-
|
111 |
-
# Optimization
|
112 |
-
model_dtype: "fp16"
|
113 |
-
optim: "adam"
|
114 |
-
learning_rate: 2
|
115 |
-
warmup_steps: 8000
|
116 |
-
decay_method: "noam"
|
117 |
-
adam_beta2: 0.998
|
118 |
-
max_grad_norm: 0
|
119 |
-
label_smoothing: 0.1
|
120 |
-
param_init: 0
|
121 |
-
param_init_glorot: true
|
122 |
-
normalization: "tokens"
|
123 |
-
|
124 |
-
# Model
|
125 |
-
encoder_type: transformer
|
126 |
-
decoder_type: transformer
|
127 |
-
position_encoding: true
|
128 |
-
enc_layers: 6
|
129 |
-
dec_layers: 6
|
130 |
-
heads: 8
|
131 |
-
rnn_size: 512
|
132 |
-
word_vec_size: 512
|
133 |
-
transformer_ff: 2048
|
134 |
-
dropout_steps: [0]
|
135 |
-
dropout: [0.1]
|
136 |
-
attention_dropout: [0.1]
|
137 |
-
share_decoder_embeddings: true
|
138 |
-
share_embeddings: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
NOS-MT-gl-en.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:c624e72797fc8476941125ae8b490b83d277fe1772fd3b2b561708de3dce17be
|
3 |
-
size 1563576538
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -3,63 +3,55 @@ license: mit
|
|
3 |
language:
|
4 |
- gl
|
5 |
metrics:
|
6 |
-
- bleu (Gold1):
|
7 |
-
- bleu (Gold2):
|
8 |
-
- bleu (Flores):
|
9 |
-
- bleu (Test-suite):
|
10 |
-
---
|
11 |
-
license: mit
|
12 |
---
|
13 |
|
14 |
-
**English text [here](https://huggingface.co/proxectonos/
|
15 |
|
16 |
-
**Descrición do Modelo**
|
17 |
|
18 |
-
Modelo feito con OpenNMT para o par galego
|
19 |
|
20 |
-
**Como traducir**
|
21 |
|
22 |
-
+
|
23 |
-
+ Instalar [
|
24 |
-
+ Instalar [Open NMT toolkit v.2.2](https://github.com/OpenNMT/OpenNMT-py)
|
25 |
+ Traducir un input_text utilizando o modelo NOS-MT-gl-en co seguinte comando:
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
27 |
```bash
|
28 |
-
|
|
|
|
|
|
|
29 |
```
|
30 |
-
+ O resultado da tradución estará no PATH indicado no flag -output.
|
31 |
|
32 |
**Adestramento**
|
33 |
|
34 |
-
No adestramento, utilizamos córpora auténticos e sintéticos do [ProxectoNós](https://github.com/proxectonos/corpora). Os primeiros son córpora de traducións feitas directamente por tradutores humanos. Os segundos son córpora de traducións
|
35 |
-
|
36 |
|
37 |
-
**Procedemento de adestramento
|
38 |
|
39 |
+ Tokenización dos datasets feita co tokenizador (tokenizer.pl) de [linguakit](https://github.com/citiususc/Linguakit) que foi modificado para evitar o salto de liña por token do ficheiro orixinal.
|
40 |
|
41 |
+ O vocabulario BPE para os modelos foi xerado a través do script [learn_bpe.py](https://github.com/OpenNMT/OpenNMT-py/blob/master/tools/learn_bpe.py) da OpenNMT
|
42 |
|
43 |
-
+ Utilizando o .yaml deste repositorio pode replicar o proceso de adestramento. É preciso modificar os paths do ficheiro .yaml para a Open NMT saber onde ir buscar os textos. Após facer isto, pode do seguinte xeito comezar o proceso:
|
44 |
-
|
45 |
-
```bash
|
46 |
-
onmt_build_vocab -config bpe-gl-en_emb.yaml -n_sample 100000
|
47 |
-
onmt_train -config bpe-gl-en_emb.yaml
|
48 |
-
```
|
49 |
|
50 |
-
**Hiperparámetros**
|
51 |
-
|
52 |
-
Os parámetros usados para o desenvolvemento do modelo poden ser consultados directamente no mesmo ficheiro .yaml bpe-gl-en_emb.yaml
|
53 |
|
54 |
**Avaliación**
|
55 |
|
56 |
-
A
|
57 |
|
58 |
| GOLD 1 | GOLD 2 | FLORES | TEST-SUITE|
|
59 |
| ------------- |:-------------:| -------:|----------:|
|
60 |
-
|
|
61 |
-
|
62 |
-
|
63 |
|
64 |
**Licenzas do Modelo**
|
65 |
|
@@ -89,15 +81,12 @@ SOFTWARE.
|
|
89 |
|
90 |
Esta investigación foi financiada polo proxecto "Nós: o galego na sociedade e economía da intelixencia artificial", resultado dun acordo entre a Xunta de Galicia e a Universidade de Santiago de Compostela, o que resultou no subsidio ED431G2019/04 da Consellaría de Educación, Universidade e Formación Profesional da Galiza, e polo Fondo Europeo de Desenvolvemento Rexional (programa ERDF/FEDER), e Grupos de Referencia: ED431C 2020/21.
|
91 |
|
92 |
-
|
93 |
-
**Citar este traballo**
|
94 |
|
95 |
Se utilizar este modelo no seu traballo, cite por favor así:
|
96 |
|
97 |
-
Daniel Bardanca Outeirinho, Pablo Gamallo Otero, Iria de-Dios-Flores, and José Ramom Pichel Campos. 2024.
|
98 |
-
Exploring the effects of vocabulary size in neural machine translation: Galician as a target language.
|
99 |
-
In Proceedings of the 16th International Conference on Computational Processing of Portuguese, pages 600–604,
|
100 |
Santiago de Compostela, Galiza. Association for Computational Lingustics.
|
101 |
|
102 |
-
|
103 |
-
|
|
|
3 |
language:
|
4 |
- gl
|
5 |
metrics:
|
6 |
+
- bleu (Gold1): 41.7
|
7 |
+
- bleu (Gold2): 55.5
|
8 |
+
- bleu (Flores): 37
|
9 |
+
- bleu (Test-suite): 45.2
|
|
|
|
|
10 |
---
|
11 |
|
12 |
+
**English text [here](https://huggingface.co/proxectonos/NOS-MT-OpenNMT-gl-en/blob/main/README_English.md)**
|
13 |
|
14 |
+
**Descrición do Modelo**
|
15 |
|
16 |
+
Modelo feito con OpenNMT-py 3.2 para o par español-galego utilizando unha arquitectura transformer. O modelo foi transformado para o formato da ctranslate2.
|
17 |
|
18 |
+
**Como traducir con este Modelo**
|
19 |
|
20 |
+
+ Instalar o [Python 3.9](https://www.python.org/downloads/release/python-390/)
|
21 |
+
+ Instalar o [ctranslate 3.2](https://github.com/OpenNMT/CTranslate2)
|
|
|
22 |
+ Traducir un input_text utilizando o modelo NOS-MT-gl-en co seguinte comando:
|
23 |
+
```bash
|
24 |
+
perl tokenizer.perl < input.txt > input.tok
|
25 |
+
```
|
26 |
+
```bash
|
27 |
+
subword_nmt.apply_bpe -c ./bpe/es.bpe < input.tok > input.bpe
|
28 |
+
```
|
29 |
```bash
|
30 |
+
python3 translate.py ./ct2-gl-en_12L input.bpe > output.txt
|
31 |
+
```
|
32 |
+
```bash
|
33 |
+
sed -i 's/@@ //g' output.txt
|
34 |
```
|
|
|
35 |
|
36 |
**Adestramento**
|
37 |
|
38 |
+
No adestramento, utilizamos córpora auténticos e sintéticos do [ProxectoNós](https://github.com/proxectonos/corpora). Os primeiros son córpora de traducións feitas directamente por tradutores humanos. É importante salientar que a pesar destes textos seren feitos por humanos, non están libres de erros lingüísticos. Os segundos son córpora de traducións español-portugués, que convertemos en español-galego a través da tradución automática portugués-galego con Opentrad/Apertium e transliteración para palabras fóra de vocabulario.
|
|
|
39 |
|
40 |
+
**Procedemento de adestramento**
|
41 |
|
42 |
+ Tokenización dos datasets feita co tokenizador (tokenizer.pl) de [linguakit](https://github.com/citiususc/Linguakit) que foi modificado para evitar o salto de liña por token do ficheiro orixinal.
|
43 |
|
44 |
+ O vocabulario BPE para os modelos foi xerado a través do script [learn_bpe.py](https://github.com/OpenNMT/OpenNMT-py/blob/master/tools/learn_bpe.py) da OpenNMT
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
|
|
|
|
|
|
47 |
|
48 |
**Avaliación**
|
49 |
|
50 |
+
A avaliación BLEU dos modelos é feita cunha mistura de tests desenvolvidos internamente (gold1, gold2, test-suite) con outros datasets disponíbeis en galego (Flores).
|
51 |
|
52 |
| GOLD 1 | GOLD 2 | FLORES | TEST-SUITE|
|
53 |
| ------------- |:-------------:| -------:|----------:|
|
54 |
+
| 41.7 | 55.5 | 37 | 45.2 |
|
|
|
|
|
55 |
|
56 |
**Licenzas do Modelo**
|
57 |
|
|
|
81 |
|
82 |
Esta investigación foi financiada polo proxecto "Nós: o galego na sociedade e economía da intelixencia artificial", resultado dun acordo entre a Xunta de Galicia e a Universidade de Santiago de Compostela, o que resultou no subsidio ED431G2019/04 da Consellaría de Educación, Universidade e Formación Profesional da Galiza, e polo Fondo Europeo de Desenvolvemento Rexional (programa ERDF/FEDER), e Grupos de Referencia: ED431C 2020/21.
|
83 |
|
84 |
+
**Citar este traballo**
|
|
|
85 |
|
86 |
Se utilizar este modelo no seu traballo, cite por favor así:
|
87 |
|
88 |
+
Daniel Bardanca Outeirinho, Pablo Gamallo Otero, Iria de-Dios-Flores, and José Ramom Pichel Campos. 2024.
|
89 |
+
Exploring the effects of vocabulary size in neural machine translation: Galician as a target language.
|
90 |
+
In Proceedings of the 16th International Conference on Computational Processing of Portuguese, pages 600–604,
|
91 |
Santiago de Compostela, Galiza. Association for Computational Lingustics.
|
92 |
|
|
|
|
README_English.md
CHANGED
@@ -3,60 +3,54 @@ license: mit
|
|
3 |
language:
|
4 |
- gl
|
5 |
metrics:
|
6 |
-
- bleu (Gold1):
|
7 |
-
- bleu (Gold2):
|
8 |
-
- bleu (Flores):
|
9 |
-
- bleu (Test-suite):
|
10 |
---
|
11 |
|
12 |
-
|
13 |
-
License: MIT
|
14 |
-
---
|
15 |
|
16 |
-
**Model
|
17 |
|
18 |
-
Model
|
19 |
|
20 |
-
**How to
|
21 |
|
22 |
-
+ Open bash terminal
|
23 |
+ Install [Python 3.9](https://www.python.org/downloads/release/python-390/)
|
24 |
-
+ Install [
|
25 |
+ Translate an input_text using the NOS-MT-gl-en model with the following command:
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
27 |
```bash
|
28 |
-
|
|
|
|
|
|
|
29 |
```
|
30 |
-
+ The resulting translation will be in the PATH indicated by the -output flag.
|
31 |
|
32 |
**Training**
|
33 |
|
34 |
-
|
35 |
|
36 |
-
|
37 |
|
|
|
38 |
|
39 |
-
|
40 |
|
41 |
-
+ Tokenisation was performed with a modified version of the [linguakit](https://github.com/citiususc/Linguakit) tokeniser (tokenizer.pl) that does not append a new line after each token.
|
42 |
-
+ All BPE models were generated with the script [learn_bpe.py](https://github.com/OpenNMT/OpenNMT-py/blob/master/tools/learn_bpe.py)
|
43 |
-
+ Using the .yaml in this repository, it is possible to replicate the original training process. Before training the model, please verify that the path to each target (tgt) and (src) file is correct. Once this is done, proceed as follows:
|
44 |
|
45 |
-
|
46 |
-
onmt_build_vocab -config bpe-gl-en_emb.yaml -n_sample 100000
|
47 |
-
onmt_train -config bpe-gl-en_emb.yaml
|
48 |
-
```
|
49 |
-
**Hyperparameters**
|
50 |
-
|
51 |
-
You may find the parameters used for this model inside the file bpe-gl-en_emb.yaml
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
The BLEU evaluation of the models is a mixture of internally developed tests (gold1, gold2, test-suite) and other datasets available in Galician (Flores).
|
56 |
|
57 |
| GOLD 1 | GOLD 2 | FLORES | TEST-SUITE|
|
58 |
| ------------- |:-------------:| -------:|----------:|
|
59 |
-
|
|
60 |
|
61 |
**Licensing information**
|
62 |
|
@@ -86,6 +80,11 @@ SOFTWARE.
|
|
86 |
|
87 |
This research was funded by the project "Nós: Galician in the society and economy of artificial intelligence", agreement between Xunta de Galicia and University of Santiago de Compostela, and grant ED431G2019/04 by the Galician Ministry of Education, University and Professional Training, and the European Regional Development Fund (ERDF/FEDER program), and Groups of Reference: ED431C 2020/21.
|
88 |
|
89 |
-
**Citation Information**
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
-
Gamallo, Pablo; Bardanca, Daniel; Pichel, José Ramom; García, Marcos; Rodríguez-Rey, Sandra; de-Dios-Flores, Iria. 2023. NOS-MT-OpenNMT-gl-en. Url: https://huggingface.co/proxectonos/NOS-MT-OpenNMT-gl-en
|
|
|
3 |
language:
|
4 |
- gl
|
5 |
metrics:
|
6 |
+
- bleu (Gold1): 41.7
|
7 |
+
- bleu (Gold2): 55.5
|
8 |
+
- bleu (Flores): 37
|
9 |
+
- bleu (Test-suite): 45.2
|
10 |
---
|
11 |
|
12 |
+
**English text [here](https://huggingface.co/proxectonos/NOS-MT-OpenNMT-gl-en/blob/main/README_English.md)**
|
|
|
|
|
13 |
|
14 |
+
**Model Description**
|
15 |
|
16 |
+
Model created with OpenNMT-py 3.2 for the Spanish-Galician pair using a transformer architecture. The model was converted to the ctranslate2 format.
|
17 |
|
18 |
+
**How to Translate with this Model**
|
19 |
|
|
|
20 |
+ Install [Python 3.9](https://www.python.org/downloads/release/python-390/)
|
21 |
+
+ Install [ctranslate 3.2](https://github.com/OpenNMT/CTranslate2)
|
22 |
+ Translate an input_text using the NOS-MT-gl-en model with the following command:
|
23 |
+
```bash
|
24 |
+
perl tokenizer.perl < input.txt > input.tok
|
25 |
+
```
|
26 |
+
```bash
|
27 |
+
subword_nmt.apply_bpe -c ./bpe/es.bpe < input.tok > input.bpe
|
28 |
+
```
|
29 |
```bash
|
30 |
+
python3 translate.py ./ct2-gl-en_12L input.bpe > output.txt
|
31 |
+
```
|
32 |
+
```bash
|
33 |
+
sed -i 's/@@ //g' output.txt
|
34 |
```
|
|
|
35 |
|
36 |
**Training**
|
37 |
|
38 |
+
We used authentic and synthetic corpora from the [ProxectoNós](https://github.com/proxectonos/corpora). The former are translation corpora made directly by human translators. It is important to note that despite these texts being made by humans, they are not free from linguistic errors. The latter are Spanish-Portuguese translation corpora, which we converted into Spanish-Galician through Portuguese-Galician automatic translation with Opentrad/Apertium and transliteration for out-of-vocabulary words.
|
39 |
|
40 |
+
**Training Procedure**
|
41 |
|
42 |
+
+ Tokenization of the datasets was done with the tokenizer (tokenizer.pl) from [linguakit](https://github.com/citiususc/Linguakit), which was modified to avoid line breaks per token.
|
43 |
|
44 |
+
+ The BPE vocabulary for the models was generated using the script [learn_bpe.py](https://github.com/OpenNMT/OpenNMT-py/blob/master/tools/learn_bpe.py) from OpenNMT.
|
45 |
|
|
|
|
|
|
|
46 |
|
47 |
+
**Evaluation**
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
+
The BLEU evaluation of the models is done with a mix of internally developed tests (gold1, gold2, test-suite) with other available datasets in Galician (Flores).
|
|
|
|
|
50 |
|
51 |
| GOLD 1 | GOLD 2 | FLORES | TEST-SUITE|
|
52 |
| ------------- |:-------------:| -------:|----------:|
|
53 |
+
| 35.5 | 44.5 | 32.4 | 41.4 |
|
54 |
|
55 |
**Licensing information**
|
56 |
|
|
|
80 |
|
81 |
This research was funded by the project "Nós: Galician in the society and economy of artificial intelligence", agreement between Xunta de Galicia and University of Santiago de Compostela, and grant ED431G2019/04 by the Galician Ministry of Education, University and Professional Training, and the European Regional Development Fund (ERDF/FEDER program), and Groups of Reference: ED431C 2020/21.
|
82 |
|
83 |
+
**Citation Information**
|
84 |
+
|
85 |
+
|
86 |
+
Daniel Bardanca Outeirinho, Pablo Gamallo Otero, Iria de-Dios-Flores, and José Ramom Pichel Campos. 2024.
|
87 |
+
Exploring the effects of vocabulary size in neural machine translation: Galician as a target language.
|
88 |
+
In Proceedings of the 16th International Conference on Computational Processing of Portuguese, pages 600–604,
|
89 |
+
Santiago de Compostela, Galiza. Association for Computational Lingustics.
|
90 |
|
|
ct2-gl-en_12L/config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_source_bos": false,
|
3 |
+
"add_source_eos": false,
|
4 |
+
"bos_token": "<s>",
|
5 |
+
"decoder_start_token": "<s>",
|
6 |
+
"eos_token": "</s>",
|
7 |
+
"layer_norm_epsilon": null,
|
8 |
+
"unk_token": "<unk>"
|
9 |
+
}
|
embeddings/en.emb.txt → ct2-gl-en_12L/model.bin
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ffb9614bc9c5f33c1fa56db6e1f5e24b384a191f50979cc5a57510cc97f20e69
|
3 |
+
size 497179365
|
ct2-gl-en_12L/source_vocabulary.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ct2-gl-en_12L/target_vocabulary.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
embeddings/gl.emb.txt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:3590ee6395bd0de984feee52b9e5015bc8f405d0f267c5e6cfdb7e38c466da57
|
3 |
-
size 382372326
|
|
|
|
|
|
|
|