update to ct2

Browse files

Files changed (9) hide show

bpe-gl-en_emb.yaml +0 -138
NOS-MT-gl-en.pt +0 -3
README.md +28 -39
README_English.md +33 -34
ct2-gl-en_12L/config.json +9 -0
embeddings/en.emb.txt → ct2-gl-en_12L/model.bin +2 -2
ct2-gl-en_12L/source_vocabulary.json +0 -0
ct2-gl-en_12L/target_vocabulary.json +0 -0
embeddings/gl.emb.txt +0 -3

bpe-gl-en_emb.yaml DELETED Viewed

@@ -1,138 +0,0 @@
-save_data: run
-## Where the vocab(s) will be written
-src_vocab: run/bpe.vocab.src
-tgt_vocab: run/bpe.vocab.tgt
-overwrite: True
-# Corpus opts:
-data:
-    europarl:
-        path_src: corpora/europarl/partitions/en_train.txt
-        path_tgt: corpora/europarl/partitions/gl_train.txt
-        transforms: [bpe, filtertoolong]
-        weight: 120
-    opensub:
-        path_tgt: corpora/opensub/partitions/en_train.txt
-        path_src: corpora/opensub/partitions/gl_train.txt
-        transforms: [bpe, filtertoolong]
-        weight: 152
-    opus:
-        path_tgt: corpora/opus/partitions/en_train.txt
-        path_src: corpora/opus/partitions/gl_train.txt
-        transforms: [bpe, filtertoolong]
-        weight: 160
-    ted2020:
-        path_tgt: corpora/ted2020/partitions/en_train.txt
-        path_src: corpora/ted2020/partitions/gl_train.txt
-        transforms: [bpe, filtertoolong]
-        weight: 10
-    corgaback:
-        path_tgt: corpora/corgaback/partitions/en_train.txt
-        path_src: corpora/corgaback/partitions/gl_train.txt
-        transforms: [bpe, filtertoolong]
-        weight: 15
-    ccmatrix:
-        path_tgt: corpora/ccmatrix/en_tok_dbo.txt
-        path_src: corpora/ccmatrix/gl_tok_dbo.txt
-        transforms: [bpe, filtertoolong]
-        weight: 380
-    wikimatrix:
-        path_tgt: corpora/wikimatrix/en.txt
-        path_src: corpora/wikimatrix/gl.txt
-        transforms: [bpe, filtertoolong]
-        weight: 70
-    cluvi:
-        path_tgt: corpora/cluvi/en.txt
-        path_src: corpora/cluvi/gl.txt
-        transforms: [bpe, filtertoolong]
-        weight: 70
-    valid:
-        path_tgt: corpora/partitions/all-en_valid.txt
-        path_src: corpora/partitions/all-gl_valid.txt
-        transforms: [bpe, filtertoolong]
-### Transform related opts:
-#### Subword
-tgt_subword_model: ./bpe/en.code
-src_subword_model: ./bpe/gl.code
-src_subword_vocab: ./run/bpe.vocab.src
-tgt_subword_vocab: ./run/bpe.vocab.tgt
-src_subword_type: bpe
-tgt_subord_type: bpe
-src_subword_nbest: 1
-src_subword_alpha: 0.0
-tgt_subword_nbest: 1
-tgt_subword_alpha: 0.0
-#### Filter
-src_seq_length: 150
-tgt_seq_length: 150
-# silently ignore empty lines in the data
-skip_empty_level: silent
-##embeddings
-tgt_embeddings: ../embeddings/en.emb.txt
-src_embeddings: ../embeddings/gl.emb.txt
-## supported types: GloVe, word2vec
-embeddings_type: "word2vec"
-# word_vec_size need to match with the pretrained embeddings dimensions
-word_vec_size: 300
-# General opts
-save_model: run/model
-keep_checkpoint: 50
-save_checkpoint_steps: 10000
-average_decay: 0.0005
-seed: 1234
-report_every: 1000
-train_steps: 200000
-valid_steps: 10000
-# Batching
-queue_size: 10000
-bucket_size: 32768
-world_size: 1
-gpu_ranks: [0]
-batch_type: "tokens"
-batch_size: 8192
-#batch_size: 4096
-valid_batch_size: 64
-batch_size_multiple: 1
-max_generator_batches: 2
-accum_count: [4]
-accum_steps: [0]
-# Optimization
-model_dtype: "fp16"
-optim: "adam"
-learning_rate: 2
-warmup_steps: 8000
-decay_method: "noam"
-adam_beta2: 0.998
-max_grad_norm: 0
-label_smoothing: 0.1
-param_init: 0
-param_init_glorot: true
-normalization: "tokens"
-# Model
-encoder_type: transformer
-decoder_type: transformer
-position_encoding: true
-enc_layers: 6
-dec_layers: 6
-heads: 8
-rnn_size: 512
-word_vec_size: 512
-transformer_ff: 2048
-dropout_steps: [0]
-dropout: [0.1]
-attention_dropout: [0.1]
-share_decoder_embeddings: true
-share_embeddings: false

NOS-MT-gl-en.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c624e72797fc8476941125ae8b490b83d277fe1772fd3b2b561708de3dce17be
-size 1563576538

README.md CHANGED Viewed

@@ -3,63 +3,55 @@ license: mit
 language:
 - gl
 metrics:
-- bleu (Gold1): 43.6
-- bleu (Gold2): 58.3
-- bleu (Flores): 39.0
-- bleu (Test-suite): 48.7
----
-license: mit
 ---
-**English text [here](https://huggingface.co/proxectonos/Nos_MT-OpenNMT-gl-en/blob/main/README_English.md)**
-**Descrición do Modelo**
-Modelo feito con OpenNMT para o par galego-inglés utilizando unha arquitectura transformer.
-**Como traducir**
-+ Abrir terminal bash
-+ Instalar [Python 3.9](https://www.python.org/downloads/release/python-390/)
-+ Instalar [Open NMT toolkit v.2.2](https://github.com/OpenNMT/OpenNMT-py)
 + Traducir un input_text utilizando o modelo NOS-MT-gl-en co seguinte comando:
 ```bash
-onmt_translate -src input_text -model NOS-MT-gl-en.pt --output ./output_file.txt --replace_unk -gpu 0
 ```
-+ O resultado da tradución estará no PATH indicado no flag -output.
 **Adestramento**
-No adestramento, utilizamos córpora auténticos e sintéticos do [ProxectoNós](https://github.com/proxectonos/corpora). Os primeiros son córpora de traducións feitas directamente por tradutores humanos. Os segundos son córpora de traducións inglés-portugués, que convertemos en inglés-galego a través da tradución automática portugués-galego con Opentrad/Apertium e transliteración para palabras fóra de vocabulario.
-**Procedemento de adestramento / Training process**
 + Tokenización dos datasets feita co tokenizador (tokenizer.pl) de [linguakit](https://github.com/citiususc/Linguakit) que foi modificado para evitar o salto de liña por token do ficheiro orixinal.
 + O vocabulario BPE para os modelos foi xerado a través do script [learn_bpe.py](https://github.com/OpenNMT/OpenNMT-py/blob/master/tools/learn_bpe.py) da OpenNMT
-+ Utilizando o .yaml deste repositorio pode replicar o proceso de adestramento. É preciso modificar os paths do ficheiro .yaml para a Open NMT saber onde ir buscar os textos. Após facer isto, pode do seguinte xeito comezar o proceso:
-```bash
-onmt_build_vocab -config  bpe-gl-en_emb.yaml -n_sample 100000
-onmt_train -config bpe-gl-en_emb.yaml
-```
-**Hiperparámetros**
-Os parámetros usados para o desenvolvemento do modelo poden ser consultados directamente no mesmo ficheiro .yaml  bpe-gl-en_emb.yaml
 **Avaliación**
-A avalación BLEU dos modelos é feita cunha mistura de tests desenvolvidos internamente (gold1, gold2, test-suite) con outros datasets disponíbeis en galego (Flores).
 | GOLD 1        | GOLD 2        | FLORES  | TEST-SUITE|
 | ------------- |:-------------:| -------:|----------:|
-| 43.6          | 58.3          | 39.0    | 48.7      |
 **Licenzas do Modelo**
@@ -89,15 +81,12 @@ SOFTWARE.
 Esta investigación foi financiada polo proxecto "Nós: o galego na sociedade e economía da intelixencia artificial", resultado dun acordo entre a Xunta de Galicia e a Universidade de Santiago de Compostela, o que resultou no subsidio ED431G2019/04 da Consellaría de Educación, Universidade e Formación Profesional da Galiza, e polo Fondo Europeo de Desenvolvemento Rexional (programa ERDF/FEDER), e Grupos de Referencia: ED431C 2020/21.
-**Citar este traballo**
 Se utilizar este modelo no seu traballo, cite por favor así:
-Daniel Bardanca Outeirinho, Pablo Gamallo Otero, Iria de-Dios-Flores, and José Ramom Pichel Campos. 2024.
-Exploring the effects of vocabulary size in neural machine translation: Galician as a target language.
-In Proceedings of the 16th International Conference on Computational Processing of Portuguese, pages 600–604,
 Santiago de Compostela, Galiza. Association for Computational Lingustics.

 language:
 - gl
 metrics:
+- bleu (Gold1): 41.7
+- bleu (Gold2): 55.5
+- bleu (Flores): 37
+- bleu (Test-suite): 45.2
 ---
+**English text [here](https://huggingface.co/proxectonos/NOS-MT-OpenNMT-gl-en/blob/main/README_English.md)**
+**Descrición do Modelo**
+Modelo feito con OpenNMT-py 3.2 para o par español-galego utilizando unha arquitectura transformer. O modelo foi transformado para o formato da ctranslate2.
+**Como traducir con este Modelo**
++ Instalar o [Python 3.9](https://www.python.org/downloads/release/python-390/)
++ Instalar o [ctranslate 3.2](https://github.com/OpenNMT/CTranslate2)
 + Traducir un input_text utilizando o modelo NOS-MT-gl-en co seguinte comando:
+```bash
+    perl tokenizer.perl < input.txt > input.tok
+```
+```bash
+    subword_nmt.apply_bpe -c ./bpe/es.bpe < input.tok > input.bpe
+```
 ```bash
+    python3 translate.py ./ct2-gl-en_12L input.bpe > output.txt
+```
+```bash
+    sed -i 's/@@ //g' output.txt
 ```
 **Adestramento**
+No adestramento, utilizamos córpora auténticos e sintéticos do [ProxectoNós](https://github.com/proxectonos/corpora). Os primeiros son córpora de traducións feitas directamente por tradutores humanos. É importante salientar que a pesar destes textos seren feitos por humanos, non están libres de erros lingüísticos. Os segundos son córpora de traducións español-portugués, que convertemos en español-galego a través da tradución automática portugués-galego con Opentrad/Apertium e transliteración para palabras fóra de vocabulario.
+**Procedemento de adestramento**
 + Tokenización dos datasets feita co tokenizador (tokenizer.pl) de [linguakit](https://github.com/citiususc/Linguakit) que foi modificado para evitar o salto de liña por token do ficheiro orixinal.
 + O vocabulario BPE para os modelos foi xerado a través do script [learn_bpe.py](https://github.com/OpenNMT/OpenNMT-py/blob/master/tools/learn_bpe.py) da OpenNMT
 **Avaliación**
+A avaliación BLEU dos modelos é feita cunha mistura de tests desenvolvidos internamente (gold1, gold2, test-suite) con outros datasets disponíbeis en galego (Flores).
 | GOLD 1        | GOLD 2        | FLORES  | TEST-SUITE|
 | ------------- |:-------------:| -------:|----------:|
+| 41.7          | 55.5          | 37      | 45.2      |
 **Licenzas do Modelo**
 Esta investigación foi financiada polo proxecto "Nós: o galego na sociedade e economía da intelixencia artificial", resultado dun acordo entre a Xunta de Galicia e a Universidade de Santiago de Compostela, o que resultou no subsidio ED431G2019/04 da Consellaría de Educación, Universidade e Formación Profesional da Galiza, e polo Fondo Europeo de Desenvolvemento Rexional (programa ERDF/FEDER), e Grupos de Referencia: ED431C 2020/21.
+**Citar este traballo**
 Se utilizar este modelo no seu traballo, cite por favor así:
+Daniel Bardanca Outeirinho, Pablo Gamallo Otero, Iria de-Dios-Flores, and José Ramom Pichel Campos. 2024.
+Exploring the effects of vocabulary size in neural machine translation: Galician as a target language.
+In Proceedings of the 16th International Conference on Computational Processing of Portuguese, pages 600–604,
 Santiago de Compostela, Galiza. Association for Computational Lingustics.

README_English.md CHANGED Viewed

@@ -3,60 +3,54 @@ license: mit
 language:
 - gl
 metrics:
-- bleu (Gold1): 43.6
-- bleu (Gold2): 58.3
-- bleu (Flores): 39.0
-- bleu (Test-suite): 48.7
 ---
----
-License: MIT
----
-**Model description**
-Model developed with OpenNMT for the Galician-Spanish pair using the transformer architecture.
-**How to translate**
-+ Open bash terminal
 + Install [Python 3.9](https://www.python.org/downloads/release/python-390/)
-+ Install [Open NMT toolkit v.2.2](https://github.com/OpenNMT/OpenNMT-py)
 + Translate an input_text using the NOS-MT-gl-en model with the following command:
 ```bash
-onmt_translate -src input_text -model NOS-MT-gl-en -output ./output_file.txt -replace_unk -gpu 0
 ```
-+ The resulting translation will be in the PATH indicated by the -output flag.
 **Training**
-To train this model, we have used authentic and synthetic corpora from [ProxectoNós](https://github.com/proxectonos/corpora).
-Authentic corpora are corpora produced by human translators. Synthetic corpora are Spanish-Portuguese translations, which have been converted to Spanish-Galician by means of Portuguese-Galician translation with Opentrad/Apertium and transliteration for out-of-vocabulary words.
-**Training process**
-+ Tokenisation was performed with a modified version of the [linguakit](https://github.com/citiususc/Linguakit) tokeniser (tokenizer.pl) that does not append a new line after each token.
-+ All BPE models were generated with the script [learn_bpe.py](https://github.com/OpenNMT/OpenNMT-py/blob/master/tools/learn_bpe.py)
-+ Using the .yaml in this repository, it is possible to replicate the original training process. Before training the model, please verify that the path to each target (tgt) and (src) file is correct. Once this is done, proceed as follows:
-```bash
-onmt_build_vocab -config  bpe-gl-en_emb.yaml -n_sample 100000
-onmt_train -config bpe-gl-en_emb.yaml
-```
-**Hyperparameters**
-You may find the parameters used for this model inside the file  bpe-gl-en_emb.yaml
-**Evaluation**
-The BLEU evaluation of the models is a mixture of internally developed tests (gold1, gold2, test-suite) and other datasets available in Galician (Flores).
 | GOLD 1        | GOLD 2        | FLORES  | TEST-SUITE|
 | ------------- |:-------------:| -------:|----------:|
-| 43.6          | 58.3          | 39.0    | 48.7      |
 **Licensing information**
@@ -86,6 +80,11 @@ SOFTWARE.
 This research was funded by the project "Nós: Galician in the society and economy of artificial intelligence", agreement between Xunta de Galicia and University of Santiago de Compostela, and grant ED431G2019/04 by the Galician Ministry of Education, University and Professional Training, and the European Regional Development Fund (ERDF/FEDER program), and Groups of Reference: ED431C 2020/21.
-**Citation Information**
-Gamallo, Pablo; Bardanca, Daniel; Pichel, José Ramom; García, Marcos; Rodríguez-Rey, Sandra; de-Dios-Flores, Iria. 2023. NOS-MT-OpenNMT-gl-en. Url: https://huggingface.co/proxectonos/NOS-MT-OpenNMT-gl-en

 language:
 - gl
 metrics:
+- bleu (Gold1): 41.7
+- bleu (Gold2): 55.5
+- bleu (Flores): 37
+- bleu (Test-suite): 45.2
 ---
+**English text [here](https://huggingface.co/proxectonos/NOS-MT-OpenNMT-gl-en/blob/main/README_English.md)**
+**Model Description**
+Model created with OpenNMT-py 3.2 for the Spanish-Galician pair using a transformer architecture. The model was converted to the ctranslate2 format.
+**How to Translate with this Model**
 + Install [Python 3.9](https://www.python.org/downloads/release/python-390/)
++ Install [ctranslate 3.2](https://github.com/OpenNMT/CTranslate2)
 + Translate an input_text using the NOS-MT-gl-en model with the following command:
+```bash
+    perl tokenizer.perl < input.txt > input.tok
+```
+```bash
+    subword_nmt.apply_bpe -c ./bpe/es.bpe < input.tok > input.bpe
+```
 ```bash
+    python3 translate.py ./ct2-gl-en_12L input.bpe > output.txt
+```
+```bash
+    sed -i 's/@@ //g' output.txt
 ```
 **Training**
+We used authentic and synthetic corpora from the [ProxectoNós](https://github.com/proxectonos/corpora). The former are translation corpora made directly by human translators. It is important to note that despite these texts being made by humans, they are not free from linguistic errors. The latter are Spanish-Portuguese translation corpora, which we converted into Spanish-Galician through Portuguese-Galician automatic translation with Opentrad/Apertium and transliteration for out-of-vocabulary words.
+**Training Procedure**
++ Tokenization of the datasets was done with the tokenizer (tokenizer.pl) from [linguakit](https://github.com/citiususc/Linguakit), which was modified to avoid line breaks per token.
++ The BPE vocabulary for the models was generated using the script [learn_bpe.py](https://github.com/OpenNMT/OpenNMT-py/blob/master/tools/learn_bpe.py) from OpenNMT.
+**Evaluation**
+The BLEU evaluation of the models is done with a mix of internally developed tests (gold1, gold2, test-suite) with other available datasets in Galician (Flores).
 | GOLD 1        | GOLD 2        | FLORES  | TEST-SUITE|
 | ------------- |:-------------:| -------:|----------:|
+| 35.5          | 44.5          | 32.4    | 41.4      |
 **Licensing information**
 This research was funded by the project "Nós: Galician in the society and economy of artificial intelligence", agreement between Xunta de Galicia and University of Santiago de Compostela, and grant ED431G2019/04 by the Galician Ministry of Education, University and Professional Training, and the European Regional Development Fund (ERDF/FEDER program), and Groups of Reference: ED431C 2020/21.
+**Citation Information**
+Daniel Bardanca Outeirinho, Pablo Gamallo Otero, Iria de-Dios-Flores, and José Ramom Pichel Campos. 2024.
+Exploring the effects of vocabulary size in neural machine translation: Galician as a target language.
+In Proceedings of the 16th International Conference on Computational Processing of Portuguese, pages 600–604,
+Santiago de Compostela, Galiza. Association for Computational Lingustics.

ct2-gl-en_12L/config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "add_source_bos": false,
+  "add_source_eos": false,
+  "bos_token": "<s>",
+  "decoder_start_token": "<s>",
+  "eos_token": "</s>",
+  "layer_norm_epsilon": null,
+  "unk_token": "<unk>"
+}

embeddings/en.emb.txt → ct2-gl-en_12L/model.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6f242903b80ac06548e8c3648ca309d11dbb2c18c3fca775378b43ada3ff6f36
-size 288418417

 version https://git-lfs.github.com/spec/v1
+oid sha256:ffb9614bc9c5f33c1fa56db6e1f5e24b384a191f50979cc5a57510cc97f20e69
+size 497179365

ct2-gl-en_12L/source_vocabulary.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ct2-gl-en_12L/target_vocabulary.json ADDED Viewed

The diff for this file is too large to render. See raw diff

embeddings/gl.emb.txt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3590ee6395bd0de984feee52b9e5015bc8f405d0f267c5e6cfdb7e38c466da57
-size 382372326