imdbo commited on
Commit
632f87a
1 Parent(s): 0cbe0d1

Upload bpe-en-gl_emb.yaml

Browse files
Files changed (1) hide show
  1. bpe-en-gl_emb.yaml +158 -0
bpe-en-gl_emb.yaml ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ save_data: run
3
+ ## Where the vocab(s) will be written
4
+ src_vocab: run/bpe.vocab.src
5
+ tgt_vocab: run/bpe.vocab.tgt
6
+ overwrite: True
7
+
8
+ # Corpus opts:
9
+ data:
10
+ europarl:
11
+ path_src: ../DGTcorpora_tokenized/en_gl/europarl/partitions/en_train.txt
12
+ path_tgt: ../DGTcorpora_tokenized/en_gl/europarl/partitions/gl_train.txt
13
+ transforms: [bpe, filtertoolong]
14
+ weight: 120
15
+ opensub:
16
+ path_src: ../DGTcorpora_tokenized/en_gl/opensub/partitions/en_train.txt
17
+ path_tgt: ../DGTcorpora_tokenized/en_gl/opensub/partitions/gl_train.txt
18
+ transforms: [bpe, filtertoolong]
19
+ weight: 152
20
+ opus:
21
+ path_src: ../DGTcorpora_tokenized/en_gl/opus/partitions/en_train.txt
22
+ path_tgt: ../DGTcorpora_tokenized/en_gl/opus/partitions/gl_train.txt
23
+ transforms: [bpe, filtertoolong]
24
+ weight: 160
25
+ ted2020:
26
+ path_src: ../DGTcorpora_tokenized/en_gl/ted2020/partitions/en_train.txt
27
+ path_tgt: ../DGTcorpora_tokenized/en_gl/ted2020/partitions/gl_train.txt
28
+ transforms: [bpe, filtertoolong]
29
+ weight: 10
30
+ corgaback:
31
+ path_src: ../DGTcorpora_tokenized/en_gl/corgaback/partitions/en_train.txt
32
+ path_tgt: ../DGTcorpora_tokenized/en_gl/corgaback/partitions/gl_train.txt
33
+ transforms: [bpe, filtertoolong]
34
+ weight: 15
35
+ ccmatrix:
36
+ path_src: ../DGTcorpora_tokenized/en_gl/ccmatrix/en_tok_dbo.txt
37
+ path_tgt: ../DGTcorpora_tokenized/en_gl/ccmatrix/gl_tok_dbo.txt
38
+ transforms: [bpe, filtertoolong]
39
+ weight: 380 ##75 ## 25000000/13000000 = 2; 760/2 = 380 * 5 = 1900 (380/5=75)
40
+ wikimatrix:
41
+ path_src: ../DGTcorpora_tokenized/en_gl/wikimatrix/en.txt
42
+ path_tgt: ../DGTcorpora_tokenized/en_gl/wikimatrix/gl.txt
43
+ transforms: [bpe, filtertoolong]
44
+ weight: 70 #25000000/450000 = 55 ; 760/55 = 14 ; 14 * 5 = 70
45
+ cluvi:
46
+ path_src: ../DGTcorpora_tokenized/en_gl/cluvi/en.txt
47
+ path_tgt: ../DGTcorpora_tokenized/en_gl/cluvi/gl.txt
48
+ transforms: [bpe, filtertoolong]
49
+ weight: 70 #25000000/295000 = 84 ; 760/84 = 9 ; 9 * 10 = 90
50
+ #wikimedia:
51
+ # path_src: ../DGTcorpora_tokenized/en_gl/wikimedia/en.txt
52
+ #path_tgt: ../DGTcorpora_tokenized/en_gl/wikimedia/gl.txt
53
+ #transforms: [bpe, filtertoolong]
54
+ #weight: 4
55
+ # xlent:
56
+ #path_src: ../DGTcorpora_tokenized/en_gl/xlent/en.txt
57
+ #path_tgt: ../DGTcorpora_tokenized/en_gl/xlent/gl.txt
58
+ #transforms: [bpe, filtertoolong]
59
+ #weight: 50 #25000000/1600000=15; 760/15=50
60
+ #linux:
61
+ #path_src: ../DGTcorpora_tokenized/en_gl/linux/en.txt
62
+ #path_tgt: ../DGTcorpora_tokenized/en_gl/linux/gl.txt
63
+ #transforms: [bpe, filtertoolong]
64
+ #weight: 20 #25000000/150000=166; 760/166=5 * 5 = 20
65
+ valid:
66
+ path_src: ../DGTcorpora_tokenized/en_gl/partitions/all-en_valid.txt
67
+ path_tgt: ../DGTcorpora_tokenized/en_gl/partitions/all-gl_valid.txt
68
+ transforms: [bpe, filtertoolong]
69
+
70
+ ### Transform related opts:
71
+ #### Subword
72
+ src_subword_model: ./bpe/en.code
73
+ tgt_subword_model: ./bpe/gl.code
74
+ src_subword_vocab: ./run/bpe.vocab.src
75
+ tgt_subword_vocab: ./run/bpe.vocab.tgt
76
+ #src_subword_model: ../sentencepiece/en-gl/en.sp.model
77
+ #tgt_subword_model: ../sentencepiece/en-gl/gl.sp.model
78
+ src_subword_type: bpe
79
+ tgt_subord_type: bpe
80
+
81
+ src_subword_nbest: 1
82
+ src_subword_alpha: 0.0
83
+ tgt_subword_nbest: 1
84
+ tgt_subword_alpha: 0.0
85
+ #### Filter
86
+ src_seq_length: 150
87
+ tgt_seq_length: 150
88
+
89
+ # silently ignore empty lines in the data
90
+ skip_empty_level: silent
91
+
92
+ ##embeddings
93
+ src_embeddings: ../embeddings/en.emb.txt
94
+ tgt_embeddings: ../embeddings/gl.emb.txt
95
+
96
+ ## supported types: GloVe, word2vec
97
+ embeddings_type: "word2vec"
98
+
99
+ # word_vec_size need to match with the pretrained embeddings dimensions
100
+ word_vec_size: 300
101
+
102
+
103
+
104
+ # General opts
105
+ save_model: run/model
106
+ keep_checkpoint: 50
107
+ save_checkpoint_steps: 10000
108
+ average_decay: 0.0005
109
+ seed: 1234
110
+ report_every: 1000
111
+ train_steps: 200000
112
+ valid_steps: 10000
113
+
114
+ # Batching
115
+ queue_size: 10000
116
+ bucket_size: 32768
117
+ world_size: 1
118
+ gpu_ranks: [0]
119
+ batch_type: "tokens"
120
+ batch_size: 8192
121
+ #batch_size: 4096
122
+ valid_batch_size: 64
123
+ batch_size_multiple: 1
124
+ max_generator_batches: 2
125
+ accum_count: [4]
126
+ accum_steps: [0]
127
+
128
+ # Optimization
129
+ model_dtype: "fp16"
130
+ optim: "adam"
131
+ learning_rate: 2
132
+ warmup_steps: 8000
133
+ decay_method: "noam"
134
+ adam_beta2: 0.998
135
+ max_grad_norm: 0
136
+ label_smoothing: 0.1
137
+ param_init: 0
138
+ param_init_glorot: true
139
+ normalization: "tokens"
140
+
141
+ # Model
142
+ encoder_type: transformer
143
+ decoder_type: transformer
144
+ position_encoding: true
145
+ enc_layers: 6
146
+ dec_layers: 6
147
+ heads: 8
148
+ rnn_size: 512
149
+ word_vec_size: 512
150
+ transformer_ff: 2048
151
+ dropout_steps: [0]
152
+ dropout: [0.1]
153
+ attention_dropout: [0.1]
154
+ share_decoder_embeddings: true
155
+ share_embeddings: false
156
+
157
+
158
+