Ubuntu commited on
Commit
e9ab399
1 Parent(s): 9007b04
.ipynb_checkpoints/convert-from-malaya-checkpoint.ipynb ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "58d45708",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from transformers import XLNetTokenizer, XLNetModel, XLNetConfig, AutoTokenizer, AutoModelWithLMHead, pipeline"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 3,
16
+ "id": "e0314358",
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "name": "stdout",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "model.ckpt-320000.data-00000-of-00001 model.ckpt-320000.meta\r\n",
24
+ "model.ckpt-320000.index\r\n"
25
+ ]
26
+ }
27
+ ],
28
+ "source": [
29
+ "# !tar -zxf xlnet-large-2021-09-06.tar.gz\n",
30
+ "# !rm xlnet-large-2021-09-06.tar.gz\n",
31
+ "!ls xlnet-large"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 4,
37
+ "id": "59d2c8b5",
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "# !wget https://raw.githubusercontent.com/huseinzol05/malaya/master/pretrained-model/xlnet/tokenizer/sp10m.cased.v9.vocab\n",
42
+ "# !wget https://raw.githubusercontent.com/huseinzol05/malaya/master/pretrained-model/xlnet/tokenizer/sp10m.cased.v9.model"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": 5,
48
+ "id": "f35e09f4",
49
+ "metadata": {},
50
+ "outputs": [
51
+ {
52
+ "data": {
53
+ "text/plain": [
54
+ "('./tokenizer_config.json',\n",
55
+ " './special_tokens_map.json',\n",
56
+ " './spiece.model',\n",
57
+ " './added_tokens.json')"
58
+ ]
59
+ },
60
+ "execution_count": 5,
61
+ "metadata": {},
62
+ "output_type": "execute_result"
63
+ }
64
+ ],
65
+ "source": [
66
+ "tokenizer = XLNetTokenizer('sp10m.cased.v9.model', do_lower_case = False)\n",
67
+ "tokenizer.save_pretrained('./')"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 6,
73
+ "id": "4438ff5c",
74
+ "metadata": {},
75
+ "outputs": [],
76
+ "source": [
77
+ "import json\n",
78
+ "\n",
79
+ "config = {\n",
80
+ " \"d_head\": 64,\n",
81
+ " \"d_inner\": 4096,\n",
82
+ " \"d_model\": 1024,\n",
83
+ " \"ff_activation\": \"gelu\",\n",
84
+ " \"n_head\": 16,\n",
85
+ " \"n_layer\": 20,\n",
86
+ " \"n_token\": 32000,\n",
87
+ " \"untie_r\": True\n",
88
+ "}\n",
89
+ "\n",
90
+ "with open('config.json', 'w') as fopen:\n",
91
+ " json.dump(config, fopen)"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 8,
97
+ "id": "a265f23c",
98
+ "metadata": {},
99
+ "outputs": [],
100
+ "source": [
101
+ "# !transformers-cli convert --model_type xlnet \\\n",
102
+ "# --tf_checkpoint xlnet-large/model.ckpt-320000 \\\n",
103
+ "# --config config.json \\\n",
104
+ "# --pytorch_dump_output ./"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": 9,
110
+ "id": "22b94055",
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": [
114
+ "config = XLNetConfig(f'./config.json')\n",
115
+ "config.vocab_size = 32000\n",
116
+ "config.d_inner = 4096\n",
117
+ "config.d_model = 1024\n",
118
+ "config.n_head = 16\n",
119
+ "config.n_layer = 20"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": 10,
125
+ "id": "17c6d447",
126
+ "metadata": {},
127
+ "outputs": [
128
+ {
129
+ "name": "stderr",
130
+ "output_type": "stream",
131
+ "text": [
132
+ "Some weights of the model checkpoint at ./ were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']\n",
133
+ "- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
134
+ "- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
135
+ ]
136
+ }
137
+ ],
138
+ "source": [
139
+ "model = XLNetModel.from_pretrained('./', config = config)"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 11,
145
+ "id": "d0fc0138",
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": [
149
+ "tokenizer = XLNetTokenizer.from_pretrained('./',do_lower_case = False)"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": 12,
155
+ "id": "ec2c0661",
156
+ "metadata": {},
157
+ "outputs": [],
158
+ "source": [
159
+ "model.save_pretrained('./')"
160
+ ]
161
+ }
162
+ ],
163
+ "metadata": {
164
+ "kernelspec": {
165
+ "display_name": "Python 3",
166
+ "language": "python",
167
+ "name": "python3"
168
+ },
169
+ "language_info": {
170
+ "codemirror_mode": {
171
+ "name": "ipython",
172
+ "version": 3
173
+ },
174
+ "file_extension": ".py",
175
+ "mimetype": "text/x-python",
176
+ "name": "python",
177
+ "nbconvert_exporter": "python",
178
+ "pygments_lexer": "ipython3",
179
+ "version": "3.6.9"
180
+ }
181
+ },
182
+ "nbformat": 4,
183
+ "nbformat_minor": 5
184
+ }
README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: ms
3
+ ---
4
+
5
+ # xlnet-large-bahasa-cased
6
+
7
+ Pretrained XLNET large language model for Malay.
8
+
9
+ ## Pretraining Corpus
10
+
11
+ `xlnet-large-bahasa-cased` model was pretrained on ~1.4 Billion words. Below is list of data we trained on,
12
+
13
+ 1. [cleaned local texts](https://github.com/huseinzol05/malay-dataset/tree/master/dumping/clean).
14
+ 2. [translated The Pile](https://github.com/huseinzol05/malay-dataset/tree/master/corpus/pile).
15
+
16
+ ## Pretraining details
17
+
18
+ - All steps can reproduce from here, [Malaya/pretrained-model/xlnet](https://github.com/huseinzol05/Malaya/tree/master/pretrained-model/xlnet).
19
+
20
+ ## Load Pretrained Model
21
+
22
+ You can use this model by installing `torch` or `tensorflow` and Huggingface library `transformers`. And you can use it directly by initializing it like this:
23
+
24
+ ```python
25
+ from transformers import XLNetModel, XLNetTokenizer
26
+
27
+ model = XLNetModel.from_pretrained('malay-huggingface/xlnet-large-bahasa-cased')
28
+ tokenizer = XLNetTokenizer.from_pretrained(
29
+ 'malay-huggingface/xlnet-large-bahasa-cased',
30
+ do_lower_case = False,
31
+ )
32
+ ```
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./",
3
+ "architectures": [
4
+ "XLNetModel"
5
+ ],
6
+ "attn_type": "bi",
7
+ "bi_data": false,
8
+ "bos_token_id": 1,
9
+ "clamp_len": -1,
10
+ "d_head": 64,
11
+ "d_inner": 4096,
12
+ "d_model": 1024,
13
+ "dropout": 0.1,
14
+ "end_n_top": 5,
15
+ "eos_token_id": 2,
16
+ "ff_activation": "gelu",
17
+ "initializer_range": 0.02,
18
+ "layer_norm_eps": 1e-12,
19
+ "mem_len": 512,
20
+ "model_type": "xlnet",
21
+ "n_head": 16,
22
+ "n_layer": 20,
23
+ "pad_token_id": 5,
24
+ "reuse_len": null,
25
+ "same_length": false,
26
+ "start_n_top": 5,
27
+ "summary_activation": "tanh",
28
+ "summary_last_dropout": 0.1,
29
+ "summary_type": "last",
30
+ "summary_use_proj": true,
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.10.0",
33
+ "untie_r": true,
34
+ "use_mems_eval": true,
35
+ "use_mems_train": false,
36
+ "vocab_size": 32000
37
+ }
convert-from-malaya.ipynb ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "58d45708",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from transformers import XLNetTokenizer, XLNetModel, XLNetConfig, AutoTokenizer, AutoModelWithLMHead, pipeline"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 3,
16
+ "id": "e0314358",
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "name": "stdout",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "model.ckpt-320000.data-00000-of-00001 model.ckpt-320000.meta\r\n",
24
+ "model.ckpt-320000.index\r\n"
25
+ ]
26
+ }
27
+ ],
28
+ "source": [
29
+ "# !tar -zxf xlnet-large-2021-09-06.tar.gz\n",
30
+ "# !rm xlnet-large-2021-09-06.tar.gz\n",
31
+ "!ls xlnet-large"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 4,
37
+ "id": "59d2c8b5",
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "# !wget https://raw.githubusercontent.com/huseinzol05/malaya/master/pretrained-model/xlnet/tokenizer/sp10m.cased.v9.vocab\n",
42
+ "# !wget https://raw.githubusercontent.com/huseinzol05/malaya/master/pretrained-model/xlnet/tokenizer/sp10m.cased.v9.model"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": 5,
48
+ "id": "f35e09f4",
49
+ "metadata": {},
50
+ "outputs": [
51
+ {
52
+ "data": {
53
+ "text/plain": [
54
+ "('./tokenizer_config.json',\n",
55
+ " './special_tokens_map.json',\n",
56
+ " './spiece.model',\n",
57
+ " './added_tokens.json')"
58
+ ]
59
+ },
60
+ "execution_count": 5,
61
+ "metadata": {},
62
+ "output_type": "execute_result"
63
+ }
64
+ ],
65
+ "source": [
66
+ "tokenizer = XLNetTokenizer('sp10m.cased.v9.model', do_lower_case = False)\n",
67
+ "tokenizer.save_pretrained('./')"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 6,
73
+ "id": "4438ff5c",
74
+ "metadata": {},
75
+ "outputs": [],
76
+ "source": [
77
+ "import json\n",
78
+ "\n",
79
+ "config = {\n",
80
+ " \"d_head\": 64,\n",
81
+ " \"d_inner\": 4096,\n",
82
+ " \"d_model\": 1024,\n",
83
+ " \"ff_activation\": \"gelu\",\n",
84
+ " \"n_head\": 16,\n",
85
+ " \"n_layer\": 20,\n",
86
+ " \"n_token\": 32000,\n",
87
+ " \"untie_r\": True\n",
88
+ "}\n",
89
+ "\n",
90
+ "with open('config.json', 'w') as fopen:\n",
91
+ " json.dump(config, fopen)"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 8,
97
+ "id": "a265f23c",
98
+ "metadata": {},
99
+ "outputs": [],
100
+ "source": [
101
+ "# !transformers-cli convert --model_type xlnet \\\n",
102
+ "# --tf_checkpoint xlnet-large/model.ckpt-320000 \\\n",
103
+ "# --config config.json \\\n",
104
+ "# --pytorch_dump_output ./"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": 9,
110
+ "id": "22b94055",
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": [
114
+ "config = XLNetConfig(f'./config.json')\n",
115
+ "config.vocab_size = 32000\n",
116
+ "config.d_inner = 4096\n",
117
+ "config.d_model = 1024\n",
118
+ "config.n_head = 16\n",
119
+ "config.n_layer = 20"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": 10,
125
+ "id": "17c6d447",
126
+ "metadata": {},
127
+ "outputs": [
128
+ {
129
+ "name": "stderr",
130
+ "output_type": "stream",
131
+ "text": [
132
+ "Some weights of the model checkpoint at ./ were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']\n",
133
+ "- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
134
+ "- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
135
+ ]
136
+ }
137
+ ],
138
+ "source": [
139
+ "model = XLNetModel.from_pretrained('./', config = config)"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 11,
145
+ "id": "d0fc0138",
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": [
149
+ "tokenizer = XLNetTokenizer.from_pretrained('./',do_lower_case = False)"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": 12,
155
+ "id": "ec2c0661",
156
+ "metadata": {},
157
+ "outputs": [],
158
+ "source": [
159
+ "model.save_pretrained('./')"
160
+ ]
161
+ }
162
+ ],
163
+ "metadata": {
164
+ "kernelspec": {
165
+ "display_name": "Python 3",
166
+ "language": "python",
167
+ "name": "python3"
168
+ },
169
+ "language_info": {
170
+ "codemirror_mode": {
171
+ "name": "ipython",
172
+ "version": 3
173
+ },
174
+ "file_extension": ".py",
175
+ "mimetype": "text/x-python",
176
+ "name": "python",
177
+ "nbconvert_exporter": "python",
178
+ "pygments_lexer": "ipython3",
179
+ "version": "3.6.9"
180
+ }
181
+ },
182
+ "nbformat": 4,
183
+ "nbformat_minor": 5
184
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1b26afddfeb5ebe45d8a6ebbb0acd8e39367cf1f3e32d97e4419552bdd40c30
3
+ size 1222853333
sp10m.cased.v9.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0caa407e56cc60c5a74aad6f90d3c3a7d25231b0c0d92211df9f4c7442b839a
3
+ size 778744
sp10m.cased.v9.vocab ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "<sep>", "pad_token": "<pad>", "cls_token": "<cls>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}, "additional_special_tokens": ["<eop>", "<eod>"]}
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0caa407e56cc60c5a74aad6f90d3c3a7d25231b0c0d92211df9f4c7442b839a
3
+ size 778744
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "remove_space": true, "keep_accents": false, "bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "<sep>", "pad_token": "<pad>", "cls_token": "<cls>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "additional_special_tokens": ["<eop>", "<eod>"], "sp_model_kwargs": {}, "tokenizer_class": "XLNetTokenizer"}