hibikaze commited on
Commit
f1660c1
1 Parent(s): 7b91767

upload model

Browse files
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - en
5
+ - ja
6
+ datasets:
7
+ - izumi-lab/wikipedia-ja-20230720
8
+ - izumi-lab/wikipedia-en-20230720
9
+ - izumi-lab/open-text-books
10
+ - if001/aozorabunko-clean-sin
11
+ - if001/oscar_2023_filtered
12
+ tags:
13
+ - ja
14
+ - japanese
15
+ - mixtral
16
+ inference: false
17
+ ---
18
+
19
+ 275.86Mのmixtralを日本語データセットでpretrainingしたものです
20
+
21
+ ## sample
22
+
23
+ ```
24
+ from transformers import AutoTokenizer, AutoModelForCausalLM
25
+ model = AutoModelForCausalLM.from_pretrained("if001/tiny_mixtral_ja")
26
+ tokenizer = AutoTokenizer.from_pretrained("if001/sentencepiece_ja", trust_remote_code=True)
27
+
28
+ prompt = "それは九月初旬のある蒸し暑い晩のことであった。私は、D坂の"
29
+ inputs = tokenizer(prompt, return_tensors="pt")
30
+
31
+ generate_ids = model.generate(
32
+ inputs.input_ids,
33
+ max_length=30,
34
+ top_k=30,
35
+ top_p=0.95,
36
+ temperature=0.6,
37
+ repetition_penalty=1.2,
38
+ do_sample=True,
39
+ )
40
+ tokenizer.decode(generate_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
41
+
42
+ >> それは九月初旬のある蒸し暑い晩のことであった。私は、D坂の茶舗を後にして、その路地の角に横丁をあるいて居る、と云うと、丁度其処から、
43
+ ```
44
+
45
+ ## dataset
46
+ 英語と日本語のデータセットを使用
47
+
48
+ ```
49
+ total tokens: 8.64B
50
+
51
+ wikipedia_ja: 844.65M
52
+ wikipedia_en: 3.80B
53
+ open-text-books: 60.17M
54
+ oscar: 3.85B
55
+ aozorabunko: 92.97M
56
+ ```
57
+
58
+ ## tokenizer
59
+ ```
60
+ all_special_ids: [1, 2, 3, 0, 4]
61
+ all_special_tokens: ['<BOS>', '<EOS>', '<UNK>', '<PAD>', '<MASK>']
62
+ ```
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MixtralForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 1,
7
+ "eos_token_id": 2,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 640,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 2400,
12
+ "max_position_embeddings": 131072,
13
+ "model_type": "mixtral",
14
+ "num_attention_heads": 8,
15
+ "num_experts_per_tok": 2,
16
+ "num_hidden_layers": 8,
17
+ "num_key_value_heads": 4,
18
+ "num_local_experts": 6,
19
+ "output_router_logits": true,
20
+ "pad_token_id": 0,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_theta": 1000000.0,
23
+ "router_aux_loss_coef": 0.001,
24
+ "sliding_window": null,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.37.0",
28
+ "use_cache": true,
29
+ "vocab_size": 35000
30
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.37.0"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:797294a9ec3adcf8b2f3af8dd4896d9d4aeb7e2b5e9821e51908d725d5f0e4cf
3
+ size 1103448792
sentencepiece_ja.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Union, List, Optional, Tuple
3
+
4
+ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, AutoTokenizer
5
+ from transformers.utils.hub import cached_file
6
+ class SentencePieceJA(PreTrainedTokenizer):
7
+ def __init__(self,
8
+ model_path = "./tokenizer.json",
9
+ pad = "<PAD>",
10
+ bos = "<BOS>",
11
+ eos = "<EOS>",
12
+ unk = "<UNK>",
13
+ mask = "<MASK>",
14
+ **kwargs):
15
+ from tokenizers import Tokenizer
16
+ try:
17
+ self._tokenizer = Tokenizer.from_file(model_path)
18
+ except Exception as e:
19
+ print('exception: ', e)
20
+ print('load from cache...')
21
+ model_path = cached_file('if001/sentencepiece_ja', 'tokenizer.json')
22
+ self._tokenizer = Tokenizer.from_file(model_path)
23
+ super().__init__(**kwargs)
24
+ self.add_special_tokens({
25
+ 'pad_token': pad,
26
+ 'bos_token': bos,
27
+ 'eos_token': eos,
28
+ 'unk_token': unk,
29
+ 'mask_token': mask
30
+ })
31
+ self._tokenizer.add_tokens([" ", " "])
32
+
33
+ def get_vocab(self) -> int:
34
+ return self._tokenizer.get_vocab()
35
+
36
+ @property
37
+ def vocab_size(self) -> int:
38
+ return self._tokenizer.get_vocab_size()
39
+
40
+ def _tokenize(self, text, **kwargs):
41
+ return self._tokenizer.encode(text).tokens
42
+
43
+ def _convert_token_to_id(self, token):
44
+ ids = self._tokenizer.encode(token).ids
45
+ if len(ids) == 0:
46
+ return self.unk_token_id
47
+ return self._tokenizer.encode(token).ids[0]
48
+
49
+ def _convert_id_to_token(self, index: int) -> str:
50
+ return self._tokenizer.decode([index])
51
+
52
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
53
+ ## 日本語用
54
+ return "".join(tokens)
55
+
56
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
57
+ index = 0
58
+ if os.path.isdir(save_directory):
59
+ vocab_file = os.path.join(
60
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + 'vocab.txt'
61
+ )
62
+ else:
63
+ vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
64
+ with open(vocab_file, "w", encoding="utf-8") as writer:
65
+ for token, token_index in sorted(self.get_vocab().items(), key=lambda kv: kv[1]):
66
+ if index != token_index:
67
+ index = token_index
68
+ writer.write(token + "\n")
69
+ index += 1
70
+ return (vocab_file,)
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<BOS>",
3
+ "eos_token": "<EOS>",
4
+ "mask_token": "<MASK>",
5
+ "pad_token": "<PAD>",
6
+ "unk_token": "<UNK>"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<BOS>",
3
+ "eos_token": "<EOS>",
4
+ "mask_token": "<MASK>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": "<PAD>",
7
+ "unk_token": "<UNK>",
8
+ "clean_up_tokenization_spaces": true,
9
+ "tokenizer_class": "SentencePieceJA",
10
+ "auto_map": {
11
+ "AutoTokenizer": ["","sentencepiece_ja.SentencePieceJA"]
12
+ },
13
+ "transformers_version": " 4.34.1"
14
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff