upload model

Browse files

Files changed (10) hide show

README.md +0 -0
args.json +24 -0
config.json +25 -0
dict.txt +0 -0
merges.txt +0 -0
process.log +8 -0
pytorch_model.bin +3 -0
special_tokens_map.json +1 -0
tokenizer_config.json +1 -0
vocab.json +0 -0

README.md ADDED Viewed

File without changes

args.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+    "custom_vocab_files": [
+        "/home/usuaris/veu/casimiro.pio.carrino/projects/corpus-utils-lm/corpora/bio/biomedical-clinical.txt"
+    ],
+    "vocab_name": "bio-biomedical-clinical-vocab-52k",
+    "tokenizer": "bbpe",
+    "lowercase": false,
+    "vocab_size": 52000,
+    "min_frequency": 10,
+    "extra_tokens": [],
+    "limit_alphabet": 1000,
+    "no_show_progress": false,
+    "strip_accents": false,
+    "no_handle_chinese_chars": false,
+    "no_clean_text": false,
+    "reserve_tokens": 0,
+    "use_tokenizers": false,
+    "no_fairseq": false,
+    "files": [
+        "/home/usuaris/veu/casimiro.pio.carrino/projects/corpus-utils-lm/corpora/bio/biomedical-clinical.txt"
+    ],
+    "output_root_path": "/home/usuaris/veu/casimiro.pio.carrino/projects/corpus-utils-lm/output/model-ready_output/bio-biomedical-clinical-vocab-52k-2021-04-26-0955-3a71-240f",
+    "commit_hash": "3a7116cf776527c411869becbe6fad8b9e3f5e56"
+}

config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.4.0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 52000
+}

dict.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

process.log ADDED Viewed

	@@ -0,0 +1,8 @@

+Executing train_tokenizer.py
+------------------------------
+training bbpe tokenizer
+Initialize an empty tokenizer
+training
+saving model tokenizer to /home/usuaris/veu/casimiro.pio.carrino/projects/corpus-utils-lm/output/model-ready_output/bio-biomedical-clinical-vocab-52k-2021-04-26-0955-3a71-240f/train_tokenizer_output/train-tokenizer-2021-04-26-1009-3a71-e9ca
+saving pretrained to /home/usuaris/veu/casimiro.pio.carrino/projects/corpus-utils-lm/output/model-ready_output/bio-biomedical-clinical-vocab-52k-2021-04-26-0955-3a71-240f/train_tokenizer_output/train-tokenizer-2021-04-26-1009-3a71-e9ca
+saving config to /home/usuaris/veu/casimiro.pio.carrino/projects/corpus-utils-lm/output/model-ready_output/bio-biomedical-clinical-vocab-52k-2021-04-26-0955-3a71-240f/train_tokenizer_output/train-tokenizer-2021-04-26-1009-3a71-e9ca

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e0c8d7bb348e40327b7c38eb6995c74d5c64345bc4ab9b3deff58e7359f15f7
+size 504420627

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": true, "errors": "replace", "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "max_len": 512, "special_tokens_map_file": null, "name_or_path": "/home/usuaris/veu/casimiro.pio.carrino/projects/corpus-utils-lm/output/model-ready_output/bio-biomedical-clinical-vocab-52k-2021-04-26-0955-3a71-240f/train_tokenizer_output/train-tokenizer-2021-04-26-1009-3a71-e9ca"}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff