SuperBigtoo commited on
Commit
833997d
·
1 Parent(s): 93acf27
__pycache__/thai_tokenization.cpython-311.pyc ADDED
Binary file (5.04 kB). View file
 
app.py CHANGED
@@ -1,12 +1,34 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
-
6
  iface = gr.Interface(
7
- fn=greet,
8
  inputs=gr.Textbox(lines=1, max_lines=10, label="Input News's Title"),
9
- outputs=gr.Textbox(lines=1, max_lines=10, label="Predicted News's Type"),
10
  title="Thai News Classify",
11
  examples=["จบสกอร์ไม่คม หมดครึ่งแรก ยูเครน เจ๊า โปแลนด์ 0-0",
12
  "แอรินยินดีนาฑี มีรักใหม่ ยันจบกันด้วยดี ปัดถ่ายแฟชั่นเซ็กซี่ประชดรัก อ้างถูกใจคอนเซปต์ (คลิป)",
 
1
  import gradio as gr
2
+ import torch
3
+ from simpletransformers.classification import ClassificationModel
4
+ from pythainlp import sent_tokenize
5
+ from thai_tokenization import ThaiTokenizer
6
+
7
+ tokenizer = ThaiTokenizer(vocab_file='ThaiNewsClassify/th.wiki.bpe.op25000.vocab', spm_file='ThaiNewsClassify/th.wiki.bpe.op25000.model')
8
+
9
+ typeId = {'การเมือง': 0, 'กีฬา': 1, 'คุณภาพชีวิต': 2, 'ทั่วไทย': 3, 'ไลฟ์สไตล์': 4,
10
+ 'อื่นๆ': 5, 'อาชญากรรม': 6, 'สิ่งแวดล้อม': 7, 'บันเทิง & วัฒนธรรม': 8, 'เศรษฐกิจ': 9,
11
+ 'วิทยาศาสตร์ & การศึกษา': 10, 'สังคม': 11, 'unspecified': 12, 'ต่างประเทศ': 13}
12
+
13
+ loaded_model = ClassificationModel(
14
+ "bert",
15
+ "ThaiNewsClassify/model/thainewsClassify_model_3_14",
16
+ use_cuda=torch.cuda.is_available(),
17
+ num_labels=14,
18
+ )
19
+
20
+ def predict_type(title_input):
21
+ title_input = title_input.lower()
22
+ title_input = sent_tokenize(title_input)
23
+ title_input = ' '.join(tokenizer.tokenize(' '.join(title_input)))
24
+ predictions, raw_outputs = loaded_model.predict([title_input])
25
+ predicted_label_name = [type_name for type_name, type_id in typeId.items() if type_id == predictions[0]]
26
+ return f"Predicted News Type: {predicted_label_name[0]}"
27
 
 
 
 
28
  iface = gr.Interface(
29
+ fn=predict_type,
30
  inputs=gr.Textbox(lines=1, max_lines=10, label="Input News's Title"),
31
+ outputs=gr.Textbox(lines=1, max_lines=2, label="Predicted News's Type"),
32
  title="Thai News Classify",
33
  examples=["จบสกอร์ไม่คม หมดครึ่งแรก ยูเครน เจ๊า โปแลนด์ 0-0",
34
  "แอรินยินดีนาฑี มีรักใหม่ ยันจบกันด้วยดี ปัดถ่ายแฟชั่นเซ็กซี่ประชดรัก อ้างถูกใจคอนเซปต์ (คลิป)",
model/thainewsClassify_model_3_14/config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ThaiNewsClassify/model/thainewsClassify_model_3_14",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "embedding_size": 768,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "LABEL_0",
14
+ "1": "LABEL_1",
15
+ "2": "LABEL_2",
16
+ "3": "LABEL_3",
17
+ "4": "LABEL_4",
18
+ "5": "LABEL_5",
19
+ "6": "LABEL_6",
20
+ "7": "LABEL_7",
21
+ "8": "LABEL_8",
22
+ "9": "LABEL_9",
23
+ "10": "LABEL_10",
24
+ "11": "LABEL_11",
25
+ "12": "LABEL_12",
26
+ "13": "LABEL_13"
27
+ },
28
+ "initializer_range": 0.02,
29
+ "intermediate_size": 3072,
30
+ "label2id": {
31
+ "LABEL_0": 0,
32
+ "LABEL_1": 1,
33
+ "LABEL_10": 10,
34
+ "LABEL_11": 11,
35
+ "LABEL_12": 12,
36
+ "LABEL_13": 13,
37
+ "LABEL_2": 2,
38
+ "LABEL_3": 3,
39
+ "LABEL_4": 4,
40
+ "LABEL_5": 5,
41
+ "LABEL_6": 6,
42
+ "LABEL_7": 7,
43
+ "LABEL_8": 8,
44
+ "LABEL_9": 9
45
+ },
46
+ "layer_norm_eps": 1e-12,
47
+ "max_position_embeddings": 512,
48
+ "model_type": "bert",
49
+ "num_attention_heads": 12,
50
+ "num_hidden_layers": 12,
51
+ "pad_token_id": 0,
52
+ "position_embedding_type": "absolute",
53
+ "problem_type": "single_label_classification",
54
+ "torch_dtype": "float32",
55
+ "transformers_version": "4.34.1",
56
+ "type_vocab_size": 2,
57
+ "use_cache": true,
58
+ "vocab_size": 25004
59
+ }
model/thainewsClassify_model_3_14/model_args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"adafactor_beta1": null, "adafactor_clip_threshold": 1.0, "adafactor_decay_rate": -0.8, "adafactor_eps": [1e-30, 0.001], "adafactor_relative_step": true, "adafactor_scale_parameter": true, "adafactor_warmup_init": true, "adam_betas": [0.9, 0.999], "adam_epsilon": 1e-08, "best_model_dir": "outputs/best_model", "cache_dir": "cache_dir/", "config": {}, "cosine_schedule_num_cycles": 0.5, "custom_layer_parameters": [], "custom_parameter_groups": [{"lr": 0.01}], "dataloader_num_workers": 0, "do_lower_case": false, "dynamic_quantize": false, "early_stopping_consider_epochs": false, "early_stopping_delta": 0, "early_stopping_metric": "eval_loss", "early_stopping_metric_minimize": true, "early_stopping_patience": 3, "encoding": null, "eval_batch_size": 64, "evaluate_during_training": false, "evaluate_during_training_silent": true, "evaluate_during_training_steps": 2000, "evaluate_during_training_verbose": false, "evaluate_each_epoch": true, "fp16": false, "gradient_accumulation_steps": 1, "learning_rate": 4e-05, "local_rank": -1, "logging_steps": 50, "loss_type": null, "loss_args": {}, "manual_seed": null, "max_grad_norm": 1.0, "max_seq_length": 128, "model_name": "/content/models/simple_transformer/thainewsClassify_model_2_14", "model_type": "bert", "multiprocessing_chunksize": -1, "n_gpu": 1, "no_cache": true, "no_save": false, "not_saved_args": [], "num_train_epochs": 3, "optimizer": "AdamW", "output_dir": "/content/models/simple_transformer/thainewsClassify_model_2_14", "overwrite_output_dir": true, "polynomial_decay_schedule_lr_end": 1e-07, "polynomial_decay_schedule_power": 1.0, "process_count": 1, "quantized_model": false, "reprocess_input_data": true, "save_best_model": true, "save_eval_checkpoints": true, "save_model_every_epoch": false, "save_optimizer_and_scheduler": true, "save_steps": -1, "scheduler": "linear_schedule_with_warmup", "silent": false, "skip_special_tokens": true, "tensorboard_dir": null, "thread_count": null, "tokenizer_name": "/content/models/simple_transformer/thainewsClassify_model_2_14", "tokenizer_type": null, "train_batch_size": 64, "train_custom_parameters_only": false, "use_cached_eval_features": false, "use_early_stopping": false, "use_hf_datasets": false, "use_multiprocessing": false, "use_multiprocessing_for_evaluation": false, "wandb_kwargs": {}, "wandb_project": null, "warmup_ratio": 0.06, "warmup_steps": 707, "weight_decay": 0.0, "model_class": "ClassificationModel", "labels_list": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "labels_map": {}, "lazy_delimiter": "\t", "lazy_labels_column": 1, "lazy_loading": false, "lazy_loading_start_line": 1, "lazy_text_a_column": null, "lazy_text_b_column": null, "lazy_text_column": 0, "onnx": false, "regression": false, "sliding_window": false, "special_tokens_list": [], "stride": 0.8, "tie_value": 1}
model/thainewsClassify_model_3_14/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dad22f227d95fc5f151dcbe803e6a728c00e35558c2c60fccb30cf30de21a93
3
+ size 421089518
model/thainewsClassify_model_3_14/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<unk>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
model/thainewsClassify_model_3_14/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
model/thainewsClassify_model_3_14/tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[MASK]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<unk>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "lowercase": false,
49
+ "mask_token": "[MASK]",
50
+ "max_length": 128,
51
+ "model_max_length": 1000000000000000019884624838656,
52
+ "never_split": null,
53
+ "pad_to_multiple_of": null,
54
+ "pad_token": "[PAD]",
55
+ "pad_token_type_id": 0,
56
+ "padding_side": "right",
57
+ "sep_token": "[SEP]",
58
+ "stride": 0,
59
+ "strip_accents": false,
60
+ "tokenize_chinese_chars": true,
61
+ "tokenizer_class": "BertTokenizer",
62
+ "truncation_side": "right",
63
+ "truncation_strategy": "longest_first",
64
+ "unk_token": "<unk>"
65
+ }
model/thainewsClassify_model_3_14/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69373dbab9d89dd72fa6822e7fddc57d5408a32d55680d343b993634892f0f34
3
+ size 3768
model/thainewsClassify_model_3_14/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ tensorflow
3
+ transformers
4
+ simpletransformers
5
+ pythainlp
6
+ numpy
th.wiki.bpe.op25000.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c58c571078266e44a63d151ee1a14c7f3c4adfdf44b3282f21a0d7bc2b97a1d
3
+ size 926663
th.wiki.bpe.op25000.vocab ADDED
The diff for this file is too large to render. See raw diff
 
thai_tokenization.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import collections
2
+ import unicodedata
3
+ import six
4
+
5
+ def convert_to_unicode(text):
6
+ """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
7
+ if six.PY3:
8
+ if isinstance(text, str):
9
+ return text
10
+ elif isinstance(text, bytes):
11
+ return text.decode("utf-8", "ignore")
12
+ else:
13
+ raise ValueError("Unsupported string type: %s" % (type(text)))
14
+ elif six.PY2:
15
+ if isinstance(text, str):
16
+ return text.decode("utf-8", "ignore")
17
+ elif isinstance(text, unicode):
18
+ return text
19
+ else:
20
+ raise ValueError("Unsupported string type: %s" % (type(text)))
21
+ else:
22
+ raise ValueError("Not running on Python2 or Python 3?")
23
+
24
+ def load_vocab(vocab_file):
25
+ vocab = collections.OrderedDict()
26
+ index = 0
27
+ with open(vocab_file, "r", encoding='utf-8') as reader:
28
+ while True:
29
+ token = reader.readline()
30
+ if token.split(): token = token.split()[0] # to support SentencePiece vocab file
31
+ token = convert_to_unicode(token)
32
+ if not token:
33
+ break
34
+ token = token.strip()
35
+ vocab[token] = index
36
+ index += 1
37
+ return vocab
38
+
39
+ #####
40
+
41
+ from bert.bpe_helper import BPE
42
+ import sentencepiece as spm
43
+
44
+ def convert_by_vocab(vocab, items):
45
+ output = []
46
+ for item in items:
47
+ output.append(vocab[item])
48
+ return output
49
+
50
+ class ThaiTokenizer(object):
51
+ """Tokenizes Thai texts."""
52
+
53
+ def __init__(self, vocab_file, spm_file):
54
+ self.vocab = load_vocab(vocab_file)
55
+ self.inv_vocab = {v: k for k, v in self.vocab.items()}
56
+
57
+ self.bpe = BPE(vocab_file)
58
+ self.s = spm.SentencePieceProcessor()
59
+ self.s.Load(spm_file)
60
+
61
+ def tokenize(self, text):
62
+ bpe_tokens = self.bpe.encode(text).split(' ')
63
+ spm_tokens = self.s.EncodeAsPieces(text)
64
+
65
+ tokens = bpe_tokens if len(bpe_tokens) < len(spm_tokens) else spm_tokens
66
+
67
+ split_tokens = []
68
+
69
+ for token in tokens:
70
+ new_token = token
71
+
72
+ if token.startswith('_') and not token in self.vocab:
73
+ split_tokens.append('_')
74
+ new_token = token[1:]
75
+
76
+ if not new_token in self.vocab:
77
+ split_tokens.append('<unk>')
78
+ else:
79
+ split_tokens.append(new_token)
80
+
81
+ return split_tokens
82
+
83
+ def convert_tokens_to_ids(self, tokens):
84
+ return convert_by_vocab(self.vocab, tokens)
85
+
86
+ def convert_ids_to_tokens(self, ids):
87
+ return convert_by_vocab(self.inv_vocab, ids)