Spaces:
Sleeping
Sleeping
SuperBigtoo
commited on
Commit
·
833997d
1
Parent(s):
93acf27
commit
Browse files- __pycache__/thai_tokenization.cpython-311.pyc +0 -0
- app.py +27 -5
- model/thainewsClassify_model_3_14/config.json +59 -0
- model/thainewsClassify_model_3_14/model_args.json +1 -0
- model/thainewsClassify_model_3_14/pytorch_model.bin +3 -0
- model/thainewsClassify_model_3_14/special_tokens_map.json +37 -0
- model/thainewsClassify_model_3_14/tokenizer.json +0 -0
- model/thainewsClassify_model_3_14/tokenizer_config.json +65 -0
- model/thainewsClassify_model_3_14/training_args.bin +3 -0
- model/thainewsClassify_model_3_14/vocab.txt +0 -0
- requirements.txt +6 -0
- th.wiki.bpe.op25000.model +3 -0
- th.wiki.bpe.op25000.vocab +0 -0
- thai_tokenization.py +87 -0
__pycache__/thai_tokenization.cpython-311.pyc
ADDED
Binary file (5.04 kB). View file
|
|
app.py
CHANGED
@@ -1,12 +1,34 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
def greet(name):
|
4 |
-
return "Hello " + name + "!!"
|
5 |
-
|
6 |
iface = gr.Interface(
|
7 |
-
fn=
|
8 |
inputs=gr.Textbox(lines=1, max_lines=10, label="Input News's Title"),
|
9 |
-
outputs=gr.Textbox(lines=1, max_lines=
|
10 |
title="Thai News Classify",
|
11 |
examples=["จบสกอร์ไม่คม หมดครึ่งแรก ยูเครน เจ๊า โปแลนด์ 0-0",
|
12 |
"แอรินยินดีนาฑี มีรักใหม่ ยันจบกันด้วยดี ปัดถ่ายแฟชั่นเซ็กซี่ประชดรัก อ้างถูกใจคอนเซปต์ (คลิป)",
|
|
|
1 |
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from simpletransformers.classification import ClassificationModel
|
4 |
+
from pythainlp import sent_tokenize
|
5 |
+
from thai_tokenization import ThaiTokenizer
|
6 |
+
|
7 |
+
tokenizer = ThaiTokenizer(vocab_file='ThaiNewsClassify/th.wiki.bpe.op25000.vocab', spm_file='ThaiNewsClassify/th.wiki.bpe.op25000.model')
|
8 |
+
|
9 |
+
typeId = {'การเมือง': 0, 'กีฬา': 1, 'คุณภาพชีวิต': 2, 'ทั่วไทย': 3, 'ไลฟ์สไตล์': 4,
|
10 |
+
'อื่นๆ': 5, 'อาชญากรรม': 6, 'สิ่งแวดล้อม': 7, 'บันเทิง & วัฒนธรรม': 8, 'เศรษฐกิจ': 9,
|
11 |
+
'วิทยาศาสตร์ & การศึกษา': 10, 'สังคม': 11, 'unspecified': 12, 'ต่างประเทศ': 13}
|
12 |
+
|
13 |
+
loaded_model = ClassificationModel(
|
14 |
+
"bert",
|
15 |
+
"ThaiNewsClassify/model/thainewsClassify_model_3_14",
|
16 |
+
use_cuda=torch.cuda.is_available(),
|
17 |
+
num_labels=14,
|
18 |
+
)
|
19 |
+
|
20 |
+
def predict_type(title_input):
|
21 |
+
title_input = title_input.lower()
|
22 |
+
title_input = sent_tokenize(title_input)
|
23 |
+
title_input = ' '.join(tokenizer.tokenize(' '.join(title_input)))
|
24 |
+
predictions, raw_outputs = loaded_model.predict([title_input])
|
25 |
+
predicted_label_name = [type_name for type_name, type_id in typeId.items() if type_id == predictions[0]]
|
26 |
+
return f"Predicted News Type: {predicted_label_name[0]}"
|
27 |
|
|
|
|
|
|
|
28 |
iface = gr.Interface(
|
29 |
+
fn=predict_type,
|
30 |
inputs=gr.Textbox(lines=1, max_lines=10, label="Input News's Title"),
|
31 |
+
outputs=gr.Textbox(lines=1, max_lines=2, label="Predicted News's Type"),
|
32 |
title="Thai News Classify",
|
33 |
examples=["จบสกอร์ไม่คม หมดครึ่งแรก ยูเครน เจ๊า โปแลนด์ 0-0",
|
34 |
"แอรินยินดีนาฑี มีรักใหม่ ยันจบกันด้วยดี ปัดถ่ายแฟชั่นเซ็กซี่ประชดรัก อ้างถูกใจคอนเซปต์ (คลิป)",
|
model/thainewsClassify_model_3_14/config.json
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "ThaiNewsClassify/model/thainewsClassify_model_3_14",
|
3 |
+
"architectures": [
|
4 |
+
"BertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"embedding_size": 768,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"id2label": {
|
13 |
+
"0": "LABEL_0",
|
14 |
+
"1": "LABEL_1",
|
15 |
+
"2": "LABEL_2",
|
16 |
+
"3": "LABEL_3",
|
17 |
+
"4": "LABEL_4",
|
18 |
+
"5": "LABEL_5",
|
19 |
+
"6": "LABEL_6",
|
20 |
+
"7": "LABEL_7",
|
21 |
+
"8": "LABEL_8",
|
22 |
+
"9": "LABEL_9",
|
23 |
+
"10": "LABEL_10",
|
24 |
+
"11": "LABEL_11",
|
25 |
+
"12": "LABEL_12",
|
26 |
+
"13": "LABEL_13"
|
27 |
+
},
|
28 |
+
"initializer_range": 0.02,
|
29 |
+
"intermediate_size": 3072,
|
30 |
+
"label2id": {
|
31 |
+
"LABEL_0": 0,
|
32 |
+
"LABEL_1": 1,
|
33 |
+
"LABEL_10": 10,
|
34 |
+
"LABEL_11": 11,
|
35 |
+
"LABEL_12": 12,
|
36 |
+
"LABEL_13": 13,
|
37 |
+
"LABEL_2": 2,
|
38 |
+
"LABEL_3": 3,
|
39 |
+
"LABEL_4": 4,
|
40 |
+
"LABEL_5": 5,
|
41 |
+
"LABEL_6": 6,
|
42 |
+
"LABEL_7": 7,
|
43 |
+
"LABEL_8": 8,
|
44 |
+
"LABEL_9": 9
|
45 |
+
},
|
46 |
+
"layer_norm_eps": 1e-12,
|
47 |
+
"max_position_embeddings": 512,
|
48 |
+
"model_type": "bert",
|
49 |
+
"num_attention_heads": 12,
|
50 |
+
"num_hidden_layers": 12,
|
51 |
+
"pad_token_id": 0,
|
52 |
+
"position_embedding_type": "absolute",
|
53 |
+
"problem_type": "single_label_classification",
|
54 |
+
"torch_dtype": "float32",
|
55 |
+
"transformers_version": "4.34.1",
|
56 |
+
"type_vocab_size": 2,
|
57 |
+
"use_cache": true,
|
58 |
+
"vocab_size": 25004
|
59 |
+
}
|
model/thainewsClassify_model_3_14/model_args.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"adafactor_beta1": null, "adafactor_clip_threshold": 1.0, "adafactor_decay_rate": -0.8, "adafactor_eps": [1e-30, 0.001], "adafactor_relative_step": true, "adafactor_scale_parameter": true, "adafactor_warmup_init": true, "adam_betas": [0.9, 0.999], "adam_epsilon": 1e-08, "best_model_dir": "outputs/best_model", "cache_dir": "cache_dir/", "config": {}, "cosine_schedule_num_cycles": 0.5, "custom_layer_parameters": [], "custom_parameter_groups": [{"lr": 0.01}], "dataloader_num_workers": 0, "do_lower_case": false, "dynamic_quantize": false, "early_stopping_consider_epochs": false, "early_stopping_delta": 0, "early_stopping_metric": "eval_loss", "early_stopping_metric_minimize": true, "early_stopping_patience": 3, "encoding": null, "eval_batch_size": 64, "evaluate_during_training": false, "evaluate_during_training_silent": true, "evaluate_during_training_steps": 2000, "evaluate_during_training_verbose": false, "evaluate_each_epoch": true, "fp16": false, "gradient_accumulation_steps": 1, "learning_rate": 4e-05, "local_rank": -1, "logging_steps": 50, "loss_type": null, "loss_args": {}, "manual_seed": null, "max_grad_norm": 1.0, "max_seq_length": 128, "model_name": "/content/models/simple_transformer/thainewsClassify_model_2_14", "model_type": "bert", "multiprocessing_chunksize": -1, "n_gpu": 1, "no_cache": true, "no_save": false, "not_saved_args": [], "num_train_epochs": 3, "optimizer": "AdamW", "output_dir": "/content/models/simple_transformer/thainewsClassify_model_2_14", "overwrite_output_dir": true, "polynomial_decay_schedule_lr_end": 1e-07, "polynomial_decay_schedule_power": 1.0, "process_count": 1, "quantized_model": false, "reprocess_input_data": true, "save_best_model": true, "save_eval_checkpoints": true, "save_model_every_epoch": false, "save_optimizer_and_scheduler": true, "save_steps": -1, "scheduler": "linear_schedule_with_warmup", "silent": false, "skip_special_tokens": true, "tensorboard_dir": null, "thread_count": null, "tokenizer_name": "/content/models/simple_transformer/thainewsClassify_model_2_14", "tokenizer_type": null, "train_batch_size": 64, "train_custom_parameters_only": false, "use_cached_eval_features": false, "use_early_stopping": false, "use_hf_datasets": false, "use_multiprocessing": false, "use_multiprocessing_for_evaluation": false, "wandb_kwargs": {}, "wandb_project": null, "warmup_ratio": 0.06, "warmup_steps": 707, "weight_decay": 0.0, "model_class": "ClassificationModel", "labels_list": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "labels_map": {}, "lazy_delimiter": "\t", "lazy_labels_column": 1, "lazy_loading": false, "lazy_loading_start_line": 1, "lazy_text_a_column": null, "lazy_text_b_column": null, "lazy_text_column": 0, "onnx": false, "regression": false, "sliding_window": false, "special_tokens_list": [], "stride": 0.8, "tie_value": 1}
|
model/thainewsClassify_model_3_14/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2dad22f227d95fc5f151dcbe803e6a728c00e35558c2c60fccb30cf30de21a93
|
3 |
+
size 421089518
|
model/thainewsClassify_model_3_14/special_tokens_map.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": {
|
3 |
+
"content": "[CLS]",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"mask_token": {
|
10 |
+
"content": "[MASK]",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "[PAD]",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"sep_token": {
|
24 |
+
"content": "[SEP]",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"unk_token": {
|
31 |
+
"content": "<unk>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
}
|
37 |
+
}
|
model/thainewsClassify_model_3_14/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model/thainewsClassify_model_3_14/tokenizer_config.json
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "[CLS]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "[SEP]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "[MASK]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"4": {
|
36 |
+
"content": "<unk>",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": true,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": false,
|
48 |
+
"lowercase": false,
|
49 |
+
"mask_token": "[MASK]",
|
50 |
+
"max_length": 128,
|
51 |
+
"model_max_length": 1000000000000000019884624838656,
|
52 |
+
"never_split": null,
|
53 |
+
"pad_to_multiple_of": null,
|
54 |
+
"pad_token": "[PAD]",
|
55 |
+
"pad_token_type_id": 0,
|
56 |
+
"padding_side": "right",
|
57 |
+
"sep_token": "[SEP]",
|
58 |
+
"stride": 0,
|
59 |
+
"strip_accents": false,
|
60 |
+
"tokenize_chinese_chars": true,
|
61 |
+
"tokenizer_class": "BertTokenizer",
|
62 |
+
"truncation_side": "right",
|
63 |
+
"truncation_strategy": "longest_first",
|
64 |
+
"unk_token": "<unk>"
|
65 |
+
}
|
model/thainewsClassify_model_3_14/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:69373dbab9d89dd72fa6822e7fddc57d5408a32d55680d343b993634892f0f34
|
3 |
+
size 3768
|
model/thainewsClassify_model_3_14/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
tensorflow
|
3 |
+
transformers
|
4 |
+
simpletransformers
|
5 |
+
pythainlp
|
6 |
+
numpy
|
th.wiki.bpe.op25000.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3c58c571078266e44a63d151ee1a14c7f3c4adfdf44b3282f21a0d7bc2b97a1d
|
3 |
+
size 926663
|
th.wiki.bpe.op25000.vocab
ADDED
The diff for this file is too large to render.
See raw diff
|
|
thai_tokenization.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import collections
|
2 |
+
import unicodedata
|
3 |
+
import six
|
4 |
+
|
5 |
+
def convert_to_unicode(text):
|
6 |
+
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
|
7 |
+
if six.PY3:
|
8 |
+
if isinstance(text, str):
|
9 |
+
return text
|
10 |
+
elif isinstance(text, bytes):
|
11 |
+
return text.decode("utf-8", "ignore")
|
12 |
+
else:
|
13 |
+
raise ValueError("Unsupported string type: %s" % (type(text)))
|
14 |
+
elif six.PY2:
|
15 |
+
if isinstance(text, str):
|
16 |
+
return text.decode("utf-8", "ignore")
|
17 |
+
elif isinstance(text, unicode):
|
18 |
+
return text
|
19 |
+
else:
|
20 |
+
raise ValueError("Unsupported string type: %s" % (type(text)))
|
21 |
+
else:
|
22 |
+
raise ValueError("Not running on Python2 or Python 3?")
|
23 |
+
|
24 |
+
def load_vocab(vocab_file):
|
25 |
+
vocab = collections.OrderedDict()
|
26 |
+
index = 0
|
27 |
+
with open(vocab_file, "r", encoding='utf-8') as reader:
|
28 |
+
while True:
|
29 |
+
token = reader.readline()
|
30 |
+
if token.split(): token = token.split()[0] # to support SentencePiece vocab file
|
31 |
+
token = convert_to_unicode(token)
|
32 |
+
if not token:
|
33 |
+
break
|
34 |
+
token = token.strip()
|
35 |
+
vocab[token] = index
|
36 |
+
index += 1
|
37 |
+
return vocab
|
38 |
+
|
39 |
+
#####
|
40 |
+
|
41 |
+
from bert.bpe_helper import BPE
|
42 |
+
import sentencepiece as spm
|
43 |
+
|
44 |
+
def convert_by_vocab(vocab, items):
|
45 |
+
output = []
|
46 |
+
for item in items:
|
47 |
+
output.append(vocab[item])
|
48 |
+
return output
|
49 |
+
|
50 |
+
class ThaiTokenizer(object):
|
51 |
+
"""Tokenizes Thai texts."""
|
52 |
+
|
53 |
+
def __init__(self, vocab_file, spm_file):
|
54 |
+
self.vocab = load_vocab(vocab_file)
|
55 |
+
self.inv_vocab = {v: k for k, v in self.vocab.items()}
|
56 |
+
|
57 |
+
self.bpe = BPE(vocab_file)
|
58 |
+
self.s = spm.SentencePieceProcessor()
|
59 |
+
self.s.Load(spm_file)
|
60 |
+
|
61 |
+
def tokenize(self, text):
|
62 |
+
bpe_tokens = self.bpe.encode(text).split(' ')
|
63 |
+
spm_tokens = self.s.EncodeAsPieces(text)
|
64 |
+
|
65 |
+
tokens = bpe_tokens if len(bpe_tokens) < len(spm_tokens) else spm_tokens
|
66 |
+
|
67 |
+
split_tokens = []
|
68 |
+
|
69 |
+
for token in tokens:
|
70 |
+
new_token = token
|
71 |
+
|
72 |
+
if token.startswith('_') and not token in self.vocab:
|
73 |
+
split_tokens.append('_')
|
74 |
+
new_token = token[1:]
|
75 |
+
|
76 |
+
if not new_token in self.vocab:
|
77 |
+
split_tokens.append('<unk>')
|
78 |
+
else:
|
79 |
+
split_tokens.append(new_token)
|
80 |
+
|
81 |
+
return split_tokens
|
82 |
+
|
83 |
+
def convert_tokens_to_ids(self, tokens):
|
84 |
+
return convert_by_vocab(self.vocab, tokens)
|
85 |
+
|
86 |
+
def convert_ids_to_tokens(self, ids):
|
87 |
+
return convert_by_vocab(self.inv_vocab, ids)
|