kobart-news
Browse files- README.md +37 -0
- config.json +60 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +1 -0
- tokenizer.json +0 -0
- tokenizer_config.json +1 -0
README.md
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: ko
|
3 |
+
license: mit
|
4 |
+
tags:
|
5 |
+
- summarization
|
6 |
+
- bart
|
7 |
+
---
|
8 |
+
# kobart-news
|
9 |
+
- This model is a [kobart](https://huggingface.co/hyunwoongko/kobart) fine-tuned on the [๋ฌธ์์์ฝ ํ
์คํธ/์ ๋ฌธ๊ธฐ์ฌ](https://aihub.or.kr/aidata/8054) using [Ainize Teachable-NLP](https://ainize.ai/teachable-nlp).
|
10 |
+
|
11 |
+
## Usage
|
12 |
+
### Python Code
|
13 |
+
```python
|
14 |
+
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
|
15 |
+
# Load Model and Tokenize
|
16 |
+
tokenizer = PreTrainedTokenizerFast.from_pretrained("ainize/kobart-news")
|
17 |
+
model = BartForConditionalGeneration.from_pretrained("ainize/kobart-news")
|
18 |
+
# Encode Input Text
|
19 |
+
input_text = '๊ตญ๋ด ์ ๋ฐ์ ์ธ ๊ฒฝ๊ธฐ์นจ์ฒด๋ก ์๊ฐ ๊ฑด๋ฌผ์ฃผ์ ์์ต๋ ์ ๊ตญ์ ์ธ ๊ฐ์์ธ๋ฅผ ๋ณด์ด๊ณ ์๋ ๊ฒ์ผ๋ก ๋ํ๋ฌ๋ค. ์์ตํ ๋ถ๋์ฐ ์ฐ๊ตฌ๊ฐ๋ฐ๊ธฐ์
์๊ฐ์ ๋ณด์ฐ๊ตฌ์๋ ํ๊ตญ๊ฐ์ ์ ํต๊ณ๋ฅผ ๋ถ์ํ ๊ฒฐ๊ณผ ์ ๊ตญ ์ค๋ํ ์๊ฐ ์์์
์๋(๋ถ๋์ฐ์์ ๋ฐ์ํ๋ ์๋์์
, ๊ธฐํ์์
์์ ์ ๋ฐ ๊ฒฝ๋น๋ฅผ ๊ณต์ ํ ์์๋)์ด 1๋ถ๊ธฐ ใก๋น 3๋ง4200์์์ 3๋ถ๊ธฐ 2๋ง5800์์ผ๋ก ๊ฐ์ํ๋ค๊ณ 17์ผ ๋ฐํ๋ค. ์๋๊ถ, ์ธ์ข
์, ์ง๋ฐฉ๊ด์ญ์์์ ์์์
์๋์ด ๊ฐ์ฅ ๋ง์ด ๊ฐ์ํ ์ง์ญ์ 3๋ถ๊ธฐ 1๋ง3100์์ ๊ธฐ๋กํ ์ธ์ฐ์ผ๋ก, 1๋ถ๊ธฐ 1๋ง9100์ ๋๋น 31.4% ๊ฐ์ํ๋ค. ์ด์ด ๋๊ตฌ(-27.7%), ์์ธ(-26.9%), ๊ด์ฃผ(-24.9%), ๋ถ์ฐ(-23.5%), ์ธ์ข
(-23.4%), ๋์ (-21%), ๊ฒฝ๊ธฐ(-19.2%), ์ธ์ฒ(-18.5%) ์์ผ๋ก ๊ฐ์ํ๋ค. ์ง๋ฐฉ ๋์์ ๊ฒฝ์ฐ๋ ๋น์ทํ๋ค. ๊ฒฝ๋จ์ 3๋ถ๊ธฐ ์์์
์๋์ 1๋ง2800์์ผ๋ก 1๋ถ๊ธฐ 1๋ง7400์ ๋๋น 26.4% ๊ฐ์ํ์ผ๋ฉฐ ์ ์ฃผ(-25.1%), ๊ฒฝ๋ถ(-24.1%), ์ถฉ๋จ(-20.9%), ๊ฐ์(-20.9%), ์ ๋จ(-20.1%), ์ ๋ถ(-17%), ์ถฉ๋ถ(-15.3%) ๋ฑ๋ ๊ฐ์์ธ๋ฅผ ๋ณด์๋ค. ์กฐํํ ์๊ฐ์ ๋ณด์ฐ๊ตฌ์ ์ฐ๊ตฌ์์ "์ฌํด ๋ด์ ๊ฒฝ๊ธฐ์ ์นจ์ฒด๋ ๋ถ์๊ธฐ๊ฐ ์ ์ง๋๋ฉฐ ์๊ฐ, ์คํผ์ค ๋ฑ์ ๋น๋กฏํ ์์ตํ ๋ถ๋์ฐ ์์ฅ์ ๋ถ์๊ธฐ๋ ๊ฒฝ์ง๋ ๋ชจ์ต์ ๋ณด์๊ณ ์คํผ์คํ
, ์ง์์ฐ์
์ผํฐ ๋ฑ์ ์์ตํ ๋ถ๋์ฐ ๊ณต๊ธ๋ ์ฆ๊ฐํด ๊ณต์ค์ ์ํ๋ ๋์๋ค"๋ฉฐ "์ค์ ์ฌ 3๋ถ๊ธฐ ์ ๊ตญ ์ค๋ํ ์๊ฐ ๊ณต์ค๋ฅ ์ 11.5%๋ฅผ ๊ธฐ๋กํ๋ฉฐ 1๋ถ๊ธฐ 11.3% ๋๋น 0.2% ํฌ์ธํธ ์ฆ๊ฐํ๋ค"๊ณ ๋งํ๋ค. ๊ทธ๋ "์ต๊ทผ ์์
์ปค๋จธ์ค(SNS๋ฅผ ํตํ ์ ์์๊ฑฐ๋), ์์ ๋ฐฐ๋ฌ ์ค๊ฐ ์ ํ๋ฆฌ์ผ์ด์
, ์ค๊ณ ๋ฌผํ ๊ฑฐ๋ ์ ํ๋ฆฌ์ผ์ด์
๋ฑ์ ์ฌ์ฉ ์ฆ๊ฐ๋ก ์คํ๋ผ์ธ ๋งค์ฅ์ ์ํฅ์ ๋ฏธ์ณค๋ค"๋ฉฐ "ํฅํ ์ง์ญ, ์ฝํ
์ธ ์ ๋ฐ๋ฅธ ์๊ถ ์๊ทนํ ํ์์ ์ฌํ๋ ๊ฒ์ผ๋ก ๋ณด์ธ๋ค"๊ณ ๋ง๋ถ์๋ค.'
|
20 |
+
input_ids = tokenizer.encode(input_text, return_tensors="pt")
|
21 |
+
# Generate Summary Text Ids
|
22 |
+
summary_text_ids = model.generate(
|
23 |
+
input_ids=input_ids,
|
24 |
+
bos_token_id=model.config.bos_token_id,
|
25 |
+
eos_token_id=model.config.eos_token_id,
|
26 |
+
length_penalty=2.0,
|
27 |
+
max_length=142,
|
28 |
+
min_length=56,
|
29 |
+
num_beams=4,
|
30 |
+
)
|
31 |
+
# Decoding Text
|
32 |
+
print(tokenizer.decode(summary_text_ids[0], skip_special_tokens=True))
|
33 |
+
```
|
34 |
+
### API and Demo
|
35 |
+
You can experience this model through [ainize-api](https://ainize.ai/gkswjdzz/summarize-torchserve?branch=main) and [ainize-demo](https://main-summarize-torchserve-gkswjdzz.endpoint.ainize.ai/).
|
36 |
+
|
37 |
+
|
config.json
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "kobart_news",
|
3 |
+
"activation_dropout": 0.0,
|
4 |
+
"activation_function": "gelu",
|
5 |
+
"add_bias_logits": false,
|
6 |
+
"add_final_layer_norm": false,
|
7 |
+
"architectures": [
|
8 |
+
"BartForConditionalGeneration"
|
9 |
+
],
|
10 |
+
"attention_dropout": 0.0,
|
11 |
+
"bos_token_id": 0,
|
12 |
+
"classif_dropout": 0.1,
|
13 |
+
"classifier_dropout": 0.1,
|
14 |
+
"d_model": 768,
|
15 |
+
"decoder_attention_heads": 16,
|
16 |
+
"decoder_ffn_dim": 3072,
|
17 |
+
"decoder_layerdrop": 0.0,
|
18 |
+
"decoder_layers": 6,
|
19 |
+
"decoder_start_token_id": 2,
|
20 |
+
"do_blenderbot_90_layernorm": false,
|
21 |
+
"dropout": 0.1,
|
22 |
+
"encoder_attention_heads": 16,
|
23 |
+
"encoder_ffn_dim": 3072,
|
24 |
+
"encoder_layerdrop": 0.0,
|
25 |
+
"encoder_layers": 6,
|
26 |
+
"eos_token_id": 1,
|
27 |
+
"extra_pos_embeddings": 2,
|
28 |
+
"force_bos_token_to_be_generated": false,
|
29 |
+
"forced_eos_token_id": 2,
|
30 |
+
"gradient_checkpointing": false,
|
31 |
+
"id2label": {
|
32 |
+
"0": "NEGATIVE",
|
33 |
+
"1": "POSITIVE"
|
34 |
+
},
|
35 |
+
"init_std": 0.02,
|
36 |
+
"is_encoder_decoder": true,
|
37 |
+
"label2id": {
|
38 |
+
"NEGATIVE": 0,
|
39 |
+
"POSITIVE": 1
|
40 |
+
},
|
41 |
+
"max_position_embeddings": 1026,
|
42 |
+
"model_type": "bart",
|
43 |
+
"normalize_before": false,
|
44 |
+
"normalize_embedding": true,
|
45 |
+
"num_hidden_layers": 6,
|
46 |
+
"pad_token_id": 3,
|
47 |
+
"scale_embedding": false,
|
48 |
+
"static_position_embeddings": false,
|
49 |
+
"transformers_version": "4.8.1",
|
50 |
+
"use_cache": true,
|
51 |
+
"task_specific_params": {
|
52 |
+
"summarization": {
|
53 |
+
"length_penalty": 1.0,
|
54 |
+
"max_length": 128,
|
55 |
+
"min_length": 12,
|
56 |
+
"num_beams": 4
|
57 |
+
}
|
58 |
+
},
|
59 |
+
"vocab_size": 30000
|
60 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cd2e832d56099465123486157f7a0c53b4ef3a6dadfdf48fb76fa089529859d2
|
3 |
+
size 495662867
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "do_lower_case": false, "special_tokens_map_file": null, "tokenizer_class": "PreTrainedTokenizerFast"}
|