|
--- |
|
license: cc-by-nc-nd-4.0 |
|
--- |
|
|
|
LongformerμΈμ½λ KoBARTλ‘ AIHUB κΈμ΅ λ° μ½ μλ΄ λν λ°μ΄ν°λ₯Ό CHATGPTλ₯Ό ν΅ν΄ μμ½ν νμ΅ λ°μ΄ν°λ₯Ό νμ΅ν λͺ¨λΈ |
|
|
|
|
|
``` |
|
input = """κ³ κ°: μλ
νμΈμ, μ κ° μ¬κΈ°μ μ¬μ©νλ μ μ©μΉ΄λμ λν΄ κΆκΈν κ² μμ΄μ. |
|
|
|
μλ΄μ: μλ
νμΈμ! λ€, μ΄λ€ λ¬Έμκ° μμΌμ κ°μ? |
|
|
|
κ³ κ°: μ κ° μ΄λ² λ¬μ μΉ΄λλ₯Ό μ¬μ©νλ©΄μ 리μλ ν¬μΈνΈλ₯Ό μΌλ§λ μμλμ§ νμΈνκ³ μΆμ΄μ. |
|
|
|
μλ΄μ: λ€, λΉμ μ 리μλ ν¬μΈνΈ μμ‘μ νμΈν΄ λ릴 μ μμ΅λλ€. μ κ° λΉμ μ μΉ΄λ λ²νΈλ₯Ό μ
λ ₯νκ³ νμΈν΄λ³Όκ²μ. λ²νΈλ₯Ό μλ €μ£Όμ€ μ μμκΉμ? |
|
|
|
κ³ κ°: λ€, μ μΉ΄λ λ²νΈλ 1234-5678-9012-3456μ
λλ€. |
|
|
|
μλ΄μ: κ°μ¬ν©λλ€. μ μλ§ κΈ°λ€λ €μ£ΌμΈμ. νμΈ μ€μ΄μμ... λ€, νμ¬ λΉμ μ 리μλ ν¬μΈνΈ μμ‘μ 3,250 ν¬μΈνΈμ
λλ€. |
|
|
|
κ³ κ°: μκ² μ΄μ, κ°μ¬ν©λλ€! κ·ΈλΌ μΆκ°μ μΈ μ΄μ© ννμ΄λ ν μΈμ κ΄ν μ 보λ μ»μ μ μμκΉμ? |
|
|
|
μλ΄μ: λ¬Όλ‘ μ΄μ£ ! μ ν¬ μΉ΄λμ¬λ λ€μν μ΄μ© ννμ μ 곡νκ³ μμ΅λλ€. μλ₯Ό λ€μ΄, μ¬ν, μΌν, μμ¬ λ± λ€μν λΆμΌμμ ν μΈ ννμ λ°μ μ μκ±°λ, 리μλ ν¬μΈνΈλ₯Ό μ¬μ©νμ¬ μνμ΄λ κΈ°ννΈ μΉ΄λλ‘ κ΅νν μ μμ΅λλ€. μ΄λ€ ννμ κ΄μ¬μ΄ μμΌμ κ°μ? |
|
|
|
κ³ κ°: μ λ μ¬ν ν μΈμ΄λ λ§μΌλ¦¬μ§ μ 립μ κ΄μ¬μ΄ μμ΄μ. |
|
|
|
μλ΄μ: κ·Έλ° κ²½μ°μλ λΉμ μκ² μ ν©ν μ¬ν μΉ΄λ ννμ μ 곡νλ μΉ΄λλ₯Ό μΆμ²ν΄ λ릴 μ μμ΅λλ€. μ¬ν μΉ΄λλ νκ³΅μ¬ λ§μΌλ¦¬μ§λ₯Ό μμ μ μκ³ , νΈν
ν μΈ ννμ λ°μ μλ μμ΅λλ€. μ κ° λͺ κ°μ§ μ΅μ
μ μ μν΄ λ³ΌκΉμ? |
|
|
|
κ³ κ°: λ€, κ·Έλ¬λ©΄ μ’μ κ² κ°μμ. κ°μ¬ν©λλ€! |
|
μλ΄μ: λ§μν΄ μ£Όμ
μ κ°μ¬ν©λλ€. μ΄μ μ κ° λͺ κ°μ§ μΆμ²μ λ리λλ‘ νκ² μ΅λλ€. μ΄λ€ ν곡μ¬λ₯Ό μ£Όλ‘ μ΄μ©νμλμ?""" |
|
``` |
|
|
|
|
|
``` |
|
output =""" |
|
- κ³ κ°μ΄ μ μ©μΉ΄λμ λν΄ κΆκΈν μ¬ν μλ΄ |
|
- 리μλ ν¬μΈνΈ νμΈ μμ² |
|
- μλ΄μμ΄ μΉ΄λ λ²νΈμ μμ‘ νμΈ ν μΆκ° μ΄μ© νν μλ΄ |
|
- κ³ κ°μ΄ μ¬ν ν μΈ, λ§μΌλ¦¬μ§, νΈν
ν μΈ λ± λ€μν ννμ κ΄μ¬ νν |
|
""" |
|
``` |
|
|
|
|
|
ν΄λΉ λͺ¨λΈμ νμ©νκΈ° μν΄μ λ€μκ³Ό κ°μ class νμ |
|
``` |
|
class LongformerSelfAttentionForBart(nn.Module): |
|
def __init__(self, config, layer_id): |
|
super().__init__() |
|
self.embed_dim = config.d_model |
|
self.longformer_self_attn = LongformerSelfAttention(config, layer_id=layer_id) |
|
self.output = nn.Linear(self.embed_dim, self.embed_dim) |
|
|
|
def forward( |
|
self, |
|
hidden_states: torch.Tensor, |
|
key_value_states: Optional[torch.Tensor] = None, |
|
past_key_value: Optional[Tuple[torch.Tensor]] = None, |
|
attention_mask: Optional[torch.Tensor] = None, |
|
layer_head_mask: Optional[torch.Tensor] = None, |
|
output_attentions: bool = False, |
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: |
|
|
|
is_cross_attention = key_value_states is not None |
|
bsz, tgt_len, embed_dim = hidden_states.size() |
|
|
|
# bs x seq_len x seq_len -> bs x seq_len μΌλ‘ λ³κ²½ |
|
attention_mask = attention_mask.squeeze(dim=1) |
|
attention_mask = attention_mask[:,0] |
|
|
|
is_index_masked = attention_mask < 0 |
|
is_index_global_attn = attention_mask > 0 |
|
is_global_attn = is_index_global_attn.flatten().any().item() |
|
|
|
outputs = self.longformer_self_attn( |
|
hidden_states, |
|
attention_mask=attention_mask, |
|
layer_head_mask=None, |
|
is_index_masked=is_index_masked, |
|
is_index_global_attn=is_index_global_attn, |
|
is_global_attn=is_global_attn, |
|
output_attentions=output_attentions, |
|
) |
|
|
|
attn_output = self.output(outputs[0]) |
|
|
|
return (attn_output,) + outputs[1:] if len(outputs) == 2 else (attn_output, None, None) |
|
``` |
|
|
|
``` |
|
class LongformerEncoderDecoderForConditionalGeneration(BartForConditionalGeneration): |
|
def __init__(self, config): |
|
super().__init__(config) |
|
|
|
if config.attention_mode == 'n2': |
|
pass # do nothing, use BertSelfAttention instead |
|
else: |
|
|
|
self.model.encoder.embed_positions = BartLearnedPositionalEmbedding( |
|
config.max_encoder_position_embeddings, |
|
config.d_model) |
|
|
|
self.model.decoder.embed_positions = BartLearnedPositionalEmbedding( |
|
config.max_decoder_position_embeddings, |
|
config.d_model) |
|
|
|
for i, layer in enumerate(self.model.encoder.layers): |
|
layer.self_attn = LongformerSelfAttentionForBart(config, layer_id=i) |
|
``` |
|
|
|
``` |
|
class LongformerEncoderDecoderConfig(BartConfig): |
|
def __init__(self, attention_window: List[int] = None, attention_dilation: List[int] = None, |
|
autoregressive: bool = False, attention_mode: str = 'sliding_chunks', |
|
gradient_checkpointing: bool = False, **kwargs): |
|
""" |
|
Args: |
|
attention_window: list of attention window sizes of length = number of layers. |
|
window size = number of attention locations on each side. |
|
For an affective window size of 512, use `attention_window=[256]*num_layers` |
|
which is 256 on each side. |
|
attention_dilation: list of attention dilation of length = number of layers. |
|
attention dilation of `1` means no dilation. |
|
autoregressive: do autoregressive attention or have attention of both sides |
|
attention_mode: 'n2' for regular n^2 self-attention, 'tvm' for TVM implemenation of Longformer |
|
selfattention, 'sliding_chunks' for another implementation of Longformer selfattention |
|
""" |
|
super().__init__(**kwargs) |
|
self.attention_window = attention_window |
|
self.attention_dilation = attention_dilation |
|
self.autoregressive = autoregressive |
|
self.attention_mode = attention_mode |
|
self.gradient_checkpointing = gradient_checkpointing |
|
assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2'] |
|
``` |
|
λͺ¨λΈ μ€λΈμ νΈ λ‘λ ν |
|
weightνμΌμ λ³λλ‘ λ€μ΄λ°μμ load_state_dictλ‘ μ¨μ΄νΈλ₯Ό λΆλ¬μΌ ν©λλ€. |
|
``` |
|
tokenizer = AutoTokenizer.from_pretrained("cocoirun/longforemr-kobart-summary-v1") |
|
model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained("cocoirun/longforemr-kobart-summary-v1") |
|
device = torch.device('cuda') |
|
model.load_state_dict(torch.load("summary weight.ckpt")) |
|
model.to(device) |
|
``` |
|
|
|
λͺ¨λΈ μμ½ ν¨μ |
|
``` |
|
def summarize(text, max_len): |
|
max_seq_len = 4096 |
|
context_tokens = ['<s>'] + tokenizer.tokenize(text) + ['</s>'] |
|
input_ids = tokenizer.convert_tokens_to_ids(context_tokens) |
|
|
|
if len(input_ids) < max_seq_len: |
|
while len(input_ids) < max_seq_len: |
|
input_ids += [tokenizer.pad_token_id] |
|
|
|
else: |
|
input_ids = input_ids[:max_seq_len - 1] + [ |
|
tokenizer.eos_token_id] |
|
|
|
res_ids = model.generate(torch.tensor([input_ids]).to(device), |
|
max_length=max_len, |
|
num_beams=5, |
|
no_repeat_ngram_size = 3, |
|
eos_token_id=tokenizer.eos_token_id, |
|
bad_words_ids=[[tokenizer.unk_token_id]]) |
|
|
|
res = tokenizer.batch_decode(res_ids.tolist(), skip_special_tokens=True)[0] |
|
res = res.replace("\n\n","\n") |
|
return res |
|
``` |