Spaces:
Paused
Paused
# -*- coding: utf-8 -*- | |
"""utils(2).ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1snWVRieogxGIRp-UsTCZWjLM5ir5KQxB | |
""" | |
import re | |
import nltk | |
import torch | |
import numpy as np | |
from nltk.tokenize import TweetTokenizer | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import stopwords | |
from nltk.corpus import wordnet | |
from transformers import BertTokenizer | |
from keras.preprocessing.sequence import pad_sequences | |
nltk.download('stopwords') | |
stopword_list = nltk.corpus.stopwords.words('english') | |
stopword_list.remove('no') | |
stopword_list.remove('not') | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
nltk.download('wordnet') | |
tokenizer = TweetTokenizer() | |
lemmatizer = WordNetLemmatizer() | |
tokenizer_B = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True) | |
device = torch.device("cuda") | |
# wordnet ๋ชจ๋์ ์ฌ์ฉํ์ฌ ๋จ์ด์ ํ์ฌ(POS, Part of Speech)๋ฅผ ๊ฐ์ ธ์ค๋ ํจ์ | |
def get_wordnet_pos(word): | |
"""Map POS tag to first character lemmatize() accepts""" | |
tag = nltk.pos_tag([word])[0][1][0].upper() | |
tag_dict = {"J": wordnet.ADJ, #ํ์ฉ์ฌ | |
"N": wordnet.NOUN, #๋ช ์ฌ | |
"V": wordnet.VERB, #๋์ฌ | |
"R": wordnet.ADV} #๋ถ์ฌ | |
return tag_dict.get(tag, wordnet.NOUN) | |
def get_wordnet_pos(word): | |
"""Map POS tag to first character lemmatize() accepts""" | |
tag = nltk.pos_tag([word])[0][1][0].upper() | |
tag_dict = {"J": wordnet.ADJ, #ํ์ฉ์ฌ | |
"N": wordnet.NOUN, #๋ช ์ฌ | |
"V": wordnet.VERB, #๋์ฌ | |
"R": wordnet.ADV} #๋ถ์ฌ | |
return tag_dict.get(tag, wordnet.NOUN) | |
# ์ ์ฒ๋ฆฌ ํจ์ | |
def pre_data(data): | |
#์๋ฌธ์ | |
df2 = data.lower().strip() | |
#ํ ํฐํโTweetTokenizer ์ฌ์ฉ | |
df_token = tokenizer.tokenize(df2) | |
#@์์ด๋ โ ์ ๊ฑฐ | |
df_IDdel = [] | |
for word in df_token: | |
if '@' not in word: | |
df_IDdel.append(word) | |
#๋ค์ ๋ฌธ์ฅ,.. | |
df_IDdel_sen = ' '.join(df_IDdel) | |
#์์ด ์๋ ๋ฌธ์๋ค ๊ณต๋ฐฑ์ผ๋ก ์ ํ | |
df_eng = re.sub("[^a-zA-Z]", " ", df_IDdel_sen) | |
#๋ฐ๋ณต๋ ์ฒ ์ ์ง์ฐ๊ธฐ (์ต๋ 2๊ฐ๊น์ง ๊ฐ๋ฅ) | |
df_rep_list = [] | |
for i, e in enumerate(df_eng): | |
if i > 1 and e == df_eng[i - 2] and e == df_eng[i - 1]: | |
df_rep_list.append('') | |
else: | |
df_rep_list.append(e) | |
df_rep = ''.join(df_rep_list) | |
#์ฐ์๋ ๊ณต๋ฐฑ ์ ๋ฆฌ | |
df_rep = re.sub(r'\s+', ' ', df_rep) | |
#ํ์ ์ด ์ถ์ถ(lemmatizer) | |
df_lemma = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(df_rep)] | |
#๋ถ์ฉ์ด ์ ๊ฑฐ | |
df_clean = [w for w in df_lemma if not w in stopword_list] | |
if len(df_clean) == 0: | |
df_clean = 'NC' #NC=No Category - ๋ฆฌ์คํธ๊ฐ ๋น์ด์์๋ ์ฌ์ฉํ๋ ๋ฌธ์์ด, ์๋ฏธ์๋ ๋จ์ดX | |
else: df_clean = ' '.join(df_clean) | |
return df_clean | |
# ์ ๋ ฅ ๋ฐ์ดํฐ ๋ณํ | |
def convert_input_data(sentences): | |
# BERT์ ํ ํฌ๋์ด์ ๋ก ๋ฌธ์ฅ์ ํ ํฐ์ผ๋ก ๋ถ๋ฆฌ | |
tokenized_texts = [tokenizer_B.tokenize(sent) for sent in sentences] | |
# ์ ๋ ฅ ํ ํฐ์ ์ต๋ ์ํ์ค ๊ธธ์ด | |
MAX_LEN = 80 | |
# ํ ํฐ์ ์ซ์ ์ธ๋ฑ์ค๋ก ๋ณํ | |
input_ids = [tokenizer_B.convert_tokens_to_ids(x) for x in tokenized_texts] | |
# ๋ฌธ์ฅ์ MAX_LEN ๊ธธ์ด์ ๋ง๊ฒ ์๋ฅด๊ณ , ๋ชจ์๋ ๋ถ๋ถ์ ํจ๋ฉ 0์ผ๋ก ์ฑ์ | |
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") | |
# ์ดํ ์ ๋ง์คํฌ ์ด๊ธฐํ | |
attention_masks = [] | |
# ์ดํ ์ ๋ง์คํฌ๋ฅผ ํจ๋ฉ์ด ์๋๋ฉด 1, ํจ๋ฉ์ด๋ฉด 0์ผ๋ก ์ค์ | |
# ํจ๋ฉ ๋ถ๋ถ์ BERT ๋ชจ๋ธ์์ ์ดํ ์ ์ ์ํํ์ง ์์ ์๋ ํฅ์ | |
for seq in input_ids: | |
seq_mask = [float(i>0) for i in seq] | |
attention_masks.append(seq_mask) | |
# ๋ฐ์ดํฐ๋ฅผ ํ์ดํ ์น์ ํ ์๋ก ๋ณํ | |
inputs = torch.tensor(input_ids) | |
masks = torch.tensor(attention_masks) | |
return inputs, masks | |
# ๋ฌธ์ฅ ํ ์คํธ | |
def test_sentences(sentences, load_model): | |
# ๋ฌธ์ฅ์ ์ ๋ ฅ ๋ฐ์ดํฐ๋ก ๋ณํ | |
inputs, masks = convert_input_data(sentences) | |
# ๋ฐ์ดํฐ๋ฅผ GPU์ ๋ฃ์ | |
b_input_ids = inputs.to(device) | |
b_input_mask = masks.to(device) | |
# ๊ทธ๋๋์ธํธ ๊ณ์ฐ ์ํจ | |
with torch.no_grad(): | |
# Forward ์ํ | |
outputs = load_model(b_input_ids, | |
token_type_ids=None, | |
attention_mask=b_input_mask) | |
# ๋ก์ค ๊ตฌํจ | |
logits = outputs[0] | |
# CPU๋ก ๋ฐ์ดํฐ ์ด๋ | |
logits = logits.detach().cpu().numpy() | |
return logits |