Spaces:

S2gamzaS2
/

Twitter_nlp

Paused

File size: 4,598 Bytes

d37b5f8

# -*- coding: utf-8 -*-
"""utils(2).ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1snWVRieogxGIRp-UsTCZWjLM5ir5KQxB
"""

import re
import nltk
import torch
import numpy as np

from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

nltk.download('stopwords')

stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()
tokenizer_B = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)

device = torch.device("cuda")

# wordnet 모듈을 사용하여 단어의 품사(POS, Part of Speech)를 가져오는 함수
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, #형용사
                "N": wordnet.NOUN, #명사
                "V": wordnet.VERB, #동사
                "R": wordnet.ADV} #부사

    return tag_dict.get(tag, wordnet.NOUN)

def get_wordnet_pos(word):
   """Map POS tag to first character lemmatize() accepts"""
   tag = nltk.pos_tag([word])[0][1][0].upper()
   tag_dict = {"J": wordnet.ADJ, #형용사
               "N": wordnet.NOUN, #명사
               "V": wordnet.VERB, #동사
               "R": wordnet.ADV} #부사

   return tag_dict.get(tag, wordnet.NOUN)

# 전처리 함수
def pre_data(data):

  #소문자
  df2 = data.lower().strip()

  #토큰화→TweetTokenizer 사용
  df_token = tokenizer.tokenize(df2)

  #@아이디 → 제거
  df_IDdel = []
  for word in df_token:
    if '@' not in word:
      df_IDdel.append(word)

  #다시 문장,..
  df_IDdel_sen = ' '.join(df_IDdel)

  #영어 아닌 문자들 공백으로 전환
  df_eng = re.sub("[^a-zA-Z]", " ", df_IDdel_sen)

  #반복된 철자 지우기 (최대 2개까지 가능)
  df_rep_list = []
  for i, e in enumerate(df_eng):
    if i > 1 and e == df_eng[i - 2] and e == df_eng[i - 1]:
      df_rep_list.append('')
    else:
      df_rep_list.append(e)
  df_rep = ''.join(df_rep_list)
  #연속된 공백 정리
  df_rep = re.sub(r'\s+', ' ', df_rep)

  #표제어 추출(lemmatizer)
  df_lemma = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(df_rep)]

  #불용어 제거
  df_clean = [w for w in df_lemma if not w in stopword_list]

  if len(df_clean) == 0:
    df_clean = 'NC' #NC=No Category - 리스트가 비어있을때 사용하는 문자열, 의미있는 단어X
  else: df_clean = ' '.join(df_clean)

  return df_clean

# 입력 데이터 변환
def convert_input_data(sentences):

    # BERT의 토크나이저로 문장을 토큰으로 분리
    tokenized_texts = [tokenizer_B.tokenize(sent) for sent in sentences]

    # 입력 토큰의 최대 시퀀스 길이
    MAX_LEN = 80

    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer_B.convert_tokens_to_ids(x) for x in tokenized_texts]

    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # 데이터를 파이토치의 텐서로 변환
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

# 문장 테스트
def test_sentences(sentences, load_model):

    # 문장을 입력 데이터로 변환
    inputs, masks = convert_input_data(sentences)

    # 데이터를 GPU에 넣음
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)

    # 그래디언트 계산 안함
    with torch.no_grad():
        # Forward 수행
        outputs = load_model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)

    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()

    return logits