File size: 3,308 Bytes
fb13f9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import re
import string
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
import string
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *

# A Function to use in the dataframe
kamus = pd.read_csv('kamus.txt', sep="	", header=None,names=['slang', 'fix'])
slang_list = kamus['slang'].tolist()
fix_list = kamus['fix'].tolist()

def TextProcess(text):

  # 1. Change all text to Lowercase
  text = text.lower()
  
  # 2. Removing Mentions
  text = re.sub("@[A-Za-z0-9_]+", " ", text)
  
  # 3. Removing Hashtags
  text = re.sub("#[A-Za-z0-9_]+", " ", text)
  
  # 4. Removing \n
  text = re.sub(r"\\n", " ",text)
  
  # 5. Removing Whitespaces
  text = text.strip()

  # 6. Removing Links
  text = re.sub(r"http\S+", " ", text)
  text = re.sub(r"www.\S+", " ", text)

  # 7. Removing non text characters such as Emojis, Mathematical symbols
  text = re.sub("[^A-Za-z\s']", " ", text)

  # 8. Removing RT
  text = re.sub("rt", " ",text)

  # 9. Removing Punctuations
  text = text.translate(str.maketrans('', '', string.punctuation))

  # 11. Tokenization
  token = word_tokenize(text)

  for x in range(len(token)):
    for i in range(len(slang_list)):
      if token[x] == slang_list[i]:
        token[x] = fix_list[i]
      else:
        pass

  from nltk.util import ngrams
  _2gram = [' '.join(e) for e in ngrams(token, 2)]
  _3gram = [' '.join(e) for e in ngrams(token, 3)]
  text = token + _2gram + _3gram

  
  #text2 = ' '.join(token)

  text = np.array(text)

  
  return text

def TextProcess2(text):

  # 1. Change all text to Lowercase
  text = text.lower()
  
  # 2. Removing Mentions
  text = re.sub("@[A-Za-z0-9_]+", " ", text)
  
  # 3. Removing Hashtags
  text = re.sub("#[A-Za-z0-9_]+", " ", text)
  
  # 4. Removing \n
  text = re.sub(r"\\n", " ",text)
  
  # 5. Removing Whitespaces
  text = text.strip()

  # 6. Removing Links
  text = re.sub(r"http\S+", " ", text)
  text = re.sub(r"www.\S+", " ", text)

  # 7. Removing non text characters such as Emojis, Mathematical symbols
  text = re.sub("[^A-Za-z\s']", " ", text)

  # 8. Removing RT
  text = re.sub("rt", " ",text)

  # 9. Tokenization
  tokens = word_tokenize(text)
  for x in range(len(tokens)):
    for i in range(len(slang_list)):
      if tokens[x] == slang_list[i]:
        tokens[x] = fix_list[i]
      else:
        pass

  # 10. Removing Stopwords
  # stopwords_en = list(set(stopwords.words('english')))
  text = ' '.join([word for word in tokens])
  
  # 11. Stemming
  stemmer = PorterStemmer()
  text = stemmer.stem(text)
  
  return text

def Label(num):
  if num == 0:
    topic = 'Baterai cepat habis'
  elif num == 1:
    topic = 'hp tidak berfungsi, tidak sesuai, tidak nyala'
  elif num == 2:
    topic = 'barang tidak sesuai deskripsi, hp mati'
  elif num == 3:
    topic = 'positif'
  elif num == 4:
    topic = 'barang tidak sesuai pesanan'
  elif num == 5:
    topic = 'barang rusak'
  elif num == 6:
    topic = 'barang tidak sesuai, suara tidak berfungsi'
  elif num == 7:
    topic = 'warna tidak sesuai, atau barang tidak sesuai gambar'
  elif num == 8:
    topic = 'barang tidak sesuai deskripsi, pengiriman lama'
  elif num == 9:
    topic = 'barang kosong, cancel, retur'
  else:
    pass
  return topic