File size: 3,308 Bytes
fb13f9f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import re
import string
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
import string
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
# A Function to use in the dataframe
kamus = pd.read_csv('kamus.txt', sep=" ", header=None,names=['slang', 'fix'])
slang_list = kamus['slang'].tolist()
fix_list = kamus['fix'].tolist()
def TextProcess(text):
# 1. Change all text to Lowercase
text = text.lower()
# 2. Removing Mentions
text = re.sub("@[A-Za-z0-9_]+", " ", text)
# 3. Removing Hashtags
text = re.sub("#[A-Za-z0-9_]+", " ", text)
# 4. Removing \n
text = re.sub(r"\\n", " ",text)
# 5. Removing Whitespaces
text = text.strip()
# 6. Removing Links
text = re.sub(r"http\S+", " ", text)
text = re.sub(r"www.\S+", " ", text)
# 7. Removing non text characters such as Emojis, Mathematical symbols
text = re.sub("[^A-Za-z\s']", " ", text)
# 8. Removing RT
text = re.sub("rt", " ",text)
# 9. Removing Punctuations
text = text.translate(str.maketrans('', '', string.punctuation))
# 11. Tokenization
token = word_tokenize(text)
for x in range(len(token)):
for i in range(len(slang_list)):
if token[x] == slang_list[i]:
token[x] = fix_list[i]
else:
pass
from nltk.util import ngrams
_2gram = [' '.join(e) for e in ngrams(token, 2)]
_3gram = [' '.join(e) for e in ngrams(token, 3)]
text = token + _2gram + _3gram
#text2 = ' '.join(token)
text = np.array(text)
return text
def TextProcess2(text):
# 1. Change all text to Lowercase
text = text.lower()
# 2. Removing Mentions
text = re.sub("@[A-Za-z0-9_]+", " ", text)
# 3. Removing Hashtags
text = re.sub("#[A-Za-z0-9_]+", " ", text)
# 4. Removing \n
text = re.sub(r"\\n", " ",text)
# 5. Removing Whitespaces
text = text.strip()
# 6. Removing Links
text = re.sub(r"http\S+", " ", text)
text = re.sub(r"www.\S+", " ", text)
# 7. Removing non text characters such as Emojis, Mathematical symbols
text = re.sub("[^A-Za-z\s']", " ", text)
# 8. Removing RT
text = re.sub("rt", " ",text)
# 9. Tokenization
tokens = word_tokenize(text)
for x in range(len(tokens)):
for i in range(len(slang_list)):
if tokens[x] == slang_list[i]:
tokens[x] = fix_list[i]
else:
pass
# 10. Removing Stopwords
# stopwords_en = list(set(stopwords.words('english')))
text = ' '.join([word for word in tokens])
# 11. Stemming
stemmer = PorterStemmer()
text = stemmer.stem(text)
return text
def Label(num):
if num == 0:
topic = 'Baterai cepat habis'
elif num == 1:
topic = 'hp tidak berfungsi, tidak sesuai, tidak nyala'
elif num == 2:
topic = 'barang tidak sesuai deskripsi, hp mati'
elif num == 3:
topic = 'positif'
elif num == 4:
topic = 'barang tidak sesuai pesanan'
elif num == 5:
topic = 'barang rusak'
elif num == 6:
topic = 'barang tidak sesuai, suara tidak berfungsi'
elif num == 7:
topic = 'warna tidak sesuai, atau barang tidak sesuai gambar'
elif num == 8:
topic = 'barang tidak sesuai deskripsi, pengiriman lama'
elif num == 9:
topic = 'barang kosong, cancel, retur'
else:
pass
return topic
|