ThanaphonJoe commited on
Commit
43452d4
1 Parent(s): 6200f99
Files changed (1) hide show
  1. app.py +26 -1
app.py CHANGED
@@ -7,6 +7,9 @@ from sklearn.model_selection import train_test_split
7
  from sklearn.metrics import confusion_matrix
8
  import matplotlib.pyplot as plt
9
  import re
 
 
 
10
 
11
 
12
  def deEmojify(text):
@@ -34,12 +37,32 @@ def deEmojify(text):
34
 
35
 
36
  def clean_me(data):
37
- data['clean_text'] = data.str.replace(r'<[^<>]*>', '', regex=True)
 
 
 
 
 
 
 
38
  data['clean2_text']= data['clean_text'].str.strip().str.lower().str.replace('\r+', ' ').str.replace('\n+',' ').str.replace('\t+',' ')
39
  data['clean3_text'] = data.apply(lambda row: deEmojify(row['clean2_text']), axis=1)
 
 
 
 
 
 
 
 
40
  return(data)
41
 
 
42
  def combine(a, b):
 
 
 
 
43
  return a + " " + b
44
 
45
 
@@ -79,5 +102,7 @@ with gr.Blocks() as demo:
79
  cache_examples=True,
80
  )
81
 
 
 
82
  if __name__ == "__main__":
83
  demo.launch()
 
7
  from sklearn.metrics import confusion_matrix
8
  import matplotlib.pyplot as plt
9
  import re
10
+ from pythainlp.util import normalize
11
+ from pythainlp.corpus import thai_stopwords
12
+ from pythainlp.tokenize import word_tokenize
13
 
14
 
15
  def deEmojify(text):
 
37
 
38
 
39
  def clean_me(data):
40
+ stopwords = list(thai_stopwords())
41
+ stopwords.append("nan")
42
+ stopwords.append("-")
43
+ stopwords.append("_")
44
+ stopwords.append("")
45
+ stopwords.append(" ")
46
+
47
+ data['clean_text'] = data['text'].str.replace(r'<[^<>]*>', '', regex=True)
48
  data['clean2_text']= data['clean_text'].str.strip().str.lower().str.replace('\r+', ' ').str.replace('\n+',' ').str.replace('\t+',' ')
49
  data['clean3_text'] = data.apply(lambda row: deEmojify(row['clean2_text']), axis=1)
50
+ # Normalize text
51
+ data['clean4_text'] = data.apply(lambda row: normalize(row['clean3_text']), axis=1)
52
+ # Word segmentation: it will take a while....
53
+ data['wordseged_text'] = data.apply(lambda row: word_tokenize(row['clean4_text'], engine="newmm-safe"), axis=1)
54
+ # Join the wordsegged with space
55
+ data['wordseged_space_text'] = data.apply(lambda row: " ".join(row["wordseged_text"]), axis=1)
56
+
57
+
58
  return(data)
59
 
60
+
61
  def combine(a, b):
62
+ data = pd.DataFrame()
63
+ data['text'] = [a]
64
+ data = clean_me(data)
65
+ a = data['wordseged_space_text'][0] + '123'
66
  return a + " " + b
67
 
68
 
 
102
  cache_examples=True,
103
  )
104
 
105
+
106
+
107
  if __name__ == "__main__":
108
  demo.launch()