kadabengaran commited on
Commit
e186fb5
·
0 Parent(s):

initial commit

Browse files
Files changed (7) hide show
  1. .gitattributes +34 -0
  2. .gitignore +2 -0
  3. README.md +12 -0
  4. app/main.py +139 -0
  5. app/model.py +88 -0
  6. howto.txt +7 -0
  7. requirements.txt +6 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /venv
2
+ /app/__pycache__
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Useful Review Classification
3
+ emoji: 🔥
4
+ colorFrom: green
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.21.0
8
+ app_file: app/main.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app/main.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import re
3
+ import streamlit as st
4
+ from transformers import BertTokenizer, BertModel
5
+ from model import IndoBERTBiLSTM, IndoBERTModel
6
+
7
+ # Config
8
+ MAX_SEQ_LEN = 128
9
+ bert_path = 'indolem/indobert-base-uncased'
10
+ MODELS_PATH = ["kadabengaran/IndoBERT-Useful-App-Review",
11
+ "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"]
12
+
13
+ # "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"]
14
+ HIDDEN_DIM = 768
15
+ OUTPUT_DIM = 2 # 2 if Binary
16
+ N_LAYERS = 2
17
+ BIDIRECTIONAL = True
18
+ DROPOUT = 0.2
19
+
20
+ # Get the Keys
21
+ def get_key(val, my_dict):
22
+ for key, value in my_dict.items():
23
+ if val == value:
24
+ return key
25
+
26
+
27
+ def get_device():
28
+ if torch.cuda.is_available():
29
+ return torch.device('cuda')
30
+ else:
31
+ return torch.device('cpu')
32
+
33
+
34
+ def load_tokenizer(model_path):
35
+ tokenizer = BertTokenizer.from_pretrained(model_path)
36
+ return tokenizer
37
+
38
+
39
+ def remove_special_characters(text):
40
+ # menghapus karakter khusus kecuali tanda baca seperti titik, koma, dan tanda tanya
41
+ # text = re.sub(r"[^a-zA-Z0-9.,!?]+", " ", text)
42
+ text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
43
+
44
+ # text = re.sub(r"'\s+|\s+'", " ", text) # replace apostrophe with space if it's surrounded by whitespace
45
+ text = re.sub(r"\s+", " ", text) # replace multiple whitespace characters with a single space
46
+
47
+ text = re.sub(r'[0-9]', ' ', text) #remove number
48
+
49
+ text = text.lower()
50
+ return text
51
+
52
+
53
+ def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN):
54
+ return tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_seq,
55
+ pad_to_max_length=True,
56
+ return_attention_mask=True,
57
+ return_tensors='pt'
58
+ )
59
+
60
+ def load_model():
61
+ bert = BertModel.from_pretrained(bert_path)
62
+
63
+ # Load the model
64
+ model1 = IndoBERTBiLSTM.from_pretrained(MODELS_PATH[0],
65
+ bert,
66
+ HIDDEN_DIM,
67
+ OUTPUT_DIM,
68
+ N_LAYERS, BIDIRECTIONAL,
69
+ DROPOUT)
70
+ model2 = IndoBERTModel.from_pretrained(MODELS_PATH[1],
71
+ bert,
72
+ OUTPUT_DIM)
73
+ return model1, model2
74
+
75
+
76
+ def predict(text, model, tokenizer, device):
77
+
78
+ # model = torch.load(model_path, map_location=device)
79
+ if device.type == 'cuda':
80
+ model.cuda()
81
+
82
+ # We need Token IDs and Attention Mask for inference on the new sentence
83
+ test_ids = []
84
+ test_attention_mask = []
85
+
86
+ # Apply preprocessing to the new sentence
87
+ new_sentence = remove_special_characters(text)
88
+ encoding = preprocess(new_sentence, tokenizer)
89
+
90
+ # Extract IDs and Attention Mask
91
+ test_ids.append(encoding['input_ids'])
92
+ test_attention_mask.append(encoding['attention_mask'])
93
+ test_ids = torch.cat(test_ids, dim=0)
94
+ test_attention_mask = torch.cat(test_attention_mask, dim=0)
95
+
96
+ # Forward pass, calculate logit predictions
97
+ with torch.no_grad():
98
+ outputs = model(test_ids.to(device),
99
+ test_attention_mask.to(device))
100
+ print("output ", outputs)
101
+ predictions = torch.argmax(outputs, dim=-1)
102
+ print("output ", predictions)
103
+ return predictions.item()
104
+
105
+ def main():
106
+ """App Review Classifier"""
107
+ # st.title("Klasifikasi Ulasan APlikasi")
108
+ # st.subheader("ML App with Streamlit")
109
+ html_temp = """
110
+ <div style="background-color:blue;padding:10px">
111
+ <h1 style="color:white;text-align:center;">Klasifikasi Ulasan Aplikasi yang Berguna</h1>
112
+ </div>
113
+
114
+ """
115
+ st.markdown(html_temp, unsafe_allow_html=True)
116
+ # st.info("Prediction with ML")
117
+
118
+ input_text = st.text_area("Enter Text Here", placeholder="Type Here")
119
+ all_ml_models = ["IndoBERT", "IndoBERT-BiLSTM"]
120
+ model_choice = st.selectbox("Select Model", all_ml_models)
121
+
122
+ tokenizer = load_tokenizer(bert_path)
123
+ device = get_device()
124
+ model1, model2 = load_model()
125
+
126
+ prediction = 0
127
+ prediction_labels = {'Not Useful': 0, 'Useful': 1}
128
+ if st.button("Classify"):
129
+ st.text("Original Text:\n{}".format(input_text))
130
+ if model_choice == 'IndoBERT':
131
+ prediction = predict(input_text, model1, tokenizer, device)
132
+ elif model_choice == 'IndoBERT-BiLSTM':
133
+ prediction = predict(input_text, model2, tokenizer, device)
134
+ final_result = get_key(prediction, prediction_labels)
135
+ st.success("Review Categorized as:: {}".format(final_result))
136
+ # st.sidebar.subheader("About")
137
+
138
+ if __name__ == '__main__':
139
+ main()
app/model.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ from transformers import PreTrainedModel, BertConfig
3
+
4
+ USE_CUDA = False
5
+
6
+ class IndoBERTBiLSTM(PreTrainedModel):
7
+ config_class = BertConfig
8
+ def __init__(self, bert_config, bert_pretrained_path, hidden_dim, num_classes, n_layers, bidirectional, dropout):
9
+ super().__init__(bert_config)
10
+ self.output_dim = num_classes
11
+ self.n_layers = n_layers
12
+ self.hidden_dim = hidden_dim
13
+ self.bidirectional = bidirectional
14
+
15
+ self.bert = bert_pretrained_path
16
+ self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size,
17
+ hidden_size=hidden_dim,
18
+ num_layers=n_layers,
19
+ bidirectional=bidirectional,
20
+ batch_first=True)
21
+ self.dropout = nn.Dropout(dropout)
22
+ self.global_pooling = nn.AdaptiveAvgPool1d(1)
23
+ self.hidden_layer = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, hidden_dim * 2 if bidirectional else hidden_dim)
24
+ self.output_layer = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, num_classes)
25
+ self.relu = nn.ReLU()
26
+
27
+ def forward(self, input_ids, attention_mask):
28
+
29
+ hidden = self.init_hidden(input_ids.shape[0])
30
+ # print("hidden : ", type(hidden))
31
+ output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
32
+ sequence_output = output.last_hidden_state
33
+
34
+ # apply dropout
35
+ sequence_output = self.dropout(sequence_output)
36
+ # print('output size of the bert:', last_hidden_state.size())
37
+
38
+ lstm_output, (hidden_last, cn_last) = self.lstm(sequence_output, hidden)
39
+ # print('output size of the LSTM:', lstm_output.size())
40
+ lstm_output = self.dropout(lstm_output)
41
+
42
+ # global pooling
43
+ lstm_output = lstm_output.permute(0, 2, 1)
44
+ pooled_output = self.global_pooling(lstm_output).squeeze()
45
+
46
+ # pass through hidden layer
47
+ hidden_layer_output = self.hidden_layer(pooled_output)
48
+ hidden_layer_output = self.relu(hidden_layer_output)
49
+
50
+ # output layer
51
+ logits = self.output_layer(hidden_layer_output)
52
+ # logits = nn.Softmax(dim=1)(logits)
53
+
54
+ return logits
55
+
56
+ def init_hidden(self, batch_size):
57
+ weight = next(self.parameters()).data
58
+
59
+ number = 1
60
+ if self.bidirectional:
61
+ number = 2
62
+
63
+ if (USE_CUDA):
64
+ hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda(),
65
+ weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda()
66
+ )
67
+ else:
68
+ hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float(),
69
+ weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float()
70
+ )
71
+
72
+ return hidden
73
+
74
+
75
+ class IndoBERTModel(PreTrainedModel):
76
+ config_class = BertConfig
77
+ def __init__(self, bert_config, bert_pretrained, num_classes):
78
+ super().__init__(bert_config)
79
+ self.bert = bert_pretrained
80
+ self.dropout = nn.Dropout(0.1)
81
+ self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
82
+
83
+ def forward(self, input_ids, attention_mask):
84
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
85
+ pooled_output = outputs.pooler_output
86
+ pooled_output = self.dropout(pooled_output)
87
+ logits = self.fc(pooled_output)
88
+ return logits
howto.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ python -m venv --system-site-packages .\venv
2
+
3
+ .\venv\Scripts\activate
4
+
5
+ pip install -r requirements.txt
6
+
7
+ streamlit run app/main.py
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ torch
3
+ torchvision
4
+ transformers
5
+ tokenizers
6
+ pickleshare