File size: 2,918 Bytes
e2d5ce2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from flask import Flask, request, render_template
import joblib
import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

app = Flask(__name__)
# app.config['APPLICATION_ROOT'] = '/klasifikasi-berita'

# Load pre-trained model and vectorizer
def load_model():
    model = joblib.load("logistic_regression_model.pkl")  # Example model file
    vectorizer = joblib.load("content_vectorizer.pkl")  # Example vectorizer file
    return model, vectorizer

model, vectorizer = load_model()  # Load model and vectorizer once on startup

# Function to clean the input string
def clean_string(text):
    text = text.lower()  # Make text lowercase
    text = re.sub(r'\n', ' ', text)  # Remove line breaks
    translator = str.maketrans('', '', string.punctuation)  # Remove punctuation
    text = text.translate(translator)
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    stop_words = set(stopwords.words('indonesian'))  # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Function to stem the input string using Sastrawi
def sastrawi_stemmer(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    stemmed_text = ' '.join(stemmer.stem(word) for word in tqdm(text.split()) if word in text)
    return stemmed_text

# Function to classify news article
def classify_news(text, model, vectorizer):
    # Clean and preprocess the text
    cleaned_text = clean_string(text)
    stemmed_text = sastrawi_stemmer(cleaned_text)
    
    # Vectorize the text
    text_vectorized = vectorizer.transform([stemmed_text])
    
    # Get prediction and probabilities
    prediction = model.predict(text_vectorized)
    prediction_proba = model.predict_proba(text_vectorized)
    
    return prediction[0], prediction_proba[0]

# Flask route for the main page
@app.route("/", methods=["GET", "POST"])
def home():
    category_name = None
    probabilities = None
    user_input = ""
    
    if request.method == "POST":
        user_input = request.form["news_text"]
        if user_input.strip() != "":
            # Classify the text
            category, probabilities = classify_news(user_input, model, vectorizer)
            
            # Map category to string
            if category == 0:
                category_name = "Ekonomi"
            elif category == 1:
                category_name = "Politik"
    
    return render_template("index.html", category_name=category_name, probabilities=probabilities, user_input=user_input)

# Run the Flask app
if __name__ == "__main__":
    app.run(debug=True)