Spaces:
Sleeping
Sleeping
File size: 2,918 Bytes
e2d5ce2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
from flask import Flask, request, render_template
import joblib
import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
app = Flask(__name__)
# app.config['APPLICATION_ROOT'] = '/klasifikasi-berita'
# Load pre-trained model and vectorizer
def load_model():
model = joblib.load("logistic_regression_model.pkl") # Example model file
vectorizer = joblib.load("content_vectorizer.pkl") # Example vectorizer file
return model, vectorizer
model, vectorizer = load_model() # Load model and vectorizer once on startup
# Function to clean the input string
def clean_string(text):
text = text.lower() # Make text lowercase
text = re.sub(r'\n', ' ', text) # Remove line breaks
translator = str.maketrans('', '', string.punctuation) # Remove punctuation
text = text.translate(translator)
text = re.sub(r'\d+', '', text) # Remove numbers
text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove non-ASCII characters
stop_words = set(stopwords.words('indonesian')) # Remove stopwords
text = ' '.join([word for word in text.split() if word not in stop_words])
return text
# Function to stem the input string using Sastrawi
def sastrawi_stemmer(text):
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stemmed_text = ' '.join(stemmer.stem(word) for word in tqdm(text.split()) if word in text)
return stemmed_text
# Function to classify news article
def classify_news(text, model, vectorizer):
# Clean and preprocess the text
cleaned_text = clean_string(text)
stemmed_text = sastrawi_stemmer(cleaned_text)
# Vectorize the text
text_vectorized = vectorizer.transform([stemmed_text])
# Get prediction and probabilities
prediction = model.predict(text_vectorized)
prediction_proba = model.predict_proba(text_vectorized)
return prediction[0], prediction_proba[0]
# Flask route for the main page
@app.route("/", methods=["GET", "POST"])
def home():
category_name = None
probabilities = None
user_input = ""
if request.method == "POST":
user_input = request.form["news_text"]
if user_input.strip() != "":
# Classify the text
category, probabilities = classify_news(user_input, model, vectorizer)
# Map category to string
if category == 0:
category_name = "Ekonomi"
elif category == 1:
category_name = "Politik"
return render_template("index.html", category_name=category_name, probabilities=probabilities, user_input=user_input)
# Run the Flask app
if __name__ == "__main__":
app.run(debug=True)
|