ringkasan-berita / preprocessing.py
wchynto's picture
init
9afb8cd
import pickle as pkl
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string
import pickle as pkl
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def clean_text(text):
# make text lowercase
text = text.lower()
# remove line breaks
text = re.sub(r'\n', ' ', text)
# remove puctuation
translator = str.maketrans('', '', string.punctuation)
text = text.translate(translator)
# remove numbers
text = re.sub(r'\d+', '', text)
# remove extra spaces
text = re.sub(r'\s+', ' ', text)
# remove non-ascii characters
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
return text
def clean_stopword(tokens):
listStopword = set(stopwords.words('indonesian'))
filtered_words = [word for word in tokens if word.lower() not in listStopword]
return filtered_words
def preprocess_text(content):
cleaned_text = clean_text(content)
tokens = word_tokenize(cleaned_text)
cleaned_stopword = clean_stopword(tokens)
return ' '.join(cleaned_stopword)