import pickle as pkl import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import re import string import pickle as pkl import pandas as pd import networkx as nx import matplotlib.pyplot as plt from tqdm import tqdm from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity def clean_text(text): # make text lowercase text = text.lower() # remove line breaks text = re.sub(r'\n', ' ', text) # remove puctuation translator = str.maketrans('', '', string.punctuation) text = text.translate(translator) # remove numbers text = re.sub(r'\d+', '', text) # remove extra spaces text = re.sub(r'\s+', ' ', text) # remove non-ascii characters text = re.sub(r'[^\x00-\x7F]+', ' ', text) return text def clean_stopword(tokens): listStopword = set(stopwords.words('indonesian')) filtered_words = [word for word in tokens if word.lower() not in listStopword] return filtered_words def preprocess_text(content): cleaned_text = clean_text(content) tokens = word_tokenize(cleaned_text) cleaned_stopword = clean_stopword(tokens) return ' '.join(cleaned_stopword)