Spaces:
Sleeping
Sleeping
import pickle as pkl | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
import re | |
import string | |
import pickle as pkl | |
import pandas as pd | |
import networkx as nx | |
import matplotlib.pyplot as plt | |
from tqdm import tqdm | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
def clean_text(text): | |
# make text lowercase | |
text = text.lower() | |
# remove line breaks | |
text = re.sub(r'\n', ' ', text) | |
# remove puctuation | |
translator = str.maketrans('', '', string.punctuation) | |
text = text.translate(translator) | |
# remove numbers | |
text = re.sub(r'\d+', '', text) | |
# remove extra spaces | |
text = re.sub(r'\s+', ' ', text) | |
# remove non-ascii characters | |
text = re.sub(r'[^\x00-\x7F]+', ' ', text) | |
return text | |
def clean_stopword(tokens): | |
listStopword = set(stopwords.words('indonesian')) | |
filtered_words = [word for word in tokens if word.lower() not in listStopword] | |
return filtered_words | |
def preprocess_text(content): | |
cleaned_text = clean_text(content) | |
tokens = word_tokenize(cleaned_text) | |
cleaned_stopword = clean_stopword(tokens) | |
return ' '.join(cleaned_stopword) | |