Spaces:

wchynto
/

ringkasan-berita

Sleeping

ringkasan-berita / preprocessing.py

init

9afb8cd 3 months ago

1.17 kB

	import pickle as pkl

	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	import re
	import string

	import pickle as pkl

	import pandas as pd
	import networkx as nx
	import matplotlib.pyplot as plt
	from tqdm import tqdm

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	def clean_text(text):
	# make text lowercase
	text = text.lower()

	# remove line breaks
	text = re.sub(r'\n', ' ', text)

	# remove puctuation
	translator = str.maketrans('', '', string.punctuation)
	text = text.translate(translator)

	# remove numbers
	text = re.sub(r'\d+', '', text)

	# remove extra spaces
	text = re.sub(r'\s+', ' ', text)

	# remove non-ascii characters
	text = re.sub(r'[^\x00-\x7F]+', ' ', text)

	return text

	def clean_stopword(tokens):
	listStopword = set(stopwords.words('indonesian'))
	filtered_words = [word for word in tokens if word.lower() not in listStopword]
	return filtered_words

	def preprocess_text(content):
	cleaned_text = clean_text(content)
	tokens = word_tokenize(cleaned_text)
	cleaned_stopword = clean_stopword(tokens)
	return ' '.join(cleaned_stopword)