Spaces:
Paused
Paused
import string | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
import re | |
import numpy as np | |
from sklearn.base import TransformerMixin | |
from sklearn.metrics import ConfusionMatrixDisplay | |
from keras.preprocessing.text import Tokenizer | |
import nltk | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import wordnet | |
def download_if_non_existent(res_path, res_name): | |
try: | |
nltk.data.find(res_path) | |
except LookupError: | |
print(f'resource {res_path} not found. Downloading now...') | |
nltk.download(res_name) | |
download_if_non_existent('corpora/stopwords', 'stopwords') | |
download_if_non_existent('taggers/averaged_perceptron_tagger', 'averaged_perceptron_tagger') | |
download_if_non_existent('corpora/wordnet', 'wordnet') | |
def fit_model(pipeline, x_train, y_train, x_test, y_test): | |
pipeline.fit(x_train, y_train) | |
return ConfusionMatrixDisplay.from_estimator(pipeline, x_test, y_test, normalize="true") | |
class LinguisticPreprocessor(TransformerMixin): | |
def __init__(self, ): | |
super().__init__() | |
self.lemmatizer = WordNetLemmatizer() | |
self.tokenizer = Tokenizer() | |
self.stop_words = set(stopwords.words('english')) | |
self.stop = stopwords.words('english') | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X, y=None): | |
X = self._remove_html_tags(X) | |
X = self._remove_all_punctuations(X) | |
X = self._remove_double_spaces(X) | |
X = self._lemmatize(X) | |
X = self._remove_stopwords(X) | |
return X | |
def _remove_html_tags(self, X): | |
X = list(map( lambda x: BeautifulSoup(x, 'html.parser').get_text(), X)) | |
return X | |
def _remove_all_punctuations(self, X): | |
X = list( | |
map( | |
lambda text: re.sub('[%s]' % re.escape(string.punctuation), '', text), | |
X | |
) | |
) | |
return X | |
def _remove_double_spaces(self, X): | |
X = list(map(lambda text: re.sub(" +", " ", text), X)) | |
return X | |
def _remove_stopwords(self, X): | |
X = list(map( | |
lambda text: " ".join( | |
[ | |
word for word in text.split() if word not in (self.stop_words) | |
] | |
), | |
X | |
) | |
) | |
return X | |
def _lemmatize(self, X): | |
X = list(map(lambda text: self._lemmatize_one_sentence(text), X)) | |
return X | |
def _lemmatize_one_sentence(self, sentence): | |
sentence = nltk.word_tokenize(sentence) | |
sentence = list(map(lambda word: self.lemmatizer.lemmatize(word), sentence)) | |
return " ".join(sentence) | |
def training_data(dataset_1, dataset_2, dataset_3): | |
X_test = dataset_1['test']['text'] | |
y_test = dataset_1['test']['label'] | |
test_df = pd.DataFrame({ | |
'text':X_test, | |
'label': y_test | |
}) | |
combined_train_df = pd.DataFrame({ | |
'text': dataset_1['train']['text'] + dataset_2['train']['text'] + dataset_3['train']['text'], | |
'label': dataset_1['train']['label'] + dataset_2['train']['label'] + dataset_3['train']['label'] | |
}) | |
combined_train_df.drop_duplicates(subset=['text'], inplace=True) | |
merged_df = pd.merge(combined_train_df, test_df, on="text", how='left', indicator=True) | |
result_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge']) | |
X_train = result_df['text'].tolist() | |
y_train = result_df['label_x'].tolist() | |
X_test = np.array(X_test) | |
X_train = np.array(X_train) | |
return X_train, y_train, X_test, y_test |