Spaces:
Sleeping
Sleeping
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from sklearn.preprocessing import LabelEncoder | |
import spacy | |
with open('label_names.txt', 'r') as f: | |
labels = [emotion.strip() for emotion in f.readlines()] | |
encoder= LabelEncoder() | |
encoder.fit(labels) | |
nlp = spacy.load("en_core_web_sm") | |
def plot_pie_chart(data_frame: pd.DataFrame, title: str) -> None: | |
""" | |
Plot a pie chart to visualize label distribution in the provided DataFrame. | |
Args: | |
data_frame (pd.DataFrame): The DataFrame containing the data to visualize. | |
title (str): The title for the pie chart. | |
Returns: | |
None | |
""" | |
label_count = data_frame['label'].value_counts() | |
plt.figure(figsize=(8, 8)) | |
sns.set_style("whitegrid") | |
plt.pie(label_count, labels=label_count.index, colors=sns.color_palette("hls", len(label_count.index)), autopct='%1.1f%%', startangle=90) | |
plt.title(f"{title} Label Distribution") | |
plt.show() | |
plt.close() | |
def preprocess_text(df: pd.DataFrame, emotions: list=['love', 'surprise']): | |
""" | |
Preprocesses text data in a DataFrame. | |
Args: | |
df (pd.DataFrame): DataFrame containing 'sentence' and 'label' columns. | |
encoder (LabelEncoder): Label encoder for the labels. | |
emotions (list): List of emotions to drop from the DataFrame. | |
Returns: | |
pd.DataFrame: DataFrame with preprocessed text and encoded labels. | |
""" | |
for i in emotions: | |
df = df[df['label'] != i] | |
df['processed_text'] = df['text'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x) if not token.is_stop and not token.is_punct and not token.is_space])) | |
df['label_num'] = encoder.transform(df['label']) | |
df.drop(columns=['text', 'label'], inplace=True) | |
return df | |
def preprocess_single_sentence(sentence): | |
""" | |
Preprocesses a single sentence. | |
Args: | |
sentence (str): Input sentence. | |
Returns: | |
str: Preprocessed and tokenized sentence. | |
""" | |
processed_text = ' '.join([token.lemma_ for token in nlp(sentence) if not token.is_stop and not token.is_punct and not token.is_space]) | |
return processed_text | |