""" This module contains classes and methods for words distribution analysis """ import pandas as pd import numpy as np class WordsDistributionClass: ''' This class is for creating a dataframe with the frequencies of the words in the text column of the input file, in addition to the file's original columns. ''' def __init__(self,input_file_path,output_file_path,text_column='text'): self.input_file_path = input_file_path self.output_file_path = output_file_path self.text_column = text_column def initialize_data(self): # read dataframe from the input CSV file path self.df = pd.read_csv(self.input_file_path,encoding='cp1255') # add frequencies of the words in the text column as columns # for the dataframe which was previously read # Impl. Note: all_words is a dictionary for the words' frequencies # to be used during the calculation. It's a local variable. # for word in all_words.keys(): # all_words[word] == # videos which contain word # as part of the text in in the text column all_words = {} self.df['freq'] = self.df.apply(lambda x: WordsDistributionClass.get_words_freq_in_text(x[self.text_column],all_words),axis=1) for word in all_words.keys(): if all_words[word] >= 10: self.df['freq_'+word] = self.df.apply(lambda x: 0 if word not in x['freq'].keys() else x['freq'][word],axis=1) del all_words del self.df['freq'] def get_words_freq_in_text(text,all_words): # static public function freq = {} # our calcuation is not sensitive to CAPS-LOCK characters text = text.lower() # our calcuation is not sensitive to the characters: ";",",","." # NOTE: we are sensitive to other characters, including question marks # and '"', "'" etc. text = text.replace(";","") text = text.replace(",","") text = text.replace(".","") words = text.split(" ") # algorithm for assigning words distribution # for given all_words dictionary for word in words: if word not in all_words: all_words[word] = 0 if word not in freq.keys(): freq[word] = 1 all_words[word] += 1 else: freq[word] += 1 return freq def save_output(self): #export dataframe to output CSV file path self.df.to_csv(self.output_file_path,index=False) if __name__ == "__main__": # Arguments INPUT_FILE_NAME = 'tagging_MMD_db_with_face_sentiment_extracted.csv' OUTPUT_FILE_NAME = 'tagging_MMD_db_with_face_sentiment_extracted_and_words_distributions.csv' # Run WordsDistributionClass on the given input wdc = WordsDistributionClass(INPUT_FILE_NAME,OUTPUT_FILE_NAME) wdc.initialize_data() wdc.save_output()