{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"collapsed_sections":[],"machine_shape":"hm"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"gpuClass":"standard"},"cells":[{"cell_type":"markdown","source":["# Imports and Setup"],"metadata":{"id":"6GbZy4iAXEVe"}},{"cell_type":"code","source":["import pandas as pd\n","pd.set_option('max_colwidth',150)\n","import numpy as np\n","import os\n","from datetime import datetime as dt\n","from IPython.core.interactiveshell import InteractiveShell\n","InteractiveShell.ast_node_interactivity = \"all\" # allow multiple outputs in a cell\n","import warnings\n","warnings.filterwarnings(\"ignore\")\n","import pickle"],"metadata":{"id":"5tPVn0h1R7cD"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Loading NLTK Modules\n","import nltk\n","nltk.download('stopwords')\n","nltk.download('punkt')\n","nltk.download('wordnet')\n","nltk.download('omw-1.4')\n","nltk.download('averaged_perceptron_tagger')\n","from nltk.stem import WordNetLemmatizer\n","from nltk.corpus import stopwords"],"metadata":{"id":"yf7zlt-enNO2"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["\n","#Loading all news articles row data\n","\n","This section loads the data used in the GLG project from the data_store in external Source. If you are using the notebook locally do not run the following cells and define the paths to the data."],"metadata":{"id":"8vqW-z5cazlr"}},{"cell_type":"code","source":["def download_dataset():\n"," \n"," if not os.path.isfile('all-the-news-2-1.zip?dl=0'):\n","\n"," # Downloading Annotated Corpus for all-news-article dataset\n"," !wget https://www.dropbox.com/s/cn2utnr5ipathhh/all-the-news-2-1.zip?dl=0\n","\n"," if not os.path.exists(\"/content/data\"):\n"," # Make a data folder to store the data\n"," !mkdir data\n","\n"," !unzip /content/all-the-news-2-1.zip?dl=0 \n"," !mv /content/all-the-news-2-1.csv ./data\n","\n"," !rm /content/all-the-news-2-1.zip?dl=0\n","download_dataset()"],"metadata":{"id":"5AaLSoi7R7ew"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["class newsArticleDataCleaner:\n","\n"," '''\n","\n"," This class can be used online (in colab) or offline (locally):\n"," 1. Online:\n"," If using this class in Colab and downloading the data from external source using the code\n"," in the notebook only run download_dataset function above in the code cell.\n"," 2. Offline:\n"," If using this class to process news article data available in a local directory,\n"," \"data_path\" parameter should be defined.\n"," Where \"data_path\" is the path to the folder containing all news articles datasets\n"," datasets:\n","\n"," Parameters:\n"," -----------\n","\n"," data_path: str\n"," the path to 'all-the-news-2-1.csv' data if the data is downloaded from GDrive or other location.\n","\n"," '''\n"," def __init__(self,data_path = None):\n","\n"," self.data_path = data_path\n"," self.path = self.get_file_path()\n"," \n"," def get_file_path(self):\n"," '''\n"," Get file path of news article.\n"," \n"," '''\n","\n"," if self.data_path is None:\n"," file_path = '/content/data/all-the-news-2-1.csv'\n"," else:\n"," file_path = self.data_path \n","\n"," return file_path\n","\n"," def filter_section(self, section):\n","\n"," if str(section).lower().startswith('tech') :\n"," return 'technology'\n"," elif str(section).lower().startswith('health'):\n"," return 'health'\n","\n"," return 'other'\n","\n"," def extract_health_tech_data(self):\n","\n"," \"\"\"\n"," Read dataframe, then:\n"," 1. Add Extra column 'tech_health_tag' inorder to identify tech and health documents based on section column\n"," 2. Load only news article focus on Technology and Health section\n"," \"\"\"\n"," \n"," data = pd.read_csv(self.path, encoding = \"utf-8\")\n"," \n"," #Add tech_health_tag column which identify the documents belong to either \n"," #health or technology section \n"," data['tech_health_tag'] = data['section'].apply(self.filter_section)\n"," #Load news article focus on Technology and Health section\n"," data_tech_health = data[(data['tech_health_tag']=='technology') | (data['tech_health_tag']=='health')] \n"," self.data_tech_health = data_tech_health\n","\n"," return data_tech_health\n","\n","\n"," def clean(self):\n","\n"," \"\"\"\n"," 1. Call extract_health_tech_data() function\n"," 2. Filter health and tech data based on document word length\n"," 3. Delet rows with null value for Article column\n"," 4. Remove columns if it has more than 20% null value\n"," 5. Reset index\n"," 6. Make all columns lowe case\n"," 7. Apply lemmatization, punctuation and stop words removal\n"," \"\"\"\n"," \n"," data = self.extract_health_tech_data()\n"," data['article_word_len'] = data['article'].apply(lambda x: len(str(x).split()))\n"," # From data exploration steps we know 95% of our artcle data is less than 1340 \n"," data = data[(data['article_word_len']>=50) & (data['article_word_len']<=1340) ]\n"," # delete all rows with column 'article' has null value \n"," indexArticle = data[data['article'].isnull()].index\n"," data.drop(indexArticle , inplace=True)\n"," # delete columns if it has more than 20% null value\n"," missing_cols = data.isnull().sum()\n"," drop_missing_cols = missing_cols[(missing_cols > len(data)/20)].sort_values()\n"," data = data.drop(drop_missing_cols.index, axis=1)\n"," data = data.dropna()\n"," data['date'] = pd.to_datetime(data['date'])\n"," # reset index\n"," data = data.reset_index(drop=True)\n"," # make all columns lower_case \n"," data.columns = data.columns.str.lower()\n"," tech_data_sample = data[data['tech_health_tag']=='technology'].sample(n=5500, random_state=1)\n"," health_data_sample = data[data['tech_health_tag']=='health'].sample(n=5500, random_state=1)\n"," working_data = pd.concat([tech_data_sample, health_data_sample], ignore_index=True)\n"," return working_data"],"metadata":{"id":"vnab3ToAR7o2"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# News Articl Data Cleaner Class"],"metadata":{"id":"PUTBtrUOQ7Mn"}},{"cell_type":"markdown","source":["1. Create a newsArticleDataCleaner class object\n","\n","- Note:during creating class object you should specify the path where to download and store all-the news-2-1.csv row data. If you are working on google colab you can specify `path_file = '/content/data/all-the-news-2-1.csv`'.\n","\n","\n"],"metadata":{"id":"UcY7ucQ3RaUc"}},{"cell_type":"code","source":["# Creating class object\n","path_file = '/content/data/all-the-news-2-1.csv'\n","article_obj = newsArticleDataCleaner(path_file)"],"metadata":{"id":"GMFcOZsTgvc0"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# calling Clean method in the class\n","# returns a processed dataframe \n","df = article_obj.clean()"],"metadata":{"id":"BAVsix7ThEdb"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Specify the location according to your working enviroments to save the dataframe\n","df.to_csv('/content/drive/MyDrive/data_tech_health.csv', sep=',', index=False)"],"metadata":{"id":"0d4TA0YlyKSZ"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df.head()"],"metadata":{"id":"VXFwGzEL6SM7"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df['article_word_len'].describe([0.1,0.25,0.5,0.75,0.95])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"WZ08bmtdh8x3","outputId":"47279a2e-3af9-4a99-fcec-7fff084d8b39","executionInfo":{"status":"ok","timestamp":1666972961614,"user_tz":240,"elapsed":158,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}}},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["count 11000.000000\n","mean 442.630636\n","std 312.645494\n","min 50.000000\n","10% 81.000000\n","25% 190.000000\n","50% 385.000000\n","75% 623.000000\n","95% 1092.000000\n","max 1340.000000\n","Name: article_word_len, dtype: float64"]},"metadata":{},"execution_count":11}]}]}