{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "collapsed_sections": [], "machine_shape": "hm" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "qIFLx0_wimTB" }, "outputs": [], "source": [ "import pandas as pd\n", "pd.set_option('max_colwidth',150)\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from datetime import datetime as dt\n", "from string import punctuation\n", "import re\n", "import os\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from IPython.core.interactiveshell import InteractiveShell\n", "InteractiveShell.ast_node_interactivity = \"all\" # allow multiple outputs in a cell\n", "import warnings\n", "import pandas as pd\n", "pd.options.plotting.backend = \"plotly\"\n", "warnings.filterwarnings(\"ignore\")\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "source": [ "# Download and Extract the Datasets" ], "metadata": { "id": "QqvaLRjVjIj3" } }, { "cell_type": "code", "source": [ "# Downloading all-the-news-2-news-articles-dataset \n", "! wget https://www.dropbox.com/s/cn2utnr5ipathhh/all-the-news-2-1.zip?dl=0\n", "\n", "# Downloading Annotated Corpus for Named Entity Recognition dataset\n", "!gdown https://drive.google.com/uc?id=13y8JNgL5TQ4x-yufpBOv3QBsEiE051sE\n", "\n", "# Make a data folder to store the data\n", "!mkdir data\n", "\n", "!unzip /content/all-the-news-2-1.zip?dl=0 -d ./data/\n", "\n", "!mv /content/ner.csv ./data\n", "\n", "!rm /content/all-the-news-2-1.zip?dl=0\n", "\n" ], "metadata": { "id": "VYvJeKsujCFY" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Load Data" ], "metadata": { "id": "liJiX3Xf2hQh" } }, { "cell_type": "code", "source": [ "#specify the path to data location\n", "\n", "filepath = '/content/data/all-the-news-2-1.csv'\n", "# data = pd.read_csv(filepath, encoding = \"ISO-8859-1\")\n", "data = pd.read_csv(filepath, encoding = \"utf-8\") \n" ], "metadata": { "id": "LMwtt2rJnNhB" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "#Verify that the data is loaded correctly\n", "data.head(3)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "g4VoxOSnnOs9", "outputId": "4f0dea96-29e8-4f80-f009-12e9ef6e0c05" }, "execution_count": 4, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " date year month day author \\\n", "0 2016-12-09 18:31:00 2016 12.0 9 Lee Drutman \n", "1 2016-10-07 21:26:46 2016 10.0 7 Scott Davis \n", "2 2018-01-26 00:00:00 2018 1.0 26 NaN \n", "\n", " title \\\n", "0 We should take concerns about the health of liberal democracy seriously \n", "1 Colts GM Ryan Grigson says Andrew Luck's contract makes it difficult to build the team \n", "2 Trump denies report he ordered Mueller fired \n", "\n", " article \\\n", "0 This post is part of Polyarchy, an independent blog produced by the political reform program at New America, a Washington think tank devoted to de... \n", "1 The Indianapolis Colts made Andrew Luck the highest-paid player in NFL history this offseason with a five-year, $122-million contract with $89 mi... \n", "2 DAVOS, Switzerland (Reuters) - U.S. President Donald Trump denied a report on Friday that he had ordered Special Counsel Robert Mueller fired last... \n", "\n", " url \\\n", "0 https://www.vox.com/polyarchy/2016/12/9/13898340/democracy-warning-signs \n", "1 https://www.businessinsider.com/colts-gm-ryan-grigson-andrew-luck-contract-2016-10 \n", "2 https://www.reuters.com/article/us-davos-meeting-trump-mueller/trump-denies-report-he-ordered-mueller-fired-idUSKBN1FF12A \n", "\n", " section publication \n", "0 NaN Vox \n", "1 NaN Business Insider \n", "2 Davos Reuters " ], "text/html": [ "\n", "
\n", " | date | \n", "year | \n", "month | \n", "day | \n", "author | \n", "title | \n", "article | \n", "url | \n", "section | \n", "publication | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "2016-12-09 18:31:00 | \n", "2016 | \n", "12.0 | \n", "9 | \n", "Lee Drutman | \n", "We should take concerns about the health of liberal democracy seriously | \n", "This post is part of Polyarchy, an independent blog produced by the political reform program at New America, a Washington think tank devoted to de... | \n", "https://www.vox.com/polyarchy/2016/12/9/13898340/democracy-warning-signs | \n", "NaN | \n", "Vox | \n", "
1 | \n", "2016-10-07 21:26:46 | \n", "2016 | \n", "10.0 | \n", "7 | \n", "Scott Davis | \n", "Colts GM Ryan Grigson says Andrew Luck's contract makes it difficult to build the team | \n", "The Indianapolis Colts made Andrew Luck the highest-paid player in NFL history this offseason with a five-year, $122-million contract with $89 mi... | \n", "https://www.businessinsider.com/colts-gm-ryan-grigson-andrew-luck-contract-2016-10 | \n", "NaN | \n", "Business Insider | \n", "
2 | \n", "2018-01-26 00:00:00 | \n", "2018 | \n", "1.0 | \n", "26 | \n", "NaN | \n", "Trump denies report he ordered Mueller fired | \n", "DAVOS, Switzerland (Reuters) - U.S. President Donald Trump denied a report on Friday that he had ordered Special Counsel Robert Mueller fired last... | \n", "https://www.reuters.com/article/us-davos-meeting-trump-mueller/trump-denies-report-he-ordered-mueller-fired-idUSKBN1FF12A | \n", "Davos | \n", "Reuters | \n", "
\n", " | date | \n", "year | \n", "month | \n", "day | \n", "author | \n", "title | \n", "article | \n", "url | \n", "section | \n", "publication | \n", "tech_health_tag | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "2018-05-02 17:09:00 | \n", "2018 | \n", "5.0 | \n", "2 | \n", "Caroline Williams | \n", "You Can Trick Your Brain Into Being More Focused | \n", "If only every day could be like this. You can’t put your finger on why: Maybe you had just the right amount of sleep. Maybe the stars are somehow ... | \n", "https://www.vice.com/en_us/article/9kgp4v/how-to-improve-focus-be-more-creative | \n", "Health | \n", "Vice | \n", "health | \n", "
1 | \n", "2018-10-05 19:35:00 | \n", "2018 | \n", "10.0 | \n", "5 | \n", "Caroline Haskins | \n", "Trash Geyser Spews Garbage In Yellowstone National Park | \n", "Geyser eruptions are known as one of the most beautiful events to occur in nature. Not anymore! On September 15, Yellowstone Park’s Ear Spring ge... | \n", "https://www.vice.com/en_us/article/evwq47/ear-spring-geyser-spews-trash-in-yellowstone-national-park | \n", "Tech by VICE | \n", "Vice | \n", "technology | \n", "
2 | \n", "2019-06-20 00:00:00 | \n", "2019 | \n", "6.0 | \n", "20 | \n", "Gergely Szakacs | \n", "Hungary has no evidence of Huawei threat, plans rapid 5G rollout: minister | \n", "BUDAPEST (Reuters) - Hungary has no evidence that equipment from Chinese telecoms giant Huawei poses a security threat, a government minister said... | \n", "https://www.reuters.com/article/us-hungary-telecoms-5g-huawei/hungary-has-no-evidence-of-huawei-threat-plans-rapid-5g-rollout-minister-idUSKCN1TL2AP | \n", "Technology News | \n", "Reuters | \n", "technology | \n", "