{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "machine_shape": "hm" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "gpuClass": "standard" }, "cells": [ { "cell_type": "markdown", "source": [ "# Install required Libraries" ], "metadata": { "id": "TrV0i1Vk3_cE" } }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "eNA2FS2VPSwI", "outputId": "f453cf29-5b42-4497-9199-2c39dfefcfca" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" ] } ], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "source": [ "# Install tomotopy\n", "! pip install tomotopy" ], "metadata": { "id": "alH-oKMa4EfV", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "1e67eab0-baab-4bcf-c571-bfb840e291a7" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting tomotopy\n", " Downloading tomotopy-0.12.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (16.5 MB)\n", "\u001b[K |████████████████████████████████| 16.5 MB 11.3 MB/s \n", "\u001b[?25hRequirement already satisfied: numpy>=1.11.0 in /usr/local/lib/python3.7/dist-packages (from tomotopy) (1.21.6)\n", "Installing collected packages: tomotopy\n", "Successfully installed tomotopy-0.12.3\n" ] } ] }, { "cell_type": "markdown", "source": [ "# Imports and Setup" ], "metadata": { "id": "6GbZy4iAXEVe" } }, { "cell_type": "code", "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import numpy as np\n", "import os\n", "import re\n", "from IPython.core.interactiveshell import InteractiveShell\n", "InteractiveShell.ast_node_interactivity = \"all\" # allow multiple outputs in a cell\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "%matplotlib inline\n", "\n", "\n", "# import tomptopy\n", "import tomotopy as tp\n", "import pickle" ], "metadata": { "id": "5tPVn0h1R7cD" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Loading NLTK Modules\n", "import nltk\n", "# nltk.download('all')\n", "nltk.download('stopwords')\n", "nltk.download('punkt')\n", "nltk.download('wordnet')\n", "nltk.download('omw-1.4')\n", "nltk.download('averaged_perceptron_tagger')\n", "from nltk.stem import PorterStemmer\n", "from nltk.corpus import stopwords" ], "metadata": { "id": "jVzSV7KaoN8C", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "1d9eec7e-e4a0-4239-ba8c-fe2e4fa9ec38" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 78 }, { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 78 }, { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 78 }, { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package omw-1.4 to /root/nltk_data...\n", "[nltk_data] Package omw-1.4 is already up-to-date!\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 78 }, { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package averaged_perceptron_tagger to\n", "[nltk_data] /root/nltk_data...\n", "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n", "[nltk_data] date!\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 78 } ] }, { "cell_type": "code", "source": [ "class topicModel:\n", "\n", " # perform pre-processing steps using lemmatization, stop-words and unnecessary punctuation removal\n", " def preprocess_article_text(self, doc_article):\n", " \"\"\"\n", " Accept pandas series, then:\n", " 1. Apply Word stemming\n", " 2. Apply Stop Word removal\n", " \"\"\"\n", " # clean\n", " doc_article = doc_article.lower()\n", " # remove stop words\n", " words = nltk.word_tokenize(doc_article)\n", " stop_words = stopwords.words('english')\n", " stop_words = stop_words + [\"said\", \"says\", \"just\", \"like\", \"would\", \"could\", \"use\", \"told\", \"new\", \"also\", \"thats\", \"even\",\"dont\"]\n", " words = [word for word in words if word not in stop_words and len(word) > 3]\n", " doc_article = ' '.join(words)\n", " doc_article = doc_article.replace('\\xa0', '')\n", " doc_article = re.sub('[!\"#$%&\\'()’*+,-./:;<=>?—@[\\\\]^_`{|}~’]', '', doc_article)\n", " # remove digits \n", " doc_article = re.sub(\"^\\d+\\s|\\s\\d+\\s|\\s\\d+$\", \" \", doc_article)\n", " return doc_article\n", "\n", " def LdaModel_train(self, doc_list):\n", " # k_g is th number of global topics, while k_l is the number of local topics\n", " num_doc = len(doc_list)\n", " mdl = tp.LDAModel(k=5, min_cf= int(num_doc * 0.25), min_df= int(num_doc * 0.33))\n", " for document in doc_list:\n", " mdl.add_doc(document.split())\n", "\n", " iterations = 100\n", " for i in range(0, 10000, iterations):\n", " mdl.train(iterations)\n", " print('Iteration: {}\\tLog-likelihood: {}'.format(i, mdl.ll_per_word))\n", " result_dict_train = self.extract_topic(mdl)\n", " return result_dict_train, mdl\n", "\n", " def extract_topic(self, mdl):\n", " result_dict = {}\n", " topic_dict = {}\n", " extractor = tp.label.PMIExtractor(max_len=5, max_cand=10000)\n", " cands = extractor.extract(mdl)\n", "\n", " # ranking the candidates of labels for a specific topic\n", " labeler = tp.label.FoRelevance(mdl, cands, smoothing=1e-2, mu=0.25)\n", "\n", " # for k in range(mdl.k):\n", " # print(\"== Topic #{} ==\".format(k))\n", " # print(\"Labels:\", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))\n", " # for word, prob in mdl.get_topic_words(k, top_n=10):\n", " # print(word, prob, sep='\\t')\n", "\n", " max_topic_num = 0\n", " for k in range(mdl.k):\n", " cur_topic = \"topic#\"+str(k)\n", " result_dict[cur_topic] = {}\n", " result_dict[cur_topic][\"labels\"] = (', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))\n", " # result_dict[cur_topic]['topics'] = mdl.get_topic_words(k, top_n=10)\n", " result_dict[cur_topic]['topics'] = ' ,'.join([i[0] for i in mdl.get_topic_words(k, top_n=5)])\n", " \n", " return result_dict\n", "\n", " def LdaModel_predict(self, doc_list, mdl):\n", " pred_result = {}\n", " docs_words = []\n", " for doc in doc_list:\n", " docs_words = docs_words + doc.strip().split()\n", " doc_inst = mdl.make_doc(docs_words)\n", " topic_dist, ll = mdl.infer(doc_inst)\n", " # sort the topic dist and take index\n", " topic_dist_arr = np.array(topic_dist)\n", " topic_dist_idx = topic_dist_arr.argsort()[::-1]\n", " mdl_topic = self.extract_topic(mdl)\n", " idx = 0\n", " for i in topic_dist_idx:\n", " if topic_dist[i]>0:\n", " pred_result[\"topic#\"+str(idx)] = mdl_topic[\"topic#\"+str(i)]\n", " idx+=1\n", " return pred_result\n", "\n" ], "metadata": { "id": "vnab3ToAR7o2" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "data_path = '/content/drive/MyDrive/GLG_project/data/GLG_train_data_labeled.csv'\n", "df_train = pd.read_csv(data_path, sep=',')" ], "metadata": { "id": "0yfvTUhdMMZv" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "hierarchical_data_path = '/content/drive/MyDrive/GLG_project/data/hierarchial_cluster.csv'\n", "df_hierarchical = pd.read_csv(hierarchical_data_path, sep=',')" ], "metadata": { "id": "qMdJT7BDMvLS" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Creat a class object to access all methods and instance of the class\n", "topic_object = topicModel()" ], "metadata": { "id": "pLg7nXlrOJQW" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Preprocess training data\n", "df_train['preprocessed_article'] = df_train['article'].apply(topic_object.preprocess_article_text)" ], "metadata": { "id": "qUXB3_o2qTc-" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df_train.head(2)" ], "metadata": { "id": "gQ8EO4iWsKKM", "colab": { "base_uri": "https://localhost:8080/", "height": 315 }, "outputId": "494cf943-31c3-4874-f2d8-7287589cb9e0" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " date year month day \\\n", "0 2018-08-09 09:11:14 2018 8.0 9 \n", "1 2016-04-26 00:00:00 2016 4.0 26 \n", "\n", " title \\\n", "0 Psychologists’ Group Maintains Ban on Work at ... \n", "1 Prince autopsy: What examiners looked for \n", "\n", " article \\\n", "0 MIND Members of the American Psychological Ass... \n", "1 (CNN)Pop superstar Prince died from an accide... \n", "\n", " url section \\\n", "0 https://www.nytimes.com/2018/08/09/health/inte... health \n", "1 https://www.cnn.com/2016/04/26/health/prince-d... health \n", "\n", " publication tech_health_tag article_word_len cluster_label \\\n", "0 The New York Times health 700 22 \n", "1 CNN health 889 9 \n", "\n", " preprocessed_article \n", "0 mind members american psychological associatio... \n", "1 superstar prince died accidental overdose opio... " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dateyearmonthdaytitlearticleurlsectionpublicationtech_health_tagarticle_word_lencluster_labelpreprocessed_article
02018-08-09 09:11:1420188.09Psychologists’ Group Maintains Ban on Work at ...MIND Members of the American Psychological Ass...https://www.nytimes.com/2018/08/09/health/inte...healthThe New York Timeshealth70022mind members american psychological associatio...
12016-04-26 00:00:0020164.026Prince autopsy: What examiners looked for(CNN)Pop superstar Prince died from an accide...https://www.cnn.com/2016/04/26/health/prince-d...healthCNNhealth8899superstar prince died accidental overdose opio...
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 84 } ] }, { "cell_type": "code", "source": [ "df_hierarchical[df_hierarchical['parent']==9909]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 144 }, "id": "CsWshaLTiRxd", "outputId": "b4528d0a-96d9-4598-f685-48498cb4dbcd" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " parent child lambda_val child_size cluster_label\n", "954 9909 1088 3.316230 1 -1\n", "959 9909 9913 3.333467 575 P\n", "960 9909 9914 3.333467 381 P" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
parentchildlambda_valchild_sizecluster_label
954990910883.3162301-1
959990999133.333467575P
960990999143.333467381P
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 97 } ] }, { "cell_type": "code", "source": [ "# Run Multi Grain LDA Model for training data\n", "model_result_train = {\"global\": {}, \"local\":{}}\n", "\n", "# Global clusters\n", "cluster_labels = [str(i) for i in df_train['cluster_label'].unique()]\n", "for cluster_label in cluster_labels:\n", " df_hierarchical_ = df_hierarchical[df_hierarchical['cluster_label']==cluster_label]\n", " print('Starting training model {}'.format(cluster_label))\n", " parent_docs = df_hierarchical_['parent'].unique()\n", " print(parent_docs)\n", " if len(parent_docs) > 1:\n", " parent_docs = sorted(parent_docs)\n", " global_p = parent_docs[:1]\n", " global_docs_indx = df_hierarchical_[df_hierarchical_['parent'].isin(global_p)]['child']\n", " global_docs = df_train.iloc[global_docs_indx]['preprocessed_article'].tolist()\n", " local_p = parent_docs[1:]\n", " else:\n", " global_p = df_hierarchical[df_hierarchical['child']== parent_docs[0]]['parent'].tolist()\n", " global_docs_indx = df_hierarchical[(df_hierarchical['parent'].isin(global_p)) & (df_hierarchical['cluster_label']!=\"P\")]['child']\n", " global_docs = df_train.iloc[global_docs_indx]['preprocessed_article'].tolist()\n", " local_p = parent_docs\n", "\n", " local_docs_indx = df_hierarchical_[df_hierarchical_['parent'].isin(local_p)]['child']\n", " local_docs = df_train.iloc[local_docs_indx]['preprocessed_article'].tolist()\n", "\n", " model_result_train['global'][cluster_label], mdl_g = topic_object.LdaModel_train(global_docs)\n", " # save the model file\n", " mdl_g.save('/content/drive/MyDrive/GLG_project/GLG_topic_model/mdl_topic_model_global_' + str(cluster_label) + '.bin')\n", "\n", " model_result_train['local'][cluster_label], mdl_l = topic_object.LdaModel_train(local_docs)\n", " # save the model file\n", " mdl_l.save('/content/drive/MyDrive/GLG_project/GLG_topic_model/mdl_topic_model_local_' + str(cluster_label) + '.bin')\n" ], "metadata": { "id": "ZyP2D_WzsKaN" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import json\n", "print(json.dumps(model_result_train['global'], sort_keys=True, indent=4))\n" ], "metadata": { "id": "69JC680doyY_", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "2cb4d7de-1fc4-4249-9890-a85f010581c7" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "{\n", " \"-1\": {\n", " \"topic#0\": {\n", " \"labels\": \"contact, symptoms, organization, boars, wild boars\",\n", " \"topics\": \"virus ,spread ,health ,world ,reporting\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"boars, wild boars, african swine fever, african swine, swine fever\",\n", " \"topics\": \"outbreak ,ministry ,disease ,agriculture ,reuters\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"total compensation million versus million, total compensation, versus, million versus, versus million\",\n", " \"topics\": \"million ,reuters ,reporting ,beijing ,last\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"cull, usda, h5n8, flocks, strains\",\n", " \"topics\": \"poultry ,bird ,birds ,highly ,china\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"monitoring, china reported, agriculture rural, beijing reuters china, ministry agriculture rural\",\n", " \"topics\": \"china ,reported ,beijing ,editing ,reporting\"\n", " }\n", " },\n", " \"0\": {\n", " \"topic#0\": {\n", " \"labels\": \"cull, poultry markets, live poultry, flocks, usda\",\n", " \"topics\": \"poultry ,bird ,birds ,reuters ,outbreak\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"contact, symptoms, organization, wild boars, boars\",\n", " \"topics\": \"virus ,spread ,health ,world ,reporting\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"china reported, paris, sybille, hamaide, sybille hamaide\",\n", " \"topics\": \"outbreak ,reported ,farm ,highly ,killed\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"filing, total compensation million, total compensation, million versus, versus million\",\n", " \"topics\": \"million ,world ,reuters ,last ,reporting\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"herd, african swine, swine fever, african swine fever, agriculture rural\",\n", " \"topics\": \"china ,ministry ,disease ,agriculture ,reuters\"\n", " }\n", " },\n", " \"1\": {\n", " \"topic#0\": {\n", " \"labels\": \"bain, nuclear, consortium, toshiba corp, chip unit\",\n", " \"topics\": \"reuters ,reporting ,editing ,march ,last\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"cure, south sudan, immune system, guinea, humans\",\n", " \"topics\": \"virus ,world ,first ,people ,reuters\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"south sudan, without borders, doctors without borders, doctors without, probable\",\n", " \"topics\": \"outbreak ,health ,people ,world ,virus\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"health insurance, kaiser, insurance coverage, cancers, diagnoses\",\n", " \"topics\": \"health ,first ,statement ,people ,reuters\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"stigma, insurance coverage, diagnoses, kaiser, preexposure\",\n", " \"topics\": \"people ,health ,year ,virus ,last\"\n", " }\n", " },\n", " \"10\": {\n", " \"topic#0\": {\n", " \"labels\": \"autoinjector, epipens, epinephrine, patients, drug\",\n", " \"topics\": \"generic ,drug ,patients ,epinephrine ,products\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"epipens, epinephrine, autoinjector, emergency, pfizer\",\n", " \"topics\": \"epipens ,shortage ,supply ,lifesaving ,used\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"emergency, pfizer, product, epipens, generic\",\n", " \"topics\": \"pfizer ,emergency ,united ,states ,medical\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"us, autoinjector, health, patients, price\",\n", " \"topics\": \"mylan ,epipen ,company ,price ,product\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"price, health, us, treatment, allergy\",\n", " \"topics\": \"us ,grassley ,federal ,department ,reuters\"\n", " }\n", " },\n", " \"11\": {\n", " \"topic#0\": {\n", " \"labels\": \"autoinjector, epinephrine, patients, us, epipens\",\n", " \"topics\": \"epipen ,emergency ,lifesaving ,last ,device\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"pfizer, epipens, emergency, epinephrine, autoinjector\",\n", " \"topics\": \"epipens ,shortage ,supply ,pfizer ,available\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"autoinjector, epipens, epinephrine, patients, drug\",\n", " \"topics\": \"generic ,drug ,patients ,epinephrine ,products\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"health, price, us, allergy, generic\",\n", " \"topics\": \"mylan ,price ,us ,health ,grassley\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"pfizer, emergency, product, price, allergy\",\n", " \"topics\": \"company ,product ,allergy ,states ,united\"\n", " }\n", " },\n", " \"12\": {\n", " \"topic#0\": {\n", " \"labels\": \"risk, medicine, doctor, atrial fibrillation, fibrillation\",\n", " \"topics\": \"health ,people ,data ,device ,used\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"heart rate, atrial fibrillation, fibrillation, atrial, heart\",\n", " \"topics\": \"heart ,rate ,devices ,fitness ,used\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"watch series, apple watch series, samsung, series, apple watch\",\n", " \"topics\": \"watch ,apple ,life ,used ,fitbit\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"started, popular, industry, silicon, developing\",\n", " \"topics\": \"company ,year ,fitness ,first ,time\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"versa, healthcare, fitbit, sales, started\",\n", " \"topics\": \"fitbit ,users ,fitness ,used ,according\"\n", " }\n", " },\n", " \"13\": {\n", " \"topic#0\": {\n", " \"labels\": \"watch series, apple watch series, image, series, apple watch\",\n", " \"topics\": \"watch ,apple ,time ,used ,fitbit\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"measures, person, risk, medicine, doctor\",\n", " \"topics\": \"health ,people ,data ,devices ,device\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"started, silicon, industry, offering, developing\",\n", " \"topics\": \"company ,year ,first ,fitness ,last\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"heart rate, atrial fibrillation, fibrillation, heart, atrial\",\n", " \"topics\": \"heart ,rate ,first ,fitness ,used\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"versa, healthcare, fitbit, started, goals\",\n", " \"topics\": \"fitbit ,company ,fitness ,users ,used\"\n", " }\n", " },\n", " \"14\": {\n", " \"topic#0\": {\n", " \"labels\": \"takeaway, food delivery, food, delivery, competition\",\n", " \"topics\": \"food ,delivery ,right ,meal ,whether\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"takeaway, shares, competition, share, first\",\n", " \"topics\": \"takeaway ,shares ,competition ,amazon ,sales\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"right, last, year, percent, customers\",\n", " \"topics\": \"apron ,blue ,service ,time ,according\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"share, company, shares, billion, last\",\n", " \"topics\": \"company ,market ,percent ,share ,last\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"customers, million, billion, share, first\",\n", " \"topics\": \"million ,year ,billion ,customers ,business\"\n", " }\n", " },\n", " \"15\": {\n", " \"topic#0\": {\n", " \"labels\": \"\",\n", " \"topics\": \"deal ,group ,percent ,products ,payment\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"\",\n", " \"topics\": \"medianet ,company ,technology ,united ,microsoft\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"\",\n", " \"topics\": \"turakhia ,based ,states ,digital ,msfto\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"\",\n", " \"topics\": \"chinese ,advertising ,miteno ,users ,amount\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"\",\n", " \"topics\": \"tech ,million ,reporting ,revenue ,interview\"\n", " }\n", " },\n", " \"16\": {\n", " \"topic#0\": {\n", " \"labels\": \"\",\n", " \"topics\": \"company ,facebook ,compete ,watch ,important\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"\",\n", " \"topics\": \"content ,people ,want ,things ,spend\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"\",\n", " \"topics\": \"mayer ,world ,team ,chief ,delivered\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"\",\n", " \"topics\": \"yahoo ,mobile ,million ,khalaf ,revenue\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"\",\n", " \"topics\": \"free ,access ,slim ,firm ,offer\"\n", " }\n", " },\n", " \"17\": {\n", " \"topic#0\": {\n", " \"labels\": \"ibes, cents share, third quarter, analysts average, cents\",\n", " \"topics\": \"year ,last ,company ,billion ,people\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"selfdriving, technology companies, third quarter, taxes, tech industry\",\n", " \"topics\": \"company ,companies ,technology ,last ,make\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"obsessed, tech product, carry, editor, print\",\n", " \"topics\": \"time ,people ,make ,first ,technology\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"free android, interface, headphones, carry, obsessed\",\n", " \"topics\": \"people ,make ,companies ,company ,year\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"ibes, analysts average, cents share, adjusted, cents\",\n", " \"topics\": \"billion ,company ,year ,first ,last\"\n", " }\n", " },\n", " \"18\": {\n", " \"topic#0\": {\n", " \"labels\": \"match, lets, profiles, studies, influencers\",\n", " \"topics\": \"people ,time ,make ,social ,media\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"fees, influencers, digital media, pelosi, media companies\",\n", " \"topics\": \"content ,media ,companies ,online ,last\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"lets, donate, european, influencers, birthday\",\n", " \"topics\": \"facebook ,social ,people ,companies ,company\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"profiles, prior, earnings report, daily active, daily active users\",\n", " \"topics\": \"users ,company ,year ,last ,platform\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"twitter accounts, tweeting, phone number, twitter account, jack dorsey\",\n", " \"topics\": \"twitter ,social ,including ,facebook ,many\"\n", " }\n", " },\n", " \"19\": {\n", " \"topic#0\": {\n", " \"labels\": \"credit, consumers, companies, banks, financial\",\n", " \"topics\": \"banks ,financial ,consumers ,statement ,credit\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"market, tuesday, twitter, last, company\",\n", " \"topics\": \"company ,last ,reuters ,reporting ,twitter\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"market, banking, fintech, britain, access\",\n", " \"topics\": \"fintech ,banking ,services ,britain ,since\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"information, access, data, companies, consumers\",\n", " \"topics\": \"data ,information ,companies ,access ,according\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"added, bank, customer, digital, online\",\n", " \"topics\": \"customers ,bank ,customer ,online ,issue\"\n", " }\n", " },\n", " \"2\": {\n", " \"topic#0\": {\n", " \"labels\": \"promote, providing, education, obamacare, programs\",\n", " \"topics\": \"health ,reproductive ,care ,services ,trump\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"medication, introduced, tests, taken, home\",\n", " \"topics\": \"women ,abortion ,abortions ,states ,clinics\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"abortion restrictions, remains, texas, attorney general, legislature\",\n", " \"topics\": \"abortion ,state ,rights ,court ,supreme\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"medication, fetus, introduced, tests, sign\",\n", " \"topics\": \"bill ,pregnancy ,abortions ,parenthood ,since\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"providing, editing, reporting, reuters, abortion restrictions\",\n", " \"topics\": \"federal ,planned ,us ,parenthood ,legal\"\n", " }\n", " },\n", " \"20\": {\n", " \"topic#0\": {\n", " \"labels\": \"footage, link, connected, imsi, first time\",\n", " \"topics\": \"security ,company ,year ,time ,used\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"tech companies, personal data, industries, campaign, processors\",\n", " \"topics\": \"data ,companies ,company ,information ,year\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"recognition technology, facial recognition technology, rekognition, recognition software, facial recognition software\",\n", " \"topics\": \"technology ,people ,used ,make ,companies\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"robocall, robocalls, caller, spam, phone number\",\n", " \"topics\": \"phone ,companies ,using ,used ,year\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"dozens, events, ring, footage, staff\",\n", " \"topics\": \"information ,people ,year ,without ,first\"\n", " }\n", " },\n", " \"21\": {\n", " \"topic#0\": {\n", " \"labels\": \"childhood, severely obese, risks, develop, height\",\n", " \"topics\": \"people ,percent ,health ,years ,data\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"greater, protein, kind, meal, calorie\",\n", " \"topics\": \"food ,less ,found ,diet ,eating\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"morning, pick, kind, specific, workouts\",\n", " \"topics\": \"body ,people ,exercise ,much ,time\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"involved study, male, exposure, early, reuters health\",\n", " \"topics\": \"study ,researchers ,risk ,health ,research\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"weight gain, mass index, body mass index, body mass, index\",\n", " \"topics\": \"weight ,obesity ,obese ,women ,study\"\n", " }\n", " },\n", " \"22\": {\n", " \"topic#0\": {\n", " \"labels\": \"percent higher, jones, previous studies, study researchers examined, researchers examined data\",\n", " \"topics\": \"study ,researchers ,likely ,research ,found\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"medical students, trainees, medical school, medicine mount, medicine mount sinai\",\n", " \"topics\": \"school ,medical ,medicine ,lead ,university\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"humor, fever, sexual behavior, ride, sensation\",\n", " \"topics\": \"people ,time ,many ,years ,first\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"elevated, percent higher, soil, smoking, increase risk\",\n", " \"topics\": \"health ,risk ,medical ,found ,including\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"human cells, injected, pigs, geneediting, ethical\",\n", " \"topics\": \"research ,work ,university ,used ,medicine\"\n", " }\n", " },\n", " \"3\": {\n", " \"topic#0\": {\n", " \"labels\": \"\",\n", " \"topics\": \"cusack ,stop ,delivery ,help ,simply\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"\",\n", " \"topics\": \"people ,team ,regular ,lansing ,michigan\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"\",\n", " \"topics\": \"customers ,lake ,snowmobile ,want ,delivering\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"\",\n", " \"topics\": \"prescriptions ,posted ,pick ,enlisted ,grand\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"\",\n", " \"topics\": \"odessa ,snow ,pharmacy ,danger ,able\"\n", " }\n", " },\n", " \"4\": {\n", " \"topic#0\": {\n", " \"labels\": \"\",\n", " \"topics\": \"\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"\",\n", " \"topics\": \"\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"\",\n", " \"topics\": \"\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"\",\n", " \"topics\": \"\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"\",\n", " \"topics\": \"\"\n", " }\n", " },\n", " \"5\": {\n", " \"topic#0\": {\n", " \"labels\": \"side effects, statins, participants, effect, women\",\n", " \"topics\": \"drug ,drugs ,used ,many ,medicine\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"participants, attack stroke, heart attack, attack, heart attack stroke\",\n", " \"topics\": \"study ,patients ,risk ,heart ,taking\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"parents, fever, common, acetaminophen, child\",\n", " \"topics\": \"according ,doctors ,symptoms ,research ,doctor\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"hospitals, india, private, system, billion\",\n", " \"topics\": \"health ,treatment ,year ,percent ,increase\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"heart attack stroke, attack stroke, heart attack, attack, effect\",\n", " \"topics\": \"disease ,people ,years ,time ,found\"\n", " }\n", " },\n", " \"6\": {\n", " \"topic#0\": {\n", " \"labels\": \"\",\n", " \"topics\": \"musk ,tunnels ,challenges ,construction ,many\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"\",\n", " \"topics\": \"boring ,potential ,depth ,likely ,digging\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"\",\n", " \"topics\": \"tunnels ,deep ,company ,tunnel ,university\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"\",\n", " \"topics\": \"urban ,civil ,professor ,city ,state\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"\",\n", " \"topics\": \"feet ,engineering ,evans ,weakened ,layers\"\n", " }\n", " },\n", " \"7\": {\n", " \"topic#0\": {\n", " \"labels\": \"\",\n", " \"topics\": \"engineering ,civil ,need ,geotechnical ,state\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"\",\n", " \"topics\": \"company ,tunnel ,challenges ,mason ,future\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"\",\n", " \"topics\": \"tunnels ,construction ,university ,professor ,impossible\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"\",\n", " \"topics\": \"musk ,feet ,time ,city ,building\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"\",\n", " \"topics\": \"deep ,boring ,angeles ,engineers ,evans\"\n", " }\n", " },\n", " \"8\": {\n", " \"topic#0\": {\n", " \"labels\": \"internet, united, growth, companies, order\",\n", " \"topics\": \"companies ,people ,market ,another ,technology\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"food delivery, asia, region, southeast asia, ridehailing\",\n", " \"topics\": \"million ,services ,billion ,across ,another\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"editing, reuters, launch, firms, court\",\n", " \"topics\": \"reuters ,technology ,reporting ,editing ,across\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"name, statement, food delivery, indonesia, largest\",\n", " \"topics\": \"company ,billion ,business ,technology ,another\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"take, products, chinese, mobile, information\",\n", " \"topics\": \"year ,last ,payments ,time ,first\"\n", " }\n", " },\n", " \"9\": {\n", " \"topic#0\": {\n", " \"labels\": \"latestage, bengaluru editing, bengaluru, shares, lilly\",\n", " \"topics\": \"drug ,patients ,percent ,company ,disease\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"problem, since, cancer, impact, pressure\",\n", " \"topics\": \"health ,year ,according ,time ,million\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"risk developing, author, participants, neurology, risk alzheimer\",\n", " \"topics\": \"study ,brain ,research ,researchers ,found\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"beta amyloid, alzheimer drug, beta, immune, lilly\",\n", " \"topics\": \"alzheimer ,disease ,drugs ,brain ,research\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"diet, exercise, training, neurology, risk factors\",\n", " \"topics\": \"dementia ,people ,risk ,cognitive ,university\"\n", " }\n", " }\n", "}\n" ] } ] }, { "cell_type": "code", "source": [ "import json\n", "print(json.dumps(model_result_train['local'], sort_keys=True, indent=4))\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1smkqLUV7MZP", "outputId": "1c37fea3-d54c-467b-ae15-70b2936e8a98" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "{\n", " \"-1\": {\n", " \"topic#0\": {\n", " \"labels\": \"science times, attendance, mental illness, taught, phrase\",\n", " \"topics\": \"work ,time ,first ,make ,years\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"dominant, nasa, operating system, tech industry, search engine\",\n", " \"topics\": \"year ,last ,according ,time ,still\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"short videos, douyin, merger, gopro, passwords\",\n", " \"topics\": \"company ,last ,year ,according ,people\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"percent less likely, researchers examined data, online january, mercury, micrograms\",\n", " \"topics\": \"health ,found ,according ,people ,company\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"study period, otolaryngology, geriatrician, geriatric, people take\",\n", " \"topics\": \"people ,found ,many ,years ,make\"\n", " }\n", " },\n", " \"0\": {\n", " \"topic#0\": {\n", " \"labels\": \"rubella, scans, born microcephaly, congenital, disorder\",\n", " \"topics\": \"zika ,virus ,microcephaly ,brazil ,outbreak\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"females, method, eggs, spray, insects\",\n", " \"topics\": \"mosquitoes ,mosquito ,people ,control ,spread\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"control district, mosquito control district, miami beach, beach, miamidade county\",\n", " \"topics\": \"zika ,virus ,florida ,states ,officials\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"brazil confirmed, suspected cases, spread rapidly, spread rapidly americas, considers\",\n", " \"topics\": \"health ,cases ,us ,reuters ,case\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"cdc, countries territories, territories, fetus, hearing\",\n", " \"topics\": \"women ,pregnant ,infected ,birth ,infection\"\n", " }\n", " },\n", " \"1\": {\n", " \"topic#0\": {\n", " \"labels\": \"bengaluru editing, euros reporting, reuters south, jane, jason neely\",\n", " \"topics\": \"reuters ,reporting ,editing ,company ,coronavirus\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"eikon company coverage gdansk newsroom, text eikon company coverage gdansk, eikon company coverage gdansk, coverage gdansk newsroom, company coverage gdansk\",\n", " \"topics\": \"company ,march ,reuters ,source ,text\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"year financial results, full year financial, quarter full year financial results, fourth quarter full year financial, quarter full year financial\",\n", " \"topics\": \"company ,reuters ,source ,coverage ,text\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"alex richardson, richardson, editing alex, editing alex richardson, state news\",\n", " \"topics\": \"coronavirus ,reuters ,reporting ,editing ,march\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"business update, year financial results, full year financial, quarter full year financial, full year financial results\",\n", " \"topics\": \"company ,reuters ,source ,coverage ,text\"\n", " }\n", " },\n", " \"10\": {\n", " \"topic#0\": {\n", " \"labels\": \"individual plans, special enrollment, periods, jan, csrs\",\n", " \"topics\": \"obamacare ,plans ,insurers ,percent ,individual\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"medicare advantage, private health, network, sanders, centers medicare medicaid services\",\n", " \"topics\": \"medicare ,healthcare ,health ,care ,government\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"graham, house bill, majority leader, senate bill, cassidy\",\n", " \"topics\": \"bill ,senate ,house ,repeal ,republicans\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"urban, household income, household, type, adult\",\n", " \"topics\": \"health ,insurance ,people ,coverage ,plan\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"conference, issued, immediate, meeting, press\",\n", " \"topics\": \"trump ,obamacare ,president ,administration ,congress\"\n", " }\n", " },\n", " \"11\": {\n", " \"topic#0\": {\n", " \"labels\": \"spinraza, muscular, avexis, physician, zolgensma\",\n", " \"topics\": \"patients ,treatment ,drug ,cancer ,drugs\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"glyphosate, personalized, firstline treatment, cancer institute, cancer center\",\n", " \"topics\": \"cancer ,patients ,treatment ,year ,drug\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"warning letter, shareholders, biogen, californiabased, us district\",\n", " \"topics\": \"company ,us ,reuters ,drug ,reporting\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"democrats, lobbying, lowering, negotiate drug, campaign\",\n", " \"topics\": \"drug ,drugs ,year ,administration ,patients\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"democrats, harvoni, lobbying, lowering, high drug\",\n", " \"topics\": \"drugs ,drug ,patients ,reporting ,cancer\"\n", " }\n", " },\n", " \"12\": {\n", " \"topic#0\": {\n", " \"labels\": \"incredibly, class, simulation, letter, editor\",\n", " \"topics\": \"game ,world ,time ,games ,company\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"simulation, editor, gameplay, weapons, publishers\",\n", " \"topics\": \"games ,game ,video ,last ,company\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"incredibly, editor, night, living, literally\",\n", " \"topics\": \"people ,first ,time ,still ,around\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"cloud gaming, game streaming, rivals, gaming market, million units\",\n", " \"topics\": \"gaming ,games ,year ,last ,game\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"magic leap, investment, standalone, ship, snap\",\n", " \"topics\": \"company ,year ,game ,games ,people\"\n", " }\n", " },\n", " \"13\": {\n", " \"topic#0\": {\n", " \"labels\": \"amazoncom, echo devices, edition, prime, fire stick\",\n", " \"topics\": \"amazon ,devices ,google ,alexa ,home\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"google translate, translated, sentences, pixel, restaurant\",\n", " \"topics\": \"google ,available ,make ,amazon ,alexa\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"musicians, vinyl, translated, kickstarter, noisecanceling\",\n", " \"topics\": \"company ,first ,make ,time ,devices\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"alexa voice, amazon announced, echo devices, assistant alexa, wake word\",\n", " \"topics\": \"alexa ,amazon ,voice ,home ,google\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"movies, home security, google home, door, lock\",\n", " \"topics\": \"home ,google ,devices ,amazon ,alexa\"\n", " }\n", " },\n", " \"14\": {\n", " \"topic#0\": {\n", " \"labels\": \"theory, ethereum, hash, quantum computer, calculations\",\n", " \"topics\": \"computer ,used ,time ,research ,according\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"colleges, employee, entrepreneurs, hired, cloud computing\",\n", " \"topics\": \"company ,companies ,technology ,year ,work\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"privacy, cloud computing, bias, tencent, employee\",\n", " \"topics\": \"intelligence ,artificial ,data ,human ,technology\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"mckinsey, warehouse, replaced, prototype, robots\",\n", " \"topics\": \"robots ,human ,need ,years ,still\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"effect, computations, quantum computer, list, employee\",\n", " \"topics\": \"people ,make ,world ,years ,time\"\n", " }\n", " },\n", " \"15\": {\n", " \"topic#0\": {\n", " \"labels\": \"visit, sanders, senator, political, york business\",\n", " \"topics\": \"amazon ,time ,according ,companies ,including\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"coronavirus, deliver packages, brazilian, grocery delivery, delivery service\",\n", " \"topics\": \"delivery ,customers ,companies ,including ,first\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"sanders, sale, percent year, senator, closing\",\n", " \"topics\": \"percent ,year ,market ,time ,first\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"political, york business, finance, advertising business, deliver packages\",\n", " \"topics\": \"company ,business ,companies ,people ,years\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"sale, ebay, buyers, venture, visit\",\n", " \"topics\": \"billion ,year ,million ,online ,ecommerce\"\n", " }\n", " },\n", " \"16\": {\n", " \"topic#0\": {\n", " \"labels\": \"nadella, jumped percent, percent year, patents, dropped\",\n", " \"topics\": \"percent ,market ,shares ,according ,last\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"million pounds, aviv reuters, tova, tova cohen, cohen\",\n", " \"topics\": \"million ,reuters ,reporting ,editing ,financial\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"billion valuation, adam neumann, adam, neumann, saudi\",\n", " \"topics\": \"billion ,company ,year ,last ,shares\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"netflix, saudi, paperwork, buybacks, generation\",\n", " \"topics\": \"companies ,tech ,investors ,capital ,year\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"pichai, paperwork, patents, windows, netflix\",\n", " \"topics\": \"company ,business ,companies ,technology ,years\"\n", " }\n", " },\n", " \"17\": {\n", " \"topic#0\": {\n", " \"labels\": \"extradition, lawyers, wanzhou, meng wanzhou, meng\",\n", " \"topics\": \"china ,last ,company ,year ,market\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"year model, bezels, stylus, battery life, pixel\",\n", " \"topics\": \"year ,last ,company ,market ,china\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"dollar, razr, price target, market value, challenging\",\n", " \"topics\": \"market ,company ,china ,year ,last\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"15inch, release date, bezels, ereader, port\",\n", " \"topics\": \"company ,last ,china ,year ,market\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"south african, rome, vivendi, conglomerate, genish\",\n", " \"topics\": \"reuters ,reporting ,editing ,market ,company\"\n", " }\n", " },\n", " \"18\": {\n", " \"topic#0\": {\n", " \"labels\": \"gdpr, oculus, virtual reality, collins, zuckerberg facebook\",\n", " \"topics\": \"zuckerberg ,facebook ,mark ,company ,time\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"crowder, label, infowars, wojcicki, alex jones\",\n", " \"topics\": \"content ,company ,people ,platform ,including\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"banning, discord, milner, white supremacists, supremacists\",\n", " \"topics\": \"twitter ,media ,social ,people ,users\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"india, trending topics, harvested, gdpr, journalists\",\n", " \"topics\": \"facebook ,company ,people ,social ,media\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"gdpr, harvested, privacy practices, collins, browsing\",\n", " \"topics\": \"data ,facebook ,users ,information ,companies\"\n", " }\n", " },\n", " \"19\": {\n", " \"topic#0\": {\n", " \"labels\": \"german, dublin, chee, brussels reuters, data united\",\n", " \"topics\": \"reuters ,us ,european ,reporting ,commission\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"american tech, pressure, facebook amazon, privacy rules, tough\",\n", " \"topics\": \"companies ,facebook ,tech ,european ,states\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"default, motherboard, version, actually, activities\",\n", " \"topics\": \"internet ,users ,company ,service ,access\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"care, privacy legislation, americans, privacy rules, activities\",\n", " \"topics\": \"data ,privacy ,information ,protection ,personal\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"gmail, american tech, dominant, google account, show\",\n", " \"topics\": \"google ,company ,people ,years ,services\"\n", " }\n", " },\n", " \"2\": {\n", " \"topic#0\": {\n", " \"labels\": \"among high, middle high school students, middle high school, eliquid, schuchat\",\n", " \"topics\": \"vaping ,ecigarettes ,products ,health ,people\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"psychosis, cells, stop smoking, pain management, psychedelics\",\n", " \"topics\": \"study ,smoking ,found ,people ,health\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"packs, philip morris, morris international, philip morris international, youth tobacco\",\n", " \"topics\": \"tobacco ,products ,health ,public ,reuters\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"toronto, voters, legalizing, dispensaries, legalized recreational\",\n", " \"topics\": \"marijuana ,medical ,states ,drug ,health\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"psychedelic, psychedelics, psilocybin, clinical trials, magic\",\n", " \"topics\": \"drug ,people ,many ,according ,first\"\n", " }\n", " },\n", " \"20\": {\n", " \"topic#0\": {\n", " \"labels\": \"hedge, hedge fund, lawsuit, involvement, consortium\",\n", " \"topics\": \"company ,companies ,year ,last ,around\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"cryptography, child pornography site, pornography site, friends, influence\",\n", " \"topics\": \"people ,used ,first ,time ,still\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"hashed, dates, warner, personal data, property\",\n", " \"topics\": \"data ,information ,company ,users ,including\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"pornography site, child pornography site, greatest mysteries, motherboard show, time writing\",\n", " \"topics\": \"motherboard ,according ,used ,time ,companies\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"krebs, us government, council, fancy, lawsuit\",\n", " \"topics\": \"security ,according ,last ,including ,information\"\n", " }\n", " },\n", " \"21\": {\n", " \"topic#0\": {\n", " \"labels\": \"transplant, microbiome, determined, leads, parasite\",\n", " \"topics\": \"people ,disease ,time ,might ,make\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"enjoy, crops, determined, space, angeles\",\n", " \"topics\": \"food ,eating ,health ,year ,many\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"sprouts, purchased, recalling, people recover, food safety inspection service\",\n", " \"topics\": \"products ,food ,cases ,last ,control\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"norovirus, chipotle, park, swimming, health department\",\n", " \"topics\": \"people ,according ,cases ,health ,states\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"fractures, risk cancer, placebo, stress, responses\",\n", " \"topics\": \"study ,researchers ,found ,risk ,health\"\n", " }\n", " },\n", " \"22\": {\n", " \"topic#0\": {\n", " \"labels\": \"tweets, savings, porn, dick, something wrong\",\n", " \"topics\": \"feel ,really ,going ,want ,think\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"squats, getting back, tattoo, hard work, strength training\",\n", " \"topics\": \"body ,back ,work ,know ,still\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"survival odds, recent study suggests, researchers note, montreal, percent higher\",\n", " \"topics\": \"health ,university ,found ,many ,help\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"breakup, creativity, rhythms, time people, circadian rhythms\",\n", " \"topics\": \"people ,time ,work ,make ,much\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"black women, died suicide, reproduction, every years, devastating\",\n", " \"topics\": \"years ,life ,year ,first ,every\"\n", " }\n", " },\n", " \"3\": {\n", " \"topic#0\": {\n", " \"labels\": \"deployed, space exploration, miles hour, rover, soviet\",\n", " \"topics\": \"year ,world ,around ,every ,according\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"table, spaces, videos, designer, many people\",\n", " \"topics\": \"people ,make ,many ,world ,still\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"poor climate change voting record, poor climate change, climate change voting, poor climate, poor climate change voting\",\n", " \"topics\": \"change ,many ,around ,make ,motherboard\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"descent, rover, operational, deployed, soviet\",\n", " \"topics\": \"first ,time ,years ,much ,last\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"interstellar, miles hour, oldest, dark matter, extraterrestrial\",\n", " \"topics\": \"research ,scientists ,years ,found ,university\"\n", " }\n", " },\n", " \"4\": {\n", " \"topic#0\": {\n", " \"labels\": \"falcon heavy, spacecraft, north korea, korean, blue origin\",\n", " \"topics\": \"test ,flight ,system ,drone ,drones\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"blue origin, delays, crew dragon, engine, spacecraft\",\n", " \"topics\": \"company ,first ,time ,year ,last\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"incident, controller, device, commercial drones, mavic\",\n", " \"topics\": \"drones ,drone ,according ,people ,company\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"mavic, controller, smart, device, incident\",\n", " \"topics\": \"drone ,drones ,company ,space ,first\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"spaceflight, spacecraft, launch vehicle, falcon heavy, astronauts\",\n", " \"topics\": \"space ,test ,drone ,drones ,company\"\n", " }\n", " },\n", " \"5\": {\n", " \"topic#0\": {\n", " \"labels\": \"projects, verily, medical records, drugstores, regulators\",\n", " \"topics\": \"people ,health ,services ,according ,years\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"chase, bezos, verily, jp morgan, jp\",\n", " \"topics\": \"health ,care ,companies ,healthcare ,time\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"fiscal, cents share, cents, weight, forecast\",\n", " \"topics\": \"company ,million ,year ,last ,first\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"medical records, cancer, researchers, ventures, glucose\",\n", " \"topics\": \"patients ,medical ,technology ,us ,help\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"fiscal, drugstore chain, adjusted, cents, forecast\",\n", " \"topics\": \"billion ,percent ,company ,business ,year\"\n", " }\n", " },\n", " \"6\": {\n", " \"topic#0\": {\n", " \"labels\": \"model production, tweeted, musk tweeted, secured, tweet\",\n", " \"topics\": \"tesla ,company ,according ,vehicle ,editing\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"model production, settlement, quarterly, tweeted, tweet\",\n", " \"topics\": \"company ,year ,first ,last ,make\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"energy vehicles, plugin, hybrid, producing, factories\",\n", " \"topics\": \"electric ,vehicles ,reuters ,vehicle ,reporting\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"safety driver, testing selfdriving, aurora, taxi, selfdriving vehicle\",\n", " \"topics\": \"selfdriving ,cars ,vehicles ,technology ,vehicle\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"texting, distracted driving, device, save, safety driver\",\n", " \"topics\": \"driving ,technology ,according ,make ,editing\"\n", " }\n", " },\n", " \"7\": {\n", " \"topic#0\": {\n", " \"labels\": \"okerstrom, graves, ryan, software, electric scooter\",\n", " \"topics\": \"company ,business ,last ,first ,time\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"passed, wage, independent contractors, drivers uber, drivers\",\n", " \"topics\": \"drivers ,ridehailing ,including ,uber ,time\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"lime, buses, toronto, labs, software\",\n", " \"topics\": \"public ,service ,companies ,people ,around\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"airbnb, adjusted, year earlier, went public, sales\",\n", " \"topics\": \"million ,companies ,ridehailing ,year ,around\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"huffington, arianna, arianna huffington, ryan, graves\",\n", " \"topics\": \"uber ,people ,ridehailing ,including ,time\"\n", " }\n", " },\n", " \"8\": {\n", " \"topic#0\": {\n", " \"labels\": \"assets, creating, arrangement, rural areas, competing\",\n", " \"topics\": \"time ,people ,year ,companies ,last\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"internet users, free open internet, title, internet advocates, open internet advocates\",\n", " \"topics\": \"internet ,companies ,users ,services ,service\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"sen, production, open internet advocates, internet advocates, film\",\n", " \"topics\": \"content ,internet ,service ,company ,streaming\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"music streaming service, music streaming, complaint, warnermedia, debt\",\n", " \"topics\": \"service ,streaming ,company ,services ,users\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"cricket, music streaming, film, billion year, music streaming service\",\n", " \"topics\": \"million ,company ,year ,internet ,service\"\n", " }\n", " },\n", " \"9\": {\n", " \"topic#0\": {\n", " \"labels\": \"sacklers, cardinal health, cardinal, amerisourcebergen, familiar\",\n", " \"topics\": \"opioid ,health ,state ,epidemic ,crisis\"\n", " },\n", " \"topic#1\": {\n", " \"labels\": \"drug overdose deaths, stable, guidelines, hydrocodone, prescription painkillers\",\n", " \"topics\": \"opioids ,opioid ,prescription ,pain ,control\"\n", " },\n", " \"topic#2\": {\n", " \"labels\": \"probe, justice department, disclosed, drugmakers, filing\",\n", " \"topics\": \"drug ,us ,states ,drugs ,reuters\"\n", " },\n", " \"topic#3\": {\n", " \"labels\": \"drug overdose deaths, white house, numbers, overdose deaths involving, heroin fentanyl\",\n", " \"topics\": \"drug ,overdose ,deaths ,people ,fentanyl\"\n", " },\n", " \"topic#4\": {\n", " \"labels\": \"stay, many people, associate, gets, addictions\",\n", " \"topics\": \"addiction ,patients ,treatment ,people ,doctors\"\n", " }\n", " }\n", "}\n" ] } ] }, { "cell_type": "code", "source": [ "# save model_result_train dictionary using the package pickel \n", "path_file = '/content/drive/MyDrive/GLG_project/GLG_topic_model/train_doc_result.pkl'\n", "pickle.dump(model_result_train, open(path_file, 'wb')) " ], "metadata": { "id": "k_Fb9FSdFcA7" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "7QJ09aT93FIV" }, "execution_count": null, "outputs": [] } ] }