# Install required Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Install tomotopy
! pip install tomotopy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tomotopy
  Downloading tomotopy-0.12.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (16.5 MB)
[K     |████████████████████████████████| 16.5 MB 11.3 MB/s 
Installing collected packages: tomotopy
Successfully installed tomotopy-0.12.3


# Imports and Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import re
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # allow multiple outputs in a cell

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline


# import tomptopy
import tomotopy as tp
import pickle

In [None]:
# Loading NLTK Modules
import nltk
# nltk.download('all')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
class topicModel:

  # perform pre-processing steps using lemmatization, stop-words and unnecessary punctuation removal
  def preprocess_article_text(self, doc_article):
    """
    Accept pandas series, then:
    1. Apply Word stemming
    2. Apply Stop Word removal
    """
    # clean
    doc_article = doc_article.lower()
    # remove stop words
    words = nltk.word_tokenize(doc_article)
    stop_words = stopwords.words('english')
    stop_words = stop_words + ["said", "says", "just", "like", "would", "could", "use", "told", "new", "also", "thats", "even","dont"]
    words = [word for word in words if word not in stop_words and len(word) > 3]
    doc_article = ' '.join(words)
    doc_article = doc_article.replace('\xa0', '')
    doc_article = re.sub('[!"#$%&\'()’*+,-./:;<=>?—@[\\]^_`{|}~’]', '', doc_article)
    # remove digits 
    doc_article = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", doc_article)
    return doc_article

  def LdaModel_train(self, doc_list):
    # k_g is th number of global topics, while k_l is the number of local topics
    num_doc = len(doc_list)
    mdl = tp.LDAModel(k=5, min_cf= int(num_doc * 0.25), min_df= int(num_doc * 0.33))
    for document in doc_list:
        mdl.add_doc(document.split())

    iterations = 100
    for i in range(0, 10000, iterations):
            mdl.train(iterations)
            print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
    result_dict_train = self.extract_topic(mdl)
    return result_dict_train, mdl

  def extract_topic(self, mdl):
    result_dict = {}
    topic_dict = {}
    extractor = tp.label.PMIExtractor(max_len=5, max_cand=10000)
    cands = extractor.extract(mdl)

    # ranking the candidates of labels for a specific topic
    labeler = tp.label.FoRelevance(mdl, cands, smoothing=1e-2, mu=0.25)

    # for k in range(mdl.k):
    #   print("== Topic #{} ==".format(k))
    #   print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
    #   for word, prob in mdl.get_topic_words(k, top_n=10):
    #     print(word, prob, sep='\t')

    max_topic_num = 0
    for k in range(mdl.k):
        cur_topic = "topic#"+str(k)
        result_dict[cur_topic] = {}
        result_dict[cur_topic]["labels"] = (', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
        # result_dict[cur_topic]['topics'] = mdl.get_topic_words(k, top_n=10)
        result_dict[cur_topic]['topics'] = ' ,'.join([i[0] for i in mdl.get_topic_words(k, top_n=5)])
     
    return result_dict

  def LdaModel_predict(self, doc_list, mdl):
    pred_result = {}
    docs_words = []
    for doc in doc_list:
      docs_words = docs_words + doc.strip().split()
    doc_inst = mdl.make_doc(docs_words)
    topic_dist, ll = mdl.infer(doc_inst)
    # sort the topic dist and take index
    topic_dist_arr = np.array(topic_dist)
    topic_dist_idx = topic_dist_arr.argsort()[::-1]
    mdl_topic = self.extract_topic(mdl)
    idx = 0
    for i in topic_dist_idx:
      if topic_dist[i]>0:
        pred_result["topic#"+str(idx)] = mdl_topic["topic#"+str(i)]
      idx+=1
    return pred_result



In [None]:
data_path = '/content/drive/MyDrive/GLG_project/data/GLG_train_data_labeled.csv'
df_train = pd.read_csv(data_path, sep=',')

In [None]:
hierarchical_data_path = '/content/drive/MyDrive/GLG_project/data/hierarchial_cluster.csv'
df_hierarchical = pd.read_csv(hierarchical_data_path, sep=',')

In [None]:
# Creat a class object to access all methods and instance of the class
topic_object = topicModel()

In [None]:
# Preprocess training data
df_train['preprocessed_article'] =  df_train['article'].apply(topic_object.preprocess_article_text)

In [None]:
df_train.head(2)

Unnamed: 0,date,year,month,day,title,article,url,section,publication,tech_health_tag,article_word_len,cluster_label,preprocessed_article
0,2018-08-09 09:11:14,2018,8.0,9,Psychologists’ Group Maintains Ban on Work at ...,MIND Members of the American Psychological Ass...,https://www.nytimes.com/2018/08/09/health/inte...,health,The New York Times,health,700,22,mind members american psychological associatio...
1,2016-04-26 00:00:00,2016,4.0,26,Prince autopsy: What examiners looked for,(CNN)Pop superstar Prince died from an accide...,https://www.cnn.com/2016/04/26/health/prince-d...,health,CNN,health,889,9,superstar prince died accidental overdose opio...


In [None]:
df_hierarchical[df_hierarchical['parent']==9909]

Unnamed: 0,parent,child,lambda_val,child_size,cluster_label
954,9909,1088,3.31623,1,-1
959,9909,9913,3.333467,575,P
960,9909,9914,3.333467,381,P


In [None]:
# Run Multi Grain LDA Model for training data
model_result_train = {"global": {}, "local":{}}

# Global clusters
cluster_labels = [str(i) for i in df_train['cluster_label'].unique()]
for cluster_label in cluster_labels:
  df_hierarchical_ = df_hierarchical[df_hierarchical['cluster_label']==cluster_label]
  print('Starting training model {}'.format(cluster_label))
  parent_docs = df_hierarchical_['parent'].unique()
  print(parent_docs)
  if len(parent_docs) > 1:
    parent_docs = sorted(parent_docs)
    global_p = parent_docs[:1]
    global_docs_indx = df_hierarchical_[df_hierarchical_['parent'].isin(global_p)]['child']
    global_docs = df_train.iloc[global_docs_indx]['preprocessed_article'].tolist()
    local_p = parent_docs[1:]
  else:
    global_p = df_hierarchical[df_hierarchical['child']== parent_docs[0]]['parent'].tolist()
    global_docs_indx = df_hierarchical[(df_hierarchical['parent'].isin(global_p)) & (df_hierarchical['cluster_label']!="P")]['child']
    global_docs = df_train.iloc[global_docs_indx]['preprocessed_article'].tolist()
    local_p = parent_docs

  local_docs_indx = df_hierarchical_[df_hierarchical_['parent'].isin(local_p)]['child']
  local_docs = df_train.iloc[local_docs_indx]['preprocessed_article'].tolist()

  model_result_train['global'][cluster_label], mdl_g = topic_object.LdaModel_train(global_docs)
  # save the model file
  mdl_g.save('/content/drive/MyDrive/GLG_project/GLG_topic_model/mdl_topic_model_global_' + str(cluster_label) + '.bin')

  model_result_train['local'][cluster_label], mdl_l = topic_object.LdaModel_train(local_docs)
  # save the model file
  mdl_l.save('/content/drive/MyDrive/GLG_project/GLG_topic_model/mdl_topic_model_local_' + str(cluster_label) + '.bin')


In [None]:
import json
print(json.dumps(model_result_train['global'], sort_keys=True, indent=4))


{
    "-1": {
        "topic#0": {
            "labels": "contact, symptoms, organization, boars, wild boars",
            "topics": "virus ,spread ,health ,world ,reporting"
        },
        "topic#1": {
            "labels": "boars, wild boars, african swine fever, african swine, swine fever",
            "topics": "outbreak ,ministry ,disease ,agriculture ,reuters"
        },
        "topic#2": {
            "labels": "total compensation million versus million, total compensation, versus, million versus, versus million",
            "topics": "million ,reuters ,reporting ,beijing ,last"
        },
        "topic#3": {
            "labels": "cull, usda, h5n8, flocks, strains",
            "topics": "poultry ,bird ,birds ,highly ,china"
        },
        "topic#4": {
            "labels": "monitoring, china reported, agriculture rural, beijing reuters china, ministry agriculture rural",
            "topics": "china ,reported ,beijing ,editing ,reporting"
        }
    },
    "0": {

In [None]:
import json
print(json.dumps(model_result_train['local'], sort_keys=True, indent=4))


{
    "-1": {
        "topic#0": {
            "labels": "science times, attendance, mental illness, taught, phrase",
            "topics": "work ,time ,first ,make ,years"
        },
        "topic#1": {
            "labels": "dominant, nasa, operating system, tech industry, search engine",
            "topics": "year ,last ,according ,time ,still"
        },
        "topic#2": {
            "labels": "short videos, douyin, merger, gopro, passwords",
            "topics": "company ,last ,year ,according ,people"
        },
        "topic#3": {
            "labels": "percent less likely, researchers examined data, online january, mercury, micrograms",
            "topics": "health ,found ,according ,people ,company"
        },
        "topic#4": {
            "labels": "study period, otolaryngology, geriatrician, geriatric, people take",
            "topics": "people ,found ,many ,years ,make"
        }
    },
    "0": {
        "topic#0": {
            "labels": "rubella, scans, born 

In [None]:
# save model_result_train dictionary using the package pickel 
path_file = '/content/drive/MyDrive/GLG_project/GLG_topic_model/train_doc_result.pkl'
pickle.dump(model_result_train, open(path_file, 'wb')) 