# Install required Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings.

!pip install -U sentence-transformers
!pip install umap-learn
!pip install joblib==1.2.0

# Imports and Setup

In [None]:
import pandas as pd
pd.set_option('max_colwidth',150)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
import numpy as np
import os
import re
from string import punctuation
from datetime import datetime as dt
from sklearn.model_selection import train_test_split
%matplotlib inline

#importing sentence transformer
from sentence_transformers import SentenceTransformer

from umap import UMAP


import pickle

In [None]:
# Loading NLTK Modules
import nltk
# nltk.download('all')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
class documentEmbedding:

  '''

  This class can be used online (in colab) or offline (locally):
  1. Online:
  If using this class in Colab and downloading the data from external source using the code
  in the notebook only run download_dataset function above in the code cell.
  2. Offline:
  If using this class to process news article data available in a local directory,
  "data_path" parameter should be defined.
  Where "data_path" is the path to the folder containing all news articles datasets
  datasets:

  Parameters:
  -----------

  data_path: str
  the path to 'all-the-news-2-1.csv' data if the data is downloaded from GDrive or other location.

  '''

  def __init__(self, df):
    self.data = df
    self.sentence_model = SentenceTransformer("all-mpnet-base-v2")
    self.count = 0

  def doc_clean(self, text):
    text = text.lower()
    text = text.replace('\xa0', '')
    text = re.sub('[!"#$%&\'()’*+,-/:;<=>?—@[\\]^_`{|}~’]', '', text)
    return text

  def sentence_to_vector(self, sent):
      # Encode the sentence
      embeddings = self.sentence_model.encode(sent, show_progress_bar=False, device='cuda')

      return embeddings
    
  def doc_to_vectors(self, doc):
      self.count += 1
      if self.count % 1000 == 0:
        print ("Processed {} documents".format(self.count))
      doc = self.doc_clean(str(doc))
      sentences  = sent_tokenize(doc)
      # sentence to vector representation
      vector = [self.sentence_to_vector(sent) for sent in sentences]
      doc_embd = np.multiply.reduce(vector)

      return doc_embd

  def generate_embedding(self):
    print("Generating embedding vectors ...")
    df = self.data
    df['article_embd'] = df['article'].apply(self.doc_to_vectors)
    print ("Processed {} documents. Done!".format(self.count))
    return df

  def feature_reduction(self, embd_vector, umap_reducer=False):
    if umap_reducer:
      reducer = umap_reducer
      data_umap = reducer.transform(embd_vector)
    else:
      reducer =  UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
      reducer.fit(embd_vector)
      data_umap = reducer.transform(embd_vector)
    return pd.DataFrame(data_umap), reducer 

  def save_data(self, path_file, data):
    """
    1. Accept dataframe and save in specific path location as a pickle 
    """ 
    with open(path_file, "wb") as f:
        pickle.dump(data, f)

    print('The file is saved')

In [None]:
data_path = '/content/drive/MyDrive/data_tech_health.csv'
df = pd.read_csv(data_path, sep=',')

In [None]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=41, stratify=df['tech_health_tag'])

In [None]:
train_df, test_df = train_df.reset_index(drop=True), test_df.reset_index(drop=True)

In [None]:
train_df.shape, test_df.shape

((9900, 11), (1100, 11))

- Save the data

In [None]:
train_df.to_csv('/content/drive/MyDrive/GLG_train_data.csv', sep=',', index=False)
test_df.to_csv('/content/drive/MyDrive/GLG_test_data.csv', sep=',', index=False)

- Note: change runtime type to GPU

In [None]:
# Training dataset document embedding
embd_obj = documentEmbedding(train_df)

In [None]:
train_df = embd_obj.generate_embedding()

Generating embedding vectors ...
Processed 1000 documents
Processed 2000 documents
Processed 3000 documents
Processed 4000 documents
Processed 5000 documents
Processed 6000 documents
Processed 7000 documents
Processed 8000 documents
Processed 9000 documents
Processed 9900 documents. Done!


In [None]:
# Training dataset feature reduction using UMAP
doc_embd = pd.DataFrame([list(emb) for emb in train_df['article_embd'].values])
reduced_feature_embd, umap_reducer = embd_obj.feature_reduction(doc_embd)

In [None]:
reduced_feature_embd.to_csv('/content/drive/MyDrive/GLG_train_data_emb.csv', sep=',', index=False)

In [None]:
reduced_feature_embd.head()

Unnamed: 0,0,1,2,3,4
0,5.829379,7.188879,5.9052,-0.412281,8.19195
1,6.449986,6.30754,7.369937,-0.007682,7.78177
2,6.641083,8.50664,6.149587,-0.332711,7.944887
3,7.147574,6.795663,5.663146,0.219597,7.048291
4,1.92625,7.740953,5.823658,0.978812,7.03587


In [None]:
# Test dataset document embedding
embd_obj.data = test_df
test_df = embd_obj.generate_embedding()

Generating embedding vectors ...
Processed 10000 documents
Processed 11000 documents
Processed 11000 documents. Done!


In [None]:
# Test dataset feature reduction using UMAP
doc_embd_test = pd.DataFrame([list(emb) for emb in test_df['article_embd'].values])
reduced_feature_embd_test, _ = embd_obj.feature_reduction(doc_embd_test, umap_reducer)                             

In [None]:
reduced_feature_embd_test.head()

Unnamed: 0,0,1,2,3,4
0,6.384241,6.152116,6.909705,0.143703,7.433092
1,4.364654,2.928921,4.393867,1.090112,7.379026
2,6.726593,8.498932,6.248105,-0.239759,7.818388
3,7.36931,5.42725,4.332436,0.281037,7.733836
4,6.765358,4.768935,4.028739,0.633608,7.600544


In [None]:
reduced_feature_embd_test.to_csv('/content/drive/MyDrive/GLG_test_data_emb.csv', sep=',', index=False)

# Dump reducer model to be used for prediction

In [None]:
import pickle 
reducer_file = "/content/drive/MyDrive/umap_reducer_model.sav"
pickle.dump(umap_reducer, open(reducer_file, 'wb'))

In [None]:
# Do this to Load back the reducer model
# saved_reducer = pickle.load(open(reducer_file, 'rb'))