{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"collapsed_sections":[],"machine_shape":"hm"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"gpuClass":"standard","accelerator":"GPU"},"cells":[{"cell_type":"markdown","source":["# Install required Libraries"],"metadata":{"id":"TrV0i1Vk3_cE"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"eNA2FS2VPSwI"},"outputs":[],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","source":["# SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings.\n","\n","!pip install -U sentence-transformers\n","!pip install umap-learn\n","!pip install joblib==1.2.0"],"metadata":{"id":"alH-oKMa4EfV"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Imports and Setup"],"metadata":{"id":"6GbZy4iAXEVe"}},{"cell_type":"code","source":["import pandas as pd\n","pd.set_option('max_colwidth',150)\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","import numpy as np\n","import nltk\n","import numpy as np\n","import os\n","import re\n","from string import punctuation\n","from datetime import datetime as dt\n","from sklearn.model_selection import train_test_split\n","%matplotlib inline\n","\n","#importing sentence transformer\n","from sentence_transformers import SentenceTransformer\n","\n","from umap import UMAP\n","\n","\n","import pickle"],"metadata":{"id":"5tPVn0h1R7cD"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Loading NLTK Modules\n","import nltk\n","# nltk.download('all')\n","nltk.download('stopwords')\n","nltk.download('punkt')\n","nltk.download('wordnet')\n","nltk.download('omw-1.4')\n","nltk.download('averaged_perceptron_tagger')\n","from nltk.stem import WordNetLemmatizer\n","from nltk.corpus import stopwords\n","from nltk.tokenize import sent_tokenize"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"yf7zlt-enNO2","outputId":"288bfbb2-1335-4cdf-8bc9-a5a4a6cbb76a","executionInfo":{"status":"ok","timestamp":1666973924906,"user_tz":240,"elapsed":1407,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}}},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package stopwords to /root/nltk_data...\n","[nltk_data] Unzipping corpora/stopwords.zip.\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":4},{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data] Unzipping tokenizers/punkt.zip.\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":4},{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package wordnet to /root/nltk_data...\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":4},{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package omw-1.4 to /root/nltk_data...\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":4},{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package averaged_perceptron_tagger to\n","[nltk_data] /root/nltk_data...\n","[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":4}]},{"cell_type":"code","source":["class documentEmbedding:\n","\n"," '''\n","\n"," This class can be used online (in colab) or offline (locally):\n"," 1. Online:\n"," If using this class in Colab and downloading the data from external source using the code\n"," in the notebook only run download_dataset function above in the code cell.\n"," 2. Offline:\n"," If using this class to process news article data available in a local directory,\n"," \"data_path\" parameter should be defined.\n"," Where \"data_path\" is the path to the folder containing all news articles datasets\n"," datasets:\n","\n"," Parameters:\n"," -----------\n","\n"," data_path: str\n"," the path to 'all-the-news-2-1.csv' data if the data is downloaded from GDrive or other location.\n","\n"," '''\n","\n"," def __init__(self, df):\n"," self.data = df\n"," self.sentence_model = SentenceTransformer(\"all-mpnet-base-v2\")\n"," self.count = 0\n","\n"," def doc_clean(self, text):\n"," text = text.lower()\n"," text = text.replace('\\xa0', '')\n"," text = re.sub('[!\"#$%&\\'()’*+,-/:;<=>?—@[\\\\]^_`{|}~’]', '', text)\n"," return text\n","\n"," def sentence_to_vector(self, sent):\n"," # Encode the sentence\n"," embeddings = self.sentence_model.encode(sent, show_progress_bar=False, device='cuda')\n","\n"," return embeddings\n"," \n"," def doc_to_vectors(self, doc):\n"," self.count += 1\n"," if self.count % 1000 == 0:\n"," print (\"Processed {} documents\".format(self.count))\n"," doc = self.doc_clean(str(doc))\n"," sentences = sent_tokenize(doc)\n"," # sentence to vector representation\n"," vector = [self.sentence_to_vector(sent) for sent in sentences]\n"," doc_embd = np.multiply.reduce(vector)\n","\n"," return doc_embd\n","\n"," def generate_embedding(self):\n"," print(\"Generating embedding vectors ...\")\n"," df = self.data\n"," df['article_embd'] = df['article'].apply(self.doc_to_vectors)\n"," print (\"Processed {} documents. Done!\".format(self.count))\n"," return df\n","\n"," def feature_reduction(self, embd_vector, umap_reducer=False):\n"," if umap_reducer:\n"," reducer = umap_reducer\n"," data_umap = reducer.transform(embd_vector)\n"," else:\n"," reducer = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\n"," reducer.fit(embd_vector)\n"," data_umap = reducer.transform(embd_vector)\n"," return pd.DataFrame(data_umap), reducer \n","\n"," def save_data(self, path_file, data):\n"," \"\"\"\n"," 1. Accept dataframe and save in specific path location as a pickle \n"," \"\"\" \n"," with open(path_file, \"wb\") as f:\n"," pickle.dump(data, f)\n","\n"," print('The file is saved')"],"metadata":{"id":"vnab3ToAR7o2"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["data_path = '/content/drive/MyDrive/data_tech_health.csv'\n","df = pd.read_csv(data_path, sep=',')"],"metadata":{"id":"0yfvTUhdMMZv"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train_df, test_df = train_test_split(df, test_size=0.1, random_state=41, stratify=df['tech_health_tag'])"],"metadata":{"id":"Pza1fFjjQPz7"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train_df, test_df = train_df.reset_index(drop=True), test_df.reset_index(drop=True)"],"metadata":{"id":"Me2jqP3W9m7C"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train_df.shape, test_df.shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"a98igIPIXcfU","executionInfo":{"status":"ok","timestamp":1666975574313,"user_tz":240,"elapsed":140,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}},"outputId":"d854162f-f155-45bf-c8b7-8defacb0a5ac"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((9900, 11), (1100, 11))"]},"metadata":{},"execution_count":23}]},{"cell_type":"markdown","source":["- Save the data"],"metadata":{"id":"4zTpoPFI-IXk"}},{"cell_type":"code","source":["train_df.to_csv('/content/drive/MyDrive/GLG_train_data.csv', sep=',', index=False)\n","test_df.to_csv('/content/drive/MyDrive/GLG_test_data.csv', sep=',', index=False)"],"metadata":{"id":"nocwHdUp99ij"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["- Note: change runtime type to GPU"],"metadata":{"id":"EM1qCUaFZBBC"}},{"cell_type":"code","source":["# Training dataset document embedding\n","embd_obj = documentEmbedding(train_df)"],"metadata":{"id":"GMFcOZsTgvc0"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train_df = embd_obj.generate_embedding()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jWRAa6b4mXKE","executionInfo":{"status":"ok","timestamp":1666984006106,"user_tz":240,"elapsed":273544,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}},"outputId":"558ad1bf-747c-4b89-8bfd-9fd5b048ffe5"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Generating embedding vectors ...\n","Processed 1000 documents\n","Processed 2000 documents\n","Processed 3000 documents\n","Processed 4000 documents\n","Processed 5000 documents\n","Processed 6000 documents\n","Processed 7000 documents\n","Processed 8000 documents\n","Processed 9000 documents\n","Processed 9900 documents. Done!\n"]}]},{"cell_type":"code","source":["# Training dataset feature reduction using UMAP\n","doc_embd = pd.DataFrame([list(emb) for emb in train_df['article_embd'].values])\n","reduced_feature_embd, umap_reducer = embd_obj.feature_reduction(doc_embd)"],"metadata":{"id":"4zMrkX6IM8Vu"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["reduced_feature_embd.to_csv('/content/drive/MyDrive/GLG_train_data_emb.csv', sep=',', index=False)"],"metadata":{"id":"avnMucH5sUYp"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["reduced_feature_embd.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"ZvOJIz0evMg2","executionInfo":{"status":"ok","timestamp":1666984040691,"user_tz":240,"elapsed":66,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}},"outputId":"4b335658-13d6-45ac-fc66-2db3466bf431"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" 0 1 2 3 4\n","0 5.829379 7.188879 5.905200 -0.412281 8.191950\n","1 6.449986 6.307540 7.369937 -0.007682 7.781770\n","2 6.641083 8.506640 6.149587 -0.332711 7.944887\n","3 7.147574 6.795663 5.663146 0.219597 7.048291\n","4 1.926250 7.740953 5.823658 0.978812 7.035870"],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
01234
05.8293797.1888795.905200-0.4122818.191950
16.4499866.3075407.369937-0.0076827.781770
26.6410838.5066406.149587-0.3327117.944887
37.1475746.7956635.6631460.2195977.048291
41.9262507.7409535.8236580.9788127.035870
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":74}]},{"cell_type":"code","source":["# Test dataset document embedding\n","embd_obj.data = test_df\n","test_df = embd_obj.generate_embedding()"],"metadata":{"id":"q-v5pnfQR68e","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1666984073037,"user_tz":240,"elapsed":32384,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}},"outputId":"c62b6e37-5839-47f0-a544-e7c38c7ee3ba"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Generating embedding vectors ...\n","Processed 10000 documents\n","Processed 11000 documents\n","Processed 11000 documents. Done!\n"]}]},{"cell_type":"code","source":["# Test dataset feature reduction using UMAP\n","doc_embd_test = pd.DataFrame([list(emb) for emb in test_df['article_embd'].values])\n","reduced_feature_embd_test, _ = embd_obj.feature_reduction(doc_embd_test, umap_reducer) "],"metadata":{"id":"pLg7nXlrOJQW"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["reduced_feature_embd_test.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"3LxJO69qvRJF","executionInfo":{"status":"ok","timestamp":1666984079357,"user_tz":240,"elapsed":48,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}},"outputId":"0388aba4-2197-4e25-d650-c16bc468a10f"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" 0 1 2 3 4\n","0 6.384241 6.152116 6.909705 0.143703 7.433092\n","1 4.364654 2.928921 4.393867 1.090112 7.379026\n","2 6.726593 8.498932 6.248105 -0.239759 7.818388\n","3 7.369310 5.427250 4.332436 0.281037 7.733836\n","4 6.765358 4.768935 4.028739 0.633608 7.600544"],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
01234
06.3842416.1521166.9097050.1437037.433092
14.3646542.9289214.3938671.0901127.379026
26.7265938.4989326.248105-0.2397597.818388
37.3693105.4272504.3324360.2810377.733836
46.7653584.7689354.0287390.6336087.600544
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":77}]},{"cell_type":"code","source":["reduced_feature_embd_test.to_csv('/content/drive/MyDrive/GLG_test_data_emb.csv', sep=',', index=False)"],"metadata":{"id":"9fQdi95vcU7P"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Dump reducer model to be used for prediction"],"metadata":{"id":"fTpOhQ-n7nHw"}},{"cell_type":"code","source":["import pickle \n","reducer_file = \"/content/drive/MyDrive/umap_reducer_model.sav\"\n","pickle.dump(umap_reducer, open(reducer_file, 'wb'))"],"metadata":{"id":"x5B-OhY47lyI"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Do this to Load back the reducer model\n","# saved_reducer = pickle.load(open(reducer_file, 'rb'))"],"metadata":{"id":"IOf_Tyf78RTX"},"execution_count":null,"outputs":[]}]}