Spaces:

KedirAhmed
/

Topic-modeling-and-NER

No application file

File size: 22,706 Bytes

21b78eb

{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"collapsed_sections":[],"machine_shape":"hm"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"gpuClass":"standard","accelerator":"GPU"},"cells":[{"cell_type":"markdown","source":["# Install required Libraries"],"metadata":{"id":"TrV0i1Vk3_cE"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"eNA2FS2VPSwI"},"outputs":[],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","source":["# SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings.\n","\n","!pip install -U sentence-transformers\n","!pip install umap-learn\n","!pip install joblib==1.2.0"],"metadata":{"id":"alH-oKMa4EfV"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Imports and Setup"],"metadata":{"id":"6GbZy4iAXEVe"}},{"cell_type":"code","source":["import pandas as pd\n","pd.set_option('max_colwidth',150)\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","import numpy as np\n","import nltk\n","import numpy as np\n","import os\n","import re\n","from string import punctuation\n","from datetime import datetime as dt\n","from sklearn.model_selection import train_test_split\n","%matplotlib inline\n","\n","#importing sentence transformer\n","from sentence_transformers import SentenceTransformer\n","\n","from umap import UMAP\n","\n","\n","import pickle"],"metadata":{"id":"5tPVn0h1R7cD"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Loading NLTK Modules\n","import nltk\n","# nltk.download('all')\n","nltk.download('stopwords')\n","nltk.download('punkt')\n","nltk.download('wordnet')\n","nltk.download('omw-1.4')\n","nltk.download('averaged_perceptron_tagger')\n","from nltk.stem import WordNetLemmatizer\n","from nltk.corpus import stopwords\n","from nltk.tokenize import sent_tokenize"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"yf7zlt-enNO2","outputId":"288bfbb2-1335-4cdf-8bc9-a5a4a6cbb76a","executionInfo":{"status":"ok","timestamp":1666973924906,"user_tz":240,"elapsed":1407,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}}},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package stopwords to /root/nltk_data...\n","[nltk_data]   Unzipping corpora/stopwords.zip.\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":4},{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data]   Unzipping tokenizers/punkt.zip.\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":4},{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package wordnet to /root/nltk_data...\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":4},{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package omw-1.4 to /root/nltk_data...\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":4},{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package averaged_perceptron_tagger to\n","[nltk_data]     /root/nltk_data...\n","[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":4}]},{"cell_type":"code","source":["class documentEmbedding:\n","\n","  '''\n","\n","  This class can be used online (in colab) or offline (locally):\n","  1. Online:\n","  If using this class in Colab and downloading the data from external source using the code\n","  in the notebook only run download_dataset function above in the code cell.\n","  2. Offline:\n","  If using this class to process news article data available in a local directory,\n","  \"data_path\" parameter should be defined.\n","  Where \"data_path\" is the path to the folder containing all news articles datasets\n","  datasets:\n","\n","  Parameters:\n","  -----------\n","\n","  data_path: str\n","  the path to 'all-the-news-2-1.csv' data if the data is downloaded from GDrive or other location.\n","\n","  '''\n","\n","  def __init__(self, df):\n","    self.data = df\n","    self.sentence_model = SentenceTransformer(\"all-mpnet-base-v2\")\n","    self.count = 0\n","\n","  def doc_clean(self, text):\n","    text = text.lower()\n","    text = text.replace('\\xa0', '')\n","    text = re.sub('[!\"#$%&\\'()’*+,-/:;<=>?—@[\\\\]^_`{|}~’]', '', text)\n","    return text\n","\n","  def sentence_to_vector(self, sent):\n","      # Encode the sentence\n","      embeddings = self.sentence_model.encode(sent, show_progress_bar=False, device='cuda')\n","\n","      return embeddings\n","    \n","  def doc_to_vectors(self, doc):\n","      self.count += 1\n","      if self.count % 1000 == 0:\n","        print (\"Processed {} documents\".format(self.count))\n","      doc = self.doc_clean(str(doc))\n","      sentences  = sent_tokenize(doc)\n","      # sentence to vector representation\n","      vector = [self.sentence_to_vector(sent) for sent in sentences]\n","      doc_embd = np.multiply.reduce(vector)\n","\n","      return doc_embd\n","\n","  def generate_embedding(self):\n","    print(\"Generating embedding vectors ...\")\n","    df = self.data\n","    df['article_embd'] = df['article'].apply(self.doc_to_vectors)\n","    print (\"Processed {} documents. Done!\".format(self.count))\n","    return df\n","\n","  def feature_reduction(self, embd_vector, umap_reducer=False):\n","    if umap_reducer:\n","      reducer = umap_reducer\n","      data_umap = reducer.transform(embd_vector)\n","    else:\n","      reducer =  UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\n","      reducer.fit(embd_vector)\n","      data_umap = reducer.transform(embd_vector)\n","    return pd.DataFrame(data_umap), reducer \n","\n","  def save_data(self, path_file, data):\n","    \"\"\"\n","    1. Accept dataframe and save in specific path location as a pickle \n","    \"\"\" \n","    with open(path_file, \"wb\") as f:\n","        pickle.dump(data, f)\n","\n","    print('The file is saved')"],"metadata":{"id":"vnab3ToAR7o2"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["data_path = '/content/drive/MyDrive/data_tech_health.csv'\n","df = pd.read_csv(data_path, sep=',')"],"metadata":{"id":"0yfvTUhdMMZv"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train_df, test_df = train_test_split(df, test_size=0.1, random_state=41, stratify=df['tech_health_tag'])"],"metadata":{"id":"Pza1fFjjQPz7"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train_df, test_df = train_df.reset_index(drop=True), test_df.reset_index(drop=True)"],"metadata":{"id":"Me2jqP3W9m7C"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train_df.shape, test_df.shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"a98igIPIXcfU","executionInfo":{"status":"ok","timestamp":1666975574313,"user_tz":240,"elapsed":140,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}},"outputId":"d854162f-f155-45bf-c8b7-8defacb0a5ac"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((9900, 11), (1100, 11))"]},"metadata":{},"execution_count":23}]},{"cell_type":"markdown","source":["- Save the data"],"metadata":{"id":"4zTpoPFI-IXk"}},{"cell_type":"code","source":["train_df.to_csv('/content/drive/MyDrive/GLG_train_data.csv', sep=',', index=False)\n","test_df.to_csv('/content/drive/MyDrive/GLG_test_data.csv', sep=',', index=False)"],"metadata":{"id":"nocwHdUp99ij"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["- Note: change runtime type to GPU"],"metadata":{"id":"EM1qCUaFZBBC"}},{"cell_type":"code","source":["# Training dataset document embedding\n","embd_obj = documentEmbedding(train_df)"],"metadata":{"id":"GMFcOZsTgvc0"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train_df = embd_obj.generate_embedding()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jWRAa6b4mXKE","executionInfo":{"status":"ok","timestamp":1666984006106,"user_tz":240,"elapsed":273544,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}},"outputId":"558ad1bf-747c-4b89-8bfd-9fd5b048ffe5"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Generating embedding vectors ...\n","Processed 1000 documents\n","Processed 2000 documents\n","Processed 3000 documents\n","Processed 4000 documents\n","Processed 5000 documents\n","Processed 6000 documents\n","Processed 7000 documents\n","Processed 8000 documents\n","Processed 9000 documents\n","Processed 9900 documents. Done!\n"]}]},{"cell_type":"code","source":["# Training dataset feature reduction using UMAP\n","doc_embd = pd.DataFrame([list(emb) for emb in train_df['article_embd'].values])\n","reduced_feature_embd, umap_reducer = embd_obj.feature_reduction(doc_embd)"],"metadata":{"id":"4zMrkX6IM8Vu"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["reduced_feature_embd.to_csv('/content/drive/MyDrive/GLG_train_data_emb.csv', sep=',', index=False)"],"metadata":{"id":"avnMucH5sUYp"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["reduced_feature_embd.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"ZvOJIz0evMg2","executionInfo":{"status":"ok","timestamp":1666984040691,"user_tz":240,"elapsed":66,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}},"outputId":"4b335658-13d6-45ac-fc66-2db3466bf431"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["          0         1         2         3         4\n","0  5.829379  7.188879  5.905200 -0.412281  8.191950\n","1  6.449986  6.307540  7.369937 -0.007682  7.781770\n","2  6.641083  8.506640  6.149587 -0.332711  7.944887\n","3  7.147574  6.795663  5.663146  0.219597  7.048291\n","4  1.926250  7.740953  5.823658  0.978812  7.035870"],"text/html":["\n","  <div id=\"df-ca1660a1-f0b7-423d-879a-b5e020e4b495\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>0</th>\n","      <th>1</th>\n","      <th>2</th>\n","      <th>3</th>\n","      <th>4</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>5.829379</td>\n","      <td>7.188879</td>\n","      <td>5.905200</td>\n","      <td>-0.412281</td>\n","      <td>8.191950</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>6.449986</td>\n","      <td>6.307540</td>\n","      <td>7.369937</td>\n","      <td>-0.007682</td>\n","      <td>7.781770</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>6.641083</td>\n","      <td>8.506640</td>\n","      <td>6.149587</td>\n","      <td>-0.332711</td>\n","      <td>7.944887</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>7.147574</td>\n","      <td>6.795663</td>\n","      <td>5.663146</td>\n","      <td>0.219597</td>\n","      <td>7.048291</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>1.926250</td>\n","      <td>7.740953</td>\n","      <td>5.823658</td>\n","      <td>0.978812</td>\n","      <td>7.035870</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-ca1660a1-f0b7-423d-879a-b5e020e4b495')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-ca1660a1-f0b7-423d-879a-b5e020e4b495 button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-ca1660a1-f0b7-423d-879a-b5e020e4b495');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":74}]},{"cell_type":"code","source":["# Test dataset document embedding\n","embd_obj.data = test_df\n","test_df = embd_obj.generate_embedding()"],"metadata":{"id":"q-v5pnfQR68e","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1666984073037,"user_tz":240,"elapsed":32384,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}},"outputId":"c62b6e37-5839-47f0-a544-e7c38c7ee3ba"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Generating embedding vectors ...\n","Processed 10000 documents\n","Processed 11000 documents\n","Processed 11000 documents. Done!\n"]}]},{"cell_type":"code","source":["# Test dataset feature reduction using UMAP\n","doc_embd_test = pd.DataFrame([list(emb) for emb in test_df['article_embd'].values])\n","reduced_feature_embd_test, _ = embd_obj.feature_reduction(doc_embd_test, umap_reducer)                             "],"metadata":{"id":"pLg7nXlrOJQW"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["reduced_feature_embd_test.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"3LxJO69qvRJF","executionInfo":{"status":"ok","timestamp":1666984079357,"user_tz":240,"elapsed":48,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}},"outputId":"0388aba4-2197-4e25-d650-c16bc468a10f"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["          0         1         2         3         4\n","0  6.384241  6.152116  6.909705  0.143703  7.433092\n","1  4.364654  2.928921  4.393867  1.090112  7.379026\n","2  6.726593  8.498932  6.248105 -0.239759  7.818388\n","3  7.369310  5.427250  4.332436  0.281037  7.733836\n","4  6.765358  4.768935  4.028739  0.633608  7.600544"],"text/html":["\n","  <div id=\"df-58146203-dcda-4eff-9aef-4b349cb4a723\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>0</th>\n","      <th>1</th>\n","      <th>2</th>\n","      <th>3</th>\n","      <th>4</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>6.384241</td>\n","      <td>6.152116</td>\n","      <td>6.909705</td>\n","      <td>0.143703</td>\n","      <td>7.433092</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>4.364654</td>\n","      <td>2.928921</td>\n","      <td>4.393867</td>\n","      <td>1.090112</td>\n","      <td>7.379026</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>6.726593</td>\n","      <td>8.498932</td>\n","      <td>6.248105</td>\n","      <td>-0.239759</td>\n","      <td>7.818388</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>7.369310</td>\n","      <td>5.427250</td>\n","      <td>4.332436</td>\n","      <td>0.281037</td>\n","      <td>7.733836</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>6.765358</td>\n","      <td>4.768935</td>\n","      <td>4.028739</td>\n","      <td>0.633608</td>\n","      <td>7.600544</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-58146203-dcda-4eff-9aef-4b349cb4a723')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-58146203-dcda-4eff-9aef-4b349cb4a723 button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-58146203-dcda-4eff-9aef-4b349cb4a723');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":77}]},{"cell_type":"code","source":["reduced_feature_embd_test.to_csv('/content/drive/MyDrive/GLG_test_data_emb.csv', sep=',', index=False)"],"metadata":{"id":"9fQdi95vcU7P"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Dump reducer model to be used for prediction"],"metadata":{"id":"fTpOhQ-n7nHw"}},{"cell_type":"code","source":["import pickle \n","reducer_file = \"/content/drive/MyDrive/umap_reducer_model.sav\"\n","pickle.dump(umap_reducer, open(reducer_file, 'wb'))"],"metadata":{"id":"x5B-OhY47lyI"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Do this to Load back the reducer model\n","# saved_reducer = pickle.load(open(reducer_file, 'rb'))"],"metadata":{"id":"IOf_Tyf78RTX"},"execution_count":null,"outputs":[]}]}