Spaces:
No application file
No application file
File size: 22,706 Bytes
21b78eb |
1 |
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"collapsed_sections":[],"machine_shape":"hm"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"gpuClass":"standard","accelerator":"GPU"},"cells":[{"cell_type":"markdown","source":["# Install required Libraries"],"metadata":{"id":"TrV0i1Vk3_cE"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"eNA2FS2VPSwI"},"outputs":[],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","source":["# SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings.\n","\n","!pip install -U sentence-transformers\n","!pip install umap-learn\n","!pip install joblib==1.2.0"],"metadata":{"id":"alH-oKMa4EfV"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Imports and Setup"],"metadata":{"id":"6GbZy4iAXEVe"}},{"cell_type":"code","source":["import pandas as pd\n","pd.set_option('max_colwidth',150)\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","import numpy as np\n","import nltk\n","import numpy as np\n","import os\n","import re\n","from string import punctuation\n","from datetime import datetime as dt\n","from sklearn.model_selection import train_test_split\n","%matplotlib inline\n","\n","#importing sentence transformer\n","from sentence_transformers import SentenceTransformer\n","\n","from umap import UMAP\n","\n","\n","import pickle"],"metadata":{"id":"5tPVn0h1R7cD"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Loading NLTK Modules\n","import nltk\n","# nltk.download('all')\n","nltk.download('stopwords')\n","nltk.download('punkt')\n","nltk.download('wordnet')\n","nltk.download('omw-1.4')\n","nltk.download('averaged_perceptron_tagger')\n","from nltk.stem import WordNetLemmatizer\n","from nltk.corpus import stopwords\n","from nltk.tokenize import sent_tokenize"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"yf7zlt-enNO2","outputId":"288bfbb2-1335-4cdf-8bc9-a5a4a6cbb76a","executionInfo":{"status":"ok","timestamp":1666973924906,"user_tz":240,"elapsed":1407,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}}},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package stopwords to /root/nltk_data...\n","[nltk_data] Unzipping corpora/stopwords.zip.\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":4},{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data] Unzipping tokenizers/punkt.zip.\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":4},{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package wordnet to /root/nltk_data...\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":4},{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package omw-1.4 to /root/nltk_data...\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":4},{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package averaged_perceptron_tagger to\n","[nltk_data] /root/nltk_data...\n","[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":4}]},{"cell_type":"code","source":["class documentEmbedding:\n","\n"," '''\n","\n"," This class can be used online (in colab) or offline (locally):\n"," 1. Online:\n"," If using this class in Colab and downloading the data from external source using the code\n"," in the notebook only run download_dataset function above in the code cell.\n"," 2. Offline:\n"," If using this class to process news article data available in a local directory,\n"," \"data_path\" parameter should be defined.\n"," Where \"data_path\" is the path to the folder containing all news articles datasets\n"," datasets:\n","\n"," Parameters:\n"," -----------\n","\n"," data_path: str\n"," the path to 'all-the-news-2-1.csv' data if the data is downloaded from GDrive or other location.\n","\n"," '''\n","\n"," def __init__(self, df):\n"," self.data = df\n"," self.sentence_model = SentenceTransformer(\"all-mpnet-base-v2\")\n"," self.count = 0\n","\n"," def doc_clean(self, text):\n"," text = text.lower()\n"," text = text.replace('\\xa0', '')\n"," text = re.sub('[!\"#$%&\\'()β*+,-/:;<=>?β@[\\\\]^_`{|}~β]', '', text)\n"," return text\n","\n"," def sentence_to_vector(self, sent):\n"," # Encode the sentence\n"," embeddings = self.sentence_model.encode(sent, show_progress_bar=False, device='cuda')\n","\n"," return embeddings\n"," \n"," def doc_to_vectors(self, doc):\n"," self.count += 1\n"," if self.count % 1000 == 0:\n"," print (\"Processed {} documents\".format(self.count))\n"," doc = self.doc_clean(str(doc))\n"," sentences = sent_tokenize(doc)\n"," # sentence to vector representation\n"," vector = [self.sentence_to_vector(sent) for sent in sentences]\n"," doc_embd = np.multiply.reduce(vector)\n","\n"," return doc_embd\n","\n"," def generate_embedding(self):\n"," print(\"Generating embedding vectors ...\")\n"," df = self.data\n"," df['article_embd'] = df['article'].apply(self.doc_to_vectors)\n"," print (\"Processed {} documents. Done!\".format(self.count))\n"," return df\n","\n"," def feature_reduction(self, embd_vector, umap_reducer=False):\n"," if umap_reducer:\n"," reducer = umap_reducer\n"," data_umap = reducer.transform(embd_vector)\n"," else:\n"," reducer = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\n"," reducer.fit(embd_vector)\n"," data_umap = reducer.transform(embd_vector)\n"," return pd.DataFrame(data_umap), reducer \n","\n"," def save_data(self, path_file, data):\n"," \"\"\"\n"," 1. Accept dataframe and save in specific path location as a pickle \n"," \"\"\" \n"," with open(path_file, \"wb\") as f:\n"," pickle.dump(data, f)\n","\n"," print('The file is saved')"],"metadata":{"id":"vnab3ToAR7o2"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["data_path = '/content/drive/MyDrive/data_tech_health.csv'\n","df = pd.read_csv(data_path, sep=',')"],"metadata":{"id":"0yfvTUhdMMZv"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train_df, test_df = train_test_split(df, test_size=0.1, random_state=41, stratify=df['tech_health_tag'])"],"metadata":{"id":"Pza1fFjjQPz7"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train_df, test_df = train_df.reset_index(drop=True), test_df.reset_index(drop=True)"],"metadata":{"id":"Me2jqP3W9m7C"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train_df.shape, test_df.shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"a98igIPIXcfU","executionInfo":{"status":"ok","timestamp":1666975574313,"user_tz":240,"elapsed":140,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}},"outputId":"d854162f-f155-45bf-c8b7-8defacb0a5ac"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((9900, 11), (1100, 11))"]},"metadata":{},"execution_count":23}]},{"cell_type":"markdown","source":["- Save the data"],"metadata":{"id":"4zTpoPFI-IXk"}},{"cell_type":"code","source":["train_df.to_csv('/content/drive/MyDrive/GLG_train_data.csv', sep=',', index=False)\n","test_df.to_csv('/content/drive/MyDrive/GLG_test_data.csv', sep=',', index=False)"],"metadata":{"id":"nocwHdUp99ij"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["- Note: change runtime type to GPU"],"metadata":{"id":"EM1qCUaFZBBC"}},{"cell_type":"code","source":["# Training dataset document embedding\n","embd_obj = documentEmbedding(train_df)"],"metadata":{"id":"GMFcOZsTgvc0"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train_df = embd_obj.generate_embedding()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jWRAa6b4mXKE","executionInfo":{"status":"ok","timestamp":1666984006106,"user_tz":240,"elapsed":273544,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}},"outputId":"558ad1bf-747c-4b89-8bfd-9fd5b048ffe5"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Generating embedding vectors ...\n","Processed 1000 documents\n","Processed 2000 documents\n","Processed 3000 documents\n","Processed 4000 documents\n","Processed 5000 documents\n","Processed 6000 documents\n","Processed 7000 documents\n","Processed 8000 documents\n","Processed 9000 documents\n","Processed 9900 documents. Done!\n"]}]},{"cell_type":"code","source":["# Training dataset feature reduction using UMAP\n","doc_embd = pd.DataFrame([list(emb) for emb in train_df['article_embd'].values])\n","reduced_feature_embd, umap_reducer = embd_obj.feature_reduction(doc_embd)"],"metadata":{"id":"4zMrkX6IM8Vu"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["reduced_feature_embd.to_csv('/content/drive/MyDrive/GLG_train_data_emb.csv', sep=',', index=False)"],"metadata":{"id":"avnMucH5sUYp"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["reduced_feature_embd.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"ZvOJIz0evMg2","executionInfo":{"status":"ok","timestamp":1666984040691,"user_tz":240,"elapsed":66,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}},"outputId":"4b335658-13d6-45ac-fc66-2db3466bf431"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" 0 1 2 3 4\n","0 5.829379 7.188879 5.905200 -0.412281 8.191950\n","1 6.449986 6.307540 7.369937 -0.007682 7.781770\n","2 6.641083 8.506640 6.149587 -0.332711 7.944887\n","3 7.147574 6.795663 5.663146 0.219597 7.048291\n","4 1.926250 7.740953 5.823658 0.978812 7.035870"],"text/html":["\n"," <div id=\"df-ca1660a1-f0b7-423d-879a-b5e020e4b495\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>0</th>\n"," <th>1</th>\n"," <th>2</th>\n"," <th>3</th>\n"," <th>4</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>5.829379</td>\n"," <td>7.188879</td>\n"," <td>5.905200</td>\n"," <td>-0.412281</td>\n"," <td>8.191950</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>6.449986</td>\n"," <td>6.307540</td>\n"," <td>7.369937</td>\n"," <td>-0.007682</td>\n"," <td>7.781770</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>6.641083</td>\n"," <td>8.506640</td>\n"," <td>6.149587</td>\n"," <td>-0.332711</td>\n"," <td>7.944887</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>7.147574</td>\n"," <td>6.795663</td>\n"," <td>5.663146</td>\n"," <td>0.219597</td>\n"," <td>7.048291</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>1.926250</td>\n"," <td>7.740953</td>\n"," <td>5.823658</td>\n"," <td>0.978812</td>\n"," <td>7.035870</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-ca1660a1-f0b7-423d-879a-b5e020e4b495')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-ca1660a1-f0b7-423d-879a-b5e020e4b495 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-ca1660a1-f0b7-423d-879a-b5e020e4b495');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":74}]},{"cell_type":"code","source":["# Test dataset document embedding\n","embd_obj.data = test_df\n","test_df = embd_obj.generate_embedding()"],"metadata":{"id":"q-v5pnfQR68e","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1666984073037,"user_tz":240,"elapsed":32384,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}},"outputId":"c62b6e37-5839-47f0-a544-e7c38c7ee3ba"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Generating embedding vectors ...\n","Processed 10000 documents\n","Processed 11000 documents\n","Processed 11000 documents. Done!\n"]}]},{"cell_type":"code","source":["# Test dataset feature reduction using UMAP\n","doc_embd_test = pd.DataFrame([list(emb) for emb in test_df['article_embd'].values])\n","reduced_feature_embd_test, _ = embd_obj.feature_reduction(doc_embd_test, umap_reducer) "],"metadata":{"id":"pLg7nXlrOJQW"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["reduced_feature_embd_test.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"3LxJO69qvRJF","executionInfo":{"status":"ok","timestamp":1666984079357,"user_tz":240,"elapsed":48,"user":{"displayName":"Hedra Seid","userId":"08577190734824555879"}},"outputId":"0388aba4-2197-4e25-d650-c16bc468a10f"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" 0 1 2 3 4\n","0 6.384241 6.152116 6.909705 0.143703 7.433092\n","1 4.364654 2.928921 4.393867 1.090112 7.379026\n","2 6.726593 8.498932 6.248105 -0.239759 7.818388\n","3 7.369310 5.427250 4.332436 0.281037 7.733836\n","4 6.765358 4.768935 4.028739 0.633608 7.600544"],"text/html":["\n"," <div id=\"df-58146203-dcda-4eff-9aef-4b349cb4a723\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>0</th>\n"," <th>1</th>\n"," <th>2</th>\n"," <th>3</th>\n"," <th>4</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>6.384241</td>\n"," <td>6.152116</td>\n"," <td>6.909705</td>\n"," <td>0.143703</td>\n"," <td>7.433092</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>4.364654</td>\n"," <td>2.928921</td>\n"," <td>4.393867</td>\n"," <td>1.090112</td>\n"," <td>7.379026</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>6.726593</td>\n"," <td>8.498932</td>\n"," <td>6.248105</td>\n"," <td>-0.239759</td>\n"," <td>7.818388</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>7.369310</td>\n"," <td>5.427250</td>\n"," <td>4.332436</td>\n"," <td>0.281037</td>\n"," <td>7.733836</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>6.765358</td>\n"," <td>4.768935</td>\n"," <td>4.028739</td>\n"," <td>0.633608</td>\n"," <td>7.600544</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-58146203-dcda-4eff-9aef-4b349cb4a723')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-58146203-dcda-4eff-9aef-4b349cb4a723 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-58146203-dcda-4eff-9aef-4b349cb4a723');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":77}]},{"cell_type":"code","source":["reduced_feature_embd_test.to_csv('/content/drive/MyDrive/GLG_test_data_emb.csv', sep=',', index=False)"],"metadata":{"id":"9fQdi95vcU7P"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Dump reducer model to be used for prediction"],"metadata":{"id":"fTpOhQ-n7nHw"}},{"cell_type":"code","source":["import pickle \n","reducer_file = \"/content/drive/MyDrive/umap_reducer_model.sav\"\n","pickle.dump(umap_reducer, open(reducer_file, 'wb'))"],"metadata":{"id":"x5B-OhY47lyI"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Do this to Load back the reducer model\n","# saved_reducer = pickle.load(open(reducer_file, 'rb'))"],"metadata":{"id":"IOf_Tyf78RTX"},"execution_count":null,"outputs":[]}]} |