Azarthehulk
/

text_processing

Model card Files Files and versions Community

Azarthehulk commited on Feb 7, 2023

Commit

42ed916

•

1 Parent(s): 5533322

Upload text processing.ipynb (#1)

Browse files

- Upload text processing.ipynb (2eac9cd0758a95206488e298e9d854b0c17f81f0)

Files changed (1) hide show

text processing.ipynb +1852 -0

text processing.ipynb ADDED Viewed

	@@ -0,0 +1,1852 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "fc8cef68",
+   "metadata": {},
+   "source": [
+    "# text processing:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "61636845",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b7cc480a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data=pd.read_csv(\"Reviews.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "2b61b374",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',\n",
+       "       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ffe1915d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(568454, 10)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "7658aeba",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "201034                           Raspberry taste TOO strong\n",
+       "511879                        The whole family loves these!\n",
+       "8685                                  Well-rounded but weak\n",
+       "385899    1/10 as strong as other brands, won't disolve ...\n",
+       "47587                                        Perfect fudge!\n",
+       "5222                                  Great snack, however,\n",
+       "560412    Looks, smells and probably tastes like real ch...\n",
+       "479143                       Delicious way to start the day\n",
+       "307484                                  A dessert in a cup!\n",
+       "189729                            SUPERB! GREAT FOR BAKING!\n",
+       "Name: Summary, dtype: object"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.Summary.sample(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "379273cc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.Text[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "38332201",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Good Quality Dog Food'"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.Summary[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "73a4c938",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import string\n",
+    "string.punctuation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "93270bf2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#this function is about to remove the  , . ' ',\" \",all abpuve sybols discribed\n",
+    "def remove_punctuation(text):\n",
+    "    punctuationfree=\"\".join([i for i in text if i not in string.punctuation])\n",
+    "    return punctuationfree"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "95437c3b",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Id</th>\n",
+       "      <th>ProductId</th>\n",
+       "      <th>UserId</th>\n",
+       "      <th>ProfileName</th>\n",
+       "      <th>HelpfulnessNumerator</th>\n",
+       "      <th>HelpfulnessDenominator</th>\n",
+       "      <th>Score</th>\n",
+       "      <th>Time</th>\n",
+       "      <th>Summary</th>\n",
+       "      <th>Text</th>\n",
+       "      <th>clean_msg</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>B001E4KFG0</td>\n",
+       "      <td>A3SGXH7AUHU8GW</td>\n",
+       "      <td>delmartian</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1303862400</td>\n",
+       "      <td>Good Quality Dog Food</td>\n",
+       "      <td>I have bought several of the Vitality canned d...</td>\n",
+       "      <td>I have bought several of the Vitality canned d...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>B00813GRG4</td>\n",
+       "      <td>A1D87F6ZCVE5NK</td>\n",
+       "      <td>dll pa</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1346976000</td>\n",
+       "      <td>Not as Advertised</td>\n",
+       "      <td>Product arrived labeled as Jumbo Salted Peanut...</td>\n",
+       "      <td>Product arrived labeled as Jumbo Salted Peanut...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>B000LQOCH0</td>\n",
+       "      <td>ABXLMWJIXXAIN</td>\n",
+       "      <td>Natalia Corres \"Natalia Corres\"</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "      <td>1219017600</td>\n",
+       "      <td>\"Delight\" says it all</td>\n",
+       "      <td>This is a confection that has been around a fe...</td>\n",
+       "      <td>This is a confection that has been around a fe...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>B000UA0QIQ</td>\n",
+       "      <td>A395BORC6FGVXV</td>\n",
+       "      <td>Karl</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1307923200</td>\n",
+       "      <td>Cough Medicine</td>\n",
+       "      <td>If you are looking for the secret ingredient i...</td>\n",
+       "      <td>If you are looking for the secret ingredient i...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>B006K2ZZ7K</td>\n",
+       "      <td>A1UQRSCLF8GW1T</td>\n",
+       "      <td>Michael D. Bigham \"M. Wassir\"</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1350777600</td>\n",
+       "      <td>Great taffy</td>\n",
+       "      <td>Great taffy at a great price.  There was a wid...</td>\n",
+       "      <td>Great taffy at a great price  There was a wide...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Id   ProductId          UserId                      ProfileName  \\\n",
+       "0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   \n",
+       "1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   \n",
+       "2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres \"Natalia Corres\"   \n",
+       "3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   \n",
+       "4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham \"M. Wassir\"   \n",
+       "\n",
+       "   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \\\n",
+       "0                     1                       1      5  1303862400   \n",
+       "1                     0                       0      1  1346976000   \n",
+       "2                     1                       1      4  1219017600   \n",
+       "3                     3                       3      2  1307923200   \n",
+       "4                     0                       0      5  1350777600   \n",
+       "\n",
+       "                 Summary                                               Text  \\\n",
+       "0  Good Quality Dog Food  I have bought several of the Vitality canned d...   \n",
+       "1      Not as Advertised  Product arrived labeled as Jumbo Salted Peanut...   \n",
+       "2  \"Delight\" says it all  This is a confection that has been around a fe...   \n",
+       "3         Cough Medicine  If you are looking for the secret ingredient i...   \n",
+       "4            Great taffy  Great taffy at a great price.  There was a wid...   \n",
+       "\n",
+       "                                           clean_msg  \n",
+       "0  I have bought several of the Vitality canned d...  \n",
+       "1  Product arrived labeled as Jumbo Salted Peanut...  \n",
+       "2  This is a confection that has been around a fe...  \n",
+       "3  If you are looking for the secret ingredient i...  \n",
+       "4  Great taffy at a great price  There was a wide...  "
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#storing the puntuation free text\n",
+    "data['clean_msg']= data['Text'].apply(lambda x:remove_punctuation(x))\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "7f6436d5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                                clean_msg  \\\n",
+      "43350   Great tasting convenient coffeejust a little p...   \n",
+      "136363  Really good coffee Good flavor and would buy a...   \n",
+      "\n",
+      "                                                     Text  \n",
+      "43350   Great tasting, convenient coffee..just a littl...  \n",
+      "136363  Really good coffee. Good flavor and would buy ...  \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data[['clean_msg','Text']].sample(2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "954459ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data['text_lower']=data['clean_msg'].apply(lambda x: x.lower())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "29d12dbc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Text</th>\n",
+       "      <th>clean_msg</th>\n",
+       "      <th>text_lower</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>330905</th>\n",
+       "      <td>Excellent, easy, and tasty. It is usually chea...</td>\n",
+       "      <td>Excellent easy and tasty It is usually cheaper...</td>\n",
+       "      <td>excellent easy and tasty it is usually cheaper...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>291052</th>\n",
+       "      <td>This water is very good for the immune system....</td>\n",
+       "      <td>This water is very good for the immune system ...</td>\n",
+       "      <td>this water is very good for the immune system ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>389090</th>\n",
+       "      <td>I enjoy these. I'm not sure the price is a gre...</td>\n",
+       "      <td>I enjoy these Im not sure the price is a great...</td>\n",
+       "      <td>i enjoy these im not sure the price is a great...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>486142</th>\n",
+       "      <td>Is Wellness relatively expensive? Yes. Is it w...</td>\n",
+       "      <td>Is Wellness relatively expensive Yes Is it wor...</td>\n",
+       "      <td>is wellness relatively expensive yes is it wor...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>496694</th>\n",
+       "      <td>Maybe the claims are true but who can really t...</td>\n",
+       "      <td>Maybe the claims are true but who can really t...</td>\n",
+       "      <td>maybe the claims are true but who can really t...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                     Text  \\\n",
+       "330905  Excellent, easy, and tasty. It is usually chea...   \n",
+       "291052  This water is very good for the immune system....   \n",
+       "389090  I enjoy these. I'm not sure the price is a gre...   \n",
+       "486142  Is Wellness relatively expensive? Yes. Is it w...   \n",
+       "496694  Maybe the claims are true but who can really t...   \n",
+       "\n",
+       "                                                clean_msg  \\\n",
+       "330905  Excellent easy and tasty It is usually cheaper...   \n",
+       "291052  This water is very good for the immune system ...   \n",
+       "389090  I enjoy these Im not sure the price is a great...   \n",
+       "486142  Is Wellness relatively expensive Yes Is it wor...   \n",
+       "496694  Maybe the claims are true but who can really t...   \n",
+       "\n",
+       "                                               text_lower  \n",
+       "330905  excellent easy and tasty it is usually cheaper...  \n",
+       "291052  this water is very good for the immune system ...  \n",
+       "389090  i enjoy these im not sure the price is a great...  \n",
+       "486142  is wellness relatively expensive yes is it wor...  \n",
+       "496694  maybe the claims are true but who can really t...  "
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[['Text','clean_msg','text_lower']].sample(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "69f9cc57",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to\n",
+      "[nltk_data]     /Users/azarmohammad/nltk_data...\n",
+      "[nltk_data]   Unzipping tokenizers/punkt.zip.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "nltk.download('punkt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "e1005f2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#defining function for tokenization\n",
+    "import re\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "\n",
+    "def tokenization(text):\n",
+    "    tokens = word_tokenize(text)\n",
+    "    #tokens = re.split('W+',text)\n",
+    "    return tokens\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "065bd725",
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Id</th>\n",
+       "      <th>ProductId</th>\n",
+       "      <th>UserId</th>\n",
+       "      <th>ProfileName</th>\n",
+       "      <th>HelpfulnessNumerator</th>\n",
+       "      <th>HelpfulnessDenominator</th>\n",
+       "      <th>Score</th>\n",
+       "      <th>Time</th>\n",
+       "      <th>Summary</th>\n",
+       "      <th>Text</th>\n",
+       "      <th>clean_msg</th>\n",
+       "      <th>text_lower</th>\n",
+       "      <th>msg_tokenied</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>B001E4KFG0</td>\n",
+       "      <td>A3SGXH7AUHU8GW</td>\n",
+       "      <td>delmartian</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1303862400</td>\n",
+       "      <td>Good Quality Dog Food</td>\n",
+       "      <td>I have bought several of the Vitality canned d...</td>\n",
+       "      <td>I have bought several of the Vitality canned d...</td>\n",
+       "      <td>i have bought several of the vitality canned d...</td>\n",
+       "      <td>[i, have, bought, several, of, the, vitality, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>B00813GRG4</td>\n",
+       "      <td>A1D87F6ZCVE5NK</td>\n",
+       "      <td>dll pa</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1346976000</td>\n",
+       "      <td>Not as Advertised</td>\n",
+       "      <td>Product arrived labeled as Jumbo Salted Peanut...</td>\n",
+       "      <td>Product arrived labeled as Jumbo Salted Peanut...</td>\n",
+       "      <td>product arrived labeled as jumbo salted peanut...</td>\n",
+       "      <td>[product, arrived, labeled, as, jumbo, salted,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>B000LQOCH0</td>\n",
+       "      <td>ABXLMWJIXXAIN</td>\n",
+       "      <td>Natalia Corres \"Natalia Corres\"</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "      <td>1219017600</td>\n",
+       "      <td>\"Delight\" says it all</td>\n",
+       "      <td>This is a confection that has been around a fe...</td>\n",
+       "      <td>This is a confection that has been around a fe...</td>\n",
+       "      <td>this is a confection that has been around a fe...</td>\n",
+       "      <td>[this, is, a, confection, that, has, been, aro...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>B000UA0QIQ</td>\n",
+       "      <td>A395BORC6FGVXV</td>\n",
+       "      <td>Karl</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1307923200</td>\n",
+       "      <td>Cough Medicine</td>\n",
+       "      <td>If you are looking for the secret ingredient i...</td>\n",
+       "      <td>If you are looking for the secret ingredient i...</td>\n",
+       "      <td>if you are looking for the secret ingredient i...</td>\n",
+       "      <td>[if, you, are, looking, for, the, secret, ingr...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>B006K2ZZ7K</td>\n",
+       "      <td>A1UQRSCLF8GW1T</td>\n",
+       "      <td>Michael D. Bigham \"M. Wassir\"</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1350777600</td>\n",
+       "      <td>Great taffy</td>\n",
+       "      <td>Great taffy at a great price.  There was a wid...</td>\n",
+       "      <td>Great taffy at a great price  There was a wide...</td>\n",
+       "      <td>great taffy at a great price  there was a wide...</td>\n",
+       "      <td>[great, taffy, at, a, great, price, there, was...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Id   ProductId          UserId                      ProfileName  \\\n",
+       "0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   \n",
+       "1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   \n",
+       "2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres \"Natalia Corres\"   \n",
+       "3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   \n",
+       "4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham \"M. Wassir\"   \n",
+       "\n",
+       "   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \\\n",
+       "0                     1                       1      5  1303862400   \n",
+       "1                     0                       0      1  1346976000   \n",
+       "2                     1                       1      4  1219017600   \n",
+       "3                     3                       3      2  1307923200   \n",
+       "4                     0                       0      5  1350777600   \n",
+       "\n",
+       "                 Summary                                               Text  \\\n",
+       "0  Good Quality Dog Food  I have bought several of the Vitality canned d...   \n",
+       "1      Not as Advertised  Product arrived labeled as Jumbo Salted Peanut...   \n",
+       "2  \"Delight\" says it all  This is a confection that has been around a fe...   \n",
+       "3         Cough Medicine  If you are looking for the secret ingredient i...   \n",
+       "4            Great taffy  Great taffy at a great price.  There was a wid...   \n",
+       "\n",
+       "                                           clean_msg  \\\n",
+       "0  I have bought several of the Vitality canned d...   \n",
+       "1  Product arrived labeled as Jumbo Salted Peanut...   \n",
+       "2  This is a confection that has been around a fe...   \n",
+       "3  If you are looking for the secret ingredient i...   \n",
+       "4  Great taffy at a great price  There was a wide...   \n",
+       "\n",
+       "                                          text_lower  \\\n",
+       "0  i have bought several of the vitality canned d...   \n",
+       "1  product arrived labeled as jumbo salted peanut...   \n",
+       "2  this is a confection that has been around a fe...   \n",
+       "3  if you are looking for the secret ingredient i...   \n",
+       "4  great taffy at a great price  there was a wide...   \n",
+       "\n",
+       "                                        msg_tokenied  \n",
+       "0  [i, have, bought, several, of, the, vitality, ...  \n",
+       "1  [product, arrived, labeled, as, jumbo, salted,...  \n",
+       "2  [this, is, a, confection, that, has, been, aro...  \n",
+       "3  [if, you, are, looking, for, the, secret, ingr...  \n",
+       "4  [great, taffy, at, a, great, price, there, was...  "
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#applying function to the column\n",
+    "data['msg_tokenied']= data['text_lower'].apply(lambda x: tokenization(x))\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "ad786064",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>msg_tokenied</th>\n",
+       "      <th>clean_msg</th>\n",
+       "      <th>text_lower</th>\n",
+       "      <th>Text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>321594</th>\n",
+       "      <td>[these, treats, are, my, 18, month, old, bosto...</td>\n",
+       "      <td>These treats are my 18 month old Boston Terrie...</td>\n",
+       "      <td>these treats are my 18 month old boston terrie...</td>\n",
+       "      <td>These treats are my 18 month old Boston Terrie...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                             msg_tokenied  \\\n",
+       "321594  [these, treats, are, my, 18, month, old, bosto...   \n",
+       "\n",
+       "                                                clean_msg  \\\n",
+       "321594  These treats are my 18 month old Boston Terrie...   \n",
+       "\n",
+       "                                               text_lower  \\\n",
+       "321594  these treats are my 18 month old boston terrie...   \n",
+       "\n",
+       "                                                     Text  \n",
+       "321594  These treats are my 18 month old Boston Terrie...  "
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[['msg_tokenied','clean_msg','text_lower','Text']].sample(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "02f94734",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data.Text[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "3640c44e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I have bought several of the Vitality canned dog food products and have found them all to be of good quality The product looks more like a stew than a processed meat and it smells better My Labrador is finicky and she appreciates this product better than  most\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data.clean_msg[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "cef1cd92",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "i have bought several of the vitality canned dog food products and have found them all to be of good quality the product looks more like a stew than a processed meat and it smells better my labrador is finicky and she appreciates this product better than  most\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data.text_lower[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "e9f164ba",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['i', 'have', 'bought', 'several', 'of', 'the', 'vitality', 'canned', 'dog', 'food', 'products', 'and', 'have', 'found', 'them', 'all', 'to', 'be', 'of', 'good', 'quality', 'the', 'product', 'looks', 'more', 'like', 'a', 'stew', 'than', 'a', 'processed', 'meat', 'and', 'it', 'smells', 'better', 'my', 'labrador', 'is', 'finicky', 'and', 'she', 'appreciates', 'this', 'product', 'better', 'than', 'most']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data.msg_tokenied[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "id": "905d96e6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to\n",
+      "[nltk_data]     /Users/azarmohammad/nltk_data...\n",
+      "[nltk_data]   Unzipping corpora/stopwords.zip.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "#Stop words present in the library\n",
+    "nltk.download('stopwords')\n",
+    "stopwords = nltk.corpus.stopwords.words('english')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "id": "b6a95420",
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(stopwords)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f49f4947",
+   "metadata": {},
+   "source": [
+    "The words which are generally filtered out before processing a natural language are called stop words. These are actually the most common words in any language (like articles, prepositions, pronouns, conjunctions, etc) and does not add much information to the text. Examples of a few stop words in English are “the”, “a”, “an”, “so”, “what”."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "2f5b0bff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def remove_stopwords(text):\n",
+    "    #print(text)\n",
+    "    output= [x for x in text if x not in stopwords]\n",
+    "    #print(output)\n",
+    "    return output\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "id": "2652f6a3",
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Id</th>\n",
+       "      <th>ProductId</th>\n",
+       "      <th>UserId</th>\n",
+       "      <th>ProfileName</th>\n",
+       "      <th>HelpfulnessNumerator</th>\n",
+       "      <th>HelpfulnessDenominator</th>\n",
+       "      <th>Score</th>\n",
+       "      <th>Time</th>\n",
+       "      <th>Summary</th>\n",
+       "      <th>Text</th>\n",
+       "      <th>clean_msg</th>\n",
+       "      <th>text_lower</th>\n",
+       "      <th>msg_tokenied</th>\n",
+       "      <th>no_stopwords</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>B001E4KFG0</td>\n",
+       "      <td>A3SGXH7AUHU8GW</td>\n",
+       "      <td>delmartian</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1303862400</td>\n",
+       "      <td>Good Quality Dog Food</td>\n",
+       "      <td>I have bought several of the Vitality canned d...</td>\n",
+       "      <td>I have bought several of the Vitality canned d...</td>\n",
+       "      <td>i have bought several of the vitality canned d...</td>\n",
+       "      <td>[i, have, bought, several, of, the, vitality, ...</td>\n",
+       "      <td>[bought, several, vitality, canned, dog, food,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>B00813GRG4</td>\n",
+       "      <td>A1D87F6ZCVE5NK</td>\n",
+       "      <td>dll pa</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1346976000</td>\n",
+       "      <td>Not as Advertised</td>\n",
+       "      <td>Product arrived labeled as Jumbo Salted Peanut...</td>\n",
+       "      <td>Product arrived labeled as Jumbo Salted Peanut...</td>\n",
+       "      <td>product arrived labeled as jumbo salted peanut...</td>\n",
+       "      <td>[product, arrived, labeled, as, jumbo, salted,...</td>\n",
+       "      <td>[product, arrived, labeled, jumbo, salted, pea...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>B000LQOCH0</td>\n",
+       "      <td>ABXLMWJIXXAIN</td>\n",
+       "      <td>Natalia Corres \"Natalia Corres\"</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "      <td>1219017600</td>\n",
+       "      <td>\"Delight\" says it all</td>\n",
+       "      <td>This is a confection that has been around a fe...</td>\n",
+       "      <td>This is a confection that has been around a fe...</td>\n",
+       "      <td>this is a confection that has been around a fe...</td>\n",
+       "      <td>[this, is, a, confection, that, has, been, aro...</td>\n",
+       "      <td>[confection, around, centuries, light, pillowy...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>B000UA0QIQ</td>\n",
+       "      <td>A395BORC6FGVXV</td>\n",
+       "      <td>Karl</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1307923200</td>\n",
+       "      <td>Cough Medicine</td>\n",
+       "      <td>If you are looking for the secret ingredient i...</td>\n",
+       "      <td>If you are looking for the secret ingredient i...</td>\n",
+       "      <td>if you are looking for the secret ingredient i...</td>\n",
+       "      <td>[if, you, are, looking, for, the, secret, ingr...</td>\n",
+       "      <td>[looking, secret, ingredient, robitussin, beli...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>B006K2ZZ7K</td>\n",
+       "      <td>A1UQRSCLF8GW1T</td>\n",
+       "      <td>Michael D. Bigham \"M. Wassir\"</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1350777600</td>\n",
+       "      <td>Great taffy</td>\n",
+       "      <td>Great taffy at a great price.  There was a wid...</td>\n",
+       "      <td>Great taffy at a great price  There was a wide...</td>\n",
+       "      <td>great taffy at a great price  there was a wide...</td>\n",
+       "      <td>[great, taffy, at, a, great, price, there, was...</td>\n",
+       "      <td>[great, taffy, great, price, wide, assortment,...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Id   ProductId          UserId                      ProfileName  \\\n",
+       "0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   \n",
+       "1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   \n",
+       "2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres \"Natalia Corres\"   \n",
+       "3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   \n",
+       "4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham \"M. Wassir\"   \n",
+       "\n",
+       "   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \\\n",
+       "0                     1                       1      5  1303862400   \n",
+       "1                     0                       0      1  1346976000   \n",
+       "2                     1                       1      4  1219017600   \n",
+       "3                     3                       3      2  1307923200   \n",
+       "4                     0                       0      5  1350777600   \n",
+       "\n",
+       "                 Summary                                               Text  \\\n",
+       "0  Good Quality Dog Food  I have bought several of the Vitality canned d...   \n",
+       "1      Not as Advertised  Product arrived labeled as Jumbo Salted Peanut...   \n",
+       "2  \"Delight\" says it all  This is a confection that has been around a fe...   \n",
+       "3         Cough Medicine  If you are looking for the secret ingredient i...   \n",
+       "4            Great taffy  Great taffy at a great price.  There was a wid...   \n",
+       "\n",
+       "                                           clean_msg  \\\n",
+       "0  I have bought several of the Vitality canned d...   \n",
+       "1  Product arrived labeled as Jumbo Salted Peanut...   \n",
+       "2  This is a confection that has been around a fe...   \n",
+       "3  If you are looking for the secret ingredient i...   \n",
+       "4  Great taffy at a great price  There was a wide...   \n",
+       "\n",
+       "                                          text_lower  \\\n",
+       "0  i have bought several of the vitality canned d...   \n",
+       "1  product arrived labeled as jumbo salted peanut...   \n",
+       "2  this is a confection that has been around a fe...   \n",
+       "3  if you are looking for the secret ingredient i...   \n",
+       "4  great taffy at a great price  there was a wide...   \n",
+       "\n",
+       "                                        msg_tokenied  \\\n",
+       "0  [i, have, bought, several, of, the, vitality, ...   \n",
+       "1  [product, arrived, labeled, as, jumbo, salted,...   \n",
+       "2  [this, is, a, confection, that, has, been, aro...   \n",
+       "3  [if, you, are, looking, for, the, secret, ingr...   \n",
+       "4  [great, taffy, at, a, great, price, there, was...   \n",
+       "\n",
+       "                                        no_stopwords  \n",
+       "0  [bought, several, vitality, canned, dog, food,...  \n",
+       "1  [product, arrived, labeled, jumbo, salted, pea...  \n",
+       "2  [confection, around, centuries, light, pillowy...  \n",
+       "3  [looking, secret, ingredient, robitussin, beli...  \n",
+       "4  [great, taffy, great, price, wide, assortment,...  "
+      ]
+     },
+     "execution_count": 70,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data['no_stopwords']= data['msg_tokenied'].apply(lambda x : remove_stopwords(x))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "id": "1d880d9f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>no_stopwords</th>\n",
+       "      <th>clean_msg</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>47115</th>\n",
+       "      <td>[3rd, case, jello, instant, sugar, free, puddi...</td>\n",
+       "      <td>This is the 3rd case of Jello Instant Sugar Fr...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            no_stopwords  \\\n",
+       "47115  [3rd, case, jello, instant, sugar, free, puddi...   \n",
+       "\n",
+       "                                               clean_msg  \n",
+       "47115  This is the 3rd case of Jello Instant Sugar Fr...  "
+      ]
+     },
+     "execution_count": 78,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[['no_stopwords','clean_msg']].sample()\n",
+    "#data[['msg_tokenied','clean_msg','text_lower','Text']].sample(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "id": "e1a4f052",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['bought', 'several', 'vitality', 'canned', 'dog', 'food', 'products', 'found', 'good', 'quality', 'product', 'looks', 'like', 'stew', 'processed', 'meat', 'smells', 'better', 'labrador', 'finicky', 'appreciates', 'product', 'better']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data.no_stopwords[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "id": "8a7ecdda",
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data.Text[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "id": "03cd965e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "#importing the Stemming function from nltk library\n",
+    "from nltk.stem.porter import PorterStemmer\n",
+    "#defining the object for stemming\n",
+    "porter_stemmer = PorterStemmer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "id": "c983954d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<PorterStemmer>"
+      ]
+     },
+     "execution_count": 84,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "porter_stemmer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5b57e251",
+   "metadata": {},
+   "source": [
+    "Stemming is the process of reducing a word to its stem that affixes to suffixes and prefixes or to the roots of words known as \"lemmas\". Stemming is important in natural language understanding (NLU) and natural language processing (NLP)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "id": "54d61f4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#defining a function for stemming\n",
+    "def stemming(text):\n",
+    "    stem_text = [porter_stemmer.stem(word) for word in text]\n",
+    "    return stem_text\n",
+    "data['msg_stemmed']=data['no_stopwords'].apply(lambda x: stemming(x))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "5f797052",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['bought', 'sever', 'vital', 'can', 'dog', 'food', 'product', 'found', 'good', 'qualiti', 'product', 'look', 'like', 'stew', 'process', 'meat', 'smell', 'better', 'labrador', 'finicki', 'appreci', 'product', 'better']\n"
+     ]
+    }
+   ],
+   "source": [
+    "\"\"\"\n",
+    "Inflection morphemes are suffixesthat are added to a word to assign particular grammatical property to that word.\n",
+    "Inflectional morphemes are considered to be grammatical markers that indicate tense, number,\n",
+    "POS, and so on. So, in more simple language, we can say that inflectional morphemes\n",
+    "are identified as types of morpheme that modify the verb tense, aspect, mood, person,\n",
+    "number (singular and plural), gender, or case, without affecting the words meaning or POS.\n",
+    "\n",
+    "\"\"\"\n",
+    "\n",
+    "print(data.msg_stemmed[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "66c04a5c",
+   "metadata": {},
+   "source": [
+    "Lemmatization in NLTK is the algorithmic process of finding the lemma of a word depending on its meaning and context. Lemmatization usually refers to the morphological analysis of words, which aims to remove inflectional endings. It helps in returning the base or dictionary form of a word known as the lemma.\n",
+    "\n",
+    "\n",
+    "\n",
+    "Wordnet Lemmatizer with NLTK. Wordnet is an large, freely and publicly available lexical database for the English language aiming to establish structured semantic relationships between words. It offers lemmatization capabilities as well and is one of the earliest and most commonly used lemmatizers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "id": "95c6ab11",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 96,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "nltk.download()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "id": "1a6116ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.stem import WordNetLemmatizer\n",
+    "#defining the object for Lemmatization\n",
+    "wordnet_lemmatizer = WordNetLemmatizer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "id": "13674f48",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def lemmatizer(text):\n",
+    "    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]\n",
+    "    return lemm_text\n",
+    "\n",
+    "data['msg_lemmatized']=data['no_stopwords'].apply(lambda x:lemmatizer(x))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "id": "83ee5f90",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['bought', 'several', 'vitality', 'canned', 'dog', 'food', 'product', 'found', 'good', 'quality', 'product', 'look', 'like', 'stew', 'processed', 'meat', 'smell', 'better', 'labrador', 'finicky', 'appreciates', 'product', 'better']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data.msg_lemmatized[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "id": "ba7f4834",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(568454, 16)"
+      ]
+     },
+     "execution_count": 102,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 124,
+   "id": "9e6dc658",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>clean_msg</th>\n",
+       "      <th>text_lower</th>\n",
+       "      <th>msg_tokenied</th>\n",
+       "      <th>no_stopwords</th>\n",
+       "      <th>msg_stemmed</th>\n",
+       "      <th>msg_lemmatized</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>I have bought several of the Vitality canned d...</td>\n",
+       "      <td>i have bought several of the vitality canned d...</td>\n",
+       "      <td>[i, have, bought, several, of, the, vitality, ...</td>\n",
+       "      <td>[bought, several, vitality, canned, dog, food,...</td>\n",
+       "      <td>[bought, sever, vital, can, dog, food, product...</td>\n",
+       "      <td>[bought, several, vitality, canned, dog, food,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Product arrived labeled as Jumbo Salted Peanut...</td>\n",
+       "      <td>product arrived labeled as jumbo salted peanut...</td>\n",
+       "      <td>[product, arrived, labeled, as, jumbo, salted,...</td>\n",
+       "      <td>[product, arrived, labeled, jumbo, salted, pea...</td>\n",
+       "      <td>[product, arriv, label, jumbo, salt, peanutsth...</td>\n",
+       "      <td>[product, arrived, labeled, jumbo, salted, pea...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>This is a confection that has been around a fe...</td>\n",
+       "      <td>this is a confection that has been around a fe...</td>\n",
+       "      <td>[this, is, a, confection, that, has, been, aro...</td>\n",
+       "      <td>[confection, around, centuries, light, pillowy...</td>\n",
+       "      <td>[confect, around, centuri, light, pillowi, cit...</td>\n",
+       "      <td>[confection, around, century, light, pillowy, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>If you are looking for the secret ingredient i...</td>\n",
+       "      <td>if you are looking for the secret ingredient i...</td>\n",
+       "      <td>[if, you, are, looking, for, the, secret, ingr...</td>\n",
+       "      <td>[looking, secret, ingredient, robitussin, beli...</td>\n",
+       "      <td>[look, secret, ingredi, robitussin, believ, fo...</td>\n",
+       "      <td>[looking, secret, ingredient, robitussin, beli...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Great taffy at a great price  There was a wide...</td>\n",
+       "      <td>great taffy at a great price  there was a wide...</td>\n",
+       "      <td>[great, taffy, at, a, great, price, there, was...</td>\n",
+       "      <td>[great, taffy, great, price, wide, assortment,...</td>\n",
+       "      <td>[great, taffi, great, price, wide, assort, yum...</td>\n",
+       "      <td>[great, taffy, great, price, wide, assortment,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>I got a wild hair for taffy and ordered this f...</td>\n",
+       "      <td>i got a wild hair for taffy and ordered this f...</td>\n",
+       "      <td>[i, got, a, wild, hair, for, taffy, and, order...</td>\n",
+       "      <td>[got, wild, hair, taffy, ordered, five, pound,...</td>\n",
+       "      <td>[got, wild, hair, taffi, order, five, pound, b...</td>\n",
+       "      <td>[got, wild, hair, taffy, ordered, five, pound,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>This saltwater taffy had great flavors and was...</td>\n",
+       "      <td>this saltwater taffy had great flavors and was...</td>\n",
+       "      <td>[this, saltwater, taffy, had, great, flavors, ...</td>\n",
+       "      <td>[saltwater, taffy, great, flavors, soft, chewy...</td>\n",
+       "      <td>[saltwat, taffi, great, flavor, soft, chewi, c...</td>\n",
+       "      <td>[saltwater, taffy, great, flavor, soft, chewy,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>This taffy is so good  It is very soft and che...</td>\n",
+       "      <td>this taffy is so good  it is very soft and che...</td>\n",
+       "      <td>[this, taffy, is, so, good, it, is, very, soft...</td>\n",
+       "      <td>[taffy, good, soft, chewy, flavors, amazing, w...</td>\n",
+       "      <td>[taffi, good, soft, chewi, flavor, amaz, would...</td>\n",
+       "      <td>[taffy, good, soft, chewy, flavor, amazing, wo...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Right now Im mostly just sprouting this so my ...</td>\n",
+       "      <td>right now im mostly just sprouting this so my ...</td>\n",
+       "      <td>[right, now, im, mostly, just, sprouting, this...</td>\n",
+       "      <td>[right, im, mostly, sprouting, cats, eat, gras...</td>\n",
+       "      <td>[right, im, mostli, sprout, cat, eat, grass, l...</td>\n",
+       "      <td>[right, im, mostly, sprouting, cat, eat, grass...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>This is a very healthy dog food Good for their...</td>\n",
+       "      <td>this is a very healthy dog food good for their...</td>\n",
+       "      <td>[this, is, a, very, healthy, dog, food, good, ...</td>\n",
+       "      <td>[healthy, dog, food, good, digestion, also, go...</td>\n",
+       "      <td>[healthi, dog, food, good, digest, also, good,...</td>\n",
+       "      <td>[healthy, dog, food, good, digestion, also, go...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                           clean_msg  \\\n",
+       "0  I have bought several of the Vitality canned d...   \n",
+       "1  Product arrived labeled as Jumbo Salted Peanut...   \n",
+       "2  This is a confection that has been around a fe...   \n",
+       "3  If you are looking for the secret ingredient i...   \n",
+       "4  Great taffy at a great price  There was a wide...   \n",
+       "5  I got a wild hair for taffy and ordered this f...   \n",
+       "6  This saltwater taffy had great flavors and was...   \n",
+       "7  This taffy is so good  It is very soft and che...   \n",
+       "8  Right now Im mostly just sprouting this so my ...   \n",
+       "9  This is a very healthy dog food Good for their...   \n",
+       "\n",
+       "                                          text_lower  \\\n",
+       "0  i have bought several of the vitality canned d...   \n",
+       "1  product arrived labeled as jumbo salted peanut...   \n",
+       "2  this is a confection that has been around a fe...   \n",
+       "3  if you are looking for the secret ingredient i...   \n",
+       "4  great taffy at a great price  there was a wide...   \n",
+       "5  i got a wild hair for taffy and ordered this f...   \n",
+       "6  this saltwater taffy had great flavors and was...   \n",
+       "7  this taffy is so good  it is very soft and che...   \n",
+       "8  right now im mostly just sprouting this so my ...   \n",
+       "9  this is a very healthy dog food good for their...   \n",
+       "\n",
+       "                                        msg_tokenied  \\\n",
+       "0  [i, have, bought, several, of, the, vitality, ...   \n",
+       "1  [product, arrived, labeled, as, jumbo, salted,...   \n",
+       "2  [this, is, a, confection, that, has, been, aro...   \n",
+       "3  [if, you, are, looking, for, the, secret, ingr...   \n",
+       "4  [great, taffy, at, a, great, price, there, was...   \n",
+       "5  [i, got, a, wild, hair, for, taffy, and, order...   \n",
+       "6  [this, saltwater, taffy, had, great, flavors, ...   \n",
+       "7  [this, taffy, is, so, good, it, is, very, soft...   \n",
+       "8  [right, now, im, mostly, just, sprouting, this...   \n",
+       "9  [this, is, a, very, healthy, dog, food, good, ...   \n",
+       "\n",
+       "                                        no_stopwords  \\\n",
+       "0  [bought, several, vitality, canned, dog, food,...   \n",
+       "1  [product, arrived, labeled, jumbo, salted, pea...   \n",
+       "2  [confection, around, centuries, light, pillowy...   \n",
+       "3  [looking, secret, ingredient, robitussin, beli...   \n",
+       "4  [great, taffy, great, price, wide, assortment,...   \n",
+       "5  [got, wild, hair, taffy, ordered, five, pound,...   \n",
+       "6  [saltwater, taffy, great, flavors, soft, chewy...   \n",
+       "7  [taffy, good, soft, chewy, flavors, amazing, w...   \n",
+       "8  [right, im, mostly, sprouting, cats, eat, gras...   \n",
+       "9  [healthy, dog, food, good, digestion, also, go...   \n",
+       "\n",
+       "                                         msg_stemmed  \\\n",
+       "0  [bought, sever, vital, can, dog, food, product...   \n",
+       "1  [product, arriv, label, jumbo, salt, peanutsth...   \n",
+       "2  [confect, around, centuri, light, pillowi, cit...   \n",
+       "3  [look, secret, ingredi, robitussin, believ, fo...   \n",
+       "4  [great, taffi, great, price, wide, assort, yum...   \n",
+       "5  [got, wild, hair, taffi, order, five, pound, b...   \n",
+       "6  [saltwat, taffi, great, flavor, soft, chewi, c...   \n",
+       "7  [taffi, good, soft, chewi, flavor, amaz, would...   \n",
+       "8  [right, im, mostli, sprout, cat, eat, grass, l...   \n",
+       "9  [healthi, dog, food, good, digest, also, good,...   \n",
+       "\n",
+       "                                      msg_lemmatized  \n",
+       "0  [bought, several, vitality, canned, dog, food,...  \n",
+       "1  [product, arrived, labeled, jumbo, salted, pea...  \n",
+       "2  [confection, around, century, light, pillowy, ...  \n",
+       "3  [looking, secret, ingredient, robitussin, beli...  \n",
+       "4  [great, taffy, great, price, wide, assortment,...  \n",
+       "5  [got, wild, hair, taffy, ordered, five, pound,...  \n",
+       "6  [saltwater, taffy, great, flavor, soft, chewy,...  \n",
+       "7  [taffy, good, soft, chewy, flavor, amazing, wo...  \n",
+       "8  [right, im, mostly, sprouting, cat, eat, grass...  \n",
+       "9  [healthy, dog, food, good, digestion, also, go...  "
+      ]
+     },
+     "execution_count": 124,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.iloc[:10,10:16]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 125,
+   "id": "7666b9a9",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'huggingface_hub'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Input \u001b[0;32mIn [125]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhuggingface_hub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m login\n\u001b[1;32m      2\u001b[0m login()\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'huggingface_hub'"
+     ]
+    }
+   ],
+   "source": [
+    "from huggingface_hub import login\n",
+    "login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 126,
+   "id": "1d4e9483",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "SyntaxError",
+     "evalue": "invalid syntax (1355118443.py, line 1)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;36m  Input \u001b[0;32mIn [126]\u001b[0;36m\u001b[0m\n\u001b[0;31m    huggingface-cli login\u001b[0m\n\u001b[0m                    ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
+     ]
+    }
+   ],
+   "source": [
+    "huggingface-cli login"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 127,
+   "id": "30fffe0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'huggingface_hub'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Input \u001b[0;32mIn [127]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhuggingface_hub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HfApi\n\u001b[1;32m      2\u001b[0m api \u001b[38;5;241m=\u001b[39m HfApi()\n\u001b[1;32m      3\u001b[0m api\u001b[38;5;241m.\u001b[39mupload_folder(\n\u001b[1;32m      4\u001b[0m folder_path\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/path/to/local/folder\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m      5\u001b[0m path_in_repo\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmy-dataset/train\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m      8\u001b[0m ignore_patterns\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m**/logs/*.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m      9\u001b[0m )\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'huggingface_hub'"
+     ]
+    }
+   ],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "api = HfApi()\n",
+    "api.upload_folder(\n",
+    "folder_path=\"/path/to/local/folder\",\n",
+    "path_in_repo=\"my-dataset/train\",\n",
+    "repo_id=\"username/test-dataset\",\n",
+    "repo_type=\"dataset\",\n",
+    "ignore_patterns=\"**/logs/*.txt\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 128,
+   "id": "e66e1d40",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting package metadata (current_repodata.json): done\n",
+      "Solving environment: failed with initial frozen solve. Retrying with flexible solve.\n",
+      "Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.\n",
+      "Collecting package metadata (repodata.json): done\n",
+      "Solving environment: / \n",
+      "The environment is inconsistent, please check the package plan carefully\n",
+      "The following packages are causing the inconsistency:\n",
+      "\n",
+      "  - defaults/osx-64::anaconda==2022.05=py39_0\n",
+      "  - defaults/osx-64::conda-build==3.21.8=py39hecd8cb5_2\n",
+      "  - defaults/osx-64::bcrypt==3.2.0=py39h9ed2024_0\n",
+      "  - defaults/osx-64::scrapy==2.6.1=py39hecd8cb5_0\n",
+      "  - defaults/osx-64::twisted==22.2.0=py39hca72f7f_0\n",
+      "done\n",
+      "\n",
+      "## Package Plan ##\n",
+      "\n",
+      "  environment location: /opt/anaconda3\n",
+      "\n",
+      "  added / updated specs:\n",
+      "    - tokenizers==0.10.3\n",
+      "    - transformers==4.14.1\n",
+      "\n",
+      "\n",
+      "The following packages will be downloaded:\n",
+      "\n",
+      "    package                    |            build\n",
+      "    ---------------------------|-----------------\n",
+      "    _anaconda_depends-2022.10  |           py39_2          69 KB\n",
+      "    anaconda-custom            |           py39_1           4 KB\n",
+      "    ca-certificates-2023.01.10 |       hecd8cb5_0         121 KB\n",
+      "    cctools-949.0.1            |      h9abeeb2_25          18 KB\n",
+      "    cctools_osx-64-949.0.1     |      hc7db93f_25         1.3 MB\n",
+      "    certifi-2022.12.7          |   py39hecd8cb5_0         151 KB\n",
+      "    conda-23.1.0               |   py39hecd8cb5_0         938 KB\n",
+      "    conda-build-3.23.3         |   py39hecd8cb5_0         568 KB\n",
+      "    huggingface_hub-0.12.0     |             py_0         185 KB  huggingface\n",
+      "    ld64-530                   |      h20443b4_25          16 KB\n",
+      "    ld64_osx-64-530            |      h70f3046_25         920 KB\n",
+      "    ldid-2.1.2                 |       h2d21305_2          54 KB\n",
+      "    libllvm14-14.0.6           |       he552d86_0        21.3 MB\n",
+      "    ninja-1.10.2               |       hecd8cb5_5           9 KB\n",
+      "    ninja-base-1.10.2          |       haf03e11_5         118 KB\n",
+      "    openssl-1.1.1s             |       hca72f7f_0         2.8 MB\n",
+      "    patch-2.7.6                |    h1de35cc_1001         128 KB\n",
+      "    pip-22.3.1                 |   py39hecd8cb5_0         2.7 MB\n",
+      "    pytorch-1.10.2             |cpu_py39h903acac_0        53.9 MB\n",
+      "    ruamel.yaml-0.17.21        |   py39hca72f7f_0         179 KB\n",
+      "    ruamel.yaml.clib-0.2.6     |   py39hca72f7f_1         126 KB\n",
+      "    sacremoses-master          |             py_0         404 KB  huggingface\n",
+      "    tapi-1000.10.8             |       ha1b3eb9_0         4.2 MB\n",
+      "    tokenizers-0.10.3          |   py39h7bafbf5_1         1.6 MB\n",
+      "    transformers-4.14.1        |     pyhd3eb1b0_0         1.0 MB\n",
+      "    ------------------------------------------------------------\n",
+      "                                           Total:        92.9 MB\n",
+      "\n",
+      "The following NEW packages will be INSTALLED:\n",
+      "\n",
+      "  _anaconda_depends  pkgs/main/osx-64::_anaconda_depends-2022.10-py39_2\n",
+      "  cctools            pkgs/main/osx-64::cctools-949.0.1-h9abeeb2_25\n",
+      "  cctools_osx-64     pkgs/main/osx-64::cctools_osx-64-949.0.1-hc7db93f_25\n",
+      "  huggingface_hub    huggingface/noarch::huggingface_hub-0.12.0-py_0\n",
+      "  ld64               pkgs/main/osx-64::ld64-530-h20443b4_25\n",
+      "  ld64_osx-64        pkgs/main/osx-64::ld64_osx-64-530-h70f3046_25\n",
+      "  ldid               pkgs/main/osx-64::ldid-2.1.2-h2d21305_2\n",
+      "  libllvm14          pkgs/main/osx-64::libllvm14-14.0.6-he552d86_0\n",
+      "  ninja              pkgs/main/osx-64::ninja-1.10.2-hecd8cb5_5\n",
+      "  ninja-base         pkgs/main/osx-64::ninja-base-1.10.2-haf03e11_5\n",
+      "  patch              pkgs/main/osx-64::patch-2.7.6-h1de35cc_1001\n",
+      "  pip                pkgs/main/osx-64::pip-22.3.1-py39hecd8cb5_0\n",
+      "  pytorch            pkgs/main/osx-64::pytorch-1.10.2-cpu_py39h903acac_0\n",
+      "  ruamel.yaml        pkgs/main/osx-64::ruamel.yaml-0.17.21-py39hca72f7f_0\n",
+      "  ruamel.yaml.clib   pkgs/main/osx-64::ruamel.yaml.clib-0.2.6-py39hca72f7f_1\n",
+      "  sacremoses         huggingface/noarch::sacremoses-master-py_0\n",
+      "  tapi               pkgs/main/osx-64::tapi-1000.10.8-ha1b3eb9_0\n",
+      "  tokenizers         pkgs/main/osx-64::tokenizers-0.10.3-py39h7bafbf5_1\n",
+      "  transformers       pkgs/main/noarch::transformers-4.14.1-pyhd3eb1b0_0\n",
+      "\n",
+      "The following packages will be UPDATED:\n",
+      "\n",
+      "  ca-certificates                      2022.3.29-hecd8cb5_1 --> 2023.01.10-hecd8cb5_0\n",
+      "  certifi                          2021.10.8-py39hecd8cb5_2 --> 2022.12.7-py39hecd8cb5_0\n",
+      "  conda                               4.14.0-py39hecd8cb5_0 --> 23.1.0-py39hecd8cb5_0\n",
+      "  conda-build                         3.21.8-py39hecd8cb5_2 --> 3.23.3-py39hecd8cb5_0\n",
+      "  openssl                                 1.1.1n-hca72f7f_0 --> 1.1.1s-hca72f7f_0\n",
+      "\n",
+      "The following packages will be DOWNGRADED:\n",
+      "\n",
+      "  anaconda                                   2022.05-py39_0 --> custom-py39_1\n",
+      "\n",
+      "\n",
+      "\n",
+      "Downloading and Extracting Packages\n",
+      "sacremoses-master    | 404 KB    | ##################################### | 100% \n",
+      "patch-2.7.6          | 128 KB    | ##################################### | 100% \n",
+      "ruamel.yaml.clib-0.2 | 126 KB    | ##################################### | 100% \n",
+      "transformers-4.14.1  | 1.0 MB    | ##################################### | 100% \n",
+      "openssl-1.1.1s       | 2.8 MB    | ##################################### | 100% \n",
+      "pip-22.3.1           | 2.7 MB    | ##################################### | 100% \n",
+      "ninja-base-1.10.2    | 118 KB    | ##################################### | 100% \n",
+      "_anaconda_depends-20 | 69 KB     | ##################################### | 100% \n",
+      "conda-23.1.0         | 938 KB    | ##################################### | 100% \n",
+      "conda-build-3.23.3   | 568 KB    | ##################################### | 100% \n",
+      "pytorch-1.10.2       | 53.9 MB   | ##################################### | 100% \n",
+      "ruamel.yaml-0.17.21  | 179 KB    | ##################################### | 100% \n",
+      "ca-certificates-2023 | 121 KB    | ##################################### | 100% \n",
+      "anaconda-custom      | 4 KB      | ##################################### | 100% \n",
+      "ld64-530             | 16 KB     | ##################################### | 100% \n",
+      "tapi-1000.10.8       | 4.2 MB    | ##################################### | 100% \n",
+      "libllvm14-14.0.6     | 21.3 MB   | ##################################### | 100% \n",
+      "cctools-949.0.1      | 18 KB     | ##################################### | 100% \n",
+      "ld64_osx-64-530      | 920 KB    | ##################################### | 100% \n",
+      "tokenizers-0.10.3    | 1.6 MB    | ##################################### | 100% \n",
+      "ninja-1.10.2         | 9 KB      | ##################################### | 100% \n",
+      "huggingface_hub-0.12 | 185 KB    | ##################################### | 100% \n",
+      "cctools_osx-64-949.0 | 1.3 MB    | ##################################### | 100% \n",
+      "ldid-2.1.2           | 54 KB     | ##################################### | 100% \n",
+      "certifi-2022.12.7    | 151 KB    | ##################################### | 100% \n",
+      "Preparing transaction: done\n",
+      "Verifying transaction: done\n",
+      "Executing transaction: done\n",
+      "Retrieving notices: ...working... done\n",
+      "\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "conda install -c huggingface transformers==4.14.1 tokenizers==0.10.3 -y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c35fe28",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}