{ "cells": [ { "cell_type": "markdown", "id": "fc8cef68", "metadata": {}, "source": [ "# text processing:" ] }, { "cell_type": "code", "execution_count": 1, "id": "61636845", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 6, "id": "b7cc480a", "metadata": {}, "outputs": [], "source": [ "data=pd.read_csv(\"Reviews.csv\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "2b61b374", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',\n", " 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],\n", " dtype='object')" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.columns" ] }, { "cell_type": "code", "execution_count": 7, "id": "ffe1915d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(568454, 10)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.shape" ] }, { "cell_type": "code", "execution_count": 10, "id": "7658aeba", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "201034 Raspberry taste TOO strong\n", "511879 The whole family loves these!\n", "8685 Well-rounded but weak\n", "385899 1/10 as strong as other brands, won't disolve ...\n", "47587 Perfect fudge!\n", "5222 Great snack, however,\n", "560412 Looks, smells and probably tastes like real ch...\n", "479143 Delicious way to start the day\n", "307484 A dessert in a cup!\n", "189729 SUPERB! GREAT FOR BAKING!\n", "Name: Summary, dtype: object" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.Summary.sample(10)" ] }, { "cell_type": "code", "execution_count": 13, "id": "379273cc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most.'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.Text[0]" ] }, { "cell_type": "code", "execution_count": 14, "id": "38332201", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Good Quality Dog Food'" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.Summary[0]" ] }, { "cell_type": "code", "execution_count": 15, "id": "73a4c938", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import string\n", "string.punctuation" ] }, { "cell_type": "code", "execution_count": 30, "id": "93270bf2", "metadata": {}, "outputs": [], "source": [ "#this function is about to remove the , . ' ',\" \",all abpuve sybols discribed\n", "def remove_punctuation(text):\n", " punctuationfree=\"\".join([i for i in text if i not in string.punctuation])\n", " return punctuationfree" ] }, { "cell_type": "code", "execution_count": 31, "id": "95437c3b", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", " | Id | \n", "ProductId | \n", "UserId | \n", "ProfileName | \n", "HelpfulnessNumerator | \n", "HelpfulnessDenominator | \n", "Score | \n", "Time | \n", "Summary | \n", "Text | \n", "clean_msg | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "B001E4KFG0 | \n", "A3SGXH7AUHU8GW | \n", "delmartian | \n", "1 | \n", "1 | \n", "5 | \n", "1303862400 | \n", "Good Quality Dog Food | \n", "I have bought several of the Vitality canned d... | \n", "I have bought several of the Vitality canned d... | \n", "
1 | \n", "2 | \n", "B00813GRG4 | \n", "A1D87F6ZCVE5NK | \n", "dll pa | \n", "0 | \n", "0 | \n", "1 | \n", "1346976000 | \n", "Not as Advertised | \n", "Product arrived labeled as Jumbo Salted Peanut... | \n", "Product arrived labeled as Jumbo Salted Peanut... | \n", "
2 | \n", "3 | \n", "B000LQOCH0 | \n", "ABXLMWJIXXAIN | \n", "Natalia Corres \"Natalia Corres\" | \n", "1 | \n", "1 | \n", "4 | \n", "1219017600 | \n", "\"Delight\" says it all | \n", "This is a confection that has been around a fe... | \n", "This is a confection that has been around a fe... | \n", "
3 | \n", "4 | \n", "B000UA0QIQ | \n", "A395BORC6FGVXV | \n", "Karl | \n", "3 | \n", "3 | \n", "2 | \n", "1307923200 | \n", "Cough Medicine | \n", "If you are looking for the secret ingredient i... | \n", "If you are looking for the secret ingredient i... | \n", "
4 | \n", "5 | \n", "B006K2ZZ7K | \n", "A1UQRSCLF8GW1T | \n", "Michael D. Bigham \"M. Wassir\" | \n", "0 | \n", "0 | \n", "5 | \n", "1350777600 | \n", "Great taffy | \n", "Great taffy at a great price. There was a wid... | \n", "Great taffy at a great price There was a wide... | \n", "
\n", " | Text | \n", "clean_msg | \n", "text_lower | \n", "
---|---|---|---|
330905 | \n", "Excellent, easy, and tasty. It is usually chea... | \n", "Excellent easy and tasty It is usually cheaper... | \n", "excellent easy and tasty it is usually cheaper... | \n", "
291052 | \n", "This water is very good for the immune system.... | \n", "This water is very good for the immune system ... | \n", "this water is very good for the immune system ... | \n", "
389090 | \n", "I enjoy these. I'm not sure the price is a gre... | \n", "I enjoy these Im not sure the price is a great... | \n", "i enjoy these im not sure the price is a great... | \n", "
486142 | \n", "Is Wellness relatively expensive? Yes. Is it w... | \n", "Is Wellness relatively expensive Yes Is it wor... | \n", "is wellness relatively expensive yes is it wor... | \n", "
496694 | \n", "Maybe the claims are true but who can really t... | \n", "Maybe the claims are true but who can really t... | \n", "maybe the claims are true but who can really t... | \n", "
\n", " | Id | \n", "ProductId | \n", "UserId | \n", "ProfileName | \n", "HelpfulnessNumerator | \n", "HelpfulnessDenominator | \n", "Score | \n", "Time | \n", "Summary | \n", "Text | \n", "clean_msg | \n", "text_lower | \n", "msg_tokenied | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "B001E4KFG0 | \n", "A3SGXH7AUHU8GW | \n", "delmartian | \n", "1 | \n", "1 | \n", "5 | \n", "1303862400 | \n", "Good Quality Dog Food | \n", "I have bought several of the Vitality canned d... | \n", "I have bought several of the Vitality canned d... | \n", "i have bought several of the vitality canned d... | \n", "[i, have, bought, several, of, the, vitality, ... | \n", "
1 | \n", "2 | \n", "B00813GRG4 | \n", "A1D87F6ZCVE5NK | \n", "dll pa | \n", "0 | \n", "0 | \n", "1 | \n", "1346976000 | \n", "Not as Advertised | \n", "Product arrived labeled as Jumbo Salted Peanut... | \n", "Product arrived labeled as Jumbo Salted Peanut... | \n", "product arrived labeled as jumbo salted peanut... | \n", "[product, arrived, labeled, as, jumbo, salted,... | \n", "
2 | \n", "3 | \n", "B000LQOCH0 | \n", "ABXLMWJIXXAIN | \n", "Natalia Corres \"Natalia Corres\" | \n", "1 | \n", "1 | \n", "4 | \n", "1219017600 | \n", "\"Delight\" says it all | \n", "This is a confection that has been around a fe... | \n", "This is a confection that has been around a fe... | \n", "this is a confection that has been around a fe... | \n", "[this, is, a, confection, that, has, been, aro... | \n", "
3 | \n", "4 | \n", "B000UA0QIQ | \n", "A395BORC6FGVXV | \n", "Karl | \n", "3 | \n", "3 | \n", "2 | \n", "1307923200 | \n", "Cough Medicine | \n", "If you are looking for the secret ingredient i... | \n", "If you are looking for the secret ingredient i... | \n", "if you are looking for the secret ingredient i... | \n", "[if, you, are, looking, for, the, secret, ingr... | \n", "
4 | \n", "5 | \n", "B006K2ZZ7K | \n", "A1UQRSCLF8GW1T | \n", "Michael D. Bigham \"M. Wassir\" | \n", "0 | \n", "0 | \n", "5 | \n", "1350777600 | \n", "Great taffy | \n", "Great taffy at a great price. There was a wid... | \n", "Great taffy at a great price There was a wide... | \n", "great taffy at a great price there was a wide... | \n", "[great, taffy, at, a, great, price, there, was... | \n", "
\n", " | msg_tokenied | \n", "clean_msg | \n", "text_lower | \n", "Text | \n", "
---|---|---|---|---|
321594 | \n", "[these, treats, are, my, 18, month, old, bosto... | \n", "These treats are my 18 month old Boston Terrie... | \n", "these treats are my 18 month old boston terrie... | \n", "These treats are my 18 month old Boston Terrie... | \n", "
\n", " | Id | \n", "ProductId | \n", "UserId | \n", "ProfileName | \n", "HelpfulnessNumerator | \n", "HelpfulnessDenominator | \n", "Score | \n", "Time | \n", "Summary | \n", "Text | \n", "clean_msg | \n", "text_lower | \n", "msg_tokenied | \n", "no_stopwords | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "B001E4KFG0 | \n", "A3SGXH7AUHU8GW | \n", "delmartian | \n", "1 | \n", "1 | \n", "5 | \n", "1303862400 | \n", "Good Quality Dog Food | \n", "I have bought several of the Vitality canned d... | \n", "I have bought several of the Vitality canned d... | \n", "i have bought several of the vitality canned d... | \n", "[i, have, bought, several, of, the, vitality, ... | \n", "[bought, several, vitality, canned, dog, food,... | \n", "
1 | \n", "2 | \n", "B00813GRG4 | \n", "A1D87F6ZCVE5NK | \n", "dll pa | \n", "0 | \n", "0 | \n", "1 | \n", "1346976000 | \n", "Not as Advertised | \n", "Product arrived labeled as Jumbo Salted Peanut... | \n", "Product arrived labeled as Jumbo Salted Peanut... | \n", "product arrived labeled as jumbo salted peanut... | \n", "[product, arrived, labeled, as, jumbo, salted,... | \n", "[product, arrived, labeled, jumbo, salted, pea... | \n", "
2 | \n", "3 | \n", "B000LQOCH0 | \n", "ABXLMWJIXXAIN | \n", "Natalia Corres \"Natalia Corres\" | \n", "1 | \n", "1 | \n", "4 | \n", "1219017600 | \n", "\"Delight\" says it all | \n", "This is a confection that has been around a fe... | \n", "This is a confection that has been around a fe... | \n", "this is a confection that has been around a fe... | \n", "[this, is, a, confection, that, has, been, aro... | \n", "[confection, around, centuries, light, pillowy... | \n", "
3 | \n", "4 | \n", "B000UA0QIQ | \n", "A395BORC6FGVXV | \n", "Karl | \n", "3 | \n", "3 | \n", "2 | \n", "1307923200 | \n", "Cough Medicine | \n", "If you are looking for the secret ingredient i... | \n", "If you are looking for the secret ingredient i... | \n", "if you are looking for the secret ingredient i... | \n", "[if, you, are, looking, for, the, secret, ingr... | \n", "[looking, secret, ingredient, robitussin, beli... | \n", "
4 | \n", "5 | \n", "B006K2ZZ7K | \n", "A1UQRSCLF8GW1T | \n", "Michael D. Bigham \"M. Wassir\" | \n", "0 | \n", "0 | \n", "5 | \n", "1350777600 | \n", "Great taffy | \n", "Great taffy at a great price. There was a wid... | \n", "Great taffy at a great price There was a wide... | \n", "great taffy at a great price there was a wide... | \n", "[great, taffy, at, a, great, price, there, was... | \n", "[great, taffy, great, price, wide, assortment,... | \n", "
\n", " | no_stopwords | \n", "clean_msg | \n", "
---|---|---|
47115 | \n", "[3rd, case, jello, instant, sugar, free, puddi... | \n", "This is the 3rd case of Jello Instant Sugar Fr... | \n", "
\n", " | clean_msg | \n", "text_lower | \n", "msg_tokenied | \n", "no_stopwords | \n", "msg_stemmed | \n", "msg_lemmatized | \n", "
---|---|---|---|---|---|---|
0 | \n", "I have bought several of the Vitality canned d... | \n", "i have bought several of the vitality canned d... | \n", "[i, have, bought, several, of, the, vitality, ... | \n", "[bought, several, vitality, canned, dog, food,... | \n", "[bought, sever, vital, can, dog, food, product... | \n", "[bought, several, vitality, canned, dog, food,... | \n", "
1 | \n", "Product arrived labeled as Jumbo Salted Peanut... | \n", "product arrived labeled as jumbo salted peanut... | \n", "[product, arrived, labeled, as, jumbo, salted,... | \n", "[product, arrived, labeled, jumbo, salted, pea... | \n", "[product, arriv, label, jumbo, salt, peanutsth... | \n", "[product, arrived, labeled, jumbo, salted, pea... | \n", "
2 | \n", "This is a confection that has been around a fe... | \n", "this is a confection that has been around a fe... | \n", "[this, is, a, confection, that, has, been, aro... | \n", "[confection, around, centuries, light, pillowy... | \n", "[confect, around, centuri, light, pillowi, cit... | \n", "[confection, around, century, light, pillowy, ... | \n", "
3 | \n", "If you are looking for the secret ingredient i... | \n", "if you are looking for the secret ingredient i... | \n", "[if, you, are, looking, for, the, secret, ingr... | \n", "[looking, secret, ingredient, robitussin, beli... | \n", "[look, secret, ingredi, robitussin, believ, fo... | \n", "[looking, secret, ingredient, robitussin, beli... | \n", "
4 | \n", "Great taffy at a great price There was a wide... | \n", "great taffy at a great price there was a wide... | \n", "[great, taffy, at, a, great, price, there, was... | \n", "[great, taffy, great, price, wide, assortment,... | \n", "[great, taffi, great, price, wide, assort, yum... | \n", "[great, taffy, great, price, wide, assortment,... | \n", "
5 | \n", "I got a wild hair for taffy and ordered this f... | \n", "i got a wild hair for taffy and ordered this f... | \n", "[i, got, a, wild, hair, for, taffy, and, order... | \n", "[got, wild, hair, taffy, ordered, five, pound,... | \n", "[got, wild, hair, taffi, order, five, pound, b... | \n", "[got, wild, hair, taffy, ordered, five, pound,... | \n", "
6 | \n", "This saltwater taffy had great flavors and was... | \n", "this saltwater taffy had great flavors and was... | \n", "[this, saltwater, taffy, had, great, flavors, ... | \n", "[saltwater, taffy, great, flavors, soft, chewy... | \n", "[saltwat, taffi, great, flavor, soft, chewi, c... | \n", "[saltwater, taffy, great, flavor, soft, chewy,... | \n", "
7 | \n", "This taffy is so good It is very soft and che... | \n", "this taffy is so good it is very soft and che... | \n", "[this, taffy, is, so, good, it, is, very, soft... | \n", "[taffy, good, soft, chewy, flavors, amazing, w... | \n", "[taffi, good, soft, chewi, flavor, amaz, would... | \n", "[taffy, good, soft, chewy, flavor, amazing, wo... | \n", "
8 | \n", "Right now Im mostly just sprouting this so my ... | \n", "right now im mostly just sprouting this so my ... | \n", "[right, now, im, mostly, just, sprouting, this... | \n", "[right, im, mostly, sprouting, cats, eat, gras... | \n", "[right, im, mostli, sprout, cat, eat, grass, l... | \n", "[right, im, mostly, sprouting, cat, eat, grass... | \n", "
9 | \n", "This is a very healthy dog food Good for their... | \n", "this is a very healthy dog food good for their... | \n", "[this, is, a, very, healthy, dog, food, good, ... | \n", "[healthy, dog, food, good, digestion, also, go... | \n", "[healthi, dog, food, good, digest, also, good,... | \n", "[healthy, dog, food, good, digestion, also, go... | \n", "