File size: 8,533 Bytes

32347fd

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "750fed8c",
   "metadata": {},
   "source": [
    "Must run the following:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ccad76ec",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "D:\\Research\\FinancialMarkets\\Emotions\\Emtract\\Training\\EmTract\n"
     ]
    }
   ],
   "source": [
    "!git clone https://github.com/dvamossy/EmTract.git\n",
    "%cd EmTract\n",
    "!pip install -r requirements.txt "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2551adee",
   "metadata": {},
   "source": [
    "Text Cleaner for unprocessed text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "687995ef",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Research\\FinancialMarkets\\Emotions\\Emtract\\Training\\EmTract\\emtract\\processors\\cleaning.py:68: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
      "  symspell_list = pd.read_csv(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'soo well'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from emtract.processors.cleaning import clean_text\n",
    "# Illustrate text cleaning\n",
    "clean_text(\"soooooo well\", segment_words=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6b81c0cd",
   "metadata": {},
   "source": [
    "Option I"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ca68eb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import pipeline\n",
    "classifier = pipeline(\"text-classification\", model=\"vamossyd/emtract-distilbert-base-uncased-emotion\", return_all_scores=True)\n",
    "classifier(\"i love this!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0b9cd58f",
   "metadata": {},
   "source": [
    "Option II"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "524cb5d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer\n",
    "\n",
    "# Create class for data preparation\n",
    "class SimpleDataset:\n",
    "    def __init__(self, tokenized_texts):\n",
    "        self.tokenized_texts = tokenized_texts\n",
    "    \n",
    "    def __len__(self):\n",
    "        return len(self.tokenized_texts[\"input_ids\"])\n",
    "    \n",
    "    def __getitem__(self, idx):\n",
    "        return {k: v[idx] for k, v in self.tokenized_texts.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1f9f01f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_path = \"PROVIDE_PATH_TO_DATA\"\n",
    "# data = pd.read_csv(input_path) # ASSUMING DATA IS IN CSV\n",
    "\n",
    "# If text is already cleaned:\n",
    "# texts = data.text.tolist() \n",
    "\n",
    "# Otherwise:\n",
    "# texts = data['text'].apply(clean_text).tolist() # \n",
    "\n",
    "# As an example:\n",
    "texts = ['i love this', 'i do not love you', 'to the moon 🚀']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "04ce5528",
   "metadata": {},
   "outputs": [],
   "source": [
    "# in case the model does not load, use git to clone it and use emtract-distilbert-base-uncased-emotion in the model_name field\n",
    "\n",
    "#!git clone https://huggingface.co/vamossyd/emtract-distilbert-base-uncased-emotion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "839cd230",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load tokenizer and model, create trainer\n",
    "model_name = \"vamossyd/emtract-distilbert-base-uncased-emotion\"\n",
    "# model_name = \"emtract-distilbert-base-uncased-emotion\" # in case the model does not load\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "model = AutoModelForSequenceClassification.from_pretrained(model_name)\n",
    "trainer = Trainer(model=model)\n",
    "\n",
    "# Tokenize texts and create prediction data set\n",
    "tokenized_texts = tokenizer(texts, truncation=True, padding=True)\n",
    "pred_dataset = SimpleDataset(tokenized_texts)\n",
    "predictions = trainer.predict(pred_dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d903549",
   "metadata": {},
   "outputs": [],
   "source": [
    "# scores raw\n",
    "temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))\n",
    "preds = predictions.predictions.argmax(-1)\n",
    "labels = pd.Series(preds).map(model.config.id2label)\n",
    "\n",
    "# container\n",
    "anger = []\n",
    "disgust = []\n",
    "fear = []\n",
    "happy = []\n",
    "neutral = []\n",
    "sadness = []\n",
    "surprise = []\n",
    "\n",
    "# extract scores (as many entries as exist in pred_texts)\n",
    "for i in range(len(texts)):\n",
    "    anger.append(temp[i][3])\n",
    "    disgust.append(temp[i][4])\n",
    "    fear.append(temp[i][6])\n",
    "    happy.append(temp[i][1])\n",
    "    neutral.append(temp[i][0])\n",
    "    sadness.append(temp[i][2])\n",
    "    surprise.append(temp[i][5])\n",
    "    \n",
    "df = pd.DataFrame(list(zip(texts, labels, anger, disgust, fear, happy, neutral, sadness, surprise)), columns=['text','pred_label', 'anger', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "577f10b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# save results to csv\n",
    "output_path = \"YOUR_FILENAME_EMOTIONS.csv\"  # name your output file\n",
    "# df.to_csv(YOUR_FILENAME)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ddd22317",
   "metadata": {},
   "source": [
    "Option III\n",
    "\n",
    "Batch prediction in case data is too large."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f39375b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Specify batch size\n",
    "batch_size = 100000\n",
    "\n",
    "# Split the texts into batches\n",
    "text_batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]\n",
    "\n",
    "# Store the predictions\n",
    "all_predictions = []\n",
    "\n",
    "# Iterate through batches\n",
    "for batch in tqdm(text_batches):\n",
    "    # Tokenize texts and create prediction dataset\n",
    "    tokenized_texts = tokenizer(batch, truncation=True, padding=True)\n",
    "    pred_dataset = SimpleDataset(tokenized_texts)\n",
    "    predictions = trainer.predict(pred_dataset)[0]\n",
    "    all_predictions.extend(predictions)\n",
    "\n",
    "all_predictions = np.array(all_predictions)\n",
    "\n",
    "# scores raw\n",
    "temp = (np.exp(all_predictions)/np.exp(all_predictions).sum(-1,keepdims=True))\n",
    "\n",
    "# container\n",
    "anger = []\n",
    "disgust = []\n",
    "fear = []\n",
    "happy = []\n",
    "neutral = []\n",
    "sadness = []\n",
    "surprise = []\n",
    "\n",
    "# extract scores (as many entries as exist in pred_texts)\n",
    "for i in range(len(texts)):\n",
    "    anger.append(temp[i][3])\n",
    "    disgust.append(temp[i][4])\n",
    "    fear.append(temp[i][6])\n",
    "    happy.append(temp[i][1])\n",
    "    neutral.append(temp[i][0])\n",
    "    sadness.append(temp[i][2])\n",
    "    surprise.append(temp[i][5])\n",
    "    \n",
    "df = pd.DataFrame(list(zip(texts, anger, disgust, fear, happy, neutral, sadness, surprise)), columns=['text', 'anger', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'])\n",
    "df.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}