{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "from haystack.nodes import PreProcessor, EmbeddingRetriever\n",
    "from haystack.document_stores import FAISSDocumentStore\n",
    "from haystack.utils import convert_files_to_docs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocess Documents"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### BLAB-Wiki"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'PreProcessor' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb Célula 5\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb#W4sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m preprocessor \u001b[39m=\u001b[39m PreProcessor(\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb#W4sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m     clean_empty_lines\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb#W4sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m     clean_whitespace\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb#W4sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m     clean_header_footer\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb#W4sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m     split_by\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39msentence\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb#W4sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m     split_length\u001b[39m=\u001b[39m\u001b[39m2\u001b[39m,\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb#W4sZmlsZQ%3D%3D?line=6'>7</a>\u001b[0m     split_overlap\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m,\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb#W4sZmlsZQ%3D%3D?line=7'>8</a>\u001b[0m     split_respect_sentence_boundary\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m     <a href='vscode-notebook-cell:/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb#W4sZmlsZQ%3D%3D?line=9'>10</a>\u001b[0m all_docs \u001b[39m=\u001b[39m convert_files_to_docs(dir_path\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m./Fontes/Wiki_Pages/\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m     <a href='vscode-notebook-cell:/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb#W4sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m docs_default \u001b[39m=\u001b[39m preprocessor\u001b[39m.\u001b[39mprocess(all_docs)\n",
      "\u001b[0;31mNameError\u001b[0m: name 'PreProcessor' is not defined"
     ]
    }
   ],
   "source": [
    "preprocessor = PreProcessor(\n",
    "    clean_empty_lines=True,\n",
    "    clean_whitespace=True,\n",
    "    clean_header_footer=False,\n",
    "    split_by=\"sentence\",\n",
    "    split_length=2,\n",
    "    split_overlap=1,\n",
    "    split_respect_sentence_boundary=False)\n",
    "\n",
    "all_docs = convert_files_to_docs(dir_path=\"./Fontes/Wiki_Pages/\")\n",
    "docs_default = preprocessor.process(all_docs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### QA Source"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# QA sentences\n",
    "QA_path = \"./Fontes/QA_Base/\"\n",
    "\n",
    "train = pd.read_parquet(QA_path + 'train.parquet')['new_long_answers']\n",
    "test = pd.read_parquet(QA_path + 'test.parquet')['new_long_answers']\n",
    "validation = pd.read_parquet(QA_path + 'validation.parquet')['new_long_answers']\n",
    "\n",
    "answers = pd.concat([train,test,validation])\n",
    "\n",
    "docs_list = [{\"content\": v, \"content_type\": \"text\", \"score\":None, \"meta\":None} for i,v in answers.items()]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create DocumentsStore and calculate Embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "document_store = FAISSDocumentStore(similarity=\"dot_product\", embedding_dim=512)\n",
    "document_store.write_documents(docs_default + docs_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "retriever = EmbeddingRetriever(\n",
    "    document_store=document_store, \n",
    "    embedding_model=\"sentence-transformers/distiluse-base-multilingual-cased-v1\")\n",
    "\n",
    "document_store.update_embeddings(retriever, batch_size=10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/luid/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
      "[nltk_data]     /home/luid/nltk_data...\n",
      "[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.\n"
     ]
    },
    {
     "ename": "NotImplementedError",
     "evalue": "Currently, NLTK pos_tag only supports English and Russian (i.e. lang='eng' or lang='rus')",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNotImplementedError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[1;32m/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb Célula 12\u001b[0m line \u001b[0;36m1\n\u001b[1;32m     <a href='vscode-notebook-cell:/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb#X14sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m palavras \u001b[39m=\u001b[39m word_tokenize(sentenca, language\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mportuguese\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m     <a href='vscode-notebook-cell:/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb#X14sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m \u001b[39m# POS-tagging das palavras\u001b[39;00m\n\u001b[0;32m---> <a href='vscode-notebook-cell:/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb#X14sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m pos_tags \u001b[39m=\u001b[39m pos_tag(palavras, lang\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mpor\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[1;32m     <a href='vscode-notebook-cell:/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb#X14sZmlsZQ%3D%3D?line=15'>16</a>\u001b[0m \u001b[39m# Exibindo os resultados\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell:/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb#X14sZmlsZQ%3D%3D?line=16'>17</a>\u001b[0m \u001b[39mprint\u001b[39m(pos_tags)\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/nltk/tag/__init__.py:166\u001b[0m, in \u001b[0;36mpos_tag\u001b[0;34m(tokens, tagset, lang)\u001b[0m\n\u001b[1;32m    141\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    142\u001b[0m \u001b[39mUse NLTK's currently recommended part of speech tagger to\u001b[39;00m\n\u001b[1;32m    143\u001b[0m \u001b[39mtag the given list of tokens.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    163\u001b[0m \u001b[39m:rtype: list(tuple(str, str))\u001b[39;00m\n\u001b[1;32m    164\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    165\u001b[0m tagger \u001b[39m=\u001b[39m _get_tagger(lang)\n\u001b[0;32m--> 166\u001b[0m \u001b[39mreturn\u001b[39;00m _pos_tag(tokens, tagset, tagger, lang)\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/nltk/tag/__init__.py:114\u001b[0m, in \u001b[0;36m_pos_tag\u001b[0;34m(tokens, tagset, tagger, lang)\u001b[0m\n\u001b[1;32m    111\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_pos_tag\u001b[39m(tokens, tagset\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, tagger\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, lang\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m):\n\u001b[1;32m    112\u001b[0m     \u001b[39m# Currently only supports English and Russian.\u001b[39;00m\n\u001b[1;32m    113\u001b[0m     \u001b[39mif\u001b[39;00m lang \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m [\u001b[39m\"\u001b[39m\u001b[39meng\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mrus\u001b[39m\u001b[39m\"\u001b[39m]:\n\u001b[0;32m--> 114\u001b[0m         \u001b[39mraise\u001b[39;00m \u001b[39mNotImplementedError\u001b[39;00m(\n\u001b[1;32m    115\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39mCurrently, NLTK pos_tag only supports English and Russian \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    116\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39m(i.e. lang=\u001b[39m\u001b[39m'\u001b[39m\u001b[39meng\u001b[39m\u001b[39m'\u001b[39m\u001b[39m or lang=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mrus\u001b[39m\u001b[39m'\u001b[39m\u001b[39m)\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    117\u001b[0m         )\n\u001b[1;32m    118\u001b[0m     \u001b[39m# Throws Error if tokens is of string type\u001b[39;00m\n\u001b[1;32m    119\u001b[0m     \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(tokens, \u001b[39mstr\u001b[39m):\n",
      "\u001b[0;31mNotImplementedError\u001b[0m: Currently, NLTK pos_tag only supports English and Russian (i.e. lang='eng' or lang='rus')"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk import pos_tag\n",
    "nltk.download('punkt')\n",
    "nltk.download('averaged_perceptron_tagger')\n",
    "\n",
    "# Sentença de exemplo\n",
    "sentenca = \"O gato está no telhado.\"\n",
    "\n",
    "# Tokenização da sentença em palavras\n",
    "palavras = word_tokenize(sentenca, language='portuguese')\n",
    "\n",
    "# POS-tagging das palavras\n",
    "pos_tags = pos_tag(palavras, lang='por')\n",
    "\n",
    "# Exibindo os resultados\n",
    "print(pos_tags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentence = \"Eu gosto de programar em Python.\"\n",
    "inputs = tokenizer(sentence, return_tensors=\"pt\")\n",
    "outputs = model(**inputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "predicted_labels = torch.argmax(outputs.logits, dim=2)\n",
    "verb_indices = [(i,label) for i, label in enumerate(predicted_labels[0])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, tensor(1)),\n",
       " (1, tensor(1)),\n",
       " (2, tensor(1)),\n",
       " (3, tensor(1)),\n",
       " (4, tensor(0)),\n",
       " (5, tensor(0)),\n",
       " (6, tensor(1)),\n",
       " (7, tensor(1)),\n",
       " (8, tensor(0)),\n",
       " (9, tensor(1)),\n",
       " (10, tensor(1))]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "verb_indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verbos na sentença: ['gosto', 'de', '##r', 'em', '##thon']\n"
     ]
    }
   ],
   "source": [
    "predicted_labels = torch.argmax(outputs.logits, dim=2)\n",
    "verb_indices = [i for i, label in enumerate(predicted_labels[0]) if label == 1]\n",
    "\n",
    "verbs = [tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][i].item()) for i in verb_indices]\n",
    "print(\"Verbos na sentença:\", verbs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-11-28 18:26:39.155987: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
      "2023-11-28 18:26:39.300399: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
      "2023-11-28 18:26:39.300771: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n"
     ]
    }
   ],
   "source": [
    "import spacy\n",
    "from spacy.lang.pt.examples import sentences "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares \n",
      "\n",
      "Carros autônomos empurram a responsabilidade do seguro para os fabricantes.São Francisco considera banir os robôs de entrega que andam pelas calçadas \n",
      "\n",
      "Londres é a maior cidade do Reino Unido \n",
      "\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# Alguns exemplos fornecidos pela própria biblioteca\n",
    "for s in sentences:\n",
    "    print(s, '\\n')\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares\n"
     ]
    }
   ],
   "source": [
    "# Criando o objeto spacy\n",
    "nlp = spacy.load(\"pt_core_news_lg\")\n",
    "doc = nlp(sentences[0])\n",
    "print(doc.text)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "doc = nlp(\"A amazonia azul e a defesa maritma\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "for token in doc:\n",
    "    verb_count = 0\n",
    "    if token.pos_ == 'VERB':\n",
    "        verb_count +=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "verb_count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A DET\n",
      "amazonia NOUN\n",
      "azul ADJ\n",
      "e CCONJ\n",
      "a DET\n",
      "defesa NOUN\n",
      "maritma NOUN\n"
     ]
    }
   ],
   "source": [
    "for token in doc:\n",
    "    print(token.text, token.pos_)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}