{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "from haystack.nodes import PreProcessor, EmbeddingRetriever\n", "from haystack.document_stores import FAISSDocumentStore\n", "from haystack.utils import convert_files_to_docs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preprocess Documents" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### BLAB-Wiki" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'PreProcessor' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb Célula 5\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> 1\u001b[0m preprocessor \u001b[39m=\u001b[39m PreProcessor(\n\u001b[1;32m 2\u001b[0m clean_empty_lines\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m 3\u001b[0m clean_whitespace\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m 4\u001b[0m clean_header_footer\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m 5\u001b[0m split_by\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39msentence\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 6\u001b[0m split_length\u001b[39m=\u001b[39m\u001b[39m2\u001b[39m,\n\u001b[1;32m 7\u001b[0m split_overlap\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m,\n\u001b[1;32m 8\u001b[0m split_respect_sentence_boundary\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 10\u001b[0m all_docs \u001b[39m=\u001b[39m convert_files_to_docs(dir_path\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m./Fontes/Wiki_Pages/\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 11\u001b[0m docs_default \u001b[39m=\u001b[39m preprocessor\u001b[39m.\u001b[39mprocess(all_docs)\n", "\u001b[0;31mNameError\u001b[0m: name 'PreProcessor' is not defined" ] } ], "source": [ "preprocessor = PreProcessor(\n", " clean_empty_lines=True,\n", " clean_whitespace=True,\n", " clean_header_footer=False,\n", " split_by=\"sentence\",\n", " split_length=2,\n", " split_overlap=1,\n", " split_respect_sentence_boundary=False)\n", "\n", "all_docs = convert_files_to_docs(dir_path=\"./Fontes/Wiki_Pages/\")\n", "docs_default = preprocessor.process(all_docs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### QA Source" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# QA sentences\n", "QA_path = \"./Fontes/QA_Base/\"\n", "\n", "train = pd.read_parquet(QA_path + 'train.parquet')['new_long_answers']\n", "test = pd.read_parquet(QA_path + 'test.parquet')['new_long_answers']\n", "validation = pd.read_parquet(QA_path + 'validation.parquet')['new_long_answers']\n", "\n", "answers = pd.concat([train,test,validation])\n", "\n", "docs_list = [{\"content\": v, \"content_type\": \"text\", \"score\":None, \"meta\":None} for i,v in answers.items()]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create DocumentsStore and calculate Embeddings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "document_store = FAISSDocumentStore(similarity=\"dot_product\", embedding_dim=512)\n", "document_store.write_documents(docs_default + docs_list)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "retriever = EmbeddingRetriever(\n", " document_store=document_store, \n", " embedding_model=\"sentence-transformers/distiluse-base-multilingual-cased-v1\")\n", "\n", "document_store.update_embeddings(retriever, batch_size=10000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /home/luid/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package averaged_perceptron_tagger to\n", "[nltk_data] /home/luid/nltk_data...\n", "[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\n" ] }, { "ename": "NotImplementedError", "evalue": "Currently, NLTK pos_tag only supports English and Russian (i.e. lang='eng' or lang='rus')", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb Célula 12\u001b[0m line \u001b[0;36m1\n\u001b[1;32m 11\u001b[0m palavras \u001b[39m=\u001b[39m word_tokenize(sentenca, language\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mportuguese\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 13\u001b[0m \u001b[39m# POS-tagging das palavras\u001b[39;00m\n\u001b[0;32m---> 14\u001b[0m pos_tags \u001b[39m=\u001b[39m pos_tag(palavras, lang\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mpor\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[1;32m 16\u001b[0m \u001b[39m# Exibindo os resultados\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[39mprint\u001b[39m(pos_tags)\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/nltk/tag/__init__.py:166\u001b[0m, in \u001b[0;36mpos_tag\u001b[0;34m(tokens, tagset, lang)\u001b[0m\n\u001b[1;32m 141\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 142\u001b[0m \u001b[39mUse NLTK's currently recommended part of speech tagger to\u001b[39;00m\n\u001b[1;32m 143\u001b[0m \u001b[39mtag the given list of tokens.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[39m:rtype: list(tuple(str, str))\u001b[39;00m\n\u001b[1;32m 164\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 165\u001b[0m tagger \u001b[39m=\u001b[39m _get_tagger(lang)\n\u001b[0;32m--> 166\u001b[0m \u001b[39mreturn\u001b[39;00m _pos_tag(tokens, tagset, tagger, lang)\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/nltk/tag/__init__.py:114\u001b[0m, in \u001b[0;36m_pos_tag\u001b[0;34m(tokens, tagset, tagger, lang)\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_pos_tag\u001b[39m(tokens, tagset\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, tagger\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, lang\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m):\n\u001b[1;32m 112\u001b[0m \u001b[39m# Currently only supports English and Russian.\u001b[39;00m\n\u001b[1;32m 113\u001b[0m \u001b[39mif\u001b[39;00m lang \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m [\u001b[39m\"\u001b[39m\u001b[39meng\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mrus\u001b[39m\u001b[39m\"\u001b[39m]:\n\u001b[0;32m--> 114\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 115\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mCurrently, NLTK pos_tag only supports English and Russian \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 116\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m(i.e. lang=\u001b[39m\u001b[39m'\u001b[39m\u001b[39meng\u001b[39m\u001b[39m'\u001b[39m\u001b[39m or lang=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mrus\u001b[39m\u001b[39m'\u001b[39m\u001b[39m)\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 117\u001b[0m )\n\u001b[1;32m 118\u001b[0m \u001b[39m# Throws Error if tokens is of string type\u001b[39;00m\n\u001b[1;32m 119\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(tokens, \u001b[39mstr\u001b[39m):\n", "\u001b[0;31mNotImplementedError\u001b[0m: Currently, NLTK pos_tag only supports English and Russian (i.e. lang='eng' or lang='rus')" ] } ], "source": [ "import nltk\n", "from nltk.tokenize import word_tokenize\n", "from nltk import pos_tag\n", "nltk.download('punkt')\n", "nltk.download('averaged_perceptron_tagger')\n", "\n", "# Sentença de exemplo\n", "sentenca = \"O gato está no telhado.\"\n", "\n", "# Tokenização da sentença em palavras\n", "palavras = word_tokenize(sentenca, language='portuguese')\n", "\n", "# POS-tagging das palavras\n", "pos_tags = pos_tag(palavras, lang='por')\n", "\n", "# Exibindo os resultados\n", "print(pos_tags)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "sentence = \"Eu gosto de programar em Python.\"\n", "inputs = tokenizer(sentence, return_tensors=\"pt\")\n", "outputs = model(**inputs)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "predicted_labels = torch.argmax(outputs.logits, dim=2)\n", "verb_indices = [(i,label) for i, label in enumerate(predicted_labels[0])]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0, tensor(1)),\n", " (1, tensor(1)),\n", " (2, tensor(1)),\n", " (3, tensor(1)),\n", " (4, tensor(0)),\n", " (5, tensor(0)),\n", " (6, tensor(1)),\n", " (7, tensor(1)),\n", " (8, tensor(0)),\n", " (9, tensor(1)),\n", " (10, tensor(1))]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "verb_indices" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Verbos na sentença: ['gosto', 'de', '##r', 'em', '##thon']\n" ] } ], "source": [ "predicted_labels = torch.argmax(outputs.logits, dim=2)\n", "verb_indices = [i for i, label in enumerate(predicted_labels[0]) if label == 1]\n", "\n", "verbs = [tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][i].item()) for i in verb_indices]\n", "print(\"Verbos na sentença:\", verbs)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-11-28 18:26:39.155987: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2023-11-28 18:26:39.300399: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2023-11-28 18:26:39.300771: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n" ] } ], "source": [ "import spacy\n", "from spacy.lang.pt.examples import sentences " ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares \n", "\n", "Carros autônomos empurram a responsabilidade do seguro para os fabricantes.São Francisco considera banir os robôs de entrega que andam pelas calçadas \n", "\n", "Londres é a maior cidade do Reino Unido \n", "\n" ] } ], "source": [ "\n", "# Alguns exemplos fornecidos pela própria biblioteca\n", "for s in sentences:\n", " print(s, '\\n')\n", "\n" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares\n" ] } ], "source": [ "# Criando o objeto spacy\n", "nlp = spacy.load(\"pt_core_news_lg\")\n", "doc = nlp(sentences[0])\n", "print(doc.text)\n" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "doc = nlp(\"A amazonia azul e a defesa maritma\")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "for token in doc:\n", " verb_count = 0\n", " if token.pos_ == 'VERB':\n", " verb_count +=1" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "verb_count" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A DET\n", "amazonia NOUN\n", "azul ADJ\n", "e CCONJ\n", "a DET\n", "defesa NOUN\n", "maritma NOUN\n" ] } ], "source": [ "for token in doc:\n", " print(token.text, token.pos_)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }