{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"from haystack.nodes import PreProcessor, EmbeddingRetriever\n",
"from haystack.document_stores import FAISSDocumentStore\n",
"from haystack.utils import convert_files_to_docs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preprocess Documents"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### BLAB-Wiki"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'PreProcessor' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb Célula 5\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> 1\u001b[0m preprocessor \u001b[39m=\u001b[39m PreProcessor(\n\u001b[1;32m 2\u001b[0m clean_empty_lines\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m 3\u001b[0m clean_whitespace\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m 4\u001b[0m clean_header_footer\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m 5\u001b[0m split_by\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39msentence\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 6\u001b[0m split_length\u001b[39m=\u001b[39m\u001b[39m2\u001b[39m,\n\u001b[1;32m 7\u001b[0m split_overlap\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m,\n\u001b[1;32m 8\u001b[0m split_respect_sentence_boundary\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 10\u001b[0m all_docs \u001b[39m=\u001b[39m convert_files_to_docs(dir_path\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m./Fontes/Wiki_Pages/\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 11\u001b[0m docs_default \u001b[39m=\u001b[39m preprocessor\u001b[39m.\u001b[39mprocess(all_docs)\n",
"\u001b[0;31mNameError\u001b[0m: name 'PreProcessor' is not defined"
]
}
],
"source": [
"preprocessor = PreProcessor(\n",
" clean_empty_lines=True,\n",
" clean_whitespace=True,\n",
" clean_header_footer=False,\n",
" split_by=\"sentence\",\n",
" split_length=2,\n",
" split_overlap=1,\n",
" split_respect_sentence_boundary=False)\n",
"\n",
"all_docs = convert_files_to_docs(dir_path=\"./Fontes/Wiki_Pages/\")\n",
"docs_default = preprocessor.process(all_docs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### QA Source"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# QA sentences\n",
"QA_path = \"./Fontes/QA_Base/\"\n",
"\n",
"train = pd.read_parquet(QA_path + 'train.parquet')['new_long_answers']\n",
"test = pd.read_parquet(QA_path + 'test.parquet')['new_long_answers']\n",
"validation = pd.read_parquet(QA_path + 'validation.parquet')['new_long_answers']\n",
"\n",
"answers = pd.concat([train,test,validation])\n",
"\n",
"docs_list = [{\"content\": v, \"content_type\": \"text\", \"score\":None, \"meta\":None} for i,v in answers.items()]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create DocumentsStore and calculate Embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"document_store = FAISSDocumentStore(similarity=\"dot_product\", embedding_dim=512)\n",
"document_store.write_documents(docs_default + docs_list)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"retriever = EmbeddingRetriever(\n",
" document_store=document_store, \n",
" embedding_model=\"sentence-transformers/distiluse-base-multilingual-cased-v1\")\n",
"\n",
"document_store.update_embeddings(retriever, batch_size=10000)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/luid/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
"[nltk_data] /home/luid/nltk_data...\n",
"[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\n"
]
},
{
"ename": "NotImplementedError",
"evalue": "Currently, NLTK pos_tag only supports English and Russian (i.e. lang='eng' or lang='rus')",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/home/luid/Projetos/Fact_Checking_Blue_Amazon/ETL/embeddings_base.ipynb Célula 12\u001b[0m line \u001b[0;36m1\n\u001b[1;32m 11\u001b[0m palavras \u001b[39m=\u001b[39m word_tokenize(sentenca, language\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mportuguese\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 13\u001b[0m \u001b[39m# POS-tagging das palavras\u001b[39;00m\n\u001b[0;32m---> 14\u001b[0m pos_tags \u001b[39m=\u001b[39m pos_tag(palavras, lang\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mpor\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[1;32m 16\u001b[0m \u001b[39m# Exibindo os resultados\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[39mprint\u001b[39m(pos_tags)\n",
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/nltk/tag/__init__.py:166\u001b[0m, in \u001b[0;36mpos_tag\u001b[0;34m(tokens, tagset, lang)\u001b[0m\n\u001b[1;32m 141\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 142\u001b[0m \u001b[39mUse NLTK's currently recommended part of speech tagger to\u001b[39;00m\n\u001b[1;32m 143\u001b[0m \u001b[39mtag the given list of tokens.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[39m:rtype: list(tuple(str, str))\u001b[39;00m\n\u001b[1;32m 164\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 165\u001b[0m tagger \u001b[39m=\u001b[39m _get_tagger(lang)\n\u001b[0;32m--> 166\u001b[0m \u001b[39mreturn\u001b[39;00m _pos_tag(tokens, tagset, tagger, lang)\n",
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/nltk/tag/__init__.py:114\u001b[0m, in \u001b[0;36m_pos_tag\u001b[0;34m(tokens, tagset, tagger, lang)\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_pos_tag\u001b[39m(tokens, tagset\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, tagger\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, lang\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m):\n\u001b[1;32m 112\u001b[0m \u001b[39m# Currently only supports English and Russian.\u001b[39;00m\n\u001b[1;32m 113\u001b[0m \u001b[39mif\u001b[39;00m lang \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m [\u001b[39m\"\u001b[39m\u001b[39meng\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mrus\u001b[39m\u001b[39m\"\u001b[39m]:\n\u001b[0;32m--> 114\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 115\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mCurrently, NLTK pos_tag only supports English and Russian \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 116\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m(i.e. lang=\u001b[39m\u001b[39m'\u001b[39m\u001b[39meng\u001b[39m\u001b[39m'\u001b[39m\u001b[39m or lang=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mrus\u001b[39m\u001b[39m'\u001b[39m\u001b[39m)\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 117\u001b[0m )\n\u001b[1;32m 118\u001b[0m \u001b[39m# Throws Error if tokens is of string type\u001b[39;00m\n\u001b[1;32m 119\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(tokens, \u001b[39mstr\u001b[39m):\n",
"\u001b[0;31mNotImplementedError\u001b[0m: Currently, NLTK pos_tag only supports English and Russian (i.e. lang='eng' or lang='rus')"
]
}
],
"source": [
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk import pos_tag\n",
"nltk.download('punkt')\n",
"nltk.download('averaged_perceptron_tagger')\n",
"\n",
"# Sentença de exemplo\n",
"sentenca = \"O gato está no telhado.\"\n",
"\n",
"# Tokenização da sentença em palavras\n",
"palavras = word_tokenize(sentenca, language='portuguese')\n",
"\n",
"# POS-tagging das palavras\n",
"pos_tags = pos_tag(palavras, lang='por')\n",
"\n",
"# Exibindo os resultados\n",
"print(pos_tags)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Eu gosto de programar em Python.\"\n",
"inputs = tokenizer(sentence, return_tensors=\"pt\")\n",
"outputs = model(**inputs)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"predicted_labels = torch.argmax(outputs.logits, dim=2)\n",
"verb_indices = [(i,label) for i, label in enumerate(predicted_labels[0])]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(0, tensor(1)),\n",
" (1, tensor(1)),\n",
" (2, tensor(1)),\n",
" (3, tensor(1)),\n",
" (4, tensor(0)),\n",
" (5, tensor(0)),\n",
" (6, tensor(1)),\n",
" (7, tensor(1)),\n",
" (8, tensor(0)),\n",
" (9, tensor(1)),\n",
" (10, tensor(1))]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"verb_indices"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Verbos na sentença: ['gosto', 'de', '##r', 'em', '##thon']\n"
]
}
],
"source": [
"predicted_labels = torch.argmax(outputs.logits, dim=2)\n",
"verb_indices = [i for i, label in enumerate(predicted_labels[0]) if label == 1]\n",
"\n",
"verbs = [tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][i].item()) for i in verb_indices]\n",
"print(\"Verbos na sentença:\", verbs)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-11-28 18:26:39.155987: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
"2023-11-28 18:26:39.300399: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
"2023-11-28 18:26:39.300771: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n"
]
}
],
"source": [
"import spacy\n",
"from spacy.lang.pt.examples import sentences "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares \n",
"\n",
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes.São Francisco considera banir os robôs de entrega que andam pelas calçadas \n",
"\n",
"Londres é a maior cidade do Reino Unido \n",
"\n"
]
}
],
"source": [
"\n",
"# Alguns exemplos fornecidos pela própria biblioteca\n",
"for s in sentences:\n",
" print(s, '\\n')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares\n"
]
}
],
"source": [
"# Criando o objeto spacy\n",
"nlp = spacy.load(\"pt_core_news_lg\")\n",
"doc = nlp(sentences[0])\n",
"print(doc.text)\n"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"doc = nlp(\"A amazonia azul e a defesa maritma\")"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"for token in doc:\n",
" verb_count = 0\n",
" if token.pos_ == 'VERB':\n",
" verb_count +=1"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"verb_count"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"A DET\n",
"amazonia NOUN\n",
"azul ADJ\n",
"e CCONJ\n",
"a DET\n",
"defesa NOUN\n",
"maritma NOUN\n"
]
}
],
"source": [
"for token in doc:\n",
" print(token.text, token.pos_)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}