## Imports

In [None]:
import pandas as pd

from haystack.nodes import PreProcessor, EmbeddingRetriever
from haystack.document_stores import FAISSDocumentStore
from haystack.utils import convert_files_to_docs

## Preprocess Documents

### BLAB-Wiki

In [1]:
preprocessor = PreProcessor(
 clean_empty_lines=True,
 clean_whitespace=True,
 clean_header_footer=False,
 split_by="sentence",
 split_length=2,
 split_overlap=1,
 split_respect_sentence_boundary=False)

all_docs = convert_files_to_docs(dir_path="./Fontes/Wiki_Pages/")
docs_default = preprocessor.process(all_docs)

NameError: name 'PreProcessor' is not defined

### QA Source

In [None]:
# QA sentences
QA_path = "./Fontes/QA_Base/"

train = pd.read_parquet(QA_path + 'train.parquet')['new_long_answers']
test = pd.read_parquet(QA_path + 'test.parquet')['new_long_answers']
validation = pd.read_parquet(QA_path + 'validation.parquet')['new_long_answers']

answers = pd.concat([train,test,validation])

docs_list = [{"content": v, "content_type": "text", "score":None, "meta":None} for i,v in answers.items()]

## Create DocumentsStore and calculate Embeddings

In [None]:
document_store = FAISSDocumentStore(similarity="dot_product", embedding_dim=512)
document_store.write_documents(docs_default + docs_list)

In [None]:
retriever = EmbeddingRetriever(
 document_store=document_store, 
 embedding_model="sentence-transformers/distiluse-base-multilingual-cased-v1")

document_store.update_embeddings(retriever, batch_size=10000)

In [10]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Sentença de exemplo
sentenca = "O gato está no telhado."

# Tokenização da sentença em palavras
palavras = word_tokenize(sentenca, language='portuguese')

# POS-tagging das palavras
pos_tags = pos_tag(palavras, lang='por')

# Exibindo os resultados
print(pos_tags)

[nltk_data] Downloading package punkt to /home/luid/nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data] /home/luid/nltk_data...
[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.


NotImplementedError: Currently, NLTK pos_tag only supports English and Russian (i.e. lang='eng' or lang='rus')

In [3]:
sentence = "Eu gosto de programar em Python."
inputs = tokenizer(sentence, return_tensors="pt")
outputs = model(**inputs)

In [8]:
predicted_labels = torch.argmax(outputs.logits, dim=2)
verb_indices = [(i,label) for i, label in enumerate(predicted_labels[0])]

In [9]:
verb_indices

[(0, tensor(1)),
 (1, tensor(1)),
 (2, tensor(1)),
 (3, tensor(1)),
 (4, tensor(0)),
 (5, tensor(0)),
 (6, tensor(1)),
 (7, tensor(1)),
 (8, tensor(0)),
 (9, tensor(1)),
 (10, tensor(1))]

In [7]:
predicted_labels = torch.argmax(outputs.logits, dim=2)
verb_indices = [i for i, label in enumerate(predicted_labels[0]) if label == 1]

verbs = [tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][i].item()) for i in verb_indices]
print("Verbos na sentença:", verbs)

Verbos na sentença: ['gosto', 'de', '##r', 'em', '##thon']


In [11]:
import spacy
from spacy.lang.pt.examples import sentences 

2023-11-28 18:26:39.155987: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-28 18:26:39.300399: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-28 18:26:39.300771: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [12]:

# Alguns exemplos fornecidos pela própria biblioteca
for s in sentences:
 print(s, '\n')



Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares 

Carros autônomos empurram a responsabilidade do seguro para os fabricantes.São Francisco considera banir os robôs de entrega que andam pelas calçadas 

Londres é a maior cidade do Reino Unido 



In [29]:
# Criando o objeto spacy
nlp = spacy.load("pt_core_news_lg")
doc = nlp(sentences[0])
print(doc.text)


Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares


In [34]:
doc = nlp("A amazonia azul e a defesa maritma")

In [36]:
for token in doc:
 verb_count = 0
 if token.pos_ == 'VERB':
 verb_count +=1

In [37]:
verb_count

0

In [35]:
for token in doc:
 print(token.text, token.pos_)


A DET
amazonia NOUN
azul ADJ
e CCONJ
a DET
defesa NOUN
maritma NOUN
