{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "gpuClass": "standard" }, "cells": [ { "cell_type": "code", "source": [ "pip install transformers==2.6.0" ], "metadata": { "id": "c7OvXBZbxlR6" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "xcAUVFHqovOd" }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import re\n", "import os\n", "from tqdm import tqdm, trange\n", "\n", "\n", "import pickle\n", "import torch\n", "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", "from transformers import BertTokenizer, BertConfig\n", "\n", "from keras_preprocessing.sequence import pad_sequences\n", "from sklearn.model_selection import train_test_split\n", "\n", "import transformers\n", "from transformers import BertForTokenClassification, AdamW\n" ] }, { "cell_type": "code", "source": [ "import keras_preprocessing\n", "keras_preprocessing.__version__\n", "\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "TgsywgDPq07l", "outputId": "0196ec4f-0bd5-4da8-d8fb-c3de0a1aa18e" }, "execution_count": 3, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'1.1.2'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 3 } ] }, { "cell_type": "code", "source": [ "class nerModel:\n", " \n", " def __init__(self, model_path, idx2tag_path):\n", " self.ner_model = {}\n", " self.idx2tag = pickle.load(open(os.path.join(idx2tag_path, \"idx2tag.pkl\"), 'rb'))\n", " self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=True)\n", " if torch.cuda.is_available():\n", " self.model = torch.load(os.path.join(model_path,\"model.pt\"))\n", " else:\n", " self.model = torch.load(os.path.join(model_path,\"model.pt\"), map_location=torch.device('cpu'))\n", " # self.model = torch.load(os.path.join(model_path,\"model.pt\"))\n", " self.model.eval()\n", " \n", " \n", " def do_pridict(self, input_sentence):\n", " # first toknize the sentences\n", " tokenized_sentence = self.tokenizer.encode(input_sentence)\n", " if torch.cuda.is_available():\n", " input_ids = torch.tensor([tokenized_sentence]).cuda()\n", " else:\n", " input_ids = torch.tensor([tokenized_sentence])\n", " # input_ids = torch.tensor([tokenized_sentence]).cuda()\n", " # run the sentences through the model\n", " with torch.no_grad():\n", " output = self.model(input_ids)\n", " label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)\n", "\n", "\n", " # join bpe split tokens\n", " tokens = self.tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])\n", " new_tokens, new_labels = [], []\n", " for token, label_idx in zip(tokens, label_indices[0]):\n", " if token.startswith(\"##\"):\n", " new_tokens[-1] = new_tokens[-1] + token[2:]\n", " else:\n", " new_labels.append(self.idx2tag[label_idx])\n", " print(label_idx)\n", " new_tokens.append(token)\n", " return new_tokens, new_labels, output\n", "\n", " def predict_proba(self, input_sentence):\n", " # first toknize the sentences\n", " tokenized_sentence = self.tokenizer.encode(input_sentence)\n", " input_ids = torch.tensor([tokenized_sentence])\n", " # run the sentences through the model\n", " with torch.no_grad():\n", " output = self.model(input_ids)\n", " pred = output[0][0].to('cpu').numpy()\n", "\n", " return np.array(pred)\n", " \n" ], "metadata": { "id": "-htuutvEuzPm" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "V-mF4U9Czsio" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "PZxv4sl5zszZ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Creat an object for nerModel class and pass the path toner saved model.pt and idx2tag dictionary\n", "ner_obj = nerModel('/content/model', '/content/asset')" ], "metadata": { "id": "kmL7YQ4SN3NA" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "test_sentence = \"\"\"\n", "Kedir is an engineer work at Amazon.\n", "\"\"\"" ], "metadata": { "id": "TDhbl_qWSihT" }, "execution_count": 8, "outputs": [] }, { "cell_type": "code", "source": [ "tokens, labels, output = ner_obj.do_pridict(test_sentence)\n", "\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zACDd9ufSik8", "outputId": "732d3523-d13e-4d00-a6ca-2ab09c725931" }, "execution_count": 9, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "12\n", "11\n", "12\n", "12\n", "12\n", "12\n", "12\n", "9\n", "12\n", "12\n" ] } ] }, { "cell_type": "code", "source": [ "for token, label in zip(tokens, labels):\n", " # if token not in ['[CLS]','[SEP]']:\n", " print(\"{}\\t{}\".format(label, token))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "d1RXod8ySipc", "outputId": "9ed8a307-7df4-4f71-cadd-d303b6fb5a6b" }, "execution_count": 26, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "O\t[CLS]\n", "B-per\tkedir\n", "O\tis\n", "O\tan\n", "O\tengineer\n", "O\twork\n", "O\tat\n", "B-org\tamazon\n", "O\t.\n", "O\t[SEP]\n" ] } ] } ] }