File size: 10,463 Bytes
320f492
1
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":9661596,"sourceType":"datasetVersion","datasetId":5902909},{"sourceId":140348,"sourceType":"modelInstanceVersion","isSourceIdPinned":true,"modelInstanceId":118867,"modelId":142118}],"dockerImageVersionId":30787,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# # This Python 3 environment comes with many helpful analytics libraries installed\n# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# # For example, here's several helpful packages to load\n\n# import numpy as np # linear algebra\n# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# # Input data files are available in the read-only \"../input/\" directory\n# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\n# import os\n# for dirname, _, filenames in os.walk('/kaggle/input'):\n#     for filename in filenames:\n#         print(os.path.join(dirname, filename))\n\n# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"!pip install seqeval","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import pandas as pd\nimport re\nfrom transformers import BertTokenizer, BertForTokenClassification, AdamW, BertTokenizerFast\nfrom nltk.tokenize import sent_tokenize, word_tokenize\nimport torch.nn as nn\nimport torch\nimport tqdm","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import re\nimport nltk\nfrom nltk.corpus import stopwords\nfrom nltk.tokenize import word_tokenize\n\n# Download stopwords if not already downloaded\nnltk.download('punkt')\nnltk.download('stopwords')\n\n# Set of stop words (you can add more if needed)\nstop_words = set(stopwords.words('english'))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"test_file = \"/kaggle/input/miimansa/G1.xlsx\"\nmodel_path = \"/kaggle/input/ner_model/pytorch/default/1/model_weights1.pth\"","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df = pd.read_excel(test_file)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.dropna(inplace=True)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Define the label mapping\nlabel_map = {\n    \"O\": 0,\n    \"B-treatment\": 1, \"I-treatment\": 2,\n    \"B-chronic_disease\": 3, \"I-chronic_disease\": 4,\n    \"B-cancer\": 5, \"I-cancer\": 6,\n    \"B-allergy_name\": 7, \"I-allergy_name\": 8\n}\n\nnum_labels = len(label_map)\nmax_sent_len = 256\n\n# Hyperparameters\nbatch_size = 16\n\n# Define device: Use GPU (cuda) if available, else use CPU\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n\n# Initialize the tokenizer\n# tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")\ntokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')\nmodel = BertForTokenClassification.from_pretrained(\"bert-base-cased\", num_labels=len(label_map))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def preprocess_data(df):\n#     max_len=float('-inf')\n    all_input_ids = []\n    all_attention_masks = []\n    all_labels = []\n\n    for _, row in df.iterrows():\n        text = row['text']\n        entity = row['tags']\n\n        if not pd.isna(entity) and not pd.isna(text):\n\n            entity = entity.split(',')\n            # Remove all empty strings using filter\n            entities = list(filter(lambda x: x.strip(), entity))\n    #         print(\"entities: \", entities)\n\n            tokenized_input = tokenizer(text, truncation=True, padding='max_length', max_length=max_sent_len, return_offsets_mapping=True)\n            input_ids = tokenized_input['input_ids']\n            attention_mask = tokenized_input['attention_mask']\n            offset_mapping = tokenized_input['offset_mapping']\n\n            labels = ['O'] * len(input_ids)\n\n            for entity in entities:\n                start_idx, end_idx, label = entity.split(':')\n                start_idx, end_idx = int(start_idx)-1, int(end_idx)-1\n\n                entity_started = False\n                for idx, (start, end) in enumerate(offset_mapping):\n                    if start_idx <= start < end_idx and end != 0:\n                        if not entity_started:\n                            labels[idx] = f\"B-{label}\"\n                            entity_started = True\n                        else:\n                            labels[idx] = f\"I-{label}\"\n                    elif end < start_idx:\n                        entity_started = False\n\n            all_input_ids.append(input_ids)\n            all_attention_masks.append(attention_mask)\n            all_labels.append([label_map[label] for label in labels])\n\n    # Get processed data\n    processed_data = {\n        \"tokens\": all_input_ids,\n        \"attention_mask\": all_attention_masks,\n        \"labels\": all_labels\n    }\n    return processed_data\n\ntest_processed_data = preprocess_data(df)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"id2label = {v: k for k, v in label_map.items()}\n\ninput_ids = test_processed_data['tokens'][4]\nprint(\"Tokens:\", tokenizer.convert_ids_to_tokens(input_ids))\n\nlabels = test_processed_data['labels'][4]\nprint(\"Labels:\", labels)\nprint(\"Label names:\", [id2label[label] for label in labels])","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from torch.utils.data import DataLoader, Dataset\n\nclass NERDataset(Dataset):\n    def __init__(self, encodings, attention_masks, labels):\n        self.encodings = encodings\n        self.attention_masks = attention_masks\n        self.labels = labels\n\n    def __getitem__(self, idx):\n        item={}\n        item['input_ids'] = torch.tensor(self.encodings[idx])\n        item['labels'] = torch.tensor(self.labels[idx])\n        item['attention_mask'] = torch.tensor(self.attention_masks[idx])\n\n        return item\n\n    def __len__(self):\n        return len(self.labels)\n    \n# Create the test dataset and dataloader\nner_dataset = NERDataset(test_processed_data[\"tokens\"], test_processed_data[\"attention_mask\"], test_processed_data[\"labels\"])\ntest_dataloader = DataLoader(ner_dataset, batch_size=batch_size, shuffle=True)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn.metrics import f1_score\nimport numpy as np\nfrom seqeval.metrics import classification_report\n\ndef evaluation(test_dataloaders, model):\n\n    # Evaluation on test dataset\n    model.eval()\n\n    correct_predictions = 0\n    total = 0\n\n    y_true = []\n    y_pred = []\n\n    with torch.no_grad():\n        for batch in tqdm.tqdm(test_dataloaders):\n            input_ids = batch['input_ids'].to(device)\n            labels = batch['labels'].to(device)\n\n            outputs = model(input_ids)\n            # Get predictions by taking the argmax of the logits\n            predictions = torch.argmax(outputs.logits, dim=-1)\n\n            # Convert to numpy arrays\n            labels = labels.cpu().numpy()\n            predictions = predictions.cpu().numpy()\n\n            for label, pred in zip(labels, predictions):\n                # Filter out -100 labels\n                y_true.append([id2label[l] for l in label if l != -100])\n                y_pred.append([id2label[p] for p, l in zip(pred, label) if l != -100])\n\n    print(classification_report(y_true, y_pred))\n    print(\"*\"*40)\n\n    report = classification_report(y_true, y_pred, output_dict=True)\n\n    # Extracting F1 scores for each entity type\n    entity_f1_scores = {}\n    for label in ['treatment', 'chronic_disease', 'cancer', 'allergy_name']:\n        entity_f1_scores[label] = report[label]['f1-score']\n\n    weighted_avg_f1 = report['weighted avg']['f1-score']\n\n    print(\"Entity-wise F1 scores:\")\n    for entity, score in entity_f1_scores.items():\n        print(f\"{entity}: {score:.4f}\")\n    print(f\"Weighted Average F1 score: {weighted_avg_f1:.4f}\")\n\n    return (entity_f1_scores, weighted_avg_f1)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"model = BertForTokenClassification.from_pretrained(\"bert-base-cased\", num_labels=len(label_map))\nmodel.load_state_dict(torch.load(model_path))\nmodel.to(device)\n\nT1_results = evaluation(test_dataloader, model)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def get_all_scores(results):\n    score_dict = {'Weighted Average':[]}\n    for result in results:\n        for entity, score in result[0].items():\n            score_dict[entity] = score_dict.get(entity,[])\n            score_dict[entity].append(score)\n        score_dict['Weighted Average'].append(result[1])\n    score_df = pd.DataFrame(score_dict)\n    return score_df","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"all_scores_df = get_all_scores([T1_results]).T\nall_scores_df.columns = [\"Performance on the test set\"]\nall_scores_df","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"all_scores_df.to_csv('all_scores_df.csv')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}