{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"5\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XTREME has 183 configurations\n" ] } ], "source": [ "from datasets import get_dataset_config_names\n", "\n", "xtreme_subsets = get_dataset_config_names(\"xtreme\")\n", "print(f\"XTREME has {len(xtreme_subsets)} configurations\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "panx_subsets = [s for s in xtreme_subsets if s.startswith(\"PAN\")]\n", "panx_subsets[:3]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset(\"xtreme\", name=\"PAN-X.te\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict\n", "from datasets import DatasetDict \n", "\n", "langs = [\"hi\", \"te\", \"ta\", \"en\"]\n", "fracs = [0.5709, 0.0777, 0.6360, 0.1067]\n", "fracs = [frac / sum(fracs) for frac in fracs]\n", "# return a DatasetDict if a key does not exist\n", "panx_ch = defaultdict(DatasetDict)\n", "\n", "for lang, frac in zip(langs, fracs):\n", " # load multilingual corpus \n", " ds = load_dataset(\"xtreme\", name=f\"PAN-X.{lang}\")\n", " # shuffle and downsample each split according to spoken proportion\n", " for split in ds:\n", " panx_ch[lang][split] = (\n", " ds[split]\n", " .shuffle(seed=42)\n", " .select(range(int(frac * ds[split].num_rows)))\n", " )" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hitetaen
Number of training examples20515568561533
\n", "
" ], "text/plain": [ " hi te ta en\n", "Number of training examples 2051 55 6856 1533" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd \n", "\n", "pd.DataFrame({lang: [panx_ch[lang][\"train\"].num_rows] for lang in langs}, \n", " index=[\"Number of training examples\"])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tokens: ['प्रेम', 'चोपड़ा', '-', 'गिरिधारीलाल']\n", "ner_tags: [1, 2, 0, 0]\n", "langs: ['hi', 'hi', 'hi', 'hi']\n" ] } ], "source": [ "element = panx_ch[\"hi\"][\"train\"][0]\n", "for key, value in element.items():\n", " print(f\"{key}: {value}\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)\n", "ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)\n", "langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)\n" ] } ], "source": [ "for key, value in panx_ch[\"hi\"][\"train\"].features.items():\n", " print(f\"{key}: {value}\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)\n" ] } ], "source": [ "tags = panx_ch[\"hi\"][\"train\"].features[\"ner_tags\"].feature\n", "print(tags)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def create_tag_names(batch):\n", " return {\"ner_tags_str\": [tags.int2str(idx) for idx in batch['ner_tags']]}\n", "\n", "panx_hi = panx_ch[\"hi\"].map(create_tag_names)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
Tokensप्रेमचोपड़ा-गिरिधारीलाल
TagsB-PERI-PEROO
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "Tokens प्रेम चोपड़ा - गिरिधारीलाल\n", "Tags B-PER I-PER O O" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hi_example = panx_hi[\"train\"][0]\n", "pd.DataFrame([hi_example[\"tokens\"], hi_example[\"ner_tags_str\"]], ['Tokens', 'Tags'])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PERORGLOC
train945753790
validation177140185
test158151176
\n", "
" ], "text/plain": [ " PER ORG LOC\n", "train 945 753 790\n", "validation 177 140 185\n", "test 158 151 176" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import Counter \n", "\n", "split2freqs = defaultdict(Counter)\n", "for split, dataset in panx_hi.items():\n", " for row in dataset[\"ner_tags_str\"]:\n", " for tag in row:\n", " if tag.startswith(\"B\"):\n", " tag_type = tag.split(\"-\")[1]\n", " split2freqs[split][tag_type] += 1 \n", "\n", "pd.DataFrame.from_dict(split2freqs, orient=\"index\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Multilingual transformers" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer \n", "\n", "bert_model_name = \"bert-base-cased\"\n", "xlmr_model_name = \"xlm-roberta-base\"\n", "bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)\n", "xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "text = \"Jack Sparrow loves New York!\"\n", "bert_tokens = bert_tokenizer(text).tokens()\n", "xlmr_tokens = xlmr_tokenizer(text).tokens()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['[CLS]', 'Jack', 'Spa', '##rrow', 'loves', 'New', 'York', '!', '[SEP]']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bert_tokens" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '!', '']" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xlmr_tokens" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' Jack Sparrow loves New York!'" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# whitespace is preserved by using xlmr which uses the SentencePiece tokenizer\n", "\"\".join(xlmr_tokens).replace(u\"\\u2581\", \" \")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Transformers for Named Entity Recognition" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "import torch.nn as nn \n", "from transformers import XLMRobertaConfig\n", "from transformers.modeling_outputs import TokenClassifierOutput\n", "from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "class XLMRobertaForTokenClassification(RobertaPreTrainedModel):\n", " config_class = XLMRobertaConfig\n", "\n", " def __init__(self, config):\n", " super(XLMRobertaForTokenClassification, self).__init__(config)\n", " self.num_labels = config.num_labels\n", " # load model body\n", " self.roberta = RobertaModel(config, add_pooling_layer=False)\n", " # set up token classification head \n", " self.dropout = nn.Dropout(config.hidden_dropout_prob)\n", " self.classifier = nn.Linear(config.hidden_size, config.num_labels)\n", " # load and initialize weights \n", " self.init_weights()\n", "\n", " def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):\n", " # use model body to get encoder representations\n", " outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)\n", " # apply classifier to encoder representation\n", " sequence_output = self.dropout(outputs[0])\n", " logits = self.classifier(sequence_output)\n", " # calculate losses \n", " loss = None \n", " if labels is not None:\n", " loss_fct = nn.CrossEntropyLoss()\n", " loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))\n", " # return model output object \n", " return TokenClassifierOutput(\n", " loss=loss,\n", " logits=logits,\n", " hidden_states=outputs.hidden_states,\n", " attentions=outputs.attentions\n", " )\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Loading a Custom Model" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "index2tag = {idx: tag for idx, tag in enumerate(tags.names)}\n", "tag2index = {tag: idx for idx, tag in enumerate(tags.names)}" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tags" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoConfig\n", "\n", "xlmr_config = AutoConfig.from_pretrained(xlmr_model_name, num_labels=tags.num_classes, id2label=index2tag, label2id=tag2index)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "import torch \n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "xlmr_model = (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config).to(device))" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789
Tokens<s>▁Jack▁Sparrow▁loves▁New▁York!</s>
Input IDs02176337456155555161723565753382
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9\n", "Tokens ▁Jack ▁Spar row ▁love s ▁New ▁York ! \n", "Input IDs 0 21763 37456 15555 5161 7 2356 5753 38 2" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# test on a small sequence of known entities \n", "input_ids = xlmr_tokenizer.encode(text, return_tensors=\"pt\")\n", "pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=[\"Tokens\", \"Input IDs\"])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of tokens in sequence: 10\n", "Shape of outputs: torch.Size([1, 10, 7])\n" ] } ], "source": [ "outputs = xlmr_model(input_ids.to(device)).logits \n", "predictions = torch.argmax(outputs, dim=-1)\n", "print(f\"Number of tokens in sequence: {len(xlmr_tokens)}\")\n", "print(F\"Shape of outputs: {outputs.shape}\")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789
Tokens<s>▁Jack▁Sparrow▁loves▁New▁York!</s>
TagsI-ORGI-ORGI-ORGI-ORGI-ORGI-ORGI-ORGI-ORGI-ORGI-ORG
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9\n", "Tokens ▁Jack ▁Spar row ▁love s ▁New ▁York ! \n", "Tags I-ORG I-ORG I-ORG I-ORG I-ORG I-ORG I-ORG I-ORG I-ORG I-ORG" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preds = [tags.names[p] for p in predictions[0].cpu().numpy()]\n", "pd.DataFrame([xlmr_tokens, preds], index=[\"Tokens\", \"Tags\"])" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "def tag_text(text, tags, model, tokenizer):\n", " # get tokens with special characters\n", " tokens = tokenizer(text).tokens()\n", " # encode the sequence into IDs\n", " input_ids = xlmr_tokenizer(text, return_tensors=\"pt\").input_ids.to(device)\n", " # get predictions as a distribution over 7 possible classes \n", " outputs = model(input_ids)[0]\n", " # take argmax to get most likely class per token \n", " predictions = torch.argmax(outputs, dim=2)\n", " # convert to dataframe\n", " preds = [tags.names[p] for p in predictions[0].cpu().numpy()]\n", " return pd.DataFrame([tokens, preds], index=[\"Tokens\", \"Tags\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tokenizing Texts for NER" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "words, labels = hi_example[\"tokens\"], hi_example[\"ner_tags\"]" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567
Tokens<s>▁प्रेम▁चोपड़ा▁-▁गिरिधारीलाल</s>
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7\n", "Tokens ▁प्रेम ▁चोपड़ा ▁- ▁गिरि धारी लाल " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenized_input = xlmr_tokenizer(hi_example[\"tokens\"], is_split_into_words=True)\n", "tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input[\"input_ids\"])\n", "pd.DataFrame([tokens], index=[\"Tokens\"])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567
Tokens<s>▁प्रेम▁चोपड़ा▁-▁गिरिधारीलाल</s>
Word IDsNone012333None
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7\n", "Tokens ▁प्रेम ▁चोपड़ा ▁- ▁गिरि धारी लाल \n", "Word IDs None 0 1 2 3 3 3 None" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# need to mask the subwords after the first subword\n", "word_ids = tokenized_input.word_ids() \n", "pd.DataFrame([tokens, word_ids], index=[\"Tokens\", \"Word IDs\"])" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567
Tokens<s>▁प्रेम▁चोपड़ा▁-▁गिरिधारीलाल</s>
Word IDsNone012333None
Label IDs-1001200-100-100-100
LabelsB-PERI-PEROONoneNoneNoneNone
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7\n", "Tokens ▁प्रेम ▁चोपड़ा ▁- ▁गिरि धारी लाल \n", "Word IDs None 0 1 2 3 3 3 None\n", "Label IDs -100 1 2 0 0 -100 -100 -100\n", "Labels B-PER I-PER O O None None None None" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "previous_word_idx = None \n", "label_ids = []\n", "\n", "for word_idx in word_ids:\n", " if word_idx is None or word_idx == previous_word_idx:\n", " label_ids.append(-100)\n", " elif word_idx != previous_word_idx:\n", " label_ids.append(labels[word_idx])\n", " previous_word_idx = word_idx\n", "\n", "labels = [index2tag[l] if l != -100 else \"IGN\" for l in labels]\n", "index = [\"Tokens\", \"Word IDs\", \"Label IDs\", \"Labels\"]\n", "\n", "pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "def tokenize_and_align_labels(examples):\n", " tokenized_inputs = xlmr_tokenizer(examples[\"tokens\"], truncation=True, is_split_into_words=True)\n", " labels = [] \n", " for idx, label in enumerate(examples[\"ner_tags\"]):\n", " word_ids = tokenized_inputs.word_ids(batch_index=idx)\n", " previous_word_idx = None \n", " label_ids = [] \n", " for word_idx in word_ids:\n", " if word_idx is None or word_idx == previous_word_idx:\n", " label_ids.append(-100)\n", " elif word_idx != previous_word_idx:\n", " label_ids.append(label[word_idx])\n", " previous_word_idx = word_idx\n", " labels.append(label_ids)\n", "\n", " tokenized_inputs[\"labels\"] = labels \n", " return tokenized_inputs" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "cab75f81ac8e410f8b2f18a734f39187", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/410 [00:00 \n", " \n", " \n", " [258/258 00:25, Epoch 3/3]\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossF1
10.8404000.4126990.718690
20.3513000.3379110.772507
30.2341000.2847810.824338

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Adding files tracked by Git LFS: ['tokenizer.json']. This may take a bit of time if the files are large.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1fcdd1559a68434eaada8b69e921f467", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Upload file pytorch_model.bin: 0%| | 1.00/1.03G [00:00 main\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "To https://huggingface.co/the-neural-networker/xlm-roberta-base-finetuned-panx-hi\n", " 09586e2..5ce7b8e main -> main\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "data": { "text/plain": [ "'https://huggingface.co/the-neural-networker/xlm-roberta-base-finetuned-panx-hi/commit/09586e26dd7362ff4a1437703e88e2e4edd41162'" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainer.train()\n", "trainer.push_to_hub(commit_message=\"Training completed!\")" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345678910111213
Tokens<s>▁तेजी▁बच्चन▁से▁अमिताभ▁तथा▁अजिता▁दो▁पुत्र▁हुए▁।</s>
TagsOB-PERI-PEROB-PEROB-PERI-PERI-PEROOOOO
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 \\\n", "Tokens ▁तेजी ▁बच्चन ▁से ▁अमिताभ ▁तथा ▁अज िता भ ▁दो \n", "Tags O B-PER I-PER O B-PER O B-PER I-PER I-PER O \n", "\n", " 10 11 12 13 \n", "Tokens ▁पुत्र ▁हुए ▁। \n", "Tags O O O O " ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text_hi = \"तेजी बच्चन से अमिताभ तथा अजिताभ दो पुत्र हुए ।\"\n", "tag_text(text_hi, tags, trainer.model, xlmr_tokenizer)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Error Analysis" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "tags": [] }, "outputs": [], "source": [ "from torch.nn.functional import cross_entropy\n", "\n", "def forward_pass_with_label(batch):\n", " # convert dict of lists of list of dicts suitable for data collator\n", " features = [dict(zip(batch, t)) for t in zip(*batch.values())]\n", " # pad inputs and labels and put all tensors on device\n", " batch = data_collator(features)\n", " input_ids = batch[\"input_ids\"].to(device)\n", " attention_mask = batch[\"attention_mask\"].to(device)\n", " labels = batch[\"labels\"].to(device)\n", " with torch.no_grad():\n", " # pass data through model\n", " output = trainer.model(input_ids, attention_mask)\n", " # logit.size [batch_size, sequence_length, classes]\n", " # predict class with largest logit value on classes axis \n", " predicted_label = torch.argmax(output.logits, axis=-1).cpu().numpy() \n", " \n", " # calculate loss per token after flattening batch dimension with view \n", " loss = cross_entropy(output.logits.view(-1, 7), labels.view(-1), reduction=\"none\") \n", " # unflatten batch dimension and convert to numpy array \n", " loss = loss.view(len(input_ids), -1).cpu().numpy() \n", " \n", " return {'loss': loss, 'predicted_label': predicted_label}" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f97950abdb1640e19ab6d23070c6390d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/410 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
input_idsattention_masklabelslosspredicted_labelinput_tokens
0[0, 2218, 14136, 5988, 67691, 460, 2][1, 1, 1, 1, 1, 1, 1][IGN, B-ORG, IGN, I-ORG, I-ORG, I-ORG, IGN][0.0, 0.031384714, 0.0, 0.03725087, 0.03313421...[I-LOC, B-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-LOC][<s>, ▁स, जन, ▁घर, ▁जाना, ▁है, </s>]
1[0, 93019, 7475, 976, 156711, 41612, 3558, 967...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1][IGN, O, IGN, IGN, IGN, B-PER, IGN, IGN, IGN, ...[0.0, 0.002951078, 0.0, 0.0, 0.0, 0.010400972,...[O, O, O, O, O, B-PER, B-PER, I-PER, B-PER, I-...[<s>, ▁पुनर्, प्र, े, षित, ▁फ़, ि, रो, ज़, ▁शा...
2[0, 11026, 3849, 8389, 1471, 871, 76302, 659, ...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1][IGN, B-LOC, IGN, IGN, IGN, I-LOC, I-LOC, IGN,...[0.0, 0.01155906, 0.0, 0.0, 0.0, 0.01743069, 0...[I-LOC, B-LOC, I-LOC, I-LOC, I-LOC, I-LOC, I-L...[<s>, ▁ला, ह, ौ, ल, ▁और, ▁स्प, ी, ति, ▁जिला, <...
3[0, 20571, 3282, 6, 150685, 20, 3813, 1187, 11...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1][IGN, B-PER, IGN, I-PER, IGN, O, O, IGN, IGN, ...[0.0, 0.010598798, 0.0, 0.012605397, 0.0, 0.00...[O, B-PER, I-PER, I-PER, I-PER, O, O, O, O, O][<s>, ▁अस, ित, ▁, सेन, ▁-, ▁था, ने, दार, </s>]
4[0, 9163, 2629, 76183, 1472, 6, 4, 46005, 1187...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1][IGN, B-LOC, IGN, IGN, IGN, I-LOC, IGN, I-LOC,...[0.0, 0.071242474, 0.0, 0.0, 0.0, 0.5964037, 0...[I-LOC, B-LOC, I-LOC, I-LOC, I-LOC, I-LOC, I-L...[<s>, ▁मे, म्, फि, स, ▁, ,, ▁टे, ने, सी, </s>]
\n", "" ], "text/plain": [ " input_ids \\\n", "0 [0, 2218, 14136, 5988, 67691, 460, 2] \n", "1 [0, 93019, 7475, 976, 156711, 41612, 3558, 967... \n", "2 [0, 11026, 3849, 8389, 1471, 871, 76302, 659, ... \n", "3 [0, 20571, 3282, 6, 150685, 20, 3813, 1187, 11... \n", "4 [0, 9163, 2629, 76183, 1472, 6, 4, 46005, 1187... \n", "\n", " attention_mask \\\n", "0 [1, 1, 1, 1, 1, 1, 1] \n", "1 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n", "2 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n", "3 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n", "4 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n", "\n", " labels \\\n", "0 [IGN, B-ORG, IGN, I-ORG, I-ORG, I-ORG, IGN] \n", "1 [IGN, O, IGN, IGN, IGN, B-PER, IGN, IGN, IGN, ... \n", "2 [IGN, B-LOC, IGN, IGN, IGN, I-LOC, I-LOC, IGN,... \n", "3 [IGN, B-PER, IGN, I-PER, IGN, O, O, IGN, IGN, ... \n", "4 [IGN, B-LOC, IGN, IGN, IGN, I-LOC, IGN, I-LOC,... \n", "\n", " loss \\\n", "0 [0.0, 0.031384714, 0.0, 0.03725087, 0.03313421... \n", "1 [0.0, 0.002951078, 0.0, 0.0, 0.0, 0.010400972,... \n", "2 [0.0, 0.01155906, 0.0, 0.0, 0.0, 0.01743069, 0... \n", "3 [0.0, 0.010598798, 0.0, 0.012605397, 0.0, 0.00... \n", "4 [0.0, 0.071242474, 0.0, 0.0, 0.0, 0.5964037, 0... \n", "\n", " predicted_label \\\n", "0 [I-LOC, B-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-LOC] \n", "1 [O, O, O, O, O, B-PER, B-PER, I-PER, B-PER, I-... \n", "2 [I-LOC, B-LOC, I-LOC, I-LOC, I-LOC, I-LOC, I-L... \n", "3 [O, B-PER, I-PER, I-PER, I-PER, O, O, O, O, O] \n", "4 [I-LOC, B-LOC, I-LOC, I-LOC, I-LOC, I-LOC, I-L... \n", "\n", " input_tokens \n", "0 [, ▁स, जन, ▁घर, ▁जाना, ▁है, ] \n", "1 [, ▁पुनर्, प्र, े, षित, ▁फ़, ि, रो, ज़, ▁शा... \n", "2 [, ▁ला, ह, ौ, ल, ▁और, ▁स्प, ी, ति, ▁जिला, <... \n", "3 [, ▁अस, ित, ▁, सेन, ▁-, ▁था, ने, दार, ] \n", "4 [, ▁मे, म्, फि, स, ▁, ,, ▁टे, ने, सी, ] " ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index2tag[-100] = \"IGN\"\n", "df[\"input_tokens\"] = df[\"input_ids\"].apply(\n", " lambda x: xlmr_tokenizer.convert_ids_to_tokens(x))\n", "df[\"predicted_label\"] = df[\"predicted_label\"].apply(\n", " lambda x: [index2tag[i] for i in x])\n", "df[\"labels\"] = df[\"labels\"].apply(\n", " lambda x: [index2tag[i] for i in x]) \n", "df[\"loss\"] = df.apply(\n", " lambda x: x[\"loss\"][:len(x[\"input_ids\"])], axis=1)\n", "df[\"predicted_label\"] = df.apply(\n", " lambda x: x[\"predicted_label\"][:len(x[\"input_ids\"])], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
input_idsattention_masklabelslosspredicted_labelinput_tokens
022181B-ORG0.03B-ORG▁स
059881I-ORG0.04I-ORG▁घर
0676911I-ORG0.03I-ORG▁जाना
04601I-ORG0.03I-ORG▁है
1930191O0.00O▁पुनर्
1416121B-PER0.01B-PER▁फ़
1517571I-PER0.02I-PER▁शाह
\n", "
" ], "text/plain": [ " input_ids attention_mask labels loss predicted_label input_tokens\n", "0 2218 1 B-ORG 0.03 B-ORG ▁स\n", "0 5988 1 I-ORG 0.04 I-ORG ▁घर\n", "0 67691 1 I-ORG 0.03 I-ORG ▁जाना\n", "0 460 1 I-ORG 0.03 I-ORG ▁है\n", "1 93019 1 O 0.00 O ▁पुनर्\n", "1 41612 1 B-PER 0.01 B-PER ▁फ़\n", "1 51757 1 I-PER 0.02 I-PER ▁शाह" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_tokens = df.apply(pd.Series.explode) \n", "df_tokens = df_tokens.query(\"labels != 'IGN'\")\n", "df_tokens[\"loss\"] = df_tokens[\"loss\"].astype(float).round(2) \n", "df_tokens.head(7)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789
input_tokens▁)▁(▁का▁सी▁राज्य▁N▁डी▁र▁क्रिकेट
count2358080397916134
mean0.340.470.420.371.490.917.911.280.591.83
sum79.3637.833.3814.4110.448.167.917.687.637.32
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 \\\n", "input_tokens ▁ ▁) ▁( ▁का ▁सी ▁राज्य ▁N ▁डी ▁र \n", "count 235 80 80 39 7 9 1 6 13 \n", "mean 0.34 0.47 0.42 0.37 1.49 0.91 7.91 1.28 0.59 \n", "sum 79.36 37.8 33.38 14.41 10.44 8.16 7.91 7.68 7.63 \n", "\n", " 9 \n", "input_tokens ▁क्रिकेट \n", "count 4 \n", "mean 1.83 \n", "sum 7.32 " ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(df_tokens.groupby(\"input_tokens\")[[\"loss\"]]\n", " .agg([\"count\", \"mean\", \"sum\"])\n", " .droplevel(level=0, axis=1)\n", " .sort_values(by=\"sum\", ascending=False)\n", " .reset_index()\n", " .round(2)\n", " .head(10)\n", " .T\n", ")" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456
labelsI-ORGI-LOCOB-LOCB-ORGI-PERB-PER
count407163976185140259177
mean0.380.710.120.410.520.280.26
sum153.78116.0113.9176.572.7571.8945.95
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6\n", "labels I-ORG I-LOC O B-LOC B-ORG I-PER B-PER\n", "count 407 163 976 185 140 259 177\n", "mean 0.38 0.71 0.12 0.41 0.52 0.28 0.26\n", "sum 153.78 116.0 113.91 76.5 72.75 71.89 45.95" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(\n", " df_tokens.groupby(\"labels\")[[\"loss\"]]\n", " .agg([\"count\", \"mean\", \"sum\"])\n", " .droplevel(level=0, axis=1)\n", " .sort_values(by=\"sum\", ascending=False)\n", " .reset_index()\n", " .round(2)\n", " .T\n", ")" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "tags": [] }, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix \n", "\n", "def plot_confusion_matrix(y_preds, y_true, labels):\n", " cm = confusion_matrix(y_true, y_preds, normalize=\"true\")\n", " fig, ax = plt.subplots(figsize=(6, 6))\n", " disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)\n", " disp.plot(cmap=\"Blues\", values_format=\".2f\", ax=ax, colorbar=False)\n", " plt.title(\"Normalized confusion matrix\")\n", " plt.show() \n", " \n", "plot_confusion_matrix(df_tokens[\"labels\"], df_tokens[\"predicted_label\"], tags.names)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456
tokens▁चिन▁राज्य▁(▁प्राचीन▁चीन▁)</s>
labelsB-ORGI-ORGI-ORGI-ORGI-ORGI-ORGIGN
predsB-LOCI-LOCI-LOCI-LOCI-LOCI-LOCI-LOC
losses3.975.405.004.905.364.880.00
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6\n", "tokens ▁चिन ▁राज्य ▁( ▁प्राचीन ▁चीन ▁)
\n", "labels B-ORG I-ORG I-ORG I-ORG I-ORG I-ORG IGN\n", "preds B-LOC I-LOC I-LOC I-LOC I-LOC I-LOC I-LOC\n", "losses 3.97 5.40 5.00 4.90 5.36 4.88 0.00" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567
tokens▁एनएस▁(▁NSE▁)</s>
labelsB-ORGIGNIGNI-ORGI-ORGIGNI-ORGIGN
predsOOOOOOOO
losses1.660.000.006.987.910.007.100.00
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7\n", "tokens ▁एन एस ई ▁( ▁N SE ▁)
\n", "labels B-ORG IGN IGN I-ORG I-ORG IGN I-ORG IGN\n", "preds O O O O O O O O\n", "losses 1.66 0.00 0.00 6.98 7.91 0.00 7.10 0.00" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345678910
tokens▁मु▁बादशाों▁की▁सूची</s>
labelsB-PERIGNIGNIGNI-PERIGNIGNIGNI-PERI-PERIGN
predsB-ORGI-ORGI-ORGI-ORGI-ORGI-ORGI-ORGI-ORGI-ORGI-ORGI-ORG
losses5.800.000.000.004.820.000.000.005.315.510.00
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 \\\n", "tokens ▁मु ग ़ ल ▁बाद शा ह ों ▁की ▁सूची \n", "labels B-PER IGN IGN IGN I-PER IGN IGN IGN I-PER I-PER \n", "preds B-ORG I-ORG I-ORG I-ORG I-ORG I-ORG I-ORG I-ORG I-ORG I-ORG \n", "losses 5.80 0.00 0.00 0.00 4.82 0.00 0.00 0.00 5.31 5.51 \n", "\n", " 10 \n", "tokens \n", "labels IGN \n", "preds I-ORG \n", "losses 0.00 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def get_samples(df):\n", " for _, row in df.iterrows():\n", " labels, preds, tokens, losses = [], [], [], [] \n", " for i, mask in enumerate(row[\"attention_mask\"]):\n", " if i not in {0, len(row[\"attention_mask\"])}:\n", " labels.append(row[\"labels\"][i])\n", " preds.append(row[\"predicted_label\"][i])\n", " tokens.append(row[\"input_tokens\"][i])\n", " losses.append(f\"{row['loss'][i]:.2f}\")\n", " df_tmp = pd.DataFrame({\"tokens\": tokens, \"labels\":labels, \"preds\": preds, \"losses\": losses}).T\n", "\n", " yield df_tmp\n", "\n", "df[\"total_loss\"] = df[\"loss\"].apply(sum)\n", "df_tmp = df.sort_values(by=\"total_loss\", ascending=False).head(3)\n", "\n", "for sample in get_samples(df_tmp):\n", " display(sample)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567891011
tokens▁मिलरा▁मज़िक▁(▁सर्बिया▁)</s>
labelsOIGNIGNIGNOIGNIGNOB-LOCIGNOIGN
predsB-LOCI-LOCI-LOCI-LOCI-LOCI-LOCI-LOCOB-LOCI-LOCOO
losses1.380.000.000.001.780.000.000.030.050.000.120.00
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 \\\n", "tokens ▁मिल ो रा ड ▁म ज़ िक ▁( ▁सर् बिया \n", "labels O IGN IGN IGN O IGN IGN O B-LOC IGN \n", "preds B-LOC I-LOC I-LOC I-LOC I-LOC I-LOC I-LOC O B-LOC I-LOC \n", "losses 1.38 0.00 0.00 0.00 1.78 0.00 0.00 0.03 0.05 0.00 \n", "\n", " 10 11 \n", "tokens ▁) \n", "labels O IGN \n", "preds O O \n", "losses 0.12 0.00 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789
tokens▁राजेन्द्र▁सिंह▁(▁रज्जू▁भैया▁)</s>
labelsB-PERI-PERI-PERI-PERIGNIGNI-PERIGNI-PERIGN
predsB-PERI-PEROOOI-PERI-PERI-PEROO
losses0.020.012.003.400.000.000.270.001.300.00
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 \\\n", "tokens ▁राजेन्द्र ▁सिंह ▁( ▁र ज् जू ▁भै या ▁) \n", "labels B-PER I-PER I-PER I-PER IGN IGN I-PER IGN I-PER \n", "preds B-PER I-PER O O O I-PER I-PER I-PER O \n", "losses 0.02 0.01 2.00 3.40 0.00 0.00 0.27 0.00 1.30 \n", "\n", " 9 \n", "tokens \n", "labels IGN \n", "preds O \n", "losses 0.00 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_tmp = df.loc[df[\"input_tokens\"].apply(lambda x: u\"\\u2581(\" in x)].head(2)\n", "for sample in get_samples(df_tmp):\n", " display(sample) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cross Lingual Transfer" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "tags": [] }, "outputs": [], "source": [ "def get_f1_score(trainer, dataset):\n", " return trainer.predict(dataset).metrics[\"test_f1\"]" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "F1-score of [hi] model on [hi] dataset: 0.798\n" ] } ], "source": [ "f1_scores = defaultdict(dict)\n", "f1_scores[\"hi\"][\"hi\"] = get_f1_score(trainer, panx_hi_encoded[\"test\"])\n", "print(f\"F1-score of [hi] model on [hi] dataset: {f1_scores['hi']['hi']:.3f}\")" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345678910
Tokens<s>▁నా▁అమ్మ▁కళ్లకు▁నీళ్ళుపోయాయి</s>
TagsOB-PERI-PERI-ORGI-ORGOI-ORGI-ORGOOO
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 10\n", "Tokens ▁నా ▁అమ్మ ▁క ళ్ల కు ▁నీ ళ్ళు ▁ పోయాయి \n", "Tags O B-PER I-PER I-ORG I-ORG O I-ORG I-ORG O O O" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text_te = \"నా అమ్మ కళ్లకు నీళ్ళు పోయాయి\"\n", "tag_text(text_te, tags, trainer.model, xlmr_tokenizer)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "tags": [] }, "outputs": [], "source": [ "def evaluate_lang_performance(lang, trainer):\n", " panx_ds = encode_panx_dataset(panx_ch[lang])\n", " return get_f1_score(trainer, panx_ds[\"test\"])" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f6541b0db0ff48c581bb593d1a0918a2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/55 [00:00" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6e28b43098b64415862bff66128ccb16", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/6856 [00:00" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ec43bbb1914d4dfbafca85b309e77b03", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/1533 [00:00" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "F1-score of [hi] model pn [te] dataset: 0.525\n", "F1-score of [hi] model pn [ta] dataset: 0.552\n", "F1-score of [hi] model pn [en] dataset: 0.529\n" ] } ], "source": [ "f1_scores[\"hi\"][\"te\"] = evaluate_lang_performance(\"te\", trainer)\n", "f1_scores[\"hi\"][\"ta\"] = evaluate_lang_performance(\"ta\", trainer)\n", "f1_scores[\"hi\"][\"en\"] = evaluate_lang_performance(\"en\", trainer)\n", "print(f\"F1-score of [hi] model pn [te] dataset: {f1_scores['hi']['te']:.3f}\")\n", "print(f\"F1-score of [hi] model pn [ta] dataset: {f1_scores['hi']['ta']:.3f}\")\n", "print(f\"F1-score of [hi] model pn [en] dataset: {f1_scores['hi']['en']:.3f}\")" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "tags": [] }, "outputs": [], "source": [ "def train_on_subset(dataset, num_samples):\n", " train_ds = dataset[\"train\"].shuffle(seed=42).select(range(num_samples)) \n", " valid_ds = dataset[\"validation\"]\n", " test_ds = dataset[\"test\"]\n", " training_args.logging_steps = len(train_ds) // batch_size \n", " \n", " trainer = Trainer(model_init=model_init, args=training_args,\n", " data_collator=data_collator, compute_metrics=compute_metrics,\n", " train_dataset=train_ds, eval_dataset=valid_ds, tokenizer=xlmr_tokenizer)\n", " trainer.train() \n", " if training_args.push_to_hub:\n", " trainer.push_to_hub(commit_message=\"Training completed!\") \n", " \n", " f1_score = get_f1_score(trainer, test_ds)\n", " return pd.DataFrame.from_dict(\n", " {\"num_samples\": [len(train_ds)], \"f1_score\": [f1_score]})" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/data/ssd2/abhiroop/misc/nlp-with-transformers/env/lib/python3.8/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [33/33 00:04, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossF1
11.7380001.1433640.000000
21.1895000.9400890.109989
31.0587000.8537410.205519

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
num_samplesf1_score
02500.182665
\n", "
" ], "text/plain": [ " num_samples f1_score\n", "0 250 0.182665" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "panx_ta_encoded = encode_panx_dataset(panx_ch[\"ta\"])\n", "training_args.push_to_hub = False \n", "metrics_df = train_on_subset(panx_ta_encoded, 250)\n", "metrics_df" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/data/ssd2/abhiroop/misc/nlp-with-transformers/env/lib/python3.8/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [63/63 00:07, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossF1
11.4587000.8958340.156504
20.8011000.6399080.489311
30.6169000.5603480.525695

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [126/126 00:14, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossF1
11.1495000.6057090.445705
20.4765000.4139040.620573
30.3330000.3805780.683262

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [252/252 00:28, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossF1
10.8233000.3789210.660228
20.3351000.3009350.726016
30.2269000.2990770.750000

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [501/501 00:55, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossF1
10.5809000.2971010.725979
20.2641000.2390290.749110
30.1773000.2407490.776014

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for num_samples in [500, 1000, 2000, 4000]:\n", " result = train_on_subset(panx_ta_encoded, num_samples)\n", " metrics_df = pd.concat([metrics_df, result], ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "tags": [] }, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAG2CAYAAACDLKdOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAABS10lEQVR4nO3deVxU5f4H8M+wzMCwDCgwgCKoEOKGC4poqRWJ2vVq3X6aWqKZZVpqpLncEs0S7VbXFsvSQitLW8xMja6RWCru4i4JLpAKiMqwyTbz/P4wjowsggycYfi8X695xZzzzMz3YUbm0/M85xyFEEKAiIiIyEJYyV0AERERkSkx3BAREZFFYbghIiIii8JwQ0RERBaF4YaIiIgsCsMNERERWRSGGyIiIrIoDDdERERkURhuiIiIyKIw3BAREZFFkTXc/P777xg2bBi8vb2hUCiwcePGOz4mISEBPXr0gEqlgr+/P1avXt3gdRIREVHTIWu4KSgoQHBwMJYvX16r9ufOncPDDz+M+++/H0lJSZgxYwaefvpp/PLLLw1cKRERETUVCnO5cKZCocAPP/yAESNGVNtm9uzZ2LJlC44fPy5te/zxx5GTk4O4uLhGqJKIiIjMnY3cBdRFYmIiwsPDjbZFRERgxowZ1T6muLgYxcXF0n2DwYBr166hZcuWUCgUDVUqERERmZAQAnl5efD29oaVVc0TT00q3GRkZECr1Rpt02q1yM3NxY0bN2Bvb1/pMTExMVi4cGFjlUhEREQNKD09Ha1bt66xTZMKN3dj7ty5iIqKku7rdDq0adMG6enpcHZ2lrEyIiIiqq3c3Fz4+PjAycnpjm2bVLjx9PREZmam0bbMzEw4OztXOWoDACqVCiqVqtJ2Z2dnhhsiIqImpjZLSprUeW7CwsIQHx9vtG3btm0ICwuTqSIiIiIyN7KGm/z8fCQlJSEpKQnAzUO9k5KSkJaWBuDmlNK4ceOk9pMnT8bZs2fx8ssv4/Tp0/jwww/xzTff4MUXX5SjfCIiIjJDsoabAwcOoHv37ujevTsAICoqCt27d8f8+fMBAJcvX5aCDgC0bdsWW7ZswbZt2xAcHIy3334bq1atQkREhCz1ExERkfkxm/PcNJbc3FxoNBrodDquuSEiImoi6vL93aTW3BARERHdCcMNERERWRSGGyIiIrIoDDdERERkURhuiIiIyKIw3BAREZFFYbghIiIii8JwQ0RERBaF4YaIiIgsCsMNERERWRSGGyIiIrIoDDdERERkURhuiIiIyKIw3BAREZFFYbghIiIii8JwQ0RERBaF4YaIiIgsCsMNERERWRSGGyIiIrIoDDdERERkURhuiIiIyKIw3BAREZFFYbghIiIii8JwQ0RERBaF4YaIiIgsCsMNERERWRSGGyIiIrIoDDdERERkURhuiIiIyKIw3BAREZFFYbghIiIii8JwQ0RERBaF4YaIiIgsCsMNERERWRSGGyIiIrIoDDdERERkURhuiIiIyKIw3BAREZFFYbghIiIii8JwQ0RERBaF4YaIiIgsCsMNERERWRSGGyIiIrIoDDdERERkURhuiIiIyKIw3BAREZFFYbghIiIii8JwQ0RERBaF4YaIiIgsCsMNERERWRSGGyIiIrIoDDdERERkURhuiIiIyKIw3BAREZFFYbghIiIii8JwQ0RERBaF4YaIiIgsCsMNERERWRSGGyIiIrIoDDdERERkURhuiIiIyKIw3BAREZFFYbghIiIiiyJ7uFm+fDn8/PxgZ2eH0NBQ7Nu3r8b2y5YtQ2BgIOzt7eHj44MXX3wRRUVFjVQtERERmTtZw8369esRFRWF6OhoHDp0CMHBwYiIiEBWVlaV7b/66ivMmTMH0dHROHXqFD799FOsX78e8+bNa+TKiYiIyFzJGm7eeecdTJo0CRMmTEDHjh2xYsUKqNVqfPbZZ1W23717N/r164cxY8bAz88PgwYNwujRo+842kNERETNh2zhpqSkBAcPHkR4ePitYqysEB4ejsTExCof07dvXxw8eFAKM2fPnsXWrVsxdOjQal+nuLgYubm5RjciIiKyXDZyvXB2djb0ej20Wq3Rdq1Wi9OnT1f5mDFjxiA7Oxv33nsvhBAoKyvD5MmTa5yWiomJwcKFC01aOxEREZkv2RcU10VCQgIWL16MDz/8EIcOHcKGDRuwZcsWLFq0qNrHzJ07FzqdTrqlp6c3YsVERETU2GQbuXFzc4O1tTUyMzONtmdmZsLT07PKx7z66qt48skn8fTTTwMAunTpgoKCAjzzzDP497//DSuryllNpVJBpVKZvgNERERklmQbuVEqlejZsyfi4+OlbQaDAfHx8QgLC6vyMYWFhZUCjLW1NQBACNFwxRIREVGTIdvIDQBERUUhMjISISEh6N27N5YtW4aCggJMmDABADBu3Di0atUKMTExAIBhw4bhnXfeQffu3REaGoqUlBS8+uqrGDZsmBRyiIiIqHmTNdyMGjUKV65cwfz585GRkYFu3bohLi5OWmSclpZmNFLzyiuvQKFQ4JVXXsHFixfh7u6OYcOG4Y033pCrC0RERGRmFKKZzefk5uZCo9FAp9PB2dlZ7nKIiIioFury/d2kjpYiIiIiuhOGGyIiIrIoDDdERERkURhuiIiIyKIw3BAREZFFYbghIiIii8JwQ0RERBaF4YaIiIgsCsMNERERWRSGGyIiIrIoDDdERERkUWS9cCYRERE1PWV6A3Q3SqG7UYqcv/+be6MUOYU3f3Z3UmF07zay1cdwQ0RE1AwZDAJ5xWXQFZaHlJJbgaXwZljRVQgsFW/5xWU1PnePNi4MN0RERFR3QggUluhvCyHGIeX2YCKFl6JSCFG/13dS2cDZ3hYualtoKvy3vbujaTp4lxhuiIiIZFZUqr81UnKjFLrCW9M9uhul0BWWVDsNVGaoX0Kxt7WGxv5mKNH8HU409rZwsb8VWJyln5XSfmc7G9hYm+fSXYYbIiIiE6i4DqU8hNw+tVP+39zbpoGKSg31em1bawU09kpo7G0qhZCKt4ojLOWBRWVjbaLfgPlguCEiIvrb7etQbl+Loiu8+3Uod2KlwM0pnvKRESmk/B1Y7JUVthsHFntbaygUChP9Fpo+hhsiIrIoVa9DufNalJzCUuQVlaKeszzSOpTqRkrKQ0rF/c72tnBS2cDKigHFFBhuiIjILFW1DqXympOSKqeBSvX1Syh2tlZGIaTSWhT17dM9SrNfh9KcMNwQEVGDKdMbkFtUJoWQqtahVDr0+O8RFtOsQ6kmhFSY/qk4glIeWixxHUpzwnBDREQ1Kl+HUvEkbVWtQ6lquqe+61AUCtS4MLZ8mqeqaSCuQ2m+GG6IiJqBiutQ6npOlNwb9V+H4qiyqTqc3BZSKu7nOhS6Www3RERNSHGZvtqRkvIpn4Zch3Kno3aqmgbiOhRqbAw3RESN7PZ1KEa3Cidva4h1KDZWCmlUxOX2EHLbyds0auPDku1suQ6FmgaGGyKiu1DVOpTarkUxxToUZ7vqRkoq3q98yLFayXUoZPkYboio2apuHcrtZ4+tai1KQ6xDqc1aFI2a61CI7oThhoiavOIyfbVnjzU+9LjyNJCp1qFUd9ROxcWxFaeBnO1tYct1KEQNguGGiMxGqd6ArLxiZOUWVbsOpdJ0z40Sk6xDqThaUjGEaCpco8fltjZch0JknhhuiKhRFJXqkaErwmVdETJyb+CyrgiZ0v2b/83OL4a4y4GU8nUo1R+1U8ValL8XzHIdCpFlYbghonrLKyqtEFyKbv2s+zvE5BbhemFprZ7L1loBd0cVNGplpTPIVjr0uMJaFCc7rkMhopsYboioWkIIXC+8GVzKR1tuBZdbQaa2R//Y2VrBW2MPrbMdvDR28NSU/9cens4377d0UDKkEFG9MNwQNVN6g8DV/GJcrjjKklt5qqikrHbrWZzsbKSg4uV8K7ho//6vl7M9nO1tOP1DRA2O4YbIApWUGZCVd2uUJTO3qMKoyw1k6IqQlVeMsloey9zSQVlhlMXu71EWe6P7Dir+OSEi88C/RkRNzI0SvTQdVN1UUW0X5lopAA+nCqMsRtNFN8OLh7OKV0gmoiaF4YbIjBgtzK1wZFHFxbo5dViYW2mUxdl4qsjdUcVr/hCRxWG4IWoE5Qtzy6eEqpsqKijR1+r57G2tb00J3bYot3x7CzUX5hJR88RwQ1RPeoNA9t8LczPKw0tu5ami2i7MdbazgZfGvtqpIk+NHZztuDCXiKg6DDdENSgpMyAz9/ZRlltTRRm6ImTmFUNfy4W5bo7Kv6eK7OGpUd0MKxXCi6fGDmol/1kSEdUH/4pSs1W+MLf6qaKbC3Nrw0oBaJ1vH2UxXuvChblERI2D4YYsjhACecVlRmfJzdAVVzqySHejdgtzldZW0GpU8HK2r+Jw6JtTRW6OSi7MJSIyEww31OQUleqRkpVf41RRbRfmqpXWtwKLs/2tI4kqnISuhYOS61uIiJoQhhtqUs5lF2Dsyj24pCu6Y1uNvW0Voyy3poq0zlyYS0RkiRhuqMlIv1aIMSv34LKuCE52NvBtqZZGW6qaLuLCXCKi5ol//alJuKy7gTGrbgYbfw9HrHumD9wcVXKXRUREZogrIMnsZeUVYezKvUi/dgO+LdVY+3Qogw0REVWL4YbM2rWCEjyxai/OZheglYs9vprUB1pnO7nLIiIiM8ZwQ2ZLV1iKJ1btxZ+Z+dA6q/DVpFC0crGXuywiIjJzDDdklvKKSjEudh9OXs6Fm6MSa5/uA9+WDnKXRURETQDDDZmdwpIyPLV6P46k58BFbYsvnw6Fv4ej3GUREVETwXBDZqWoVI9Jnx/A/vPX4WRngy8nhqKDp7PcZRERURPCcENmo6TMgOe+PIhdKVfhoLTGmqd6o3MrjdxlERFRE8NwQ2ahVG/AC18fwvbkK7CztcJn43uhRxtXucsiIqImiOGGZKc3CER9cwS/nMiE0sYKK8eFILRdS7nLIiKiJorhhmRlMAjM/v4ofjpyCTZWCnw0tgfuC3CXuywiImrCGG5INkIIzN90HN8d/AvWVgq8P7o7HgzSyl0WERE1cQw3JAshBF7fcgpf7kmDQgG8MzIYQ7p4yV0WERFZAIYbksVb/0vGpzvPAQCWPtoVw7u1krkiIiKyFAw31Ojejz+D5dtTAQCvDe+Ekb18ZK6IiIgsCcMNNaqVv5/F29v+BAD8e2gQxoX5yVsQERFZHIYbajSfJ57HG1tPAQBeeugeTOrfTuaKiIjIEjHcUKNYvz8N8388AQCYen97vPBggMwVERGRpWK4oQa38fBFzNlwDAAw8d62mDkoUOaKiIjIkjHcUIP6+dhlvPTtEQgBPNGnDV55OAgKhULusoiIyILJHm6WL18OPz8/2NnZITQ0FPv27auxfU5ODqZOnQovLy+oVCrcc8892Lp1ayNVS3URfyoTL3x9GHqDwP/1bI3X/tmZwYaIiBqcjZwvvn79ekRFRWHFihUIDQ3FsmXLEBERgeTkZHh4eFRqX1JSgoceeggeHh747rvv0KpVK1y4cAEuLi6NXzzV6I8zV/Dcl4dQZhD4Z7A3lvyrK6ysGGyIiKjhKYQQQq4XDw0NRa9evfDBBx8AAAwGA3x8fPDCCy9gzpw5ldqvWLEC//nPf3D69GnY2tre1Wvm5uZCo9FAp9PB2dm5XvVT1facvYrxsftQVGpARCctPhjTA7bWsg8SEhFRE1aX72/ZvnFKSkpw8OBBhIeH3yrGygrh4eFITEys8jGbNm1CWFgYpk6dCq1Wi86dO2Px4sXQ6/XVvk5xcTFyc3ONbtRwDl64jomr96Oo1ID7A93x/mgGGyIialyyfetkZ2dDr9dDqzW+UKJWq0VGRkaVjzl79iy+++476PV6bN26Fa+++irefvttvP7669W+TkxMDDQajXTz8eHZcBvK8Ys6jI/dh4ISPfr5t8RHT/SE0obBhoiIGleT+uYxGAzw8PDAJ598gp49e2LUqFH497//jRUrVlT7mLlz50Kn00m39PT0Rqy4+TidkYsnPt2LvKIy9PZrgZXjQmBnay13WURE1AzJtqDYzc0N1tbWyMzMNNqemZkJT0/PKh/j5eUFW1tbWFvf+tIMCgpCRkYGSkpKoFQqKz1GpVJBpVKZtngykpKVjydW7UVOYSm6+bjg0/EhUCtlXatORETNmGwjN0qlEj179kR8fLy0zWAwID4+HmFhYVU+pl+/fkhJSYHBYJC2/fnnn/Dy8qoy2FDDu3C1AGNX7UF2fgk6ejljzYTecLK7u8XeREREpiDrtFRUVBRWrlyJNWvW4NSpU3juuedQUFCACRMmAADGjRuHuXPnSu2fe+45XLt2DdOnT8eff/6JLVu2YPHixZg6dapcXWjWLubcwJiVe5GZW4x7tI748ulQaNQMNkREJC9Z5w5GjRqFK1euYP78+cjIyEC3bt0QFxcnLTJOS0uDldWt/OXj44NffvkFL774Irp27YpWrVph+vTpmD17tlxdaLYyc4swZuUeXMy5gbZuDvjy6VC0cODoGRERyU/W89zIgee5qb/s/GKM+jgRqVcK0NrVHt88GwZvF3u5yyIiIgvWJM5zQ01TTmEJnli1F6lXCuClscPXk/ow2BARkVm5q3BTVlaGX3/9FR9//DHy8vIAAJcuXUJ+fr5JiyPzkltUiic/3YfTGXlwd1Jh7dOh8GmhlrssIiIiI3Vec3PhwgUMHjwYaWlpKC4uxkMPPQQnJycsXboUxcXFNZ5zhpquguIyTIjdj2MXdWjhoMTap0PRzt1R7rKIiIgqqfPIzfTp0xESEoLr16/D3v7WdMQjjzxidFg3WY4bJXpMXLMfBy9ch7OdDb6Y2Bv3aJ3kLouIiKhKdR65+eOPP7B79+5K55Xx8/PDxYsXTVYYmYfiMj2e+eIA9py9BkeVDT6fGIpO3hq5yyIiIqpWnUduDAZDlReq/Ouvv+DkxP+btySlegOmrj2MP85kw97WGrETeqGbj4vcZREREdWozuFm0KBBWLZsmXRfoVAgPz8f0dHRGDp0qClrIxmV6Q2YsS4Jv57KhNLGCqsiQ9DLr4XcZREREd1Rnc9zk56ejsGDB0MIgTNnziAkJARnzpyBm5sbfv/9d3h4eDRUrSbB89zcmcEgMPPbI9hw+CJsrRX4ZFwI7g807/eViIgsW12+v+u85sbHxwdHjhzB+vXrceTIEeTn52PixIkYO3as0QJjapqEEPj3xmPYcPgirK0UeH90DwYbIiJqUuo0clNaWooOHTpg8+bNCAoKasi6GoyU/C5dqjr5WVsDdna37hcUVP9kVlZAxUBXl7aFhUB1v3qFAlCr767tjRtAhQuLVuLgUG1bIQQW/pKC1fsvwkoBLHu8O/4Z7H1zZ1ERUMVaqyqf905t1eqbdQNAcTFQVmaatvb2N3/PAFBSApSWmqatnd3Nz0Vd25aW3mxfHZUKsLGpe9uyspu/i+oolYCtbd3b6vU337vq2NrebF/XtgbDzc+aKdra2Nz8XQA3/00UFpqmbV3+3TfjvxE1tuXfiLq35d+Imz/X8m9EnWZeRB15e3uLkydP1vVhZkOn0wkAQnfzT0Hl29Chxg9Qq6tuBwgxYIBxWze36tuGhBi39fWtvm3HjsZtO3asvq2vr3HbkJDq27q5GbcdMEDaZwDE4gHjhe/szcJ39mbxbY8hxm2HDq3+eW//GD32WM1t8/NvtY2MrLltVtattlOm1Nz23LlbbWfOrLnt8eO32kZH19x2375bbd98s+a227ffavvBBzW33bz5VtvY2JrbfvPNrbbffFNz29jYW203b6657Qcf3Gq7fXvNbd9881bbfftqbhsdfavt8eM1t50581bbc+dqbjtlyq22WVk1t42MvNU2P7/mto89JozU1LaZ/o2odFOrjdvyb8RN/BtxUwP8jZC+v3U6cSd1XlA8depULF26FGU1pWNqcpb1G4OP+zwGAHj9l+V47PQOmSsiIiK6O3VeUFx+sj5HR0d06dIFDhWHGgFs2LDBpAWaGqelKg85f7QrDUt/OwsAeHVQe0wM9anclkPOdW/LIeebP3Na6u7amtHfiFq15d+Iurfl34ibPzfAtFSdw82ECRNq3B8bG1uXp2t0PFrK2Gc7z+G1zScBALMiAjH1fn+ZKyIiIqqsQY+WMvfwQrX31d40KdhMe8CfwYaIiCxCncNNuStXriA5ORkAEBgYCHd3d5MVRQ3v+4N/4d8bjwEAnunfDi8+dI/MFREREZlGnRcUFxQU4KmnnoKXlxf69++P/v37w9vbGxMnTkRhTXPaZDZ+OnIJs747AiGAyDBfzB3SAYryOWsiIqImrs7hJioqCjt27MBPP/2EnJwc5OTk4Mcff8SOHTvw0ksvNUSNZEL/O5GBGeuTYBDA4718ED2sE4MNERFZlDovKHZzc8N3332HgQMHGm3fvn07Ro4ciStXrpiyPpNrzguKE5Kz8MznB1GiN2BEN2+8PbIbrK0YbIiIyPzV5fu7ziM3hYWF0Gq1lbZ7eHhwWsqM7U7JxrNf3Aw2Q7t44q3/C2awISIii1TncBMWFobo6GgUVTh+/caNG1i4cCHCwsJMWhyZxoHz1zBxzQEUlxkQHuSBZaO6w8a6zm89ERFRk1Dno6XeffddREREoHXr1ggODgYAHDlyBHZ2dvjll19MXiDVz5H0HIyP3Y8bpXrcF+CGD8b0gNKGwYaIiCxXncNN586dcebMGaxduxanT58GAIwePZpXBTdDJy/lYtxn+5BfXIbQti3wyZMhsLO1lrssIiKiBnVX57lRq9WYNGmSqWshEzqTmYcnPt0L3Y1S9Gjjgk/H94K9ksGGiIgsX53nJ2JiYvDZZ59V2v7ZZ59h6dKlJimK6udcdgHGrNqLawUl6NzKGbETesNRddfnayQiImpS6hxuPv74Y3To0KHS9k6dOmHFihUmKYruXvq1QoxduQdX8orRwdMJXzwVCo29rdxlERERNZo6h5uMjAx4eXlV2u7u7o7Lly+bpCi6O5d1NzBm1R5c0hWhvbsDvpgYClcHpdxlERERNao6hxsfHx/s2rWr0vZdu3bB29vbJEVR3WXlFWHsyr1Iv3YDbVqosfbpPnB3UsldFhERUaOr80KMSZMmYcaMGSgtLcUDDzwAAIiPj8fLL7/Myy/I5FpBCZ5YtRdnswvQysUeX00KhafGTu6yiIiIZFHncDNr1ixcvXoVU6ZMQUlJCQDAzs4Os2fPxty5c01eINWsVG9A5Gf78GdmPjycVFj7dChau6rlLouIiEg2db62VLn8/HycOnUK9vb2CAgIgErVNKZALO3aUjvPZOOJT/fC2c4GG6b0hb+Hk9wlERERmVyDXluqnKOjI3r16gUnJyekpqbCYDDc7VNRPSSezQYAhAdpGWyIiIhQh3Dz2Wef4Z133jHa9swzz6Bdu3bo0qULOnfujPT0dJMXSDXbnXoVABDWvqXMlRAREZmHWoebTz75BK6urtL9uLg4xMbG4vPPP8f+/fvh4uKChQsXNkiRVLW8olIc/UsHgOGGiIioXK0XFJ85cwYhISHS/R9//BHDhw/H2LFjAQCLFy/GhAkTTF8hVWv/+WvQGwR8W6q5iJiIiOhvtR65uXHjhtECnt27d6N///7S/Xbt2iEjI8O01VGNEv+ekurLURsiIiJJrcONr68vDh48CADIzs7GiRMn0K9fP2l/RkYGNBqN6SukapWvt+nTjuGGiIioXK2npSIjIzF16lScOHECv/32Gzp06ICePXtK+3fv3o3OnTs3SJFU2fWCEpy8nAuA622IiIgqqnW4efnll1FYWIgNGzbA09MT3377rdH+Xbt2YfTo0SYvkKq299xVCAEEeDjCw4lnIyYiIip31yfxa6os5SR+8388js8TLyAyzBcLh3PEjIiILFujnMSP5JUond/GTeZKiIiIzAvDTROUlVeEM1n5UCiAPu1ayF0OERGRWWG4aYLKR206ejnDRa2UuRoiIiLzwnDTBPH8NkRERNVjuGmCdkvhhuttiIiIbmeycJOeno6nnnrKVE9H1fjreiHSrhXC2kqBXm253oaIiOh2Jgs3165dw5o1a0z1dFSN8imprq01cFTV+jRFREREzUatvx03bdpU4/6zZ8/Wuxi6M663ISIiqlmtw82IESOgUChQ0zn/FAqFSYqiqgkhuN6GiIjoDmo9LeXl5YUNGzbAYDBUeTt06FBD1kkAzl8tREZuEZTWVujp6yp3OURERGap1uGmZ8+e0lXBq3KnUR2qv92p2QCA7m1cYGdrLXM1RERE5qnW01KzZs1CQUFBtfv9/f2xfft2kxRFVeOUFBER0Z3VOtzcd999Ne53cHDAgAED6l0QVc1gENhTHm78uZiYiIioOrWeljp79iynnWT0Z1YerhaUwN7WGsGtXeQuh4iIyGzVOtwEBATgypUr0v1Ro0YhMzOzQYqiysoPAQ/xc4XShieWJiIiqk6tvyVvH7XZunVrjWtwyLS43oaIiKh2OATQBOgNAnvO8uR9REREtVHrcKNQKCqdpI8n7WscJy7pkFdUBic7G3Tydpa7HCIiIrNW66OlhBAYP348VCoVAKCoqAiTJ0+Gg4ODUbsNGzaYtkKSpqRC27aEjTUH24iIiGpS63ATGRlpdP+JJ54weTFUtfLFxGGckiIiIrqjWoeb2NjYhqyDqlFSZsD+89cAcL0NERFRbXCOw8wd/SsHhSV6tHBQIlDrJHc5REREZo/hxsyVr7cJa9cSVlZcwE1ERHQnZhFuli9fDj8/P9jZ2SE0NBT79u2r1ePWrVsHhUKBESNGNGyBMiq/WGYfTkkRERHViuzhZv369YiKikJ0dDQOHTqE4OBgREREICsrq8bHnT9/HjNnzrzjNa+asqJSPQ6l5QDgehsiIqLakj3cvPPOO5g0aRImTJiAjh07YsWKFVCr1fjss8+qfYxer8fYsWOxcOFCtGvXrhGrbVyHLlxHSZkBWmcV2rk53PkBREREJG+4KSkpwcGDBxEeHi5ts7KyQnh4OBITE6t93GuvvQYPDw9MnDjxjq9RXFyM3Nxco1tTUfGSCzxhIhERUe3IGm6ys7Oh1+uh1WqNtmu1WmRkZFT5mJ07d+LTTz/FypUra/UaMTEx0Gg00s3Hx6fedTeW8vU2PL8NERFR7ck+LVUXeXl5ePLJJ7Fy5Uq4udXuApJz586FTqeTbunp6Q1cpWnkF5fh6F86ADePlCIiIqLaqfVJ/BqCm5sbrK2tkZmZabQ9MzMTnp6eldqnpqbi/PnzGDZsmLTNYDAAAGxsbJCcnIz27dsbPUalUkmXjGhK9p+/hjKDgE8Le/i0UMtdDhERUZMh68iNUqlEz549ER8fL20zGAyIj49HWFhYpfYdOnTAsWPHkJSUJN3++c9/4v7770dSUlKTmnK6k/JLLvRtV7sRKiIiIrpJ1pEbAIiKikJkZCRCQkLQu3dvLFu2DAUFBZgwYQIAYNy4cWjVqhViYmJgZ2eHzp07Gz3excUFACptb+rK19v09eeUFBERUV3IHm5GjRqFK1euYP78+cjIyEC3bt0QFxcnLTJOS0uDlVWTWhpUbzmFJThx6eZRXVxvQ0REVDcKIYSQu4jGlJubC41GA51OB2dnZ7nLqdIvJzLw7BcH0d7dAfEvDZS7HCIiItnV5fu7eQ2JNBGJFc5vQ0RERHXDcGOGpPU2PL8NERFRnTHcmJkrecX4MzMfANCH622IiIjqjOHGzCSevTklFeTlDFcHpczVEBERNT0MN2bm1nobjtoQERHdDYYbM5PI9TZERET1wnBjRi7m3MD5q4WwtlKgd9sWcpdDRETUJDHcmJHyKakurTRwsrOVuRoiIqKmieHGjJSHmzBOSREREd01hhszIYTgehsiIiITYLgxExeuFuKSrgi21gqE+HK9DRER0d1iuDETu/+ekurexhX2SmuZqyEiImq6GG7MRPklF3gVcCIiovphuDEDQgjsOcuT9xEREZkCw40ZOJOVj+z8EtjZWqFbGxe5yyEiImrSGG7MwO6Um1NSvfxaQGXD9TZERET1wXBjBnbz/DZEREQmw3AjM73h1nobLiYmIiKqP4YbmZ26nIvcojI4qmzQpZVG7nKIiIiaPIYbmZUfAh7atgVsrPl2EBER1Re/TWXG9TZERESmxXAjo1K9AfvOXQMA9G3vJnM1REREloHhRkZH/9KhsEQPV7UtOng6yV0OERGRRWC4kVH5VcD7tGsJKyuFzNUQERFZBoYbGZWvt+ElF4iIiEyH4UYmRaV6HLhwHQAQxvU2REREJsNwI5NDaddRUmaAu5MK7d0d5C6HiIjIYjDcyGRPhSkphYLrbYiIiEyF4UYmXG9DRETUMBhuZFBQXIak9BwAPL8NERGRqTHcyGD/+WsoMwi0drWHTwu13OUQERFZFIYbGSSm8irgREREDYXhRgaJZ/9eb+PPcENERGRqDDeNTFdYiuMXdQCAsHZcb0NERGRqDDeNbO+5qzAIoJ27Azw1dnKXQ0REZHEYbhrZbq63ISIialAMN41sT/l6Gx4CTkRE1CAYbhpRdn4xTmfkAQD6tGshczVERESWieGmEZWP2nTwdEJLR5XM1RAREVkmhptGdOuSC5ySIiIiaigMN41IOnkfrydFRETUYBhuGsll3Q2cyy6AlQLo3ZbrbYiIiBoKw00jKR+16dJKA429rczVEBERWS6Gm0Yind+G622IiIgaFMNNIxBCcL0NERFRI2G4aQRp1wpxMecGbKwU6OXnKnc5REREFo3hphGUj9p0b+MCtdJG5mqIiIgsG8NNI+B6GyIiosbDcNPAhBAVTt7H9TZEREQNjeGmgaVk5SM7vxgqGyt0b+MidzlEREQWj+GmgSX+fT2pED9XqGysZa6GiIjI8jHcNLDdKbyeFBERUWNiuGlABoOQRm54fhsiIqLGwXDTgE5ezoXuRikcVTbo2kojdzlERETNAsNNAyo/v00vP1fYWPNXTURE1Bj4jduAyqekuN6GiIio8TDcNJBSvQF7ud6GiIio0THcNJBjF3UoKNFDY2+Ljl7OcpdDRETUbDDcNJDy9TZ92rWAlZVC5mqIiIiaD4abBrI7NRsA19sQERE1NoabBlBcpseB89cB8HpSREREjY3hpgEcTstBcZkBbo4q+Hs4yl0OERFRs8Jw0wAqXgVcoeB6GyIiosZkFuFm+fLl8PPzg52dHUJDQ7Fv375q265cuRL33XcfXF1d4erqivDw8BrbyyHx7/U2PASciIio8ckebtavX4+oqChER0fj0KFDCA4ORkREBLKysqpsn5CQgNGjR2P79u1ITEyEj48PBg0ahIsXLzZy5VUrLClDUnoOAK63ISIikoNCCCHkLCA0NBS9evXCBx98AAAwGAzw8fHBCy+8gDlz5tzx8Xq9Hq6urvjggw8wbty4O7bPzc2FRqOBTqeDs7Ppzz/z+59XMO6zfWjlYo+ds+/ntBQREZEJ1OX7W9aRm5KSEhw8eBDh4eHSNisrK4SHhyMxMbFWz1FYWIjS0lK0aNGiyv3FxcXIzc01ujWk8vU2YVxvQ0REJAtZw012djb0ej20Wq3Rdq1Wi4yMjFo9x+zZs+Ht7W0UkCqKiYmBRqORbj4+PvWuuybSept2nJIiIiKSg+xrbupjyZIlWLduHX744QfY2dlV2Wbu3LnQ6XTSLT09vcHqyS0qxbGLOgBcTExERCQXGzlf3M3NDdbW1sjMzDTanpmZCU9Pzxof+9Zbb2HJkiX49ddf0bVr12rbqVQqqFQqk9R7J/vOXoNBAG3dHODtYt8or0lERETGZB25USqV6NmzJ+Lj46VtBoMB8fHxCAsLq/Zxb775JhYtWoS4uDiEhIQ0Rqm1UnG9DREREclD1pEbAIiKikJkZCRCQkLQu3dvLFu2DAUFBZgwYQIAYNy4cWjVqhViYmIAAEuXLsX8+fPx1Vdfwc/PT1qb4+joCEdHec8GfOt6Ugw3REREcpE93IwaNQpXrlzB/PnzkZGRgW7duiEuLk5aZJyWlgYrq1sDTB999BFKSkrw2GOPGT1PdHQ0FixY0JilG7maX4zTGXkAgD5cTExERCQb2c9z09ga6jw3W49dxpS1hxCodcIvL/Y32fMSERFR3b6/ZR+5sRQRnTzx0/P3IreoVO5SiIiImjWGGxOxtlKgS2uN3GUQERE1e036PDdEREREt2O4ISIiIovCcENEREQWheGGiIiILArDDREREVkUhhsiIiKyKAw3REREZFEYboiIiMiiMNwQERGRRWG4ISIiIovCcENEREQWhdeWIiJqRvR6PUpLeYFfMk9KpRJWVvUfd2G4ISJqBoQQyMjIQE5OjtylEFXLysoKbdu2hVKprNfzMNwQETUD5cHGw8MDarUaCoVC7pKIjBgMBly6dAmXL19GmzZt6vUZZbghIrJwer1eCjYtW7aUuxyiarm7u+PSpUsoKyuDra3tXT8PFxQTEVm48jU2arVa5kqIalY+HaXX6+v1PAw3RETNBKeiyNyZ6jPKcENEREQWheGGiIjIRBYsWIBu3bo12utt3LgR/v7+sLa2xowZMxrtdatTm/4PHDiwwWtluCEiIrOUkJAAhUJR7e3++++Xu8QGUd7v2hy2/+yzz+Kxxx5Deno6Fi1a1PDFmcCGDRsavFYeLUVERGapb9++uHz5cqXtmzZtwuTJkzFlypS7fu6SkpJ6n0tFbvn5+cjKykJERAS8vb2rbKPX66FQKExyYjxTadGiRYO/hvn0loiIGl9BQfW3oqLat71xo3Zt60CpVMLT09Podv36dcycORPz5s3D//3f/0ltjx8/jiFDhsDR0RFarRZPPvkksrOzpf0DBw7E888/jxkzZsDNzQ0REREAgB07dqB3795QqVTw8vLCnDlzUFZWVmNdCQkJ6N27NxwcHODi4oJ+/frhwoULRm2++OIL+Pn5QaPR4PHHH0deXp60r7i4GNOmTYOHhwfs7Oxw7733Yv/+/QCA8+fPSyNSrq6uUCgUGD9+fJU1ODk5AQAeeOABKBQKJCQkYPXq1XBxccGmTZvQsWNHqFQqpKWl4fr16xg3bhxcXV2hVqsxZMgQnDlzRnq+8sdt3rwZgYGBUKvVeOyxx1BYWIg1a9bAz88Prq6umDZtWq2OZKqp/5yWIiKihuXoWP3tX/8ybuvhUX3bIUOM2/r5Vd2uHnJycjB8+HAMHDjQaFojJycHDzzwALp3744DBw4gLi4OmZmZGDlypNHj16xZA6VSiV27dmHFihW4ePEihg4dil69euHIkSP46KOP8Omnn+L111+vtoaysjKMGDECAwYMwNGjR5GYmIhnnnnG6Cif1NRUbNy4EZs3b8bmzZuxY8cOLFmyRNr/8ssv4/vvv8eaNWtw6NAh+Pv7IyIiAteuXYOPjw++//57AEBycjIuX76Md999t1Idffv2RXJyMgDg+++/x+XLl9G3b18AQGFhIZYuXYpVq1bhxIkT8PDwwPjx43HgwAFs2rQJiYmJEEJg6NChRpfiKCwsxHvvvYd169YhLi4OCQkJeOSRR7B161Zs3boVX3zxBT7++GN89913Nb5Pd+p/oxDNjE6nEwCETqeTuxQiokZx48YNcfLkSXHjxo3KO4Hqb0OHGrdVq6tvO2CAcVs3t6rb3SW9Xi+GDBkigoKCRG5urtG+RYsWiUGDBhltS09PFwBEcnKyEEKIAQMGiO7duxu1mTdvnggMDBQGg0Hatnz5cuHo6Cj0en2VdVy9elUAEAkJCVXuj46OFmq12qjGWbNmidDQUCGEEPn5+cLW1lasXbtW2l9SUiK8vb3Fm2++KYQQYvv27QKAuH79ek2/EnH9+nUBQGzfvl3aFhsbKwCIpKQkaduff/4pAIhdu3ZJ27Kzs4W9vb345ptvjB6XkpIitXn22WeFWq0WeXl50raIiAjx7LPPVlvTnfovxM33Yvr06VU+vqbPal2+v7nmhoioOcvPr36ftbXx/ays6tvevqbj/Pm7Lqkq8+bNQ2JiIvbt2ydNx5Q7cuQItm/fDscqRoZSU1Nxzz33AAB69uxptO/UqVMICwszGnXp168f8vPz8ddffwEAOnbsaFTDvHnzMH78eEREROChhx5CeHg4Ro4cCS8vL6mdn5+fUY1eXl7I+vt3l5qaitLSUvTr10/ab2tri969e+PUqVN1/r1URalUomvXrkb9tLGxQWhoqLStZcuWCAwMNHpNtVqN9u3bS/e1Wi38/PyMfq9arVbqS3Vq6n9jYbghImrOHBzkb3sH69atw1tvvYUtW7YgICCg0v78/HwMGzYMS5curbSvYuhwqGNN3t7eSEpKku6XL4SNjY3FtGnTEBcXh/Xr1+OVV17Btm3b0KdPHwCodNkAhUIBg8FQp9euD3t7+7s6GV5Vdd9NX+TuP8A1N0REZMaSkpIwceJELFmyRFoEfLsePXrgxIkT8PPzg7+/v9GtpkATFBQkrT8pt2vXLjg5OaF169awsbExeq6KR/l0794dc+fOxe7du9G5c2d89dVXtepP+/btpXU/5UpLS7F//35plMhUlyAoFxQUhLKyMuzdu1fadvXqVSQnJxuNTFkShhsiIjJL2dnZGDFiBAYOHIgnnngCGRkZRrcrV64AAKZOnYpr165h9OjR2L9/P1JTU/HLL79gwoQJNQaEKVOmID09HS+88AJOnz6NH3/8EdHR0YiKiqr20Olz585h7ty5SExMxIULF/C///0PZ86cQVBQUK365ODggOeeew6zZs1CXFwcTp48iUmTJqGwsBATJ04EAPj6+kKhUGDz5s24cuUK8muaOqyFgIAADB8+HJMmTcLOnTtx5MgRPPHEE2jVqhWGDx9er+c2V5yWIiIis7RlyxZcuHABFy5cMJpeKufr64vz58/D29sbu3btwuzZszFo0CAUFxfD19cXgwcPrvH8Lq1atcLWrVsxa9YsBAcHo0WLFpg4cSJeeeWVah+jVqtx+vRprFmzBlevXoWXlxemTp2KZ599ttb9WrJkCQwGA5588knk5eUhJCQEv/zyC1xdXaW6Fi5ciDlz5mDChAkYN24cVq9eXevnr0psbCymT5+Of/zjHygpKUH//v2xdevWel1525wpRMXxuGYgNzcXGo0GOp0Ozs7OcpdDRNTgioqKcO7cObRt2xZ2dnZyl0NUrZo+q3X5/ua0FBEREVkUhhsiIiKyKAw3REREZFEYboiIiMiiMNwQERGRRWG4ISIiIovCcENEREQWheGGiIiILArDDREREVkUhhsiImqSBg4ciBkzZshdRqNLSEiAQqFATk6O3KWYLYYbIiIyW+PHj4dCoah0S0lJwYYNG7Bo0aIGr6G5hqjbrV69Gi4uLnKXUSu8cCYREZm1wYMHIzY21mibu7s7rK2tZaqIzB1HboiImiEhBApLymS51fV6zSqVCp6enkY3a2vrSiMqfn5+WLx4MZ566ik4OTmhTZs2+OSTT4yeKz09HSNHjoSLiwtatGiB4cOH4/z589W+9vjx47Fjxw68++670qjR+fPnqxzF2LhxIxQKhXR/wYIF6NatG7744gv4+flBo9Hg8ccfR15entTGYDAgJiYGbdu2hb29PYKDg/Hdd98ZPe/WrVtxzz33wN7eHvfff3+N9ZZLS0vD8OHD4ejoCGdnZ4wcORKZmZl1qq2ihIQETJgwATqdTvo9LFiwAADwxRdfICQkBE5OTvD09MSYMWOQlZV1xxobEkduiIiaoRulenSc/4ssr33ytQiolQ3z9fP2229j0aJFmDdvHr777js899xzGDBgAAIDA1FaWoqIiAiEhYXhjz/+gI2NDV5//XUMHjwYR48ehVKprPR87777Lv7880907twZr732GoCbo0a1lZqaio0bN2Lz5s24fv06Ro4ciSVLluCNN94AAMTExODLL7/EihUrEBAQgN9//x1PPPEE3N3dMWDAAKSnp+PRRx/F1KlT8cwzz+DAgQN46aWXanxNg8EgBZsdO3agrKwMU6dOxahRo5CQkFDr2irq27cvli1bhvnz5yM5ORkA4OjoCAAoLS3FokWLEBgYiKysLERFRWH8+PHYunVrrX9PpsZwQ0REZm3z5s3SFykADBkyBN9++22VbYcOHYopU6YAAGbPno3//ve/2L59OwIDA7F+/XoYDAasWrVKGmGJjY2Fi4sLEhISMGjQoErPp9FooFQqoVar4enpWefaDQYDVq9eDScnJwDAk08+ifj4eLzxxhsoLi7G4sWL8euvvyIsLAwA0K5dO+zcuRMff/wxBgwYgI8++gjt27fH22+/DQAIDAzEsWPHsHTp0mpfMz4+HseOHcO5c+fg4+MDAPj888/RqVMn7N+/H7169bpjbbdTKpXQaDRQKBSVfg9PPfWU9HO7du3w3nvvoVevXsjPzzd63xoTww0RUTNkb2uNk69FyPbadXH//ffjo48+ku47ODhU27Zr167Sz+VfxOVTJEeOHEFKSor0ZV6uqKgIqamp+OOPPzBkyBBp+8cff4yxY8fWqdbb+fn5Gb2el5eXVE9KSgoKCwvx0EMPGT2mpKQE3bt3BwCcOnUKoaGhRvvLg1B1Tp06BR8fHynYAEDHjh3h4uKCU6dOSeGmptrq4uDBg1iwYAGOHDmC69evw2AwALg5NdaxY8c6P58pMNwQETVDCoWiwaaGTM3BwQH+/v61amtra2t0X6FQSF+2+fn56NmzJ9auXVvpce7u7lAqlUhKSpK2abXaal/Hysqq0tqh0tLSOtcDAFu2bEGrVq2M2qlUqmpf21Rqqq22CgoKEBERgYiICKxduxbu7u5IS0tDREQESkpKTFlunTSNTzYREVE99ejRA+vXr4eHhwecnZ2rbFNViFIqldDr9Ubb3N3dkZeXh4KCAmkkqWIwqo2OHTtCpVIhLS0NAwYMqLJNUFAQNm3aZLRtz549NT5vUFAQ0tPTkZ6eLo3enDx5Ejk5OfUaSanq93D69GlcvXoVS5YskV7rwIEDd/0apsKjpYiIqFkYO3Ys3NzcMHz4cPzxxx84d+4cEhISMG3aNPz111/VPs7Pzw979+7F+fPnkZ2dDYPBgNDQUKjVasybNw+pqan46quvsHr16jrV4+TkhJkzZ+LFF1/EmjVrkJqaikOHDuH999/HmjVrAACTJ0/GmTNnMGvWLCQnJ9fqdcLDw9GlSxeMHTsWhw4dwr59+zBu3DgMGDAAISEhdaqxIj8/P+Tn5yM+Ph7Z2dkoLCxEmzZtoFQq8f777+Ps2bPYtGlTo5x76E4YboiIqFlQq9X4/fff0aZNGzz66KMICgrCxIkTUVRUVO1IDgDMnDkT1tbW6NixozTt0qJFC3z55ZfYunUrunTpgq+//lo6NLouFi1ahFdffRUxMTEICgrC4MGDsWXLFrRt2xYA0KZNG3z//ffYuHEjgoODsWLFCixevLjG51QoFPjxxx/h6uqK/v37Izw8HO3atcP69evrXF9Fffv2xeTJkzFq1Ci4u7vjzTffhLu7O1avXo1vv/0WHTt2xJIlS/DWW2/V63VMQSHqesKBJi43NxcajQY6na7GDzMRkaUoKirCuXPn0LZtW9jZ2cldDlG1avqs1uX7myM3REREZFEYboiIiMiiMNwQERGRRWG4ISIiIovCcENE1Ew0s+NHqAky1WeU4YaIyMKVn4m2sLBQ5kqIalZ+VmNr67pdouN2PEMxEZGFs7a2houLi3TdILVaLV04kshcGAwGXLlyBWq1GjY29YsnDDdERM1A+ZWc7+bCiESNxcrKCm3atKl3+Ga4ISJqBhQKBby8vODh4VHlBR6JzIFSqYSVVf1XzDDcEBE1I9bW1vVez0Bk7sxiQfHy5cvh5+cHOzs7hIaGYt++fTW2//bbb9GhQwfY2dmhS5cu2Lp1ayNVSkREROZO9nCzfv16REVFITo6GocOHUJwcDAiIiKqnRfevXs3Ro8ejYkTJ+Lw4cMYMWIERowYgePHjzdy5URERGSOZL9wZmhoKHr16oUPPvgAwM3V0j4+PnjhhRcwZ86cSu1HjRqFgoICbN68WdrWp08fdOvWDStWrLjj6/HCmURERE1PXb6/ZV1zU1JSgoMHD2Lu3LnSNisrK4SHhyMxMbHKxyQmJiIqKspoW0REBDZu3Fhl++LiYhQXF0v3dTodgJu/JCIiImoayr+3azMmI2u4yc7Ohl6vh1arNdqu1Wpx+vTpKh+TkZFRZfuMjIwq28fExGDhwoWVtvv4+Nxl1URERCSXvLw8aDSaGttY/NFSc+fONRrpMRgMuHbtGlq2bGkxJ7HKzc2Fj48P0tPTm8VUG/tr2dhfy9bc+gs0vz43VH+FEMjLy4O3t/cd28oabtzc3GBtbY3MzEyj7ZmZmdIJp27n6elZp/YqlQoqlcpom4uLy90XbcacnZ2bxT+ccuyvZWN/LVtz6y/Q/PrcEP2904hNOVmPllIqlejZsyfi4+OlbQaDAfHx8QgLC6vyMWFhYUbtAWDbtm3VticiIqLmRfZpqaioKERGRiIkJAS9e/fGsmXLUFBQgAkTJgAAxo0bh1atWiEmJgYAMH36dAwYMABvv/02Hn74Yaxbtw4HDhzAJ598Imc3iIiIyEzIHm5GjRqFK1euYP78+cjIyEC3bt0QFxcnLRpOS0szOhVz37598dVXX+GVV17BvHnzEBAQgI0bN6Jz585ydUF2KpUK0dHRlabfLBX7a9nYX8vW3PoLNL8+m0N/ZT/PDREREZEpyX6GYiIiIiJTYrghIiIii8JwQ0RERBaF4YaIiIgsCsONmVqwYAEUCoXRrUOHDtL+oqIiTJ06FS1btoSjoyP+9a9/VTq5YVpaGh5++GGo1Wp4eHhg1qxZKCsra+yuVOn333/HsGHD4O3tDYVCUenaYEIIzJ8/H15eXrC3t0d4eDjOnDlj1ObatWsYO3YsnJ2d4eLigokTJyI/P9+ozdGjR3HffffBzs4OPj4+ePPNNxu6a1W6U3/Hjx9f6f0ePHiwUZum1N+YmBj06tULTk5O8PDwwIgRI5CcnGzUxlSf4YSEBPTo0QMqlQr+/v5YvXp1Q3evktr0d+DAgZXe48mTJxu1aSr9/eijj9C1a1fpJG1hYWH4+eefpf2W9N4Cd+6vJb23VVmyZAkUCgVmzJghbTP791iQWYqOjhadOnUSly9flm5XrlyR9k+ePFn4+PiI+Ph4ceDAAdGnTx/Rt29faX9ZWZno3LmzCA8PF4cPHxZbt24Vbm5uYu7cuXJ0p5KtW7eKf//732LDhg0CgPjhhx+M9i9ZskRoNBqxceNGceTIEfHPf/5TtG3bVty4cUNqM3jwYBEcHCz27Nkj/vjjD+Hv7y9Gjx4t7dfpdEKr1YqxY8eK48ePi6+//lrY29uLjz/+uLG6KblTfyMjI8XgwYON3u9r164ZtWlK/Y2IiBCxsbHi+PHjIikpSQwdOlS0adNG5OfnS21M8Rk+e/asUKvVIioqSpw8eVK8//77wtraWsTFxZldfwcMGCAmTZpk9B7rdLom2d9NmzaJLVu2iD///FMkJyeLefPmCVtbW3H8+HEhhGW9t7XpryW9t7fbt2+f8PPzE127dhXTp0+Xtpv7e8xwY6aio6NFcHBwlftycnKEra2t+Pbbb6Vtp06dEgBEYmKiEOLml6mVlZXIyMiQ2nz00UfC2dlZFBcXN2jtdXX7l73BYBCenp7iP//5j7QtJydHqFQq8fXXXwshhDh58qQAIPbv3y+1+fnnn4VCoRAXL14UQgjx4YcfCldXV6P+zp49WwQGBjZwj2pWXbgZPnx4tY9pyv0VQoisrCwBQOzYsUMIYbrP8Msvvyw6depk9FqjRo0SERERDd2lGt3eXyFufgFW/HK4XVPurxBCuLq6ilWrVln8e1uuvL9CWO57m5eXJwICAsS2bduM+tgU3mNOS5mxM2fOwNvbG+3atcPYsWORlpYGADh48CBKS0sRHh4ute3QoQPatGmDxMREAEBiYiK6dOlidAX1iIgI5Obm4sSJE43bkTo6d+4cMjIyjPqn0WgQGhpq1D8XFxeEhIRIbcLDw2FlZYW9e/dKbfr37w+lUim1iYiIQHJyMq5fv95Ivam9hIQEeHh4IDAwEM899xyuXr0q7Wvq/dXpdACAFi1aADDdZzgxMdHoOcrblD+HXG7vb7m1a9fCzc0NnTt3xty5c1FYWCjta6r91ev1WLduHQoKChAWFmbx7+3t/S1nie/t1KlT8fDDD1eqqym8x7KfoZiqFhoaitWrVyMwMBCXL1/GwoULcd999+H48ePIyMiAUqmsdAFQrVaLjIwMAEBGRobRh6p8f/k+c1ZeX1X1V+yfh4eH0X4bGxu0aNHCqE3btm0rPUf5PldX1wap/24MHjwYjz76KNq2bYvU1FTMmzcPQ4YMQWJiIqytrZt0fw0GA2bMmIF+/fpJZxI31We4uja5ubm4ceMG7O3tG6JLNaqqvwAwZswY+Pr6wtvbG0ePHsXs2bORnJyMDRs2AGh6/T127BjCwsJQVFQER0dH/PDDD+jYsSOSkpIs8r2trr+A5b23ALBu3TocOnQI+/fvr7SvKfz7ZbgxU0OGDJF+7tq1K0JDQ+Hr64tvvvlGlj/Y1LAef/xx6ecuXbqga9euaN++PRISEvDggw/KWFn9TZ06FcePH8fOnTvlLqVRVNffZ555Rvq5S5cu8PLywoMPPojU1FS0b9++scust8DAQCQlJUGn0+G7775DZGQkduzYIXdZDaa6/nbs2NHi3tv09HRMnz4d27Ztg52dndzl3BVOSzURLi4uuOeee5CSkgJPT0+UlJQgJyfHqE1mZiY8PT0BAJ6enpVWrpffL29jrsrrq6r+iv3Lysoy2l9WVoZr165ZxO+gXbt2cHNzQ0pKCoCm29/nn38emzdvxvbt29G6dWtpu6k+w9W1cXZ2luV/Aqrrb1VCQ0MBwOg9bkr9VSqV8Pf3R8+ePRETE4Pg4GC8++67FvveVtffqjT19/bgwYPIyspCjx49YGNjAxsbG+zYsQPvvfcebGxsoNVqzf49ZrhpIvLz85GamgovLy/07NkTtra2iI+Pl/YnJycjLS1NmgMOCwvDsWPHjL4Qt23bBmdnZ2ko1Vy1bdsWnp6eRv3Lzc3F3r17jfqXk5ODgwcPSm1+++03GAwG6Q9LWFgYfv/9d5SWlkpttm3bhsDAQLOakqrKX3/9hatXr8LLywtA0+uvEALPP/88fvjhB/z222+VpstM9RkOCwszeo7yNhXXQjSGO/W3KklJSQBg9B43lf5WxWAwoLi42OLe2+qU97cqTf29ffDBB3Hs2DEkJSVJt5CQEIwdO1b62ezf43ovSaYG8dJLL4mEhARx7tw5sWvXLhEeHi7c3NxEVlaWEOLmYXht2rQRv/32mzhw4IAICwsTYWFh0uPLD8MbNGiQSEpKEnFxccLd3d1sDgXPy8sThw8fFocPHxYAxDvvvCMOHz4sLly4IIS4eSi4i4uL+PHHH8XRo0fF8OHDqzwUvHv37mLv3r1i586dIiAgwOjQ6JycHKHVasWTTz4pjh8/LtatWyfUarUsh0bX1N+8vDwxc+ZMkZiYKM6dOyd+/fVX0aNHDxEQECCKioqaZH+fe+45odFoREJCgtHhsYWFhVIbU3yGyw8lnTVrljh16pRYvny5LIfP3qm/KSkp4rXXXhMHDhwQ586dEz/++KNo166d6N+/f5Ps75w5c8SOHTvEuXPnxNGjR8WcOXOEQqEQ//vf/4QQlvXe3qm/lvbeVuf2I8LM/T1muDFTo0aNEl5eXkKpVIpWrVqJUaNGiZSUFGn/jRs3xJQpU4Srq6tQq9XikUceEZcvXzZ6jvPnz4shQ4YIe3t74ebmJl566SVRWlra2F2p0vbt2wWASrfIyEghxM3DwV999VWh1WqFSqUSDz74oEhOTjZ6jqtXr4rRo0cLR0dH4ezsLCZMmCDy8vKM2hw5ckTce++9QqVSiVatWoklS5Y0VheN1NTfwsJCMWjQIOHu7i5sbW2Fr6+vmDRpktEhlEI0rf5W1VcAIjY2Vmpjqs/w9u3bRbdu3YRSqRTt2rUzeo3Gcqf+pqWlif79+4sWLVoIlUol/P39xaxZs4zOhSJE0+nvU089JXx9fYVSqRTu7u7iwQcflIKNEJb13gpRc38t7b2tzu3hxtzfY4UQQtR//IeIiIjIPHDNDREREVkUhhsiIiKyKAw3REREZFEYboiIiMiiMNwQERGRRWG4ISIiIovCcENEREQWheGGyIKcP38eCoVCOv27OTh9+jT69OkDOzs7dOvWrdFe18/PD8uWLat1+4SEBCgUikrXy2luxo8fjxEjRshdBlG9MNwQmdD48eOhUCiwZMkSo+0bN26EQqGQqSp5RUdHw8HBAcnJyZWuIwMACoWixtuCBQvu6nX3799vdLXmO+nbty8uX74MjUZzV69XFytXrkRwcDAcHR3h4uKC7t27IyYmpsFfl6i5sJG7ACJLY2dnh6VLl+LZZ581+wt01lZJSQmUSuVdPTY1NRUPP/wwfH19q9x/+fJl6ef169dj/vz5SE5OlrY5OjpKPwshoNfrYWNz5z9d7u7udapTqVQ2ytXTP/vsM8yYMQPvvfceBgwYgOLiYhw9ehTHjx9v8Ncmai44ckNkYuHh4fD09Kzx/8QXLFhQaYpm2bJl8PPzk+6XTw8sXrwYWq0WLi4ueO2111BWVoZZs2ahRYsWaN26NWJjYys9/+nTp9G3b1/Y2dmhc+fO2LFjh9H+48ePY8iQIXB0dIRWq8WTTz6J7Oxsaf/AgQPx/PPPY8aMGXBzc0NERESV/TAYDHjttdfQunVrqFQqdOvWDXFxcdJ+hUKBgwcP4rXXXqt2FMbT01O6aTQaKBQK6f7p06fh5OSEn3/+GT179oRKpcLOnTuRmpqK4cOHQ6vVwtHREb169cKvv/5q9Ly3T0spFAqsWrUKjzzyCNRqNQICArBp0yZp/+3TUqtXr4aLiwt++eUXBAUFwdHREYMHDzYKY2VlZZg2bRpcXFzQsmVLzJ49G5GRkTVO62zatAkjR47ExIkT4e/vj06dOmH06NF44403pDb79+/HQw89BDc3N2g0GgwYMACHDh0yeh6FQoGPP/4Y//jHP6BWqxEUFITExESkpKRg4MCBcHBwQN++fZGamio9pvxz9/HHH8PHxwdqtRojR46ETqertl6DwYCYmBi0bdsW9vb2CA4OxnfffSftv379OsaOHQt3d3fY29sjICCgys8kUWNiuCEyMWtrayxevBjvv/8+/vrrr3o912+//YZLly7h999/xzvvvIPo6Gj84x//gKurK/bu3YvJkyfj2WefrfQ6s2bNwksvvYTDhw8jLCwMw4YNw9WrVwEAOTk5eOCBB9C9e3ccOHAAcXFxyMzMxMiRI42eY82aNVAqldi1axdWrFhRZX3vvvsu3n77bbz11ls4evQoIiIi8M9//hNnzpwBcHNUplOnTnjppZdw+fJlzJw5865+D3PmzMGSJUtw6tQpdO3aFfn5+Rg6dCji4+Nx+PBhDB48GMOGDUNaWlqNz7Nw4UKMHDkSR48exdChQzF27Fhcu3at2vaFhYV466238MUXX+D3339HWlqaUR+WLl2KtWvXIjY2Frt27UJubi42btxYYw2enp7Ys2cPLly4UG2bvLw8REZGYufOndizZw8CAgIwdOhQ5OXlGbVbtGgRxo0bh6SkJHTo0AFjxozBs88+i7lz5+LAgQMQQuD55583ekxKSgq++eYb/PTTT4iLi8Phw4cxZcqUamuJiYnB559/jhUrVuDEiRN48cUX8cQTT0iB+dVXX8XJkyfx888/49SpU/joo4/g5uZW4++AqMGZ5PKbRCSEECIyMlIMHz5cCCFEnz59xFNPPSWEEOKHH34QFf+5RUdHi+DgYKPH/ve//xW+vr5Gz+Xr6yv0er20LTAwUNx3333S/bKyMuHg4CC+/vprIYQQ586dEwCMrgZeWloqWrduLZYuXSqEEGLRokVi0KBBRq+dnp4uAEhXXh8wYIDo3r37Hfvr7e0t3njjDaNtvXr1ElOmTJHuBwcHi+jo6Ds+lxBCxMbGCo1GI90vv5r6xo0b7/jYTp06iffff1+67+vrK/773/9K9wGIV155Rbqfn58vAIiff/7Z6LWuX78u1QJApKSkSI9Zvny50Gq10n2tViv+85//SPfLyspEmzZtpM9AVS5duiT69OkjAIh77rlHREZGivXr1xu9z7fT6/XCyclJ/PTTT9X2JzExUQAQn376qbTt66+/FnZ2dtL96OhoYW1tLf766y9p288//yysrKykKzpX/AwXFRUJtVotdu/ebVTPxIkTxejRo4UQQgwbNkxMmDCh2tqJ5MCRG6IGsnTpUqxZswanTp266+fo1KkTrKxu/TPVarXo0qWLdN/a2hotW7ZEVlaW0ePCwsKkn21sbBASEiLVceTIEWzfvh2Ojo7SrUOHDgBgNIXRs2fPGmvLzc3FpUuX0K9fP6Pt/fr1q1efqxISEmJ0Pz8/HzNnzkRQUBBcXFzg6OiIU6dO3XHkpmvXrtLPDg4OcHZ2rvS7q0itVqN9+/bSfS8vL6m9TqdDZmYmevfuLe23tra+4+/Ny8sLiYmJOHbsGKZPn46ysjJERkZi8ODBMBgMAIDMzExMmjQJAQEB0Gg0cHZ2Rn5+fqX+VeyPVqsFAKPPh1arRVFREXJzc6Vtbdq0QatWraT7YWFhMBgMRuucyqWkpKCwsBAPPfSQ0efl888/lz4rzz33HNatW4du3brh5Zdfxu7du2vsP1Fj4IJiogbSv39/REREYO7cuRg/frzRPisrKwghjLaVlpZWeg5bW1uj+wqFospt5V+KtZGfn49hw4Zh6dKllfZ5eXlJPzs4ONT6ORva7bXMnDkT27Ztw1tvvQV/f3/Y29vjscceQ0lJSY3PU9ffXVXtb3/f7lbnzp3RuXNnTJkyBZMnT8Z9992HHTt24P7770dkZCSuXr2Kd999F76+vlCpVAgLC6vUv4r1lR+NV9W2unw+KsrPzwcAbNmyxSgQAYBKpQIADBkyBBcuXMDWrVuxbds2PPjgg5g6dSreeuutu3pNIlPgyA1RA1qyZAl++uknJCYmGm13d3dHRkaG0RelKc9Ns2fPHunnsrIyHDx4EEFBQQCAHj164MSJE/Dz84O/v7/RrS6BxtnZGd7e3ti1a5fR9l27dqFjx46m6Ug1du3ahfHjx+ORRx5Bly5d4OnpifPnzzfoa95Oo9FAq9Vi//790ja9Xl9p4W9tlP++CgoKANzs37Rp0zB06FB06tQJKpXKaMF3faSlpeHSpUvS/T179sDKygqBgYFV1qVSqZCWllbps+Lj4yO1c3d3R2RkJL788kssW7YMn3zyiUlqJbpbHLkhakBdunTB2LFj8d577xltHzhwIK5cuYI333wTjz32GOLi4vDzzz/D2dnZJK+7fPlyBAQEICgoCP/9739x/fp1PPXUUwCAqVOnYuXKlRg9ejRefvlltGjRAikpKVi3bh1WrVoFa2vrWr/OrFmzEB0djfbt26Nbt26IjY1FUlIS1q5da5J+VCcgIAAbNmzAsGHDoFAo8Oqrr9716ER9vPDCC4iJiYG/vz86dOiA999/H9evX6/xnEbPPfccvL298cADD6B169a4fPkyXn/9dbi7u0vTiQEBAfjiiy8QEhKC3NxczJo1C/b29iap2c7ODpGRkXjrrbeQm5uLadOmYeTIkVUeBu/k5ISZM2fixRdfhMFgwL333gudToddu3bB2dkZkZGRmD9/Pnr27IlOnTqhuLgYmzdvloI0kVw4ckPUwF577bVKX7xBQUH48MMPsXz5cgQHB2Pfvn13fSRRVZYsWYIlS5YgODgYO3fuxKZNm6QjWMpHW/R6PQYNGoQuXbpgxowZcHFxMVrfUxvTpk1DVFQUXnrpJXTp0gVxcXHYtGkTAgICTNaXqrzzzjtwdXVF3759MWzYMERERKBHjx4N+ppVmT17NkaPHo1x48YhLCwMjo6OiIiIgJ2dXbWPCQ8Px549e/B///d/uOeee/Cvf/0LdnZ2iI+PR8uWLQEAn376Ka5fv44ePXrgySefxLRp0+Dh4WGSmv39/fHoo49i6NChGDRoELp27YoPP/yw2vaLFi3Cq6++ipiYGAQFBWHw4MHYsmUL2rZtC+Dm+YHmzp2Lrl27on///rC2tsa6detMUivR3VIIU00gExE1cwaDAUFBQRg5ciQWLVokdzmVLFiwABs3bjSry3MQNQROSxER3aULFy7gf//7n3Sm4Q8++ADnzp3DmDFj5C6NqFnjtBQR0V2ysrLC6tWr0atXL/Tr1w/Hjh3Dr7/+yjUnRDLjtBQRERFZFI7cEBERkUVhuCEiIiKLwnBDREREFoXhhoiIiCwKww0RERFZFIYbIiIisigMN0RERGRRGG6IiIjIojDcEBERkUX5f6aWdA/tTUAYAAAAAElFTkSuQmCC", "text/plain": [ "

" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig, ax = plt.subplots()\n", "ax.axhline(f1_scores[\"hi\"][\"ta\"], ls=\"--\", color=\"r\")\n", "metrics_df.set_index(\"num_samples\").plot(ax=ax) \n", "plt.legend([\"Zero-shot from hi\", \"Fine-tuned on ta\"], loc=\"lower right\")\n", "plt.ylim((0, 1))\n", "plt.xlabel(\"Number of Training Samples\")\n", "plt.ylabel(\"F1 Score\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fine-Tuning on Multiple Languages at Once" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "tags": [] }, "outputs": [], "source": [ "from datasets import concatenate_datasets\n", "\n", "def concatenate_splits(corpora):\n", " multi_corpus = DatasetDict()\n", " for split in corpora[0].keys():\n", " multi_corpus[split] = concatenate_datasets([corpus[split] for corpus in corpora]).shuffle(seed=42)\n", " return multi_corpus " ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "tags": [] }, "outputs": [], "source": [ "panx_hi_ta_encoded = concatenate_splits([panx_hi_encoded, panx_ta_encoded])" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Cloning https://huggingface.co/the-neural-networker/xlm-roberta-base-finetuned-panx-hi-ta into local empty directory.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/ssd2/abhiroop/misc/nlp-with-transformers/env/lib/python3.8/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [1116/1116 02:00, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossF1
10.4863000.2869280.764870
20.2297000.2283810.814884
30.1527000.2197330.841515

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Adding files tracked by Git LFS: ['tokenizer.json']. This may take a bit of time if the files are large.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f8b705fe0ca04ad5bc3b2f93cb96ebc3", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Upload file pytorch_model.bin: 0%| | 1.00/1.03G [00:00 main\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "To https://huggingface.co/the-neural-networker/xlm-roberta-base-finetuned-panx-hi-ta\n", " ee5064e..18f9a46 main -> main\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "data": { "text/plain": [ "'https://huggingface.co/the-neural-networker/xlm-roberta-base-finetuned-panx-hi-ta/commit/ee5064e5c1507fc85048d794b3dbba4d2a5f0895'" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_args.logging_steps = len(panx_hi_ta_encoded[\"train\"]) // batch_size \n", "training_args.push_to_hub = True \n", "training_args.output_dir = \"xlm-roberta-base-finetuned-panx-hi-ta\"\n", "\n", "trainer = Trainer(model_init=model_init, args=training_args, \n", " data_collator=data_collator, compute_metrics=compute_metrics, \n", " tokenizer=xlmr_tokenizer, train_dataset=panx_hi_ta_encoded[\"train\"],\n", " eval_dataset=panx_hi_ta_encoded[\"validation\"])\n", "\n", "trainer.train()\n", "trainer.push_to_hub(commit_message=\"Training completed!\")" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "619f9075ad024d4384544673facf26f5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/2051 [00:00" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "F1-score of [hi-fa] model on [hi] dataset: 0.823\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d7dda0a00a5040dda9437f82ff4c9340", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/55 [00:00" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "F1-score of [hi-fa] model on [te] dataset: 0.543\n" ] }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "F1-score of [hi-fa] model on [ta] dataset: 0.820\n" ] }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "F1-score of [hi-fa] model on [en] dataset: 0.444\n" ] } ], "source": [ "for lang in langs:\n", " f1 = evaluate_lang_performance(lang, trainer)\n", " print(f\"F1-score of [hi-fa] model on [{lang}] dataset: {f1:.3f}\")" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Cloning https://huggingface.co/the-neural-networker/xlm-roberta-base-finetuned-panx-te into local empty directory.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/ssd2/abhiroop/misc/nlp-with-transformers/env/lib/python3.8/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n" ] }, { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [9/9 00:01, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossF1
12.3928001.8361610.066667
21.3728001.0591160.000000
31.2081001.0058620.000000

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Adding files tracked by Git LFS: ['tokenizer.json']. This may take a bit of time if the files are large.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "baa19893e35f4eecab9cc5e0c7ccb852", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Upload file pytorch_model.bin: 0%| | 1.00/1.03G [00:00 main\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "To https://huggingface.co/the-neural-networker/xlm-roberta-base-finetuned-panx-te\n", " cf8016b..99b3b19 main -> main\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Cloning https://huggingface.co/the-neural-networker/xlm-roberta-base-finetuned-panx-ta into local empty directory.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/ssd2/abhiroop/misc/nlp-with-transformers/env/lib/python3.8/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n" ] }, { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [858/858 01:33, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossF1
10.4843000.2531470.765018
20.2271000.2267590.786340
30.1524000.2320350.803556

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Adding files tracked by Git LFS: ['tokenizer.json']. This may take a bit of time if the files are large.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "530b8eb5e9c448f2b9b69b0e473b4d4a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Upload file pytorch_model.bin: 0%| | 1.00/1.03G [00:00 main\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "To https://huggingface.co/the-neural-networker/xlm-roberta-base-finetuned-panx-ta\n", " e0ad97a..3315405 main -> main\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Cloning https://huggingface.co/the-neural-networker/xlm-roberta-base-finetuned-panx-en into local empty directory.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/ssd2/abhiroop/misc/nlp-with-transformers/env/lib/python3.8/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n" ] }, { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [192/192 00:21, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossF1
10.9675000.4574140.593826
20.4302000.4075170.662712
30.3090000.3812270.700170

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Adding files tracked by Git LFS: ['tokenizer.json']. This may take a bit of time if the files are large.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5d286f769fa2452286503c255d881d57", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Upload file pytorch_model.bin: 0%| | 1.00/1.03G [00:00 main\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "To https://huggingface.co/the-neural-networker/xlm-roberta-base-finetuned-panx-en\n", " 773c984..7b5883b main -> main\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "corpora = [panx_hi_encoded] \n", "\n", "# exclude hindi from iteration\n", "for lang in langs[1:]:\n", " training_args.output_dir = f\"xlm-roberta-base-finetuned-panx-{lang}\"\n", " # fine-tune on monolingual corpus \n", " ds_encoded = encode_panx_dataset(panx_ch[lang])\n", " metrics = train_on_subset(ds_encoded, ds_encoded[\"train\"].num_rows)\n", " # collect F1-scores in common dict \n", " f1_scores[lang][lang] = metrics[\"f1_score\"][0]\n", " # add monolingual corpus to list of corpora to concatenate\n", " corpora.append(ds_encoded)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": { "tags": [] }, "outputs": [], "source": [ "corpora_encoded = concatenate_splits(corpora)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Cloning https://huggingface.co/the-neural-networker/xlm-roberta-base-finetuned-panx-all into local empty directory.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/ssd2/abhiroop/misc/nlp-with-transformers/env/lib/python3.8/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n" ] }, { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [1314/1314 02:23, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossF1
10.4864000.3817090.698385
20.2419000.3050410.765427
30.1606000.3023100.794586

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Adding files tracked by Git LFS: ['tokenizer.json']. This may take a bit of time if the files are large.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "33a0d5910a584c2b8ad9f591ea95fe24", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Upload file pytorch_model.bin: 0%| | 1.00/1.03G [00:00 main\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "To https://huggingface.co/the-neural-networker/xlm-roberta-base-finetuned-panx-all\n", " ab4985f..e6d5947 main -> main\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "data": { "text/plain": [ "'https://huggingface.co/the-neural-networker/xlm-roberta-base-finetuned-panx-all/commit/ab4985f7081e59d04b156e81f515f2ed40a854fd'" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_args.logging_steps = len(corpora_encoded[\"train\"]) // batch_size \n", "training_args.output_dir = \"xlm-roberta-base-finetuned-panx-all\"\n", "\n", "trainer = Trainer(model_init=model_init, args=training_args,\n", " data_collator=data_collator, compute_metrics=compute_metrics,\n", " tokenizer=xlmr_tokenizer, train_dataset=corpora_encoded[\"train\"],\n", " eval_dataset=corpora_encoded[\"validation\"])\n", "\n", "trainer.train()\n", "trainer.push_to_hub(commit_message=\"Training completed!\")" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Evaluated onhitetaen
Fine-tune on
hi0.79840.52480.55170.5289
each0.79840.00000.80690.7154
all0.81760.60430.80000.7467
\n", "
" ], "text/plain": [ "Evaluated on hi te ta en\n", "Fine-tune on \n", "hi 0.7984 0.5248 0.5517 0.5289\n", "each 0.7984 0.0000 0.8069 0.7154\n", "all 0.8176 0.6043 0.8000 0.7467" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for idx, lang in enumerate(langs):\n", " f1_scores[\"all\"][lang] = get_f1_score(trainer, corpora[idx][\"test\"])\n", "scores_data = {\"hi\": f1_scores[\"hi\"],\n", " \"each\": {lang: f1_scores[lang][lang] for lang in langs},\n", " \"all\": f1_scores[\"all\"]}\n", "f1_scores_df = pd.DataFrame(scores_data).T.round(4)\n", "f1_scores_df.rename_axis(index=\"Fine-tune on\", columns=\"Evaluated on\", \n", " inplace=True)\n", "f1_scores_df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 4 }