{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "58d45708", "metadata": {}, "outputs": [], "source": [ "from transformers import XLNetTokenizer, XLNetModel, XLNetConfig, AutoTokenizer, AutoModelWithLMHead, pipeline" ] }, { "cell_type": "code", "execution_count": 3, "id": "e0314358", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "model.ckpt-320000.data-00000-of-00001 model.ckpt-320000.meta\r\n", "model.ckpt-320000.index\r\n" ] } ], "source": [ "# !tar -zxf xlnet-large-2021-09-06.tar.gz\n", "# !rm xlnet-large-2021-09-06.tar.gz\n", "!ls xlnet-large" ] }, { "cell_type": "code", "execution_count": 4, "id": "59d2c8b5", "metadata": {}, "outputs": [], "source": [ "# !wget https://raw.githubusercontent.com/huseinzol05/malaya/master/pretrained-model/xlnet/tokenizer/sp10m.cased.v9.vocab\n", "# !wget https://raw.githubusercontent.com/huseinzol05/malaya/master/pretrained-model/xlnet/tokenizer/sp10m.cased.v9.model" ] }, { "cell_type": "code", "execution_count": 5, "id": "f35e09f4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('./tokenizer_config.json',\n", " './special_tokens_map.json',\n", " './spiece.model',\n", " './added_tokens.json')" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer = XLNetTokenizer('sp10m.cased.v9.model', do_lower_case = False)\n", "tokenizer.save_pretrained('./')" ] }, { "cell_type": "code", "execution_count": 6, "id": "4438ff5c", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "config = {\n", " \"d_head\": 64,\n", " \"d_inner\": 4096,\n", " \"d_model\": 1024,\n", " \"ff_activation\": \"gelu\",\n", " \"n_head\": 16,\n", " \"n_layer\": 20,\n", " \"n_token\": 32000,\n", " \"untie_r\": True\n", "}\n", "\n", "with open('config.json', 'w') as fopen:\n", " json.dump(config, fopen)" ] }, { "cell_type": "code", "execution_count": 8, "id": "a265f23c", "metadata": {}, "outputs": [], "source": [ "# !transformers-cli convert --model_type xlnet \\\n", "# --tf_checkpoint xlnet-large/model.ckpt-320000 \\\n", "# --config config.json \\\n", "# --pytorch_dump_output ./" ] }, { "cell_type": "code", "execution_count": 9, "id": "22b94055", "metadata": {}, "outputs": [], "source": [ "config = XLNetConfig(f'./config.json')\n", "config.vocab_size = 32000\n", "config.d_inner = 4096\n", "config.d_model = 1024\n", "config.n_head = 16\n", "config.n_layer = 20" ] }, { "cell_type": "code", "execution_count": 10, "id": "17c6d447", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at ./ were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']\n", "- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" ] } ], "source": [ "model = XLNetModel.from_pretrained('./', config = config)" ] }, { "cell_type": "code", "execution_count": 11, "id": "d0fc0138", "metadata": {}, "outputs": [], "source": [ "tokenizer = XLNetTokenizer.from_pretrained('./',do_lower_case = False)" ] }, { "cell_type": "code", "execution_count": 12, "id": "ec2c0661", "metadata": {}, "outputs": [], "source": [ "model.save_pretrained('./')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 5 }