diff --git a/Indic-BERT-v1-master/.gitignore b/Indic-BERT-v1-master/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..df9efad4590305cbe4a014ed0c7b179bb337800c --- /dev/null +++ b/Indic-BERT-v1-master/.gitignore @@ -0,0 +1,116 @@ +# Initially taken from Github's Python gitignore file + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/Indic-BERT-v1-master/LICENSE b/Indic-BERT-v1-master/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..cd3a743cdfef11a1548dfa1e4a581816f10a3bc3 --- /dev/null +++ b/Indic-BERT-v1-master/LICENSE @@ -0,0 +1,9 @@ +MIT License + +Copyright (c) 2020-present AI4Bharat + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/Indic-BERT-v1-master/albert/CONTRIBUTING.md b/Indic-BERT-v1-master/albert/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..9a86ba025d83dfc39db62d84ee7cac0a9da1ac08 --- /dev/null +++ b/Indic-BERT-v1-master/albert/CONTRIBUTING.md @@ -0,0 +1,28 @@ +# How to Contribute + +We'd love to accept your patches and contributions to this project. There are +just a few small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. + +## Community Guidelines + +This project follows +[Google's Open Source Community Guidelines](https://opensource.google/conduct/). diff --git a/Indic-BERT-v1-master/albert/LICENSE b/Indic-BERT-v1-master/albert/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..d645695673349e3947e8e5ae42332d0ac3164cd7 --- /dev/null +++ b/Indic-BERT-v1-master/albert/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Indic-BERT-v1-master/albert/README.md b/Indic-BERT-v1-master/albert/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b1ba8db045c9d6c7856ae6aa6dc35ccb4d8bd1dc --- /dev/null +++ b/Indic-BERT-v1-master/albert/README.md @@ -0,0 +1,324 @@ +ALBERT +====== + +*************** Changes from Original Implementation *************** + +1. Remove sentence order in `run_pretraining.py` +2. Modify `_is_start_piece_sp` function in `create_pretraining_data.py` to account for non-English languages. + +***************New March 28, 2020 *************** + +Add a colab [tutorial](https://github.com/google-research/albert/blob/master/albert_glue_fine_tuning_tutorial.ipynb) to run fine-tuning for GLUE datasets. + +***************New January 7, 2020 *************** + +v2 TF-Hub models should be working now with TF 1.15, as we removed the +native Einsum op from the graph. See updated TF-Hub links below. + +***************New December 30, 2019 *************** + +Chinese models are released. We would like to thank [CLUE team ](https://github.com/CLUEbenchmark/CLUE) for providing the training data. + +- [Base](https://storage.googleapis.com/albert_models/albert_base_zh.tar.gz) +- [Large](https://storage.googleapis.com/albert_models/albert_large_zh.tar.gz) +- [Xlarge](https://storage.googleapis.com/albert_models/albert_xlarge_zh.tar.gz) +- [Xxlarge](https://storage.googleapis.com/albert_models/albert_xxlarge_zh.tar.gz) + +Version 2 of ALBERT models is released. + +- Base: [[Tar file](https://storage.googleapis.com/albert_models/albert_base_v2.tar.gz)] [[TF-Hub](https://tfhub.dev/google/albert_base/3)] +- Large: [[Tar file](https://storage.googleapis.com/albert_models/albert_large_v2.tar.gz)] [[TF-Hub](https://tfhub.dev/google/albert_large/3)] +- Xlarge: [[Tar file](https://storage.googleapis.com/albert_models/albert_xlarge_v2.tar.gz)] [[TF-Hub](https://tfhub.dev/google/albert_xlarge/3)] +- Xxlarge: [[Tar file](https://storage.googleapis.com/albert_models/albert_xxlarge_v2.tar.gz)] [[TF-Hub](https://tfhub.dev/google/albert_xxlarge/3)] + +In this version, we apply 'no dropout', 'additional training data' and 'long training time' strategies to all models. We train ALBERT-base for 10M steps and other models for 3M steps. + +The result comparison to the v1 models is as followings: + +| | Average | SQuAD1.1 | SQuAD2.0 | MNLI | SST-2 | RACE | +|----------------|----------|----------|----------|----------|----------|----------| +|V2 | +|ALBERT-base |82.3 |90.2/83.2 |82.1/79.3 |84.6 |92.9 |66.8 | +|ALBERT-large |85.7 |91.8/85.2 |84.9/81.8 |86.5 |94.9 |75.2 | +|ALBERT-xlarge |87.9 |92.9/86.4 |87.9/84.1 |87.9 |95.4 |80.7 | +|ALBERT-xxlarge |90.9 |94.6/89.1 |89.8/86.9 |90.6 |96.8 |86.8 | +|V1 | +|ALBERT-base |80.1 |89.3/82.3 | 80.0/77.1|81.6 |90.3 | 64.0 | +|ALBERT-large |82.4 |90.6/83.9 | 82.3/79.4|83.5 |91.7 | 68.5 | +|ALBERT-xlarge |85.5 |92.5/86.1 | 86.1/83.1|86.4 |92.4 | 74.8 | +|ALBERT-xxlarge |91.0 |94.8/89.3 | 90.2/87.4|90.8 |96.9 | 86.5 | + +The comparison shows that for ALBERT-base, ALBERT-large, and ALBERT-xlarge, v2 is much better than v1, indicating the importance of applying the above three strategies. On average, ALBERT-xxlarge is slightly worse than the v1, because of the following two reasons: 1) Training additional 1.5 M steps (the only difference between these two models is training for 1.5M steps and 3M steps) did not lead to significant performance improvement. 2) For v1, we did a little bit hyperparameter search among the parameters sets given by BERT, Roberta, and XLnet. For v2, we simply adopt the parameters from v1 except for RACE, where we use a learning rate of 1e-5 and 0 [ALBERT DR](https://arxiv.org/pdf/1909.11942.pdf) (dropout rate for ALBERT in finetuning). The original (v1) RACE hyperparameter will cause model divergence for v2 models. Given that the downstream tasks are sensitive to the fine-tuning hyperparameters, we should be careful about so called slight improvements. + +ALBERT is "A Lite" version of BERT, a popular unsupervised language +representation learning algorithm. ALBERT uses parameter-reduction techniques +that allow for large-scale configurations, overcome previous memory limitations, +and achieve better behavior with respect to model degradation. + +For a technical description of the algorithm, see our paper: + +[ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942) + +Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut + +Release Notes +============= + +- Initial release: 10/9/2019 + +Results +======= + +Performance of ALBERT on GLUE benchmark results using a single-model setup on +dev: + +| Models | MNLI | QNLI | QQP | RTE | SST | MRPC | CoLA | STS | +|-------------------|----------|----------|----------|----------|----------|----------|----------|----------| +| BERT-large | 86.6 | 92.3 | 91.3 | 70.4 | 93.2 | 88.0 | 60.6 | 90.0 | +| XLNet-large | 89.8 | 93.9 | 91.8 | 83.8 | 95.6 | 89.2 | 63.6 | 91.8 | +| RoBERTa-large | 90.2 | 94.7 | **92.2** | 86.6 | 96.4 | **90.9** | 68.0 | 92.4 | +| ALBERT (1M) | 90.4 | 95.2 | 92.0 | 88.1 | 96.8 | 90.2 | 68.7 | 92.7 | +| ALBERT (1.5M) | **90.8** | **95.3** | **92.2** | **89.2** | **96.9** | **90.9** | **71.4** | **93.0** | + +Performance of ALBERT-xxl on SQuaD and RACE benchmarks using a single-model +setup: + +|Models | SQuAD1.1 dev | SQuAD2.0 dev | SQuAD2.0 test | RACE test (Middle/High) | +|--------------------------|---------------|---------------|---------------|-------------------------| +|BERT-large | 90.9/84.1 | 81.8/79.0 | 89.1/86.3 | 72.0 (76.6/70.1) | +|XLNet | 94.5/89.0 | 88.8/86.1 | 89.1/86.3 | 81.8 (85.5/80.2) | +|RoBERTa | 94.6/88.9 | 89.4/86.5 | 89.8/86.8 | 83.2 (86.5/81.3) | +|UPM | - | - | 89.9/87.2 | - | +|XLNet + SG-Net Verifier++ | - | - | 90.1/87.2 | - | +|ALBERT (1M) | 94.8/89.2 | 89.9/87.2 | - | 86.0 (88.2/85.1) | +|ALBERT (1.5M) | **94.8/89.3** | **90.2/87.4** | **90.9/88.1** | **86.5 (89.0/85.5)** | + + +Pre-trained Models +================== +TF-Hub modules are available: + +- Base: [[Tar file](https://storage.googleapis.com/albert_models/albert_base_v1.tar.gz)] [[TF-Hub](https://tfhub.dev/google/albert_base/1)] +- Large: [[Tar file](https://storage.googleapis.com/albert_models/albert_large_v1.tar.gz)] [[TF-Hub](https://tfhub.dev/google/albert_large/1)] +- Xlarge: [[Tar file](https://storage.googleapis.com/albert_models/albert_xlarge_v1.tar.gz)] [[TF-Hub](https://tfhub.dev/google/albert_xlarge/1)] +- Xxlarge: [[Tar file](https://storage.googleapis.com/albert_models/albert_xxlarge_v1.tar.gz)] [[TF-Hub](https://tfhub.dev/google/albert_xxlarge/1)] + +Example usage of the TF-Hub module in code: + +``` +tags = set() +if is_training: + tags.add("train") +albert_module = hub.Module("https://tfhub.dev/google/albert_base/1", tags=tags, + trainable=True) +albert_inputs = dict( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids) +albert_outputs = albert_module( + inputs=albert_inputs, + signature="tokens", + as_dict=True) + +# If you want to use the token-level output, use +# albert_outputs["sequence_output"] instead. +output_layer = albert_outputs["pooled_output"] +``` + +Most of the fine-tuning scripts in this repository support TF-hub modules +via the `--albert_hub_module_handle` flag. + +Pre-training Instructions +========================= +To pretrain ALBERT, use `run_pretraining.py`: + +``` +pip install -r albert/requirements.txt +python -m albert.run_pretraining \ + --input_file=... \ + --output_dir=... \ + --init_checkpoint=... \ + --albert_config_file=... \ + --do_train \ + --do_eval \ + --train_batch_size=4096 \ + --eval_batch_size=64 \ + --max_seq_length=512 \ + --max_predictions_per_seq=20 \ + --optimizer='lamb' \ + --learning_rate=.00176 \ + --num_train_steps=125000 \ + --num_warmup_steps=3125 \ + --save_checkpoints_steps=5000 +``` + +Fine-tuning on GLUE +=================== +To fine-tune and evaluate a pretrained ALBERT on GLUE, please see the +convenience script `run_glue.sh`. + +Lower-level use cases may want to use the `run_classifier.py` script directly. +The `run_classifier.py` script is used both for fine-tuning and evaluation of +ALBERT on individual GLUE benchmark tasks, such as MNLI: + +``` +pip install -r albert/requirements.txt +python -m albert.run_classifier \ + --data_dir=... \ + --output_dir=... \ + --init_checkpoint=... \ + --albert_config_file=... \ + --spm_model_file=... \ + --do_train \ + --do_eval \ + --do_predict \ + --do_lower_case \ + --max_seq_length=128 \ + --optimizer=adamw \ + --task_name=MNLI \ + --warmup_step=1000 \ + --learning_rate=3e-5 \ + --train_step=10000 \ + --save_checkpoints_steps=100 \ + --train_batch_size=128 +``` + +Good default flag values for each GLUE task can be found in `run_glue.sh`. + +You can fine-tune the model starting from TF-Hub modules instead of raw +checkpoints by setting e.g. +`--albert_hub_module_handle=https://tfhub.dev/google/albert_base/1` instead +of `--init_checkpoint`. + +You can find the spm_model_file in the tar files or under the assets folder of +the tf-hub module. The name of the model file is "30k-clean.model". + +After evaluation, the script should report some output like this: + +``` +***** Eval results ***** + global_step = ... + loss = ... + masked_lm_accuracy = ... + masked_lm_loss = ... + sentence_order_accuracy = ... + sentence_order_loss = ... +``` + +Fine-tuning on SQuAD +==================== +To fine-tune and evaluate a pretrained model on SQuAD v1, use the +`run_squad_v1.py` script: + +``` +pip install -r albert/requirements.txt +python -m albert.run_squad_v1 \ + --albert_config_file=... \ + --output_dir=... \ + --train_file=... \ + --predict_file=... \ + --train_feature_file=... \ + --predict_feature_file=... \ + --predict_feature_left_file=... \ + --init_checkpoint=... \ + --spm_model_file=... \ + --do_lower_case \ + --max_seq_length=384 \ + --doc_stride=128 \ + --max_query_length=64 \ + --do_train=true \ + --do_predict=true \ + --train_batch_size=48 \ + --predict_batch_size=8 \ + --learning_rate=5e-5 \ + --num_train_epochs=2.0 \ + --warmup_proportion=.1 \ + --save_checkpoints_steps=5000 \ + --n_best_size=20 \ + --max_answer_length=30 +``` + +You can fine-tune the model starting from TF-Hub modules instead of raw +checkpoints by setting e.g. +`--albert_hub_module_handle=https://tfhub.dev/google/albert_base/1` instead +of `--init_checkpoint`. + +For SQuAD v2, use the `run_squad_v2.py` script: + +``` +pip install -r albert/requirements.txt +python -m albert.run_squad_v2 \ + --albert_config_file=... \ + --output_dir=... \ + --train_file=... \ + --predict_file=... \ + --train_feature_file=... \ + --predict_feature_file=... \ + --predict_feature_left_file=... \ + --init_checkpoint=... \ + --spm_model_file=... \ + --do_lower_case \ + --max_seq_length=384 \ + --doc_stride=128 \ + --max_query_length=64 \ + --do_train \ + --do_predict \ + --train_batch_size=48 \ + --predict_batch_size=8 \ + --learning_rate=5e-5 \ + --num_train_epochs=2.0 \ + --warmup_proportion=.1 \ + --save_checkpoints_steps=5000 \ + --n_best_size=20 \ + --max_answer_length=30 +``` + +You can fine-tune the model starting from TF-Hub modules instead of raw +checkpoints by setting e.g. +`--albert_hub_module_handle=https://tfhub.dev/google/albert_base/1` instead +of `--init_checkpoint`. + +Fine-tuning on RACE +=================== +For RACE, use the `run_race.py` script: + +``` +pip install -r albert/requirements.txt +python -m albert.run_race \ + --albert_config_file=... \ + --output_dir=... \ + --train_file=... \ + --eval_file=... \ + --data_dir=...\ + --init_checkpoint=... \ + --spm_model_file=... \ + --max_seq_length=512 \ + --max_qa_length=128 \ + --do_train \ + --do_eval \ + --train_batch_size=32 \ + --eval_batch_size=8 \ + --learning_rate=1e-5 \ + --train_step=12000 \ + --warmup_step=1000 \ + --save_checkpoints_steps=100 +``` + +You can fine-tune the model starting from TF-Hub modules instead of raw +checkpoints by setting e.g. +`--albert_hub_module_handle=https://tfhub.dev/google/albert_base/1` instead +of `--init_checkpoint`. + +SentencePiece +============= +Command for generating the sentence piece vocabulary: + +``` +spm_train \ +--input all.txt --model_prefix=30k-clean --vocab_size=30000 --logtostderr +--pad_id=0 --unk_id=1 --eos_id=-1 --bos_id=-1 +--control_symbols=[CLS],[SEP],[MASK] +--user_defined_symbols="(,),\",-,.,–,£,€" +--shuffle_input_sentence=true --input_sentence_size=10000000 +--character_coverage=0.99995 --model_type=unigram +``` diff --git a/Indic-BERT-v1-master/albert/__init__.py b/Indic-BERT-v1-master/albert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4c98c2e3d2a6d3b90e78fc67f889daff18310444 --- /dev/null +++ b/Indic-BERT-v1-master/albert/__init__.py @@ -0,0 +1,14 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/Indic-BERT-v1-master/albert/albert_glue_fine_tuning_tutorial.ipynb b/Indic-BERT-v1-master/albert/albert_glue_fine_tuning_tutorial.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..70c566a1e6d1059c28fb9e26fb899f705baff299 --- /dev/null +++ b/Indic-BERT-v1-master/albert/albert_glue_fine_tuning_tutorial.ipynb @@ -0,0 +1,303 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "albert_glue_fine_tuning_tutorial", + "provenance": [], + "collapsed_sections": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "TPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "y8SJfpgTccDB", + "colab_type": "text" + }, + "source": [ + "\n", + "\"Open" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "wHQH4OCHZ9bq", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "# @title Copyright 2020 The ALBERT Authors. All Rights Reserved.\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "# ==============================================================================" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rkTLZ3I4_7c_", + "colab_type": "text" + }, + "source": [ + "# ALBERT End to End (Fine-tuning + Predicting) with Cloud TPU" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1wtjs1QDb3DX", + "colab_type": "text" + }, + "source": [ + "## Overview\n", + "\n", + "ALBERT is \"A Lite\" version of BERT, a popular unsupervised language representation learning algorithm. ALBERT uses parameter-reduction techniques that allow for large-scale configurations, overcome previous memory limitations, and achieve better behavior with respect to model degradation.\n", + "\n", + "For a technical description of the algorithm, see our paper:\n", + "\n", + "https://arxiv.org/abs/1909.11942\n", + "\n", + "Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut\n", + "\n", + "This Colab demonstates using a free Colab Cloud TPU to fine-tune GLUE tasks built on top of pretrained ALBERT models and \n", + "run predictions on tuned model. The colab demonsrates loading pretrained ALBERT models from both [TF Hub](https://www.tensorflow.org/hub) and checkpoints.\n", + "\n", + "**Note:** You will need a GCP (Google Compute Engine) account and a GCS (Google Cloud \n", + "Storage) bucket for this Colab to run.\n", + "\n", + "Please follow the [Google Cloud TPU quickstart](https://cloud.google.com/tpu/docs/quickstart) for how to create GCP account and GCS bucket. You have [$300 free credit](https://cloud.google.com/free/) to get started with any GCP product. You can learn more about Cloud TPU at https://cloud.google.com/tpu/docs.\n", + "\n", + "This notebook is hosted on GitHub. To view it in its original repository, after opening the notebook, select **File > View on GitHub**." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ld-JXlueIuPH", + "colab_type": "text" + }, + "source": [ + "## Instructions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "POkof5uHaQ_c", + "colab_type": "text" + }, + "source": [ + "

  Train on TPU

\n", + "\n", + " 1. Create a Cloud Storage bucket for your TensorBoard logs at http://console.cloud.google.com/storage and fill in the BUCKET parameter in the \"Parameters\" section below.\n", + " \n", + " 1. On the main menu, click Runtime and select **Change runtime type**. Set \"TPU\" as the hardware accelerator.\n", + " 1. Click Runtime again and select **Runtime > Run All** (Watch out: the \"Colab-only auth for this notebook and the TPU\" cell requires user input). You can also run the cells manually with Shift-ENTER." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UdMmwCJFaT8F", + "colab_type": "text" + }, + "source": [ + "### Set up your TPU environment\n", + "\n", + "In this section, you perform the following tasks:\n", + "\n", + "* Set up a Colab TPU running environment\n", + "* Verify that you are connected to a TPU device\n", + "* Upload your credentials to TPU to access your GCS bucket." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "191zq3ZErihP", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# TODO(lanzhzh): Add support for 2.x.\n", + "%tensorflow_version 1.x\n", + "import os\n", + "import pprint\n", + "import json\n", + "import tensorflow as tf\n", + "\n", + "assert \"COLAB_TPU_ADDR\" in os.environ, \"ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!\"\n", + "TPU_ADDRESS = \"grpc://\" + os.environ[\"COLAB_TPU_ADDR\"] \n", + "TPU_TOPOLOGY = \"2x2\"\n", + "print(\"TPU address is\", TPU_ADDRESS)\n", + "\n", + "from google.colab import auth\n", + "auth.authenticate_user()\n", + "with tf.Session(TPU_ADDRESS) as session:\n", + " print('TPU devices:')\n", + " pprint.pprint(session.list_devices())\n", + "\n", + " # Upload credentials to TPU.\n", + " with open('/content/adc.json', 'r') as f:\n", + " auth_info = json.load(f)\n", + " tf.contrib.cloud.configure_gcs(session, credentials=auth_info)\n", + " # Now credentials are set for all future sessions on this TPU." + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HUBP35oCDmbF", + "colab_type": "text" + }, + "source": [ + "### Prepare and import ALBERT modules\n", + "​\n", + "With your environment configured, you can now prepare and import the ALBERT modules. The following step clones the source code from GitHub." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "7wzwke0sxS6W", + "colab_type": "code", + "colab": {}, + "cellView": "code" + }, + "source": [ + "#TODO(lanzhzh): Add pip support\n", + "import sys\n", + "\n", + "!test -d albert || git clone https://github.com/google-research/albert albert\n", + "if not 'albert' in sys.path:\n", + " sys.path += ['albert']\n", + " \n", + "!pip install sentencepiece\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RRu1aKO1D7-Z", + "colab_type": "text" + }, + "source": [ + "### Prepare for training\n", + "\n", + "This next section of code performs the following tasks:\n", + "\n", + "* Specify GS bucket, create output directory for model checkpoints and eval results.\n", + "* Specify task and download training data.\n", + "* Specify ALBERT pretrained model\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "tYkaAlJNfhul", + "colab_type": "code", + "colab": {}, + "cellView": "form" + }, + "source": [ + "# Please find the full list of tasks and their fintuning hyperparameters\n", + "# here https://github.com/google-research/albert/blob/master/run_glue.sh\n", + "\n", + "BUCKET = \"albert_tutorial_glue\" #@param { type: \"string\" }\n", + "TASK = 'MRPC' #@param {type:\"string\"}\n", + "# Available pretrained model checkpoints:\n", + "# base, large, xlarge, xxlarge\n", + "ALBERT_MODEL = 'base' #@param {type:\"string\"}\n", + "\n", + "TASK_DATA_DIR = 'glue_data'\n", + "\n", + "BASE_DIR = \"gs://\" + BUCKET\n", + "if not BASE_DIR or BASE_DIR == \"gs://\":\n", + " raise ValueError(\"You must enter a BUCKET.\")\n", + "DATA_DIR = os.path.join(BASE_DIR, \"data\")\n", + "MODELS_DIR = os.path.join(BASE_DIR, \"models\")\n", + "OUTPUT_DIR = 'gs://{}/albert-tfhub/models/{}'.format(BUCKET, TASK)\n", + "tf.gfile.MakeDirs(OUTPUT_DIR)\n", + "print('***** Model output directory: {} *****'.format(OUTPUT_DIR))\n", + "\n", + "# Download glue data.\n", + "! test -d download_glue_repo || git clone https://gist.github.com/60c2bdb54d156a41194446737ce03e2e.git download_glue_repo\n", + "!python download_glue_repo/download_glue_data.py --data_dir=$TASK_DATA_DIR --tasks=$TASK\n", + "print('***** Task data directory: {} *****'.format(TASK_DATA_DIR))\n", + "\n", + "ALBERT_MODEL_HUB = 'https://tfhub.dev/google/albert_' + ALBERT_MODEL + '/3'" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Hcpfl4N2EdOk", + "colab_type": "text" + }, + "source": [ + "Now let's run the fine-tuning scripts. If you use the default MRPC task, this should be finished in around 10 mintues and you will get an accuracy of around 86.5." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "o8qXPxv8-kBO", + "colab_type": "code", + "colab": {} + }, + "source": [ + "os.environ['TFHUB_CACHE_DIR'] = OUTPUT_DIR\n", + "!python -m albert.run_classifier \\\n", + " --data_dir=\"glue_data/\" \\\n", + " --output_dir=$OUTPUT_DIR \\\n", + " --albert_hub_module_handle=$ALBERT_MODEL_HUB \\\n", + " --spm_model_file=\"from_tf_hub\" \\\n", + " --do_train=True \\\n", + " --do_eval=True \\\n", + " --do_predict=False \\\n", + " --max_seq_length=512 \\\n", + " --optimizer=adamw \\\n", + " --task_name=$TASK \\\n", + " --warmup_step=200 \\\n", + " --learning_rate=2e-5 \\\n", + " --train_step=800 \\\n", + " --save_checkpoints_steps=100 \\\n", + " --train_batch_size=32 \\\n", + " --tpu_name=$TPU_ADDRESS \\\n", + " --use_tpu=True" + ], + "execution_count": 0, + "outputs": [] + } + ] +} diff --git a/Indic-BERT-v1-master/albert/classifier_utils.py b/Indic-BERT-v1-master/albert/classifier_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9157018f36ed81c8358ad82639c24b09a06576df --- /dev/null +++ b/Indic-BERT-v1-master/albert/classifier_utils.py @@ -0,0 +1,1037 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utility functions for GLUE classification tasks.""" + +from __future__ import absolute_import +from __future__ import division +# from __future__ import google_type_annotations +from __future__ import print_function +import collections +import csv +import os +from albert import fine_tuning_utils +from albert import modeling +from albert import optimization +from albert import tokenization +import tensorflow.compat.v1 as tf +from tensorflow.contrib import data as contrib_data +from tensorflow.contrib import metrics as contrib_metrics +from tensorflow.contrib import tpu as contrib_tpu + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class PaddingInputExample(object): + """Fake example so the num input examples is a multiple of the batch size. + + When running eval/predict on the TPU, we need to pad the number of examples + to be a multiple of the batch size, because the TPU requires a fixed batch + size. The alternative is to drop the last batch, which is bad because it means + the entire output data won't be generated. + + We use this class instead of `None` because treating `None` as padding + battches could cause silent errors. + """ + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + input_ids, + input_mask, + segment_ids, + label_id, + guid=None, + example_id=None, + is_real_example=True): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + self.example_id = example_id + self.guid = guid + self.is_real_example = is_real_example + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def __init__(self, use_spm, do_lower_case): + super(DataProcessor, self).__init__() + self.use_spm = use_spm + self.do_lower_case = do_lower_case + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for prediction.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines + + def process_text(self, text): + if self.use_spm: + return tokenization.preprocess_text(text, lower=self.do_lower_case) + else: + return tokenization.convert_to_unicode(text) + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "MNLI", "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "MNLI", "dev_matched.tsv")), + "dev_matched") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "MNLI", "test_matched.tsv")), + "test") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + # Note(mingdachen): We will rely on this guid for GLUE submission. + guid = self.process_text(line[0]) + text_a = self.process_text(line[8]) + text_b = self.process_text(line[9]) + if set_type == "test": + label = "contradiction" + else: + label = self.process_text(line[-1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MisMnliProcessor(MnliProcessor): + """Processor for the Mismatched MultiNLI data set (GLUE version).""" + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "MNLI", "dev_mismatched.tsv")), + "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "MNLI", "test_mismatched.tsv")), + "test") + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "MRPC", "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "MRPC", "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "MRPC", "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = self.process_text(line[3]) + text_b = self.process_text(line[4]) + if set_type == "test": + guid = line[0] + label = "0" + else: + label = self.process_text(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "CoLA", "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "CoLA", "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "CoLA", "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + # Only the test set has a header + if set_type == "test" and i == 0: + continue + guid = "%s-%s" % (set_type, i) + if set_type == "test": + guid = line[0] + text_a = self.process_text(line[1]) + label = "0" + else: + text_a = self.process_text(line[3]) + label = self.process_text(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class Sst2Processor(DataProcessor): + """Processor for the SST-2 data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "SST-2", "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "SST-2", "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "SST-2", "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + if set_type != "test": + guid = "%s-%s" % (set_type, i) + text_a = self.process_text(line[0]) + label = self.process_text(line[1]) + else: + guid = self.process_text(line[0]) + # guid = "%s-%s" % (set_type, line[0]) + text_a = self.process_text(line[1]) + label = "0" + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class StsbProcessor(DataProcessor): + """Processor for the STS-B data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "STS-B", "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "STS-B", "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "STS-B", "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return [None] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = self.process_text(line[0]) + # guid = "%s-%s" % (set_type, line[0]) + text_a = self.process_text(line[7]) + text_b = self.process_text(line[8]) + if set_type != "test": + label = float(line[-1]) + else: + label = 0 + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QqpProcessor(DataProcessor): + """Processor for the QQP data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "QQP", "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "QQP", "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "QQP", "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = line[0] + # guid = "%s-%s" % (set_type, line[0]) + if set_type != "test": + try: + text_a = self.process_text(line[3]) + text_b = self.process_text(line[4]) + label = self.process_text(line[5]) + except IndexError: + continue + else: + text_a = self.process_text(line[1]) + text_b = self.process_text(line[2]) + label = "0" + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QnliProcessor(DataProcessor): + """Processor for the QNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "QNLI", "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "QNLI", "dev.tsv")), + "dev_matched") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "QNLI", "test.tsv")), + "test_matched") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = self.process_text(line[0]) + # guid = "%s-%s" % (set_type, line[0]) + text_a = self.process_text(line[1]) + text_b = self.process_text(line[2]) + if set_type == "test_matched": + label = "entailment" + else: + label = self.process_text(line[-1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class RteProcessor(DataProcessor): + """Processor for the RTE data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "RTE", "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "RTE", "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "RTE", "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = self.process_text(line[0]) + # guid = "%s-%s" % (set_type, line[0]) + text_a = self.process_text(line[1]) + text_b = self.process_text(line[2]) + if set_type == "test": + label = "entailment" + else: + label = self.process_text(line[-1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class WnliProcessor(DataProcessor): + """Processor for the WNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "WNLI", "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "WNLI", "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "WNLI", "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = self.process_text(line[0]) + # guid = "%s-%s" % (set_type, line[0]) + text_a = self.process_text(line[1]) + text_b = self.process_text(line[2]) + if set_type != "test": + label = self.process_text(line[-1]) + else: + label = "0" + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class AXProcessor(DataProcessor): + """Processor for the AX data set (GLUE version).""" + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "diagnostic", "diagnostic.tsv")), + "test") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + # Note(mingdachen): We will rely on this guid for GLUE submission. + guid = self.process_text(line[0]) + text_a = self.process_text(line[1]) + text_b = self.process_text(line[2]) + if set_type == "test": + label = "contradiction" + else: + label = self.process_text(line[-1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenizer, task_name): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False) + + if task_name != "sts-b": + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in ALBERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + if task_name != "sts-b": + label_id = label_map[example.label] + else: + label_id = example.label + + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file, task_name): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer, task_name) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + def create_float_feature(values): + f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_float_feature([feature.label_id])\ + if task_name == "sts-b" else create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + + +def file_based_input_fn_builder(input_file, seq_length, is_training, + drop_remainder, task_name, use_tpu, bsz, + multiple=1): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + labeltype = tf.float32 if task_name == "sts-b" else tf.int64 + + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length * multiple], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length * multiple], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length * multiple], tf.int64), + "label_ids": tf.FixedLenFeature([], labeltype), + "is_real_example": tf.FixedLenFeature([], tf.int64), + } + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + if use_tpu: + batch_size = params["batch_size"] + else: + batch_size = bsz + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + contrib_data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def create_model(albert_config, is_training, input_ids, input_mask, segment_ids, + labels, num_labels, use_one_hot_embeddings, task_name, + hub_module): + """Creates a classification model.""" + (output_layer, _) = fine_tuning_utils.create_albert( + albert_config=albert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings, + use_einsum=True, + hub_module=hub_module) + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + if task_name != "sts-b": + probabilities = tf.nn.softmax(logits, axis=-1) + predictions = tf.argmax(probabilities, axis=-1, output_type=tf.int32) + log_probs = tf.nn.log_softmax(logits, axis=-1) + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + else: + probabilities = logits + logits = tf.squeeze(logits, [-1]) + predictions = logits + per_example_loss = tf.square(logits - labels) + loss = tf.reduce_mean(per_example_loss) + + return (loss, per_example_loss, probabilities, logits, predictions) + + +def model_fn_builder(albert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings, task_name, hub_module=None, + optimizer="adamw"): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + is_real_example = None + if "is_real_example" in features: + is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) + else: + is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, probabilities, logits, predictions) = \ + create_model(albert_config, is_training, input_ids, input_mask, + segment_ids, label_ids, num_labels, use_one_hot_embeddings, + task_name, hub_module) + + tvars = tf.trainable_variables() + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, + use_tpu, optimizer) + + output_spec = contrib_tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + if task_name not in ["sts-b", "cola"]: + def metric_fn(per_example_loss, label_ids, logits, is_real_example): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy( + labels=label_ids, predictions=predictions, + weights=is_real_example) + loss = tf.metrics.mean( + values=per_example_loss, weights=is_real_example) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + elif task_name == "sts-b": + def metric_fn(per_example_loss, label_ids, logits, is_real_example): + """Compute Pearson correlations for STS-B.""" + # Display labels and predictions + concat1 = contrib_metrics.streaming_concat(logits) + concat2 = contrib_metrics.streaming_concat(label_ids) + + # Compute Pearson correlation + pearson = contrib_metrics.streaming_pearson_correlation( + logits, label_ids, weights=is_real_example) + + # Compute MSE + # mse = tf.metrics.mean(per_example_loss) + mse = tf.metrics.mean_squared_error( + label_ids, logits, weights=is_real_example) + + loss = tf.metrics.mean( + values=per_example_loss, + weights=is_real_example) + + return {"pred": concat1, "label_ids": concat2, "pearson": pearson, + "MSE": mse, "eval_loss": loss,} + elif task_name == "cola": + def metric_fn(per_example_loss, label_ids, logits, is_real_example): + """Compute Matthew's correlations for STS-B.""" + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + # https://en.wikipedia.org/wiki/Matthews_correlation_coefficient + tp, tp_op = tf.metrics.true_positives( + predictions, label_ids, weights=is_real_example) + tn, tn_op = tf.metrics.true_negatives( + predictions, label_ids, weights=is_real_example) + fp, fp_op = tf.metrics.false_positives( + predictions, label_ids, weights=is_real_example) + fn, fn_op = tf.metrics.false_negatives( + predictions, label_ids, weights=is_real_example) + + # Compute Matthew's correlation + mcc = tf.div_no_nan( + tp * tn - fp * fn, + tf.pow((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn), 0.5)) + + # Compute accuracy + accuracy = tf.metrics.accuracy( + labels=label_ids, predictions=predictions, + weights=is_real_example) + + loss = tf.metrics.mean( + values=per_example_loss, + weights=is_real_example) + + return {"matthew_corr": (mcc, tf.group(tp_op, tn_op, fp_op, fn_op)), + "eval_accuracy": accuracy, "eval_loss": loss,} + + eval_metrics = (metric_fn, + [per_example_loss, label_ids, logits, is_real_example]) + output_spec = contrib_tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = contrib_tpu.TPUEstimatorSpec( + mode=mode, + predictions={ + "probabilities": probabilities, + "predictions": predictions + }, + scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def input_fn_builder(features, seq_length, is_training, drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + all_input_ids = [] + all_input_mask = [] + all_segment_ids = [] + all_label_ids = [] + + for feature in features: + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_segment_ids.append(feature.segment_ids) + all_label_ids.append(feature.label_id) + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "segment_ids": + tf.constant( + all_segment_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + "label_ids": + tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), + }) + + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) + return d + + return input_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer, task_name): + """Convert a set of `InputExample`s to a list of `InputFeatures`.""" + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer, task_name) + + features.append(feature) + return features diff --git a/Indic-BERT-v1-master/albert/create_pretraining_data.py b/Indic-BERT-v1-master/albert/create_pretraining_data.py new file mode 100644 index 0000000000000000000000000000000000000000..c813d87092eb55100d21b044243bf790aa603130 --- /dev/null +++ b/Indic-BERT-v1-master/albert/create_pretraining_data.py @@ -0,0 +1,654 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python2, python3 +# coding=utf-8 +"""Create masked LM/next sentence masked_lm TF examples for ALBERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import collections +import random +from albert import tokenization +import numpy as np +import six +from six.moves import range +from six.moves import zip +import tensorflow.compat.v1 as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, + "Input raw text file (or comma-separated list of files).") + +flags.DEFINE_string( + "output_file", None, + "Output TF example file (or comma-separated list of files).") + +flags.DEFINE_string( + "vocab_file", None, + "The vocabulary file that the ALBERT model was trained on.") + +flags.DEFINE_string("spm_model_file", None, + "The model file for sentence piece tokenization.") + +flags.DEFINE_string("input_file_mode", "r", + "The data format of the input file.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_bool( + "do_whole_word_mask", True, + "Whether to use whole word masking rather than per-WordPiece masking.") + +flags.DEFINE_bool( + "do_permutation", False, + "Whether to do the permutation training.") + +flags.DEFINE_bool( + "favor_shorter_ngram", True, + "Whether to set higher probabilities for sampling shorter ngrams.") + +flags.DEFINE_bool( + "random_next_sentence", False, + "Whether to use the sentence that's right before the current sentence " + "as the negative sample for next sentence prection, rather than using " + "sentences from other random documents.") + +flags.DEFINE_integer("max_seq_length", 512, "Maximum sequence length.") + +flags.DEFINE_integer("ngram", 3, "Maximum number of ngrams to mask.") + +flags.DEFINE_integer("max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence.") + +flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") + +flags.DEFINE_integer( + "dupe_factor", 40, + "Number of times to duplicate the input data (with different masks).") + +flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") + +flags.DEFINE_float( + "short_seq_prob", 0.1, + "Probability of creating sequences which are shorter than the " + "maximum length.") + + +class TrainingInstance(object): + """A single training instance (sentence pair).""" + + def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, + is_random_next, token_boundary): + self.tokens = tokens + self.segment_ids = segment_ids + self.is_random_next = is_random_next + self.token_boundary = token_boundary + self.masked_lm_positions = masked_lm_positions + self.masked_lm_labels = masked_lm_labels + + def __str__(self): + s = "" + s += "tokens: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.tokens])) + s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) + s += "token_boundary: %s\n" % (" ".join( + [str(x) for x in self.token_boundary])) + s += "is_random_next: %s\n" % self.is_random_next + s += "masked_lm_positions: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_positions])) + s += "masked_lm_labels: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.masked_lm_labels])) + s += "\n" + return s + + def __repr__(self): + return self.__str__() + + +def write_instance_to_example_files(instances, tokenizer, max_seq_length, + max_predictions_per_seq, output_files): + """Create TF example files from `TrainingInstance`s.""" + writers = [] + for output_file in output_files: + writers.append(tf.python_io.TFRecordWriter(output_file)) + + writer_index = 0 + + total_written = 0 + for (inst_index, instance) in enumerate(instances): + input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) + input_mask = [1] * len(input_ids) + segment_ids = list(instance.segment_ids) + token_boundary = list(instance.token_boundary) + assert len(input_ids) <= max_seq_length + + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + token_boundary.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + masked_lm_positions = list(instance.masked_lm_positions) + masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) + masked_lm_weights = [1.0] * len(masked_lm_ids) + + multiplier = 1 + int(FLAGS.do_permutation) + while len(masked_lm_positions) < max_predictions_per_seq * multiplier: + masked_lm_positions.append(0) + masked_lm_ids.append(0) + masked_lm_weights.append(0.0) + + sentence_order_label = 1 if instance.is_random_next else 0 + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(input_ids) + features["input_mask"] = create_int_feature(input_mask) + features["segment_ids"] = create_int_feature(segment_ids) + features["token_boundary"] = create_int_feature(token_boundary) + features["masked_lm_positions"] = create_int_feature(masked_lm_positions) + features["masked_lm_ids"] = create_int_feature(masked_lm_ids) + features["masked_lm_weights"] = create_float_feature(masked_lm_weights) + # Note: We keep this feature name `next_sentence_labels` to be compatible + # with the original data created by lanzhzh@. However, in the ALBERT case + # it does contain sentence_order_label. + features["next_sentence_labels"] = create_int_feature( + [sentence_order_label]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + + writers[writer_index].write(tf_example.SerializeToString()) + writer_index = (writer_index + 1) % len(writers) + + total_written += 1 + + if inst_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in instance.tokens])) + + for feature_name in features.keys(): + feature = features[feature_name] + values = [] + if feature.int64_list.value: + values = feature.int64_list.value + elif feature.float_list.value: + values = feature.float_list.value + tf.logging.info( + "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) + + for writer in writers: + writer.close() + + tf.logging.info("Wrote %d total instances", total_written) + + +def create_int_feature(values): + feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return feature + + +def create_float_feature(values): + feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return feature + + +def create_training_instances(input_files, tokenizer, max_seq_length, + dupe_factor, short_seq_prob, masked_lm_prob, + max_predictions_per_seq, rng): + """Create `TrainingInstance`s from raw text.""" + all_documents = [[]] + + # Input file format: + # (1) One sentence per line. These should ideally be actual sentences, not + # entire paragraphs or arbitrary spans of text. (Because we use the + # sentence boundaries for the "next sentence prediction" task). + # (2) Blank lines between documents. Document boundaries are needed so + # that the "next sentence prediction" task doesn't span between documents. + for input_file in input_files: + with tf.gfile.GFile(input_file, FLAGS.input_file_mode) as reader: + while True: + line = reader.readline() + if not FLAGS.spm_model_file: + line = tokenization.convert_to_unicode(line) + if not line: + break + if FLAGS.spm_model_file: + line = tokenization.preprocess_text(line, lower=FLAGS.do_lower_case) + else: + line = line.strip() + + # Empty lines are used as document delimiters + if not line: + all_documents.append([]) + tokens = tokenizer.tokenize(line) + if tokens: + all_documents[-1].append(tokens) + + # Remove empty documents + all_documents = [x for x in all_documents if x] + rng.shuffle(all_documents) + + vocab_words = list(tokenizer.vocab.keys()) + instances = [] + for _ in range(dupe_factor): + for document_index in range(len(all_documents)): + instances.extend( + create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) + + rng.shuffle(instances) + return instances + + +def create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document.""" + document = all_documents[document_index] + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + target_seq_length = max_num_tokens + if rng.random() < short_seq_prob: + target_seq_length = rng.randint(2, max_num_tokens) + + # We DON'T just concatenate all of the tokens from a document into a long + # sequence and choose an arbitrary split point because this would make the + # next sentence prediction task too easy. Instead, we split the input into + # segments "A" and "B" based on the actual "sentences" provided by the user + # input. + instances = [] + current_chunk = [] + current_length = 0 + i = 0 + while i < len(document): + segment = document[i] + current_chunk.append(segment) + current_length += len(segment) + if i == len(document) - 1 or current_length >= target_seq_length: + if current_chunk: + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: + a_end = rng.randint(1, len(current_chunk) - 1) + + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + tokens_b = [] + # Random next + is_random_next = False + if len(current_chunk) == 1 or \ + (FLAGS.random_next_sentence and rng.random() < 0.5): + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + + # This should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document + # we're processing. + for _ in range(10): + random_document_index = rng.randint(0, len(all_documents) - 1) + if random_document_index != document_index: + break + + random_document = all_documents[random_document_index] + random_start = rng.randint(0, len(random_document) - 1) + for j in range(random_start, len(random_document)): + tokens_b.extend(random_document[j]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. + num_unused_segments = len(current_chunk) - a_end + i -= num_unused_segments + elif not FLAGS.random_next_sentence and rng.random() < 0.5: + is_random_next = True + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + # Note(mingdachen): in this case, we just swap tokens_a and tokens_b + tokens_a, tokens_b = tokens_b, tokens_a + # Actual next + else: + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + + tokens.append("[SEP]") + segment_ids.append(0) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + (tokens, masked_lm_positions, + masked_lm_labels, token_boundary) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + token_boundary=token_boundary, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] + current_length = 0 + i += 1 + + return instances + + +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) + + +def _is_start_piece_sp(piece): + """Check if the current word piece is the starting piece (sentence piece).""" + special_pieces = set(list('!"#$%&\"()*+,-./:;?@[\\]^_`{|}~₹')) + special_pieces.add(u"€".encode("utf-8")) + special_pieces.add(u"£".encode("utf-8")) + # Note(mingdachen): + # For foreign characters, we always treat them as a whole piece. + english_chars = set(list("abcdefghijklmnopqrstuvwxyz")) + if (six.ensure_str(piece).startswith("▁") or + six.ensure_str(piece).startswith("<") or piece in special_pieces): + return True + else: + return False + + +def _is_start_piece_bert(piece): + """Check if the current word piece is the starting piece (BERT).""" + # When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + return not six.ensure_str(piece).startswith("##") + + +def is_start_piece(piece): + if FLAGS.spm_model_file: + return _is_start_piece_sp(piece) + else: + return _is_start_piece_bert(piece) + + +def create_masked_lm_predictions(tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng): + """Creates the predictions for the masked LM objective.""" + + cand_indexes = [] + # Note(mingdachen): We create a list for recording if the piece is + # the starting piece of current token, where 1 means true, so that + # on-the-fly whole word masking is possible. + token_boundary = [0] * len(tokens) + + for (i, token) in enumerate(tokens): + if token == "[CLS]" or token == "[SEP]": + token_boundary[i] = 1 + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and + not is_start_piece(token)): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + if is_start_piece(token): + token_boundary[i] = 1 + + output_tokens = list(tokens) + + masked_lm_positions = [] + masked_lm_labels = [] + + if masked_lm_prob == 0: + return (output_tokens, masked_lm_positions, + masked_lm_labels, token_boundary) + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + # Note(mingdachen): + # By default, we set the probilities to favor shorter ngram sequences. + ngrams = np.arange(1, FLAGS.ngram + 1, dtype=np.int64) + pvals = 1. / np.arange(1, FLAGS.ngram + 1) + pvals /= pvals.sum(keepdims=True) + + if not FLAGS.favor_shorter_ngram: + pvals = pvals[::-1] + + ngram_indexes = [] + for idx in range(len(cand_indexes)): + ngram_index = [] + for n in ngrams: + ngram_index.append(cand_indexes[idx:idx+n]) + ngram_indexes.append(ngram_index) + + rng.shuffle(ngram_indexes) + + masked_lms = [] + covered_indexes = set() + for cand_index_set in ngram_indexes: + if len(masked_lms) >= num_to_predict: + break + if not cand_index_set: + continue + # Note(mingdachen): + # Skip current piece if they are covered in lm masking or previous ngrams. + for index_set in cand_index_set[0]: + for index in index_set: + if index in covered_indexes: + continue + + n = np.random.choice(ngrams[:len(cand_index_set)], + p=pvals[:len(cand_index_set)] / + pvals[:len(cand_index_set)].sum(keepdims=True)) + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + # Note(mingdachen): + # Repeatedly looking for a candidate that does not exceed the + # maximum number of predictions by trying shorter ngrams. + while len(masked_lms) + len(index_set) > num_to_predict: + if n == 0: + break + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + + masked_token = None + # 80% of the time, replace with [MASK] + if rng.random() < 0.8: + masked_token = "[MASK]" + else: + # 10% of the time, keep original + if rng.random() < 0.5: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] + + output_tokens[index] = masked_token + + masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) + assert len(masked_lms) <= num_to_predict + + rng.shuffle(ngram_indexes) + + select_indexes = set() + if FLAGS.do_permutation: + for cand_index_set in ngram_indexes: + if len(select_indexes) >= num_to_predict: + break + if not cand_index_set: + continue + # Note(mingdachen): + # Skip current piece if they are covered in lm masking or previous ngrams. + for index_set in cand_index_set[0]: + for index in index_set: + if index in covered_indexes or index in select_indexes: + continue + + n = np.random.choice(ngrams[:len(cand_index_set)], + p=pvals[:len(cand_index_set)] / + pvals[:len(cand_index_set)].sum(keepdims=True)) + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + + while len(select_indexes) + len(index_set) > num_to_predict: + if n == 0: + break + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(select_indexes) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes or index in select_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + select_indexes.add(index) + assert len(select_indexes) <= num_to_predict + + select_indexes = sorted(select_indexes) + permute_indexes = list(select_indexes) + rng.shuffle(permute_indexes) + orig_token = list(output_tokens) + + for src_i, tgt_i in zip(select_indexes, permute_indexes): + output_tokens[src_i] = orig_token[tgt_i] + masked_lms.append(MaskedLmInstance(index=src_i, label=orig_token[src_i])) + + masked_lms = sorted(masked_lms, key=lambda x: x.index) + + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary) + + +def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): + """Truncates a pair of sequences to a maximum sequence length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_num_tokens: + break + + trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b + assert len(trunc_tokens) >= 1 + + # We want to sometimes truncate from the front and sometimes from the + # back to add more randomness and avoid biases. + if rng.random() < 0.5: + del trunc_tokens[0] + else: + trunc_tokens.pop() + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, + spm_model_file=FLAGS.spm_model_file) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Reading from input files ***") + for input_file in input_files: + tf.logging.info(" %s", input_file) + + rng = random.Random(FLAGS.random_seed) + instances = create_training_instances( + input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, + FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, + rng) + + tf.logging.info("number of instances: %i", len(instances)) + + output_files = FLAGS.output_file.split(",") + tf.logging.info("*** Writing to output files ***") + for output_file in output_files: + tf.logging.info(" %s", output_file) + + write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, + FLAGS.max_predictions_per_seq, output_files) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("output_file") + flags.mark_flag_as_required("vocab_file") + tf.app.run() diff --git a/Indic-BERT-v1-master/albert/evaluate.py b/Indic-BERT-v1-master/albert/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Indic-BERT-v1-master/albert/export_checkpoints.py b/Indic-BERT-v1-master/albert/export_checkpoints.py new file mode 100644 index 0000000000000000000000000000000000000000..5f42f44e1eeb13a4392e9d77afdc98b1e54118fc --- /dev/null +++ b/Indic-BERT-v1-master/albert/export_checkpoints.py @@ -0,0 +1,162 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +r"""Exports a minimal module for ALBERT models.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +from absl import app +from absl import flags +from albert import modeling +import tensorflow.compat.v1 as tf + +flags.DEFINE_string( + "albert_directory", None, + "The config json file corresponding to the pre-trained ALBERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string( + "checkpoint_name", "model.ckpt-best", + "Name of the checkpoint under albert_directory to be exported.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_string("export_path", None, "Path to the output module.") + +FLAGS = flags.FLAGS + + +def gather_indexes(sequence_tensor, positions): + """Gathers the vectors at the specific positions over a minibatch.""" + sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) + batch_size = sequence_shape[0] + seq_length = sequence_shape[1] + width = sequence_shape[2] + + flat_offsets = tf.reshape( + tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) + flat_positions = tf.reshape(positions + flat_offsets, [-1]) + flat_sequence_tensor = tf.reshape(sequence_tensor, + [batch_size * seq_length, width]) + output_tensor = tf.gather(flat_sequence_tensor, flat_positions) + return output_tensor + + +def get_mlm_logits(input_tensor, albert_config, mlm_positions, output_weights): + """From run_pretraining.py.""" + input_tensor = gather_indexes(input_tensor, mlm_positions) + with tf.variable_scope("cls/predictions"): + # We apply one more non-linear transformation before the output layer. + # This matrix is not used after pre-training. + with tf.variable_scope("transform"): + input_tensor = tf.layers.dense( + input_tensor, + units=albert_config.embedding_size, + activation=modeling.get_activation(albert_config.hidden_act), + kernel_initializer=modeling.create_initializer( + albert_config.initializer_range)) + input_tensor = modeling.layer_norm(input_tensor) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + output_bias = tf.get_variable( + "output_bias", + shape=[albert_config.vocab_size], + initializer=tf.zeros_initializer()) + logits = tf.matmul( + input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + return logits + + +def get_sentence_order_logits(input_tensor, albert_config): + """Get loss and log probs for the next sentence prediction.""" + + # Simple binary classification. Note that 0 is "next sentence" and 1 is + # "random sentence". This weight matrix is not used after pre-training. + with tf.variable_scope("cls/seq_relationship"): + output_weights = tf.get_variable( + "output_weights", + shape=[2, albert_config.hidden_size], + initializer=modeling.create_initializer( + albert_config.initializer_range)) + output_bias = tf.get_variable( + "output_bias", shape=[2], initializer=tf.zeros_initializer()) + + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + return logits + + +def build_model(sess): + """Module function.""" + input_ids = tf.placeholder(tf.int32, [None, None], "input_ids") + input_mask = tf.placeholder(tf.int32, [None, None], "input_mask") + segment_ids = tf.placeholder(tf.int32, [None, None], "segment_ids") + mlm_positions = tf.placeholder(tf.int32, [None, None], "mlm_positions") + + albert_config_path = os.path.join( + FLAGS.albert_directory, "albert_config.json") + albert_config = modeling.AlbertConfig.from_json_file(albert_config_path) + model = modeling.AlbertModel( + config=albert_config, + is_training=False, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=False) + + get_mlm_logits(model.get_sequence_output(), albert_config, + mlm_positions, model.get_embedding_table()) + get_sentence_order_logits(model.get_pooled_output(), albert_config) + + checkpoint_path = os.path.join(FLAGS.albert_directory, FLAGS.checkpoint_name) + tvars = tf.trainable_variables() + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, checkpoint_path) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + tf.train.init_from_checkpoint(checkpoint_path, assignment_map) + init = tf.global_variables_initializer() + sess.run(init) + return sess + + +def main(_): + sess = tf.Session() + tf.train.get_or_create_global_step() + sess = build_model(sess) + my_vars = [] + for var in tf.global_variables(): + if "lamb_v" not in var.name and "lamb_m" not in var.name: + my_vars.append(var) + saver = tf.train.Saver(my_vars) + saver.save(sess, FLAGS.export_path) + + +if __name__ == "__main__": + flags.mark_flag_as_required("albert_directory") + flags.mark_flag_as_required("export_path") + app.run(main) diff --git a/Indic-BERT-v1-master/albert/export_to_tfhub.py b/Indic-BERT-v1-master/albert/export_to_tfhub.py new file mode 100644 index 0000000000000000000000000000000000000000..ca9e973851cc75d36aebc6b1ea4937fe13d9e7fd --- /dev/null +++ b/Indic-BERT-v1-master/albert/export_to_tfhub.py @@ -0,0 +1,177 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +r"""Exports a minimal TF-Hub module for ALBERT models.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +from absl import app +from absl import flags +from albert import modeling +import tensorflow.compat.v1 as tf +import tensorflow_hub as hub + +flags.DEFINE_string( + "albert_directory", None, + "The config json file corresponding to the pre-trained ALBERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string( + "checkpoint_name", "model.ckpt-best", + "Name of the checkpoint under albert_directory to be exported.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_bool( + "use_einsum", True, + "Whether to use tf.einsum or tf.reshape+tf.matmul for dense layers. Must " + "be set to False for TFLite compatibility.") + +flags.DEFINE_string("export_path", None, "Path to the output TF-Hub module.") + +FLAGS = flags.FLAGS + + +def gather_indexes(sequence_tensor, positions): + """Gathers the vectors at the specific positions over a minibatch.""" + sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) + batch_size = sequence_shape[0] + seq_length = sequence_shape[1] + width = sequence_shape[2] + + flat_offsets = tf.reshape( + tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) + flat_positions = tf.reshape(positions + flat_offsets, [-1]) + flat_sequence_tensor = tf.reshape(sequence_tensor, + [batch_size * seq_length, width]) + output_tensor = tf.gather(flat_sequence_tensor, flat_positions) + return output_tensor + + +def get_mlm_logits(model, albert_config, mlm_positions): + """From run_pretraining.py.""" + input_tensor = gather_indexes(model.get_sequence_output(), mlm_positions) + with tf.variable_scope("cls/predictions"): + # We apply one more non-linear transformation before the output layer. + # This matrix is not used after pre-training. + with tf.variable_scope("transform"): + input_tensor = tf.layers.dense( + input_tensor, + units=albert_config.embedding_size, + activation=modeling.get_activation(albert_config.hidden_act), + kernel_initializer=modeling.create_initializer( + albert_config.initializer_range)) + input_tensor = modeling.layer_norm(input_tensor) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + output_bias = tf.get_variable( + "output_bias", + shape=[albert_config.vocab_size], + initializer=tf.zeros_initializer()) + logits = tf.matmul( + input_tensor, model.get_embedding_table(), transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + return logits + + +def module_fn(is_training): + """Module function.""" + input_ids = tf.placeholder(tf.int32, [None, None], "input_ids") + input_mask = tf.placeholder(tf.int32, [None, None], "input_mask") + segment_ids = tf.placeholder(tf.int32, [None, None], "segment_ids") + mlm_positions = tf.placeholder(tf.int32, [None, None], "mlm_positions") + + albert_config_path = os.path.join( + FLAGS.albert_directory, "albert_config.json") + albert_config = modeling.AlbertConfig.from_json_file(albert_config_path) + model = modeling.AlbertModel( + config=albert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=False, + use_einsum=FLAGS.use_einsum) + + mlm_logits = get_mlm_logits(model, albert_config, mlm_positions) + + vocab_model_path = os.path.join(FLAGS.albert_directory, "30k-clean.model") + vocab_file_path = os.path.join(FLAGS.albert_directory, "30k-clean.vocab") + + config_file = tf.constant( + value=albert_config_path, dtype=tf.string, name="config_file") + vocab_model = tf.constant( + value=vocab_model_path, dtype=tf.string, name="vocab_model") + # This is only for visualization purpose. + vocab_file = tf.constant( + value=vocab_file_path, dtype=tf.string, name="vocab_file") + + # By adding `config_file, vocab_model and vocab_file` + # to the ASSET_FILEPATHS collection, TF-Hub will + # rewrite this tensor so that this asset is portable. + tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file) + tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_model) + tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file) + + hub.add_signature( + name="tokens", + inputs=dict( + input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids), + outputs=dict( + sequence_output=model.get_sequence_output(), + pooled_output=model.get_pooled_output())) + + hub.add_signature( + name="mlm", + inputs=dict( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + mlm_positions=mlm_positions), + outputs=dict( + sequence_output=model.get_sequence_output(), + pooled_output=model.get_pooled_output(), + mlm_logits=mlm_logits)) + + hub.add_signature( + name="tokenization_info", + inputs={}, + outputs=dict( + vocab_file=vocab_model, + do_lower_case=tf.constant(FLAGS.do_lower_case))) + + +def main(_): + tags_and_args = [] + for is_training in (True, False): + tags = set() + if is_training: + tags.add("train") + tags_and_args.append((tags, dict(is_training=is_training))) + spec = hub.create_module_spec(module_fn, tags_and_args=tags_and_args) + checkpoint_path = os.path.join(FLAGS.albert_directory, FLAGS.checkpoint_name) + tf.logging.info("Using checkpoint {}".format(checkpoint_path)) + spec.export(FLAGS.export_path, checkpoint_path=checkpoint_path) + + +if __name__ == "__main__": + flags.mark_flag_as_required("albert_directory") + flags.mark_flag_as_required("export_path") + app.run(main) diff --git a/Indic-BERT-v1-master/albert/fine_tuning_utils.py b/Indic-BERT-v1-master/albert/fine_tuning_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..15a984357d97ebc054c6baf26f24ab34e75081f2 --- /dev/null +++ b/Indic-BERT-v1-master/albert/fine_tuning_utils.py @@ -0,0 +1,85 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python3 +"""Helper library for ALBERT fine-tuning. + +This library can be used to construct ALBERT models for fine-tuning, either from +json config files or from TF-Hub modules. +""" + +from albert import modeling +from albert import tokenization +import tensorflow.compat.v1 as tf +import tensorflow_hub as hub + + +def _create_model_from_hub(hub_module, is_training, input_ids, input_mask, + segment_ids): + """Creates an ALBERT model from TF-Hub.""" + tags = set() + if is_training: + tags.add("train") + albert_module = hub.Module(hub_module, tags=tags, trainable=True) + albert_inputs = dict( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids) + albert_outputs = albert_module( + inputs=albert_inputs, + signature="tokens", + as_dict=True) + return (albert_outputs["pooled_output"], albert_outputs["sequence_output"]) + + +def _create_model_from_scratch(albert_config, is_training, input_ids, + input_mask, segment_ids, use_one_hot_embeddings, + use_einsum): + """Creates an ALBERT model from scratch/config.""" + model = modeling.AlbertModel( + config=albert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings, + use_einsum=use_einsum) + return (model.get_pooled_output(), model.get_sequence_output()) + + +def create_albert(albert_config, is_training, input_ids, input_mask, + segment_ids, use_one_hot_embeddings, use_einsum, hub_module): + """Creates an ALBERT, either from TF-Hub or from scratch.""" + if hub_module: + tf.logging.info("creating model from hub_module: %s", hub_module) + return _create_model_from_hub(hub_module, is_training, input_ids, + input_mask, segment_ids) + else: + tf.logging.info("creating model from albert_config") + return _create_model_from_scratch(albert_config, is_training, input_ids, + input_mask, segment_ids, + use_one_hot_embeddings, use_einsum) + + +def create_vocab(vocab_file, do_lower_case, spm_model_file, hub_module): + """Creates a vocab, either from vocab file or from a TF-Hub module.""" + if hub_module: + use_spm = True if spm_model_file else False + return tokenization.FullTokenizer.from_hub_module( + hub_module=hub_module, use_spm=use_spm) + else: + return tokenization.FullTokenizer.from_scratch( + vocab_file=vocab_file, do_lower_case=do_lower_case, + spm_model_file=spm_model_file) + diff --git a/Indic-BERT-v1-master/albert/lamb_optimizer.py b/Indic-BERT-v1-master/albert/lamb_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..a7222d2cb63dc402efcd75f8aa0a1a15e51b0c5d --- /dev/null +++ b/Indic-BERT-v1-master/albert/lamb_optimizer.py @@ -0,0 +1,148 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python2, python3 +"""Functions and classes related to optimization (weight updates).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import six +import tensorflow.compat.v1 as tf + +# pylint: disable=g-direct-tensorflow-import +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import linalg_ops +from tensorflow.python.ops import math_ops +# pylint: enable=g-direct-tensorflow-import + + +class LAMBOptimizer(tf.train.Optimizer): + """LAMB (Layer-wise Adaptive Moments optimizer for Batch training).""" + # A new optimizer that includes correct L2 weight decay, adaptive + # element-wise updating, and layer-wise justification. The LAMB optimizer + # was proposed by Yang You, Jing Li, Jonathan Hseu, Xiaodan Song, + # James Demmel, and Cho-Jui Hsieh in a paper titled as Reducing BERT + # Pre-Training Time from 3 Days to 76 Minutes (arxiv.org/abs/1904.00962) + + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + exclude_from_layer_adaptation=None, + name="LAMBOptimizer"): + """Constructs a LAMBOptimizer.""" + super(LAMBOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + # exclude_from_layer_adaptation is set to exclude_from_weight_decay if the + # arg is None. + # TODO(jingli): validate if exclude_from_layer_adaptation is necessary. + if exclude_from_layer_adaptation: + self.exclude_from_layer_adaptation = exclude_from_layer_adaptation + else: + self.exclude_from_layer_adaptation = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=six.ensure_str(param_name) + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=six.ensure_str(param_name) + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + ratio = 1.0 + if self._do_layer_adaptation(param_name): + w_norm = linalg_ops.norm(param, ord=2) + g_norm = linalg_ops.norm(update, ord=2) + ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where( + math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0) + + update_with_lr = ratio * self.learning_rate * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _do_layer_adaptation(self, param_name): + """Whether to do layer-wise learning rate adaptation for `param_name`.""" + if self.exclude_from_layer_adaptation: + for r in self.exclude_from_layer_adaptation: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", six.ensure_str(param_name)) + if m is not None: + param_name = m.group(1) + return param_name diff --git a/Indic-BERT-v1-master/albert/modeling.py b/Indic-BERT-v1-master/albert/modeling.py new file mode 100644 index 0000000000000000000000000000000000000000..4d4171f83928807c9edf8625baa701b790613c54 --- /dev/null +++ b/Indic-BERT-v1-master/albert/modeling.py @@ -0,0 +1,1209 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python2, python3 +"""The main ALBERT model and related functions. + +For a description of the algorithm, see https://arxiv.org/abs/1909.11942. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import copy +import json +import math +import re +import numpy as np +import six +from six.moves import range +import tensorflow.compat.v1 as tf +from tensorflow.contrib import layers as contrib_layers + + +class AlbertConfig(object): + """Configuration for `AlbertModel`. + + The default settings match the configuration of model `albert_xxlarge`. + """ + + def __init__(self, + vocab_size, + embedding_size=128, + hidden_size=4096, + num_hidden_layers=12, + num_hidden_groups=1, + num_attention_heads=64, + intermediate_size=16384, + inner_group_num=1, + down_scale_factor=1, + hidden_act="gelu", + hidden_dropout_prob=0, + attention_probs_dropout_prob=0, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02): + """Constructs AlbertConfig. + + Args: + vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`. + embedding_size: size of voc embeddings. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_hidden_groups: Number of group for the hidden layers, parameters in + the same group are shared. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + inner_group_num: int, number of inner repetition of attention and ffn. + down_scale_factor: float, the scale to apply + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. + hidden_dropout_prob: The dropout probability for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `AlbertModel`. + initializer_range: The stdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + self.vocab_size = vocab_size + self.embedding_size = embedding_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_hidden_groups = num_hidden_groups + self.num_attention_heads = num_attention_heads + self.inner_group_num = inner_group_num + self.down_scale_factor = down_scale_factor + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + + @classmethod + def from_dict(cls, json_object): + """Constructs a `AlbertConfig` from a Python dictionary of parameters.""" + config = AlbertConfig(vocab_size=None) + for (key, value) in six.iteritems(json_object): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `AlbertConfig` from a json file of parameters.""" + with tf.gfile.GFile(json_file, "r") as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class AlbertModel(object): + """BERT model ("Bidirectional Encoder Representations from Transformers"). + + Example usage: + + ```python + # Already been converted from strings into ids + input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) + input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) + token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) + + config = modeling.AlbertConfig(vocab_size=32000, hidden_size=512, + num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + + model = modeling.AlbertModel(config=config, is_training=True, + input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) + + label_embeddings = tf.get_variable(...) + pooled_output = model.get_pooled_output() + logits = tf.matmul(pooled_output, label_embeddings) + ... + ``` + """ + + def __init__(self, + config, + is_training, + input_ids, + input_mask=None, + token_type_ids=None, + use_one_hot_embeddings=False, + use_einsum=True, + scope=None): + """Constructor for AlbertModel. + + Args: + config: `AlbertConfig` instance. + is_training: bool. true for training model, false for eval model. Controls + whether dropout will be applied. + input_ids: int32 Tensor of shape [batch_size, seq_length]. + input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + use_one_hot_embeddings: (optional) bool. Whether to use one-hot word + embeddings or tf.embedding_lookup() for the word embeddings. + use_einsum: (optional) bool. Whether to use einsum or reshape+matmul for + dense layers + scope: (optional) variable scope. Defaults to "bert". + + Raises: + ValueError: The config is invalid or one of the input tensor shapes + is invalid. + """ + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + + input_shape = get_shape_list(input_ids, expected_rank=2) + batch_size = input_shape[0] + seq_length = input_shape[1] + + if input_mask is None: + input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) + + if token_type_ids is None: + token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) + + with tf.variable_scope(scope, default_name="bert"): + with tf.variable_scope("embeddings"): + # Perform embedding lookup on the word ids. + (self.word_embedding_output, + self.output_embedding_table) = embedding_lookup( + input_ids=input_ids, + vocab_size=config.vocab_size, + embedding_size=config.embedding_size, + initializer_range=config.initializer_range, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=use_one_hot_embeddings) + + # Add positional embeddings and token type embeddings, then layer + # normalize and perform dropout. + self.embedding_output = embedding_postprocessor( + input_tensor=self.word_embedding_output, + use_token_type=True, + token_type_ids=token_type_ids, + token_type_vocab_size=config.type_vocab_size, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=config.initializer_range, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings) + + with tf.variable_scope("encoder"): + # Run the stacked transformer. + # `sequence_output` shape = [batch_size, seq_length, hidden_size]. + self.all_encoder_layers = transformer_model( + input_tensor=self.embedding_output, + attention_mask=input_mask, + hidden_size=config.hidden_size, + num_hidden_layers=config.num_hidden_layers, + num_hidden_groups=config.num_hidden_groups, + num_attention_heads=config.num_attention_heads, + intermediate_size=config.intermediate_size, + inner_group_num=config.inner_group_num, + intermediate_act_fn=get_activation(config.hidden_act), + hidden_dropout_prob=config.hidden_dropout_prob, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + initializer_range=config.initializer_range, + do_return_all_layers=True, + use_einsum=use_einsum) + + self.sequence_output = self.all_encoder_layers[-1] + # The "pooler" converts the encoded sequence tensor of shape + # [batch_size, seq_length, hidden_size] to a tensor of shape + # [batch_size, hidden_size]. This is necessary for segment-level + # (or segment-pair-level) classification tasks where we need a fixed + # dimensional representation of the segment. + with tf.variable_scope("pooler"): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. We assume that this has been pre-trained + first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) + self.pooled_output = tf.layers.dense( + first_token_tensor, + config.hidden_size, + activation=tf.tanh, + kernel_initializer=create_initializer(config.initializer_range)) + + def get_pooled_output(self): + return self.pooled_output + + def get_sequence_output(self): + """Gets final hidden layer of encoder. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the final hidden of the transformer encoder. + """ + return self.sequence_output + + def get_all_encoder_layers(self): + return self.all_encoder_layers + + def get_word_embedding_output(self): + """Get output of the word(piece) embedding lookup. + + This is BEFORE positional embeddings and token type embeddings have been + added. + + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size] + corresponding to the output of the word(piece) embedding layer. + """ + return self.word_embedding_output + + def get_embedding_output(self): + """Gets output of the embedding lookup (i.e., input to the transformer). + + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size] + corresponding to the output of the embedding layer, after summing the word + embeddings with the positional embeddings and the token type embeddings, + then performing layer normalization. This is the input to the transformer. + """ + return self.embedding_output + + def get_embedding_table(self): + return self.output_embedding_table + + +def gelu(x): + """Gaussian Error Linear Unit. + + This is a smoother version of the RELU. + Original paper: https://arxiv.org/abs/1606.08415 + Args: + x: float Tensor to perform activation. + + Returns: + `x` with the GELU activation applied. + """ + cdf = 0.5 * (1.0 + tf.tanh( + (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + return x * cdf + + +def get_activation(activation_string): + """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. + + Args: + activation_string: String name of the activation function. + + Returns: + A Python function corresponding to the activation function. If + `activation_string` is None, empty, or "linear", this will return None. + If `activation_string` is not a string, it will return `activation_string`. + + Raises: + ValueError: The `activation_string` does not correspond to a known + activation. + """ + + # We assume that anything that"s not a string is already an activation + # function, so we just return it. + if not isinstance(activation_string, six.string_types): + return activation_string + + if not activation_string: + return None + + act = activation_string.lower() + if act == "linear": + return None + elif act == "relu": + return tf.nn.relu + elif act == "gelu": + return gelu + elif act == "tanh": + return tf.tanh + else: + raise ValueError("Unsupported activation: %s" % act) + + +def get_assignment_map_from_checkpoint(tvars, init_checkpoint, num_of_group=0): + """Compute the union of the current variables and checkpoint variables.""" + assignment_map = {} + initialized_variable_names = {} + + name_to_variable = collections.OrderedDict() + for var in tvars: + name = var.name + m = re.match("^(.*):\\d+$", name) + if m is not None: + name = m.group(1) + name_to_variable[name] = var + init_vars = tf.train.list_variables(init_checkpoint) + init_vars_name = [name for (name, _) in init_vars] + + if num_of_group > 0: + assignment_map = [] + for gid in range(num_of_group): + assignment_map.append(collections.OrderedDict()) + else: + assignment_map = collections.OrderedDict() + + for name in name_to_variable: + if name in init_vars_name: + tvar_name = name + elif (re.sub(r"/group_\d+/", "/group_0/", + six.ensure_str(name)) in init_vars_name and + num_of_group > 1): + tvar_name = re.sub(r"/group_\d+/", "/group_0/", six.ensure_str(name)) + elif (re.sub(r"/ffn_\d+/", "/ffn_1/", six.ensure_str(name)) + in init_vars_name and num_of_group > 1): + tvar_name = re.sub(r"/ffn_\d+/", "/ffn_1/", six.ensure_str(name)) + elif (re.sub(r"/attention_\d+/", "/attention_1/", six.ensure_str(name)) + in init_vars_name and num_of_group > 1): + tvar_name = re.sub(r"/attention_\d+/", "/attention_1/", + six.ensure_str(name)) + else: + tf.logging.info("name %s does not get matched", name) + continue + tf.logging.info("name %s match to %s", name, tvar_name) + if num_of_group > 0: + group_matched = False + for gid in range(1, num_of_group): + if (("/group_" + str(gid) + "/" in name) or + ("/ffn_" + str(gid) + "/" in name) or + ("/attention_" + str(gid) + "/" in name)): + group_matched = True + tf.logging.info("%s belongs to %dth", name, gid) + assignment_map[gid][tvar_name] = name + if not group_matched: + assignment_map[0][tvar_name] = name + else: + assignment_map[tvar_name] = name + initialized_variable_names[name] = 1 + initialized_variable_names[six.ensure_str(name) + ":0"] = 1 + + return (assignment_map, initialized_variable_names) + + +def dropout(input_tensor, dropout_prob): + """Perform dropout. + + Args: + input_tensor: float Tensor. + dropout_prob: Python float. The probability of dropping out a value (NOT of + *keeping* a dimension as in `tf.nn.dropout`). + + Returns: + A version of `input_tensor` with dropout applied. + """ + if dropout_prob is None or dropout_prob == 0.0: + return input_tensor + + output = tf.nn.dropout(input_tensor, rate=dropout_prob) + return output + + +def layer_norm(input_tensor, name=None): + """Run layer normalization on the last dimension of the tensor.""" + return contrib_layers.layer_norm( + inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) + + +def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): + """Runs layer normalization followed by dropout.""" + output_tensor = layer_norm(input_tensor, name) + output_tensor = dropout(output_tensor, dropout_prob) + return output_tensor + + +def create_initializer(initializer_range=0.02): + """Creates a `truncated_normal_initializer` with the given range.""" + return tf.truncated_normal_initializer(stddev=initializer_range) + + +def get_timing_signal_1d_given_position(channels, + position, + min_timescale=1.0, + max_timescale=1.0e4): + """Get sinusoids of diff frequencies, with timing position given. + + Adapted from add_timing_signal_1d_given_position in + //third_party/py/tensor2tensor/layers/common_attention.py + + Args: + channels: scalar, size of timing embeddings to create. The number of + different timescales is equal to channels / 2. + position: a Tensor with shape [batch, seq_len] + min_timescale: a float + max_timescale: a float + + Returns: + a Tensor of timing signals [batch, seq_len, channels] + """ + num_timescales = channels // 2 + log_timescale_increment = ( + math.log(float(max_timescale) / float(min_timescale)) / + (tf.to_float(num_timescales) - 1)) + inv_timescales = min_timescale * tf.exp( + tf.to_float(tf.range(num_timescales)) * -log_timescale_increment) + scaled_time = ( + tf.expand_dims(tf.to_float(position), 2) * tf.expand_dims( + tf.expand_dims(inv_timescales, 0), 0)) + signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=2) + signal = tf.pad(signal, [[0, 0], [0, 0], [0, tf.mod(channels, 2)]]) + return signal + + +def embedding_lookup(input_ids, + vocab_size, + embedding_size=128, + initializer_range=0.02, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=False): + """Looks up words embeddings for id tensor. + + Args: + input_ids: int32 Tensor of shape [batch_size, seq_length] containing word + ids. + vocab_size: int. Size of the embedding vocabulary. + embedding_size: int. Width of the word embeddings. + initializer_range: float. Embedding initialization range. + word_embedding_name: string. Name of the embedding table. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.nn.embedding_lookup()`. + + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size]. + """ + # This function assumes that the input is of shape [batch_size, seq_length, + # num_inputs]. + # + # If the input is a 2D tensor of shape [batch_size, seq_length], we + # reshape to [batch_size, seq_length, 1]. + if input_ids.shape.ndims == 2: + input_ids = tf.expand_dims(input_ids, axis=[-1]) + + embedding_table = tf.get_variable( + name=word_embedding_name, + shape=[vocab_size, embedding_size], + initializer=create_initializer(initializer_range)) + + if use_one_hot_embeddings: + flat_input_ids = tf.reshape(input_ids, [-1]) + one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) + output = tf.matmul(one_hot_input_ids, embedding_table) + else: + output = tf.nn.embedding_lookup(embedding_table, input_ids) + + input_shape = get_shape_list(input_ids) + + output = tf.reshape(output, + input_shape[0:-1] + [input_shape[-1] * embedding_size]) + return (output, embedding_table) + + +def embedding_postprocessor(input_tensor, + use_token_type=False, + token_type_ids=None, + token_type_vocab_size=16, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=0.02, + max_position_embeddings=512, + dropout_prob=0.1, + use_one_hot_embeddings=True): + """Performs various post-processing on a word embedding tensor. + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, + embedding_size]. + use_token_type: bool. Whether to add embeddings for `token_type_ids`. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + Must be specified if `use_token_type` is True. + token_type_vocab_size: int. The vocabulary size of `token_type_ids`. + token_type_embedding_name: string. The name of the embedding table variable + for token type ids. + use_position_embeddings: bool. Whether to add position embeddings for the + position of each token in the sequence. + position_embedding_name: string. The name of the embedding table variable + for positional embeddings. + initializer_range: float. Range of the weight initialization. + max_position_embeddings: int. Maximum sequence length that might ever be + used with this model. This can be longer than the sequence length of + input_tensor, but cannot be shorter. + dropout_prob: float. Dropout probability applied to the final output tensor. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.nn.embedding_lookup()`. + + Returns: + float tensor with same shape as `input_tensor`. + + Raises: + ValueError: One of the tensor shapes or input values is invalid. + """ + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + width = input_shape[2] + + output = input_tensor + + if use_token_type: + if token_type_ids is None: + raise ValueError("`token_type_ids` must be specified if" + "`use_token_type` is True.") + token_type_table = tf.get_variable( + name=token_type_embedding_name, + shape=[token_type_vocab_size, width], + initializer=create_initializer(initializer_range)) + # This vocab will be small so we always do one-hot here, since it is always + # faster for a small vocabulary, unless converting to tflite model. + if use_one_hot_embeddings: + flat_token_type_ids = tf.reshape(token_type_ids, [-1]) + one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) + token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) + token_type_embeddings = tf.reshape(token_type_embeddings, + [batch_size, seq_length, width]) + else: + token_type_embeddings = tf.nn.embedding_lookup(token_type_table, + token_type_ids) + output += token_type_embeddings + + if use_position_embeddings: + assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) + with tf.control_dependencies([assert_op]): + full_position_embeddings = tf.get_variable( + name=position_embedding_name, + shape=[max_position_embeddings, width], + initializer=create_initializer(initializer_range)) + # Since the position embedding table is a learned variable, we create it + # using a (long) sequence length `max_position_embeddings`. The actual + # sequence length might be shorter than this, for faster training of + # tasks that do not have long sequences. + # + # So `full_position_embeddings` is effectively an embedding table + # for position [0, 1, 2, ..., max_position_embeddings-1], and the current + # sequence has positions [0, 1, 2, ... seq_length-1], so we can just + # perform a slice. + position_embeddings = tf.slice(full_position_embeddings, [0, 0], + [seq_length, -1]) + num_dims = len(output.shape.as_list()) + + # Only the last two dimensions are relevant (`seq_length` and `width`), so + # we broadcast among the first dimensions, which is typically just + # the batch size. + position_broadcast_shape = [] + for _ in range(num_dims - 2): + position_broadcast_shape.append(1) + position_broadcast_shape.extend([seq_length, width]) + position_embeddings = tf.reshape(position_embeddings, + position_broadcast_shape) + output += position_embeddings + + output = layer_norm_and_dropout(output, dropout_prob) + return output + + +def einsum_via_matmul(input_tensor, w, num_inner_dims): + """Implements einsum via matmul and reshape ops. + + Args: + input_tensor: float Tensor of shape [, ]. + w: float Tensor of shape [, ]. + num_inner_dims: int. number of dimensions to use for inner products. + + Returns: + float Tensor of shape [, ]. + """ + input_shape = get_shape_list(input_tensor) + w_shape = get_shape_list(w) + batch_dims = input_shape[: -num_inner_dims] + inner_dims = input_shape[-num_inner_dims:] + outer_dims = w_shape[num_inner_dims:] + inner_dim = np.prod(inner_dims) + outer_dim = np.prod(outer_dims) + if num_inner_dims > 1: + input_tensor = tf.reshape(input_tensor, batch_dims + [inner_dim]) + if len(w_shape) > 2: + w = tf.reshape(w, [inner_dim, outer_dim]) + ret = tf.matmul(input_tensor, w) + if len(outer_dims) > 1: + ret = tf.reshape(ret, batch_dims + outer_dims) + return ret + + +def dense_layer_3d(input_tensor, + num_attention_heads, + head_size, + initializer, + activation, + use_einsum, + name=None): + """A dense layer with 3D kernel. + + Args: + input_tensor: float Tensor of shape [batch, seq_length, hidden_size]. + num_attention_heads: Number of attention heads. + head_size: The size per attention head. + initializer: Kernel initializer. + activation: Actication function. + use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers. + name: The name scope of this layer. + + Returns: + float logits Tensor. + """ + + input_shape = get_shape_list(input_tensor) + hidden_size = input_shape[2] + + with tf.variable_scope(name): + w = tf.get_variable( + name="kernel", + shape=[hidden_size, num_attention_heads * head_size], + initializer=initializer) + w = tf.reshape(w, [hidden_size, num_attention_heads, head_size]) + b = tf.get_variable( + name="bias", + shape=[num_attention_heads * head_size], + initializer=tf.zeros_initializer) + b = tf.reshape(b, [num_attention_heads, head_size]) + if use_einsum: + ret = tf.einsum("BFH,HND->BFND", input_tensor, w) + else: + ret = einsum_via_matmul(input_tensor, w, 1) + ret += b + if activation is not None: + return activation(ret) + else: + return ret + + +def dense_layer_3d_proj(input_tensor, + hidden_size, + head_size, + initializer, + activation, + use_einsum, + name=None): + """A dense layer with 3D kernel for projection. + + Args: + input_tensor: float Tensor of shape [batch,from_seq_length, + num_attention_heads, size_per_head]. + hidden_size: The size of hidden layer. + head_size: The size of head. + initializer: Kernel initializer. + activation: Actication function. + use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers. + name: The name scope of this layer. + + Returns: + float logits Tensor. + """ + input_shape = get_shape_list(input_tensor) + num_attention_heads = input_shape[2] + with tf.variable_scope(name): + w = tf.get_variable( + name="kernel", + shape=[num_attention_heads * head_size, hidden_size], + initializer=initializer) + w = tf.reshape(w, [num_attention_heads, head_size, hidden_size]) + b = tf.get_variable( + name="bias", shape=[hidden_size], initializer=tf.zeros_initializer) + if use_einsum: + ret = tf.einsum("BFND,NDH->BFH", input_tensor, w) + else: + ret = einsum_via_matmul(input_tensor, w, 2) + ret += b + if activation is not None: + return activation(ret) + else: + return ret + + +def dense_layer_2d(input_tensor, + output_size, + initializer, + activation, + use_einsum, + num_attention_heads=1, + name=None): + """A dense layer with 2D kernel. + + Args: + input_tensor: Float tensor with rank 3. + output_size: The size of output dimension. + initializer: Kernel initializer. + activation: Activation function. + use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers. + num_attention_heads: number of attention head in attention layer. + name: The name scope of this layer. + + Returns: + float logits Tensor. + """ + del num_attention_heads # unused + input_shape = get_shape_list(input_tensor) + hidden_size = input_shape[2] + with tf.variable_scope(name): + w = tf.get_variable( + name="kernel", + shape=[hidden_size, output_size], + initializer=initializer) + b = tf.get_variable( + name="bias", shape=[output_size], initializer=tf.zeros_initializer) + if use_einsum: + ret = tf.einsum("BFH,HO->BFO", input_tensor, w) + else: + ret = tf.matmul(input_tensor, w) + ret += b + if activation is not None: + return activation(ret) + else: + return ret + + +def dot_product_attention(q, k, v, bias, dropout_rate=0.0): + """Dot-product attention. + + Args: + q: Tensor with shape [..., length_q, depth_k]. + k: Tensor with shape [..., length_kv, depth_k]. Leading dimensions must + match with q. + v: Tensor with shape [..., length_kv, depth_v] Leading dimensions must + match with q. + bias: bias Tensor (see attention_bias()) + dropout_rate: a float. + + Returns: + Tensor with shape [..., length_q, depth_v]. + """ + logits = tf.matmul(q, k, transpose_b=True) # [..., length_q, length_kv] + logits = tf.multiply(logits, 1.0 / math.sqrt(float(get_shape_list(q)[-1]))) + if bias is not None: + # `attention_mask` = [B, T] + from_shape = get_shape_list(q) + if len(from_shape) == 4: + broadcast_ones = tf.ones([from_shape[0], 1, from_shape[2], 1], tf.float32) + elif len(from_shape) == 5: + # from_shape = [B, N, Block_num, block_size, depth]# + broadcast_ones = tf.ones([from_shape[0], 1, from_shape[2], from_shape[3], + 1], tf.float32) + + bias = tf.matmul(broadcast_ones, + tf.cast(bias, tf.float32), transpose_b=True) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + adder = (1.0 - bias) * -10000.0 + + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + logits += adder + else: + adder = 0.0 + + attention_probs = tf.nn.softmax(logits, name="attention_probs") + attention_probs = dropout(attention_probs, dropout_rate) + return tf.matmul(attention_probs, v) + + +def attention_layer(from_tensor, + to_tensor, + attention_mask=None, + num_attention_heads=1, + query_act=None, + key_act=None, + value_act=None, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + batch_size=None, + from_seq_length=None, + to_seq_length=None, + use_einsum=True): + """Performs multi-headed attention from `from_tensor` to `to_tensor`. + + Args: + from_tensor: float Tensor of shape [batch_size, from_seq_length, + from_width]. + to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. + attention_mask: (optional) int32 Tensor of shape [batch_size, + from_seq_length, to_seq_length]. The values should be 1 or 0. The + attention scores will effectively be set to -infinity for any positions in + the mask that are 0, and will be unchanged for positions that are 1. + num_attention_heads: int. Number of attention heads. + query_act: (optional) Activation function for the query transform. + key_act: (optional) Activation function for the key transform. + value_act: (optional) Activation function for the value transform. + attention_probs_dropout_prob: (optional) float. Dropout probability of the + attention probabilities. + initializer_range: float. Range of the weight initializer. + batch_size: (Optional) int. If the input is 2D, this might be the batch size + of the 3D version of the `from_tensor` and `to_tensor`. + from_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `from_tensor`. + to_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `to_tensor`. + use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers + + Returns: + float Tensor of shape [batch_size, from_seq_length, num_attention_heads, + size_per_head]. + + Raises: + ValueError: Any of the arguments or tensor shapes are invalid. + """ + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) + size_per_head = int(from_shape[2]/num_attention_heads) + + if len(from_shape) != len(to_shape): + raise ValueError( + "The rank of `from_tensor` must match the rank of `to_tensor`.") + + if len(from_shape) == 3: + batch_size = from_shape[0] + from_seq_length = from_shape[1] + to_seq_length = to_shape[1] + elif len(from_shape) == 2: + if (batch_size is None or from_seq_length is None or to_seq_length is None): + raise ValueError( + "When passing in rank 2 tensors to attention_layer, the values " + "for `batch_size`, `from_seq_length`, and `to_seq_length` " + "must all be specified.") + + # Scalar dimensions referenced here: + # B = batch size (number of sequences) + # F = `from_tensor` sequence length + # T = `to_tensor` sequence length + # N = `num_attention_heads` + # H = `size_per_head` + + # `query_layer` = [B, F, N, H] + q = dense_layer_3d(from_tensor, num_attention_heads, size_per_head, + create_initializer(initializer_range), query_act, + use_einsum, "query") + + # `key_layer` = [B, T, N, H] + k = dense_layer_3d(to_tensor, num_attention_heads, size_per_head, + create_initializer(initializer_range), key_act, + use_einsum, "key") + # `value_layer` = [B, T, N, H] + v = dense_layer_3d(to_tensor, num_attention_heads, size_per_head, + create_initializer(initializer_range), value_act, + use_einsum, "value") + q = tf.transpose(q, [0, 2, 1, 3]) + k = tf.transpose(k, [0, 2, 1, 3]) + v = tf.transpose(v, [0, 2, 1, 3]) + if attention_mask is not None: + attention_mask = tf.reshape( + attention_mask, [batch_size, 1, to_seq_length, 1]) + # 'new_embeddings = [B, N, F, H]' + new_embeddings = dot_product_attention(q, k, v, attention_mask, + attention_probs_dropout_prob) + + return tf.transpose(new_embeddings, [0, 2, 1, 3]) + + +def attention_ffn_block(layer_input, + hidden_size=768, + attention_mask=None, + num_attention_heads=1, + attention_head_size=64, + attention_probs_dropout_prob=0.0, + intermediate_size=3072, + intermediate_act_fn=None, + initializer_range=0.02, + hidden_dropout_prob=0.0, + use_einsum=True): + """A network with attention-ffn as sub-block. + + Args: + layer_input: float Tensor of shape [batch_size, from_seq_length, + from_width]. + hidden_size: (optional) int, size of hidden layer. + attention_mask: (optional) int32 Tensor of shape [batch_size, + from_seq_length, to_seq_length]. The values should be 1 or 0. The + attention scores will effectively be set to -infinity for any positions in + the mask that are 0, and will be unchanged for positions that are 1. + num_attention_heads: int. Number of attention heads. + attention_head_size: int. Size of attention head. + attention_probs_dropout_prob: float. dropout probability for attention_layer + intermediate_size: int. Size of intermediate hidden layer. + intermediate_act_fn: (optional) Activation function for the intermediate + layer. + initializer_range: float. Range of the weight initializer. + hidden_dropout_prob: (optional) float. Dropout probability of the hidden + layer. + use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers + + Returns: + layer output + """ + + with tf.variable_scope("attention_1"): + with tf.variable_scope("self"): + attention_output = attention_layer( + from_tensor=layer_input, + to_tensor=layer_input, + attention_mask=attention_mask, + num_attention_heads=num_attention_heads, + attention_probs_dropout_prob=attention_probs_dropout_prob, + initializer_range=initializer_range, + use_einsum=use_einsum) + + # Run a linear projection of `hidden_size` then add a residual + # with `layer_input`. + with tf.variable_scope("output"): + attention_output = dense_layer_3d_proj( + attention_output, + hidden_size, + attention_head_size, + create_initializer(initializer_range), + None, + use_einsum=use_einsum, + name="dense") + attention_output = dropout(attention_output, hidden_dropout_prob) + attention_output = layer_norm(attention_output + layer_input) + with tf.variable_scope("ffn_1"): + with tf.variable_scope("intermediate"): + intermediate_output = dense_layer_2d( + attention_output, + intermediate_size, + create_initializer(initializer_range), + intermediate_act_fn, + use_einsum=use_einsum, + num_attention_heads=num_attention_heads, + name="dense") + with tf.variable_scope("output"): + ffn_output = dense_layer_2d( + intermediate_output, + hidden_size, + create_initializer(initializer_range), + None, + use_einsum=use_einsum, + num_attention_heads=num_attention_heads, + name="dense") + ffn_output = dropout(ffn_output, hidden_dropout_prob) + ffn_output = layer_norm(ffn_output + attention_output) + return ffn_output + + +def transformer_model(input_tensor, + attention_mask=None, + hidden_size=768, + num_hidden_layers=12, + num_hidden_groups=12, + num_attention_heads=12, + intermediate_size=3072, + inner_group_num=1, + intermediate_act_fn="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + do_return_all_layers=False, + use_einsum=True): + """Multi-headed, multi-layer Transformer from "Attention is All You Need". + + This is almost an exact implementation of the original Transformer encoder. + + See the original paper: + https://arxiv.org/abs/1706.03762 + + Also see: + https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. + attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, + seq_length], with 1 for positions that can be attended to and 0 in + positions that should not be. + hidden_size: int. Hidden size of the Transformer. + num_hidden_layers: int. Number of layers (blocks) in the Transformer. + num_hidden_groups: int. Number of group for the hidden layers, parameters + in the same group are shared. + num_attention_heads: int. Number of attention heads in the Transformer. + intermediate_size: int. The size of the "intermediate" (a.k.a., feed + forward) layer. + inner_group_num: int, number of inner repetition of attention and ffn. + intermediate_act_fn: function. The non-linear activation function to apply + to the output of the intermediate/feed-forward layer. + hidden_dropout_prob: float. Dropout probability for the hidden layers. + attention_probs_dropout_prob: float. Dropout probability of the attention + probabilities. + initializer_range: float. Range of the initializer (stddev of truncated + normal). + do_return_all_layers: Whether to also return all layers or just the final + layer. + use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size], the final + hidden layer of the Transformer. + + Raises: + ValueError: A Tensor shape or parameter is invalid. + """ + if hidden_size % num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (hidden_size, num_attention_heads)) + + attention_head_size = hidden_size // num_attention_heads + input_shape = get_shape_list(input_tensor, expected_rank=3) + input_width = input_shape[2] + + all_layer_outputs = [] + if input_width != hidden_size: + prev_output = dense_layer_2d( + input_tensor, hidden_size, create_initializer(initializer_range), + None, use_einsum=use_einsum, name="embedding_hidden_mapping_in") + else: + prev_output = input_tensor + with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE): + for layer_idx in range(num_hidden_layers): + group_idx = int(layer_idx / num_hidden_layers * num_hidden_groups) + with tf.variable_scope("group_%d" % group_idx): + with tf.name_scope("layer_%d" % layer_idx): + layer_output = prev_output + for inner_group_idx in range(inner_group_num): + with tf.variable_scope("inner_group_%d" % inner_group_idx): + layer_output = attention_ffn_block( + layer_input=layer_output, + hidden_size=hidden_size, + attention_mask=attention_mask, + num_attention_heads=num_attention_heads, + attention_head_size=attention_head_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + intermediate_size=intermediate_size, + intermediate_act_fn=intermediate_act_fn, + initializer_range=initializer_range, + hidden_dropout_prob=hidden_dropout_prob, + use_einsum=use_einsum) + prev_output = layer_output + all_layer_outputs.append(layer_output) + if do_return_all_layers: + return all_layer_outputs + else: + return all_layer_outputs[-1] + + +def get_shape_list(tensor, expected_rank=None, name=None): + """Returns a list of the shape of tensor, preferring static dimensions. + + Args: + tensor: A tf.Tensor object to find the shape of. + expected_rank: (optional) int. The expected rank of `tensor`. If this is + specified and the `tensor` has a different rank, and exception will be + thrown. + name: Optional name of the tensor for the error message. + + Returns: + A list of dimensions of the shape of tensor. All static dimensions will + be returned as python integers, and dynamic dimensions will be returned + as tf.Tensor scalars. + """ + if name is None: + name = tensor.name + + if expected_rank is not None: + assert_rank(tensor, expected_rank, name) + + shape = tensor.shape.as_list() + + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) + + if not non_static_indexes: + return shape + + dyn_shape = tf.shape(tensor) + for index in non_static_indexes: + shape[index] = dyn_shape[index] + return shape + + +def reshape_to_matrix(input_tensor): + """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" + ndims = input_tensor.shape.ndims + if ndims < 2: + raise ValueError("Input tensor must have at least rank 2. Shape = %s" % + (input_tensor.shape)) + if ndims == 2: + return input_tensor + + width = input_tensor.shape[-1] + output_tensor = tf.reshape(input_tensor, [-1, width]) + return output_tensor + + +def reshape_from_matrix(output_tensor, orig_shape_list): + """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" + if len(orig_shape_list) == 2: + return output_tensor + + output_shape = get_shape_list(output_tensor) + + orig_dims = orig_shape_list[0:-1] + width = output_shape[-1] + + return tf.reshape(output_tensor, orig_dims + [width]) + + +def assert_rank(tensor, expected_rank, name=None): + """Raises an exception if the tensor rank is not of the expected rank. + + Args: + tensor: A tf.Tensor to check the rank of. + expected_rank: Python integer or list of integers, expected rank. + name: Optional name of the tensor for the error message. + + Raises: + ValueError: If the expected shape doesn't match the actual shape. + """ + if name is None: + name = tensor.name + + expected_rank_dict = {} + if isinstance(expected_rank, six.integer_types): + expected_rank_dict[expected_rank] = True + else: + for x in expected_rank: + expected_rank_dict[x] = True + + actual_rank = tensor.shape.ndims + if actual_rank not in expected_rank_dict: + scope_name = tf.get_variable_scope().name + raise ValueError( + "For the tensor `%s` in scope `%s`, the actual rank " + "`%d` (shape = %s) is not equal to the expected rank `%s`" % + (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) diff --git a/Indic-BERT-v1-master/albert/modeling_test.py b/Indic-BERT-v1-master/albert/modeling_test.py new file mode 100644 index 0000000000000000000000000000000000000000..73ea81c33553761ecfe04ee4e84ce1ed9918e381 --- /dev/null +++ b/Indic-BERT-v1-master/albert/modeling_test.py @@ -0,0 +1,309 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python2, python3 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import json +import random +import re + +from albert import modeling +import numpy as np +import six +from six.moves import range +import tensorflow.compat.v1 as tf + + +class AlbertModelTest(tf.test.TestCase): + + class AlbertModelTester(object): + + def __init__(self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + vocab_size=99, + embedding_size=32, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02, + scope=None): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.vocab_size = vocab_size + self.embedding_size = embedding_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.scope = scope + + def create_model(self): + input_ids = AlbertModelTest.ids_tensor([self.batch_size, self.seq_length], + self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = AlbertModelTest.ids_tensor( + [self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = AlbertModelTest.ids_tensor( + [self.batch_size, self.seq_length], self.type_vocab_size) + + config = modeling.AlbertConfig( + vocab_size=self.vocab_size, + embedding_size=self.embedding_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range) + + model = modeling.AlbertModel( + config=config, + is_training=self.is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=token_type_ids, + scope=self.scope) + + outputs = { + "embedding_output": model.get_embedding_output(), + "sequence_output": model.get_sequence_output(), + "pooled_output": model.get_pooled_output(), + "all_encoder_layers": model.get_all_encoder_layers(), + } + return outputs + + def check_output(self, result): + self.parent.assertAllEqual( + result["embedding_output"].shape, + [self.batch_size, self.seq_length, self.embedding_size]) + + self.parent.assertAllEqual( + result["sequence_output"].shape, + [self.batch_size, self.seq_length, self.hidden_size]) + + self.parent.assertAllEqual(result["pooled_output"].shape, + [self.batch_size, self.hidden_size]) + + def test_default(self): + self.run_tester(AlbertModelTest.AlbertModelTester(self)) + + def test_config_to_json_string(self): + config = modeling.AlbertConfig(vocab_size=99, hidden_size=37) + obj = json.loads(config.to_json_string()) + self.assertEqual(obj["vocab_size"], 99) + self.assertEqual(obj["hidden_size"], 37) + + def test_einsum_via_matmul(self): + batch_size = 8 + seq_length = 12 + num_attention_heads = 3 + head_size = 6 + hidden_size = 10 + + input_tensor = np.random.uniform(0, 1, + [batch_size, seq_length, hidden_size]) + input_tensor = tf.constant(input_tensor, dtype=tf.float32) + w = np.random.uniform(0, 1, [hidden_size, num_attention_heads, head_size]) + w = tf.constant(w, dtype=tf.float32) + ret1 = tf.einsum("BFH,HND->BFND", input_tensor, w) + ret2 = modeling.einsum_via_matmul(input_tensor, w, 1) + self.assertAllClose(ret1, ret2) + + input_tensor = np.random.uniform(0, 1, + [batch_size, seq_length, + num_attention_heads, head_size]) + input_tensor = tf.constant(input_tensor, dtype=tf.float32) + w = np.random.uniform(0, 1, [num_attention_heads, head_size, hidden_size]) + w = tf.constant(w, dtype=tf.float32) + ret1 = tf.einsum("BFND,NDH->BFH", input_tensor, w) + ret2 = modeling.einsum_via_matmul(input_tensor, w, 2) + self.assertAllClose(ret1, ret2) + + def run_tester(self, tester): + with self.test_session() as sess: + ops = tester.create_model() + init_op = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + sess.run(init_op) + output_result = sess.run(ops) + tester.check_output(output_result) + + self.assert_all_tensors_reachable(sess, [init_op, ops]) + + @classmethod + def ids_tensor(cls, shape, vocab_size, rng=None, name=None): + """Creates a random int32 tensor of the shape within the vocab size.""" + if rng is None: + rng = random.Random() + + total_dims = 1 + for dim in shape: + total_dims *= dim + + values = [] + for _ in range(total_dims): + values.append(rng.randint(0, vocab_size - 1)) + + return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name) + + def assert_all_tensors_reachable(self, sess, outputs): + """Checks that all the tensors in the graph are reachable from outputs.""" + graph = sess.graph + + ignore_strings = [ + "^.*/assert_less_equal/.*$", + "^.*/dilation_rate$", + "^.*/Tensordot/concat$", + "^.*/Tensordot/concat/axis$", + "^testing/.*$", + ] + + ignore_regexes = [re.compile(x) for x in ignore_strings] + + unreachable = self.get_unreachable_ops(graph, outputs) + filtered_unreachable = [] + for x in unreachable: + do_ignore = False + for r in ignore_regexes: + m = r.match(six.ensure_str(x.name)) + if m is not None: + do_ignore = True + if do_ignore: + continue + filtered_unreachable.append(x) + unreachable = filtered_unreachable + + self.assertEqual( + len(unreachable), 0, "The following ops are unreachable: %s" % + (" ".join([x.name for x in unreachable]))) + + @classmethod + def get_unreachable_ops(cls, graph, outputs): + """Finds all of the tensors in graph that are unreachable from outputs.""" + outputs = cls.flatten_recursive(outputs) + output_to_op = collections.defaultdict(list) + op_to_all = collections.defaultdict(list) + assign_out_to_in = collections.defaultdict(list) + + for op in graph.get_operations(): + for x in op.inputs: + op_to_all[op.name].append(x.name) + for y in op.outputs: + output_to_op[y.name].append(op.name) + op_to_all[op.name].append(y.name) + if str(op.type) == "Assign": + for y in op.outputs: + for x in op.inputs: + assign_out_to_in[y.name].append(x.name) + + assign_groups = collections.defaultdict(list) + for out_name in assign_out_to_in.keys(): + name_group = assign_out_to_in[out_name] + for n1 in name_group: + assign_groups[n1].append(out_name) + for n2 in name_group: + if n1 != n2: + assign_groups[n1].append(n2) + + seen_tensors = {} + stack = [x.name for x in outputs] + while stack: + name = stack.pop() + if name in seen_tensors: + continue + seen_tensors[name] = True + + if name in output_to_op: + for op_name in output_to_op[name]: + if op_name in op_to_all: + for input_name in op_to_all[op_name]: + if input_name not in stack: + stack.append(input_name) + + expanded_names = [] + if name in assign_groups: + for assign_name in assign_groups[name]: + expanded_names.append(assign_name) + + for expanded_name in expanded_names: + if expanded_name not in stack: + stack.append(expanded_name) + + unreachable_ops = [] + for op in graph.get_operations(): + is_unreachable = False + all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs] + for name in all_names: + if name not in seen_tensors: + is_unreachable = True + if is_unreachable: + unreachable_ops.append(op) + return unreachable_ops + + @classmethod + def flatten_recursive(cls, item): + """Flattens (potentially nested) a tuple/dictionary/list to a list.""" + output = [] + if isinstance(item, list): + output.extend(item) + elif isinstance(item, tuple): + output.extend(list(item)) + elif isinstance(item, dict): + for (_, v) in six.iteritems(item): + output.append(v) + else: + return [item] + + flat_output = [] + for x in output: + flat_output.extend(cls.flatten_recursive(x)) + return flat_output + + +if __name__ == "__main__": + tf.test.main() diff --git a/Indic-BERT-v1-master/albert/optimization.py b/Indic-BERT-v1-master/albert/optimization.py new file mode 100644 index 0000000000000000000000000000000000000000..63b2adc7266fcfd5ff9399f550ef9cc5721909f3 --- /dev/null +++ b/Indic-BERT-v1-master/albert/optimization.py @@ -0,0 +1,204 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python2, python3 +"""Functions and classes related to optimization (weight updates).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import re +from albert import lamb_optimizer +import six +from six.moves import zip +import tensorflow.compat.v1 as tf +from tensorflow.contrib import tpu as contrib_tpu + + +def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, + optimizer="adamw", poly_power=1.0, start_warmup_step=0, + colocate_gradients_with_ops=False): + """Creates an optimizer training op.""" + global_step = tf.train.get_or_create_global_step() + + learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) + + # Implements linear decay of the learning rate. + learning_rate = tf.train.polynomial_decay( + learning_rate, + global_step, + num_train_steps, + end_learning_rate=0.0, + power=poly_power, + cycle=False) + + # Implements linear warmup. I.e., if global_step - start_warmup_step < + # num_warmup_steps, the learning rate will be + # `(global_step - start_warmup_step)/num_warmup_steps * init_lr`. + if num_warmup_steps: + tf.logging.info("++++++ warmup starts at step " + str(start_warmup_step) + + ", for " + str(num_warmup_steps) + " steps ++++++") + global_steps_int = tf.cast(global_step, tf.int32) + start_warm_int = tf.constant(start_warmup_step, dtype=tf.int32) + global_steps_int = global_steps_int - start_warm_int + warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) + + global_steps_float = tf.cast(global_steps_int, tf.float32) + warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) + + warmup_percent_done = global_steps_float / warmup_steps_float + warmup_learning_rate = init_lr * warmup_percent_done + + is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) + learning_rate = ( + (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) + + # It is OK that you use this optimizer for finetuning, since this + # is how the model was trained (note that the Adam m/v variables are NOT + # loaded from init_checkpoint.) + # It is OK to use AdamW in the finetuning even the model is trained by LAMB. + # As report in the Bert pulic github, the learning rate for SQuAD 1.1 finetune + # is 3e-5, 4e-5 or 5e-5. For LAMB, the users can use 3e-4, 4e-4,or 5e-4 for a + # batch size of 64 in the finetune. + if optimizer == "adamw": + tf.logging.info("using adamw") + optimizer = AdamWeightDecayOptimizer( + learning_rate=learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) + elif optimizer == "lamb": + tf.logging.info("using lamb") + optimizer = lamb_optimizer.LAMBOptimizer( + learning_rate=learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) + else: + raise ValueError("Not supported optimizer: ", optimizer) + + if use_tpu: + optimizer = contrib_tpu.CrossShardOptimizer(optimizer) + + tvars = tf.trainable_variables() + grads = tf.gradients( + loss, tvars, colocate_gradients_with_ops=colocate_gradients_with_ops) + + # This is how the model was pre-trained. + (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) + + train_op = optimizer.apply_gradients( + list(zip(grads, tvars)), global_step=global_step) + + # Normally the global step update is done inside of `apply_gradients`. + # However, neither `AdamWeightDecayOptimizer` nor `LAMBOptimizer` do this. + # But if you use a different optimizer, you should probably take this line + # out. + new_global_step = global_step + 1 + train_op = tf.group(train_op, [global_step.assign(new_global_step)]) + return train_op + + +class AdamWeightDecayOptimizer(tf.train.Optimizer): + """A basic Adam optimizer that includes "correct" L2 weight decay.""" + + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="AdamWeightDecayOptimizer"): + """Constructs a AdamWeightDecayOptimizer.""" + super(AdamWeightDecayOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=six.ensure_str(param_name) + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=six.ensure_str(param_name) + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + update_with_lr = self.learning_rate * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", six.ensure_str(param_name)) + if m is not None: + param_name = m.group(1) + return param_name diff --git a/Indic-BERT-v1-master/albert/optimization_test.py b/Indic-BERT-v1-master/albert/optimization_test.py new file mode 100644 index 0000000000000000000000000000000000000000..b9c4d52a55977ad6f0193a9e88c91d016ece6212 --- /dev/null +++ b/Indic-BERT-v1-master/albert/optimization_test.py @@ -0,0 +1,50 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python2, python3 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from albert import optimization +from six.moves import range +from six.moves import zip +import tensorflow.compat.v1 as tf + + +class OptimizationTest(tf.test.TestCase): + + def test_adam(self): + with self.test_session() as sess: + w = tf.get_variable( + "w", + shape=[3], + initializer=tf.constant_initializer([0.1, -0.2, -0.1])) + x = tf.constant([0.4, 0.2, -0.5]) + loss = tf.reduce_mean(tf.square(x - w)) + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + global_step = tf.train.get_or_create_global_step() + optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) + train_op = optimizer.apply_gradients(list(zip(grads, tvars)), global_step) + init_op = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + sess.run(init_op) + for _ in range(100): + sess.run(train_op) + w_np = sess.run(w) + self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2) + + +if __name__ == "__main__": + tf.test.main() diff --git a/Indic-BERT-v1-master/albert/race_utils.py b/Indic-BERT-v1-master/albert/race_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..42ef4a4dbfc5e83f0fee8952b0ad89389a6aade1 --- /dev/null +++ b/Indic-BERT-v1-master/albert/race_utils.py @@ -0,0 +1,432 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utility functions for RACE dataset.""" + +from __future__ import absolute_import +from __future__ import division +# from __future__ import google_type_annotations +from __future__ import print_function + +import collections +import json +import os +from albert import classifier_utils +from albert import fine_tuning_utils +from albert import modeling +from albert import optimization +from albert import tokenization +import tensorflow.compat.v1 as tf +from tensorflow.contrib import tpu as contrib_tpu + + +class InputExample(object): + """A single training/test example for the RACE dataset.""" + + def __init__(self, + example_id, + context_sentence, + start_ending, + endings, + label=None): + self.example_id = example_id + self.context_sentence = context_sentence + self.start_ending = start_ending + self.endings = endings + self.label = label + + def __str__(self): + return self.__repr__() + + def __repr__(self): + l = [ + "id: {}".format(self.example_id), + "context_sentence: {}".format(self.context_sentence), + "start_ending: {}".format(self.start_ending), + "ending_0: {}".format(self.endings[0]), + "ending_1: {}".format(self.endings[1]), + "ending_2: {}".format(self.endings[2]), + "ending_3: {}".format(self.endings[3]), + ] + + if self.label is not None: + l.append("label: {}".format(self.label)) + + return ", ".join(l) + + +class RaceProcessor(object): + """Processor for the RACE data set.""" + + def __init__(self, use_spm, do_lower_case, high_only, middle_only): + super(RaceProcessor, self).__init__() + self.use_spm = use_spm + self.do_lower_case = do_lower_case + self.high_only = high_only + self.middle_only = middle_only + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + return self.read_examples( + os.path.join(data_dir, "RACE", "train")) + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + return self.read_examples( + os.path.join(data_dir, "RACE", "dev")) + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for prediction.""" + return self.read_examples( + os.path.join(data_dir, "RACE", "test")) + + def get_labels(self): + """Gets the list of labels for this data set.""" + return ["A", "B", "C", "D"] + + def process_text(self, text): + if self.use_spm: + return tokenization.preprocess_text(text, lower=self.do_lower_case) + else: + return tokenization.convert_to_unicode(text) + + def read_examples(self, data_dir): + """Read examples from RACE json files.""" + examples = [] + for level in ["middle", "high"]: + if level == "middle" and self.high_only: continue + if level == "high" and self.middle_only: continue + cur_dir = os.path.join(data_dir, level) + + cur_path = os.path.join(cur_dir, "all.txt") + with tf.gfile.Open(cur_path) as f: + for line in f: + cur_data = json.loads(line.strip()) + + answers = cur_data["answers"] + options = cur_data["options"] + questions = cur_data["questions"] + context = self.process_text(cur_data["article"]) + + for i in range(len(answers)): + label = ord(answers[i]) - ord("A") + qa_list = [] + + question = self.process_text(questions[i]) + for j in range(4): + option = self.process_text(options[i][j]) + + if "_" in question: + qa_cat = question.replace("_", option) + else: + qa_cat = " ".join([question, option]) + + qa_list.append(qa_cat) + + examples.append( + InputExample( + example_id=cur_data["id"], + context_sentence=context, + start_ending=None, + endings=[qa_list[0], qa_list[1], qa_list[2], qa_list[3]], + label=label + ) + ) + + return examples + + +def convert_single_example(example_index, example, label_size, max_seq_length, + tokenizer, max_qa_length): + """Loads a data file into a list of `InputBatch`s.""" + + # RACE is a multiple choice task. To perform this task using AlBERT, + # we will use the formatting proposed in "Improving Language + # Understanding by Generative Pre-Training" and suggested by + # @jacobdevlin-google in this issue + # https://github.com/google-research/bert/issues/38. + # + # Each choice will correspond to a sample on which we run the + # inference. For a given RACE example, we will create the 4 + # following inputs: + # - [CLS] context [SEP] choice_1 [SEP] + # - [CLS] context [SEP] choice_2 [SEP] + # - [CLS] context [SEP] choice_3 [SEP] + # - [CLS] context [SEP] choice_4 [SEP] + # The model will output a single value for each input. To get the + # final decision of the model, we will run a softmax over these 4 + # outputs. + if isinstance(example, classifier_utils.PaddingInputExample): + return classifier_utils.InputFeatures( + example_id=0, + input_ids=[[0] * max_seq_length] * label_size, + input_mask=[[0] * max_seq_length] * label_size, + segment_ids=[[0] * max_seq_length] * label_size, + label_id=0, + is_real_example=False) + else: + context_tokens = tokenizer.tokenize(example.context_sentence) + if example.start_ending is not None: + start_ending_tokens = tokenizer.tokenize(example.start_ending) + + all_input_tokens = [] + all_input_ids = [] + all_input_mask = [] + all_segment_ids = [] + for ending in example.endings: + # We create a copy of the context tokens in order to be + # able to shrink it according to ending_tokens + context_tokens_choice = context_tokens[:] + if example.start_ending is not None: + ending_tokens = start_ending_tokens + tokenizer.tokenize(ending) + else: + ending_tokens = tokenizer.tokenize(ending) + # Modifies `context_tokens_choice` and `ending_tokens` in + # place so that the total length is less than the + # specified length. Account for [CLS], [SEP], [SEP] with + # "- 3" + ending_tokens = ending_tokens[- max_qa_length:] + + if len(context_tokens_choice) + len(ending_tokens) > max_seq_length - 3: + context_tokens_choice = context_tokens_choice[: ( + max_seq_length - 3 - len(ending_tokens))] + tokens = ["[CLS]"] + context_tokens_choice + ( + ["[SEP]"] + ending_tokens + ["[SEP]"]) + segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * ( + len(ending_tokens) + 1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + padding = [0] * (max_seq_length - len(input_ids)) + input_ids += padding + input_mask += padding + segment_ids += padding + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + all_input_tokens.append(tokens) + all_input_ids.append(input_ids) + all_input_mask.append(input_mask) + all_segment_ids.append(segment_ids) + + label = example.label + if example_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("id: {}".format(example.example_id)) + for choice_idx, (tokens, input_ids, input_mask, segment_ids) in \ + enumerate(zip(all_input_tokens, all_input_ids, all_input_mask, all_segment_ids)): + tf.logging.info("choice: {}".format(choice_idx)) + tf.logging.info("tokens: {}".format(" ".join(tokens))) + tf.logging.info( + "input_ids: {}".format(" ".join(map(str, input_ids)))) + tf.logging.info( + "input_mask: {}".format(" ".join(map(str, input_mask)))) + tf.logging.info( + "segment_ids: {}".format(" ".join(map(str, segment_ids)))) + tf.logging.info("label: {}".format(label)) + + return classifier_utils.InputFeatures( + example_id=example.example_id, + input_ids=all_input_ids, + input_mask=all_input_mask, + segment_ids=all_segment_ids, + label_id=label + ) + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, + output_file, max_qa_length): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, len(label_list), + max_seq_length, tokenizer, max_qa_length) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(sum(feature.input_ids, [])) + features["input_mask"] = create_int_feature(sum(feature.input_mask, [])) + features["segment_ids"] = create_int_feature(sum(feature.segment_ids, [])) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + + +def create_model(albert_config, is_training, input_ids, input_mask, segment_ids, + labels, num_labels, use_one_hot_embeddings, max_seq_length, + dropout_prob, hub_module): + """Creates a classification model.""" + bsz_per_core = tf.shape(input_ids)[0] + + input_ids = tf.reshape(input_ids, [bsz_per_core * num_labels, max_seq_length]) + input_mask = tf.reshape(input_mask, + [bsz_per_core * num_labels, max_seq_length]) + token_type_ids = tf.reshape(segment_ids, + [bsz_per_core * num_labels, max_seq_length]) + + (output_layer, _) = fine_tuning_utils.create_albert( + albert_config=albert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=token_type_ids, + use_one_hot_embeddings=use_one_hot_embeddings, + use_einsum=True, + hub_module=hub_module) + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [1, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [1], + initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout( + output_layer, keep_prob=1 - dropout_prob) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + logits = tf.reshape(logits, [bsz_per_core, num_labels]) + probabilities = tf.nn.softmax(logits, axis=-1) + predictions = tf.argmax(probabilities, axis=-1, output_type=tf.int32) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot( + labels, depth=tf.cast(num_labels, dtype=tf.int32), dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + + return (loss, per_example_loss, probabilities, logits, predictions) + + +def model_fn_builder(albert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings, max_seq_length, dropout_prob, + hub_module): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + is_real_example = None + if "is_real_example" in features: + is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) + else: + is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, probabilities, logits, predictions) = \ + create_model(albert_config, is_training, input_ids, input_mask, + segment_ids, label_ids, num_labels, + use_one_hot_embeddings, max_seq_length, dropout_prob, + hub_module) + + tvars = tf.trainable_variables() + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = contrib_tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + def metric_fn(per_example_loss, label_ids, logits, is_real_example): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy( + labels=label_ids, predictions=predictions, + weights=is_real_example) + loss = tf.metrics.mean( + values=per_example_loss, weights=is_real_example) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + + eval_metrics = (metric_fn, + [per_example_loss, label_ids, logits, is_real_example]) + output_spec = contrib_tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = contrib_tpu.TPUEstimatorSpec( + mode=mode, + predictions={"probabilities": probabilities, + "predictions": predictions}, + scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + diff --git a/Indic-BERT-v1-master/albert/requirements.txt b/Indic-BERT-v1-master/albert/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..65c919762baebdbd04137cb8ec9b092e8b0c359b --- /dev/null +++ b/Indic-BERT-v1-master/albert/requirements.txt @@ -0,0 +1,5 @@ +# Run pip install --upgrade pip if tensorflow 1.15 cannot be found +tensorflow==1.15.2 # CPU Version of TensorFlow +tensorflow_hub==0.7 +# tensorflow-gpu==1.15 # GPU version of TensorFlow +sentencepiece diff --git a/Indic-BERT-v1-master/albert/run_classifier.py b/Indic-BERT-v1-master/albert/run_classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..e9591f20abc51e89117b2cd5ee9df0efa5148ded --- /dev/null +++ b/Indic-BERT-v1-master/albert/run_classifier.py @@ -0,0 +1,488 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning on classification tasks.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +from albert import classifier_utils +from albert import fine_tuning_utils +from albert import modeling +import tensorflow.compat.v1 as tf +from tensorflow.contrib import cluster_resolver as contrib_cluster_resolver +from tensorflow.contrib import tpu as contrib_tpu + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "data_dir", None, + "The input data dir. Should contain the .tsv files (or other data files) " + "for the task.") + +flags.DEFINE_string( + "albert_config_file", None, + "The config json file corresponding to the pre-trained ALBERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("task_name", None, "The name of the task to train.") + +flags.DEFINE_string( + "vocab_file", None, + "The vocabulary file that the ALBERT model was trained on.") + +flags.DEFINE_string("spm_model_file", None, + "The model file for sentence piece tokenization.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +flags.DEFINE_string("cached_dir", None, + "Path to cached training and dev tfrecord file. " + "The file will be generated if not exist.") + +## Other parameters + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_string( + "albert_hub_module_handle", None, + "If set, the ALBERT hub module to use.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 512, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool( + "do_predict", False, + "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_integer("train_step", 1000, + "Total number of training steps to perform.") + +flags.DEFINE_integer( + "warmup_step", 0, + "number of steps to perform linear learning rate warmup for.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("keep_checkpoint_max", 5, + "How many checkpoints to keep.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +flags.DEFINE_string("optimizer", "adamw", "Optimizer to use") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + processors = { + "cola": classifier_utils.ColaProcessor, + "mnli": classifier_utils.MnliProcessor, + "mismnli": classifier_utils.MisMnliProcessor, + "mrpc": classifier_utils.MrpcProcessor, + "rte": classifier_utils.RteProcessor, + "sst-2": classifier_utils.Sst2Processor, + "sts-b": classifier_utils.StsbProcessor, + "qqp": classifier_utils.QqpProcessor, + "qnli": classifier_utils.QnliProcessor, + "wnli": classifier_utils.WnliProcessor, + } + + if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: + raise ValueError( + "At least one of `do_train`, `do_eval` or `do_predict' must be True.") + + if not FLAGS.albert_config_file and not FLAGS.albert_hub_module_handle: + raise ValueError("At least one of `--albert_config_file` and " + "`--albert_hub_module_handle` must be set") + + if FLAGS.albert_config_file: + albert_config = modeling.AlbertConfig.from_json_file( + FLAGS.albert_config_file) + if FLAGS.max_seq_length > albert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the ALBERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, albert_config.max_position_embeddings)) + else: + albert_config = None # Get the config from TF-Hub. + + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]( + use_spm=True if FLAGS.spm_model_file else False, + do_lower_case=FLAGS.do_lower_case) + + label_list = processor.get_labels() + + tokenizer = fine_tuning_utils.create_vocab( + vocab_file=FLAGS.vocab_file, + do_lower_case=FLAGS.do_lower_case, + spm_model_file=FLAGS.spm_model_file, + hub_module=FLAGS.albert_hub_module_handle) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 + if FLAGS.do_train: + iterations_per_loop = int(min(FLAGS.iterations_per_loop, + FLAGS.save_checkpoints_steps)) + else: + iterations_per_loop = FLAGS.iterations_per_loop + run_config = contrib_tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=int(FLAGS.save_checkpoints_steps), + keep_checkpoint_max=0, + tpu_config=contrib_tpu.TPUConfig( + iterations_per_loop=iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + model_fn = classifier_utils.model_fn_builder( + albert_config=albert_config, + num_labels=len(label_list), + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=FLAGS.train_step, + num_warmup_steps=FLAGS.warmup_step, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu, + task_name=task_name, + hub_module=FLAGS.albert_hub_module_handle, + optimizer=FLAGS.optimizer) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = contrib_tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + cached_dir = FLAGS.cached_dir + if not cached_dir: + cached_dir = FLAGS.output_dir + train_file = os.path.join(cached_dir, task_name + "_train.tf_record") + if not tf.gfile.Exists(train_file): + classifier_utils.file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, + train_file, task_name) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", FLAGS.train_step) + train_input_fn = classifier_utils.file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True, + task_name=task_name, + use_tpu=FLAGS.use_tpu, + bsz=FLAGS.train_batch_size) + estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_step) + + if FLAGS.do_eval: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(classifier_utils.PaddingInputExample()) + + cached_dir = FLAGS.cached_dir + if not cached_dir: + cached_dir = FLAGS.output_dir + eval_file = os.path.join(cached_dir, task_name + "_eval.tf_record") + if not tf.gfile.Exists(eval_file): + classifier_utils.file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, + eval_file, task_name) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = classifier_utils.file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder, + task_name=task_name, + use_tpu=FLAGS.use_tpu, + bsz=FLAGS.eval_batch_size) + + best_trial_info_file = os.path.join(FLAGS.output_dir, "best_trial.txt") + + def _best_trial_info(): + """Returns information about which checkpoints have been evaled so far.""" + if tf.gfile.Exists(best_trial_info_file): + with tf.gfile.GFile(best_trial_info_file, "r") as best_info: + global_step, best_metric_global_step, metric_value = ( + best_info.read().split(":")) + global_step = int(global_step) + best_metric_global_step = int(best_metric_global_step) + metric_value = float(metric_value) + else: + metric_value = -1 + best_metric_global_step = -1 + global_step = -1 + tf.logging.info( + "Best trial info: Step: %s, Best Value Step: %s, " + "Best Value: %s", global_step, best_metric_global_step, metric_value) + return global_step, best_metric_global_step, metric_value + + def _remove_checkpoint(checkpoint_path): + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tf.logging.info("removing {}".format(src_ckpt)) + tf.gfile.Remove(src_ckpt) + + def _find_valid_cands(curr_step): + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + candidates = [] + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + idx = ckpt_name.split("-")[-1] + if int(idx) > curr_step: + candidates.append(filename) + return candidates + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + + if task_name == "sts-b": + key_name = "pearson" + elif task_name == "cola": + key_name = "matthew_corr" + else: + key_name = "eval_accuracy" + + global_step, best_perf_global_step, best_perf = _best_trial_info() + writer = tf.gfile.GFile(output_eval_file, "w") + while global_step < FLAGS.train_step: + steps_and_files = {} + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + if cur_filename.split("-")[-1] == "best": + continue + gstep = int(cur_filename.split("-")[-1]) + if gstep not in steps_and_files: + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files[gstep] = cur_filename + tf.logging.info("found {} files.".format(len(steps_and_files))) + if not steps_and_files: + tf.logging.info("found 0 file, global step: {}. Sleeping." + .format(global_step)) + time.sleep(60) + else: + for checkpoint in sorted(steps_and_files.items()): + step, checkpoint_path = checkpoint + if global_step >= step: + if (best_perf_global_step != step and + len(_find_valid_cands(step)) > 1): + _remove_checkpoint(checkpoint_path) + continue + result = estimator.evaluate( + input_fn=eval_input_fn, + steps=eval_steps, + checkpoint_path=checkpoint_path) + global_step = result["global_step"] + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + writer.write("best = {}\n".format(best_perf)) + if result[key_name] > best_perf: + best_perf = result[key_name] + best_perf_global_step = global_step + elif len(_find_valid_cands(global_step)) > 1: + _remove_checkpoint(checkpoint_path) + writer.write("=" * 50 + "\n") + writer.flush() + with tf.gfile.GFile(best_trial_info_file, "w") as best_info: + best_info.write("{}:{}:{}".format( + global_step, best_perf_global_step, best_perf)) + writer.close() + + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = "model.ckpt-{}.{}".format(best_perf_global_step, ext) + tgt_ckpt = "model.ckpt-best.{}".format(ext) + tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt)) + tf.io.gfile.rename( + os.path.join(FLAGS.output_dir, src_ckpt), + os.path.join(FLAGS.output_dir, tgt_ckpt), + overwrite=True) + + if FLAGS.do_predict: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_predict_examples = len(predict_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. + while len(predict_examples) % FLAGS.predict_batch_size != 0: + predict_examples.append(classifier_utils.PaddingInputExample()) + + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + classifier_utils.file_based_convert_examples_to_features( + predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file, task_name) + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(predict_examples), num_actual_predict_examples, + len(predict_examples) - num_actual_predict_examples) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = classifier_utils.file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder, + task_name=task_name, + use_tpu=FLAGS.use_tpu, + bsz=FLAGS.predict_batch_size) + + checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") + result = estimator.predict( + input_fn=predict_input_fn, + checkpoint_path=checkpoint_path) + + output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") + output_submit_file = os.path.join(FLAGS.output_dir, "submit_results.tsv") + with tf.gfile.GFile(output_predict_file, "w") as pred_writer,\ + tf.gfile.GFile(output_submit_file, "w") as sub_writer: + sub_writer.write("index" + "\t" + "prediction\n") + num_written_lines = 0 + tf.logging.info("***** Predict results *****") + for (i, (example, prediction)) in\ + enumerate(zip(predict_examples, result)): + probabilities = prediction["probabilities"] + if i >= num_actual_predict_examples: + break + output_line = "\t".join( + str(class_probability) + for class_probability in probabilities) + "\n" + pred_writer.write(output_line) + + if task_name != "sts-b": + actual_label = label_list[int(prediction["predictions"])] + else: + actual_label = str(prediction["predictions"]) + sub_writer.write(example.guid + "\t" + actual_label + "\n") + num_written_lines += 1 + assert num_written_lines == num_actual_predict_examples + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("spm_model_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/Indic-BERT-v1-master/albert/run_glue.sh b/Indic-BERT-v1-master/albert/run_glue.sh new file mode 100644 index 0000000000000000000000000000000000000000..97f7a1587935a5e5b794430a58d59013c18777e4 --- /dev/null +++ b/Indic-BERT-v1-master/albert/run_glue.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# This is a convenience script for evaluating ALBERT on the GLUE benchmark. +# +# By default, this script uses a pretrained ALBERT v1 BASE model, but you may +# use a custom checkpoint or any compatible TF-Hub checkpoint with minimal +# edits to environment variables (see ALBERT_HUB_MODULE_HANDLE below). +# +# This script does fine-tuning and evaluation on 8 tasks, so it may take a +# while to complete if you do not have a hardware accelerator. + +set -ex + +python3 -m venv $HOME/albertenv +. $HOME/albertenv/bin/activate + +OUTPUT_DIR_BASE="$(mktemp -d)" +OUTPUT_DIR="${OUTPUT_DIR_BASE}/output" + +# To start from a custom pretrained checkpoint, set ALBERT_HUB_MODULE_HANDLE +# below to an empty string and set INIT_CHECKPOINT to your checkpoint path. +ALBERT_HUB_MODULE_HANDLE="https://tfhub.dev/google/albert_base/1" +INIT_CHECKPOINT="" + +pip3 install --upgrade pip +pip3 install numpy +pip3 install -r requirements.txt + +function run_task() { + COMMON_ARGS="--output_dir="${OUTPUT_DIR}/$1" --data_dir="${ALBERT_ROOT}/glue" --vocab_file="${ALBERT_ROOT}/vocab.txt" --spm_model_file="${ALBERT_ROOT}/30k-clean.model" --do_lower_case --max_seq_length=512 --optimizer=adamw --task_name=$1 --warmup_step=$2 --learning_rate=$3 --train_step=$4 --save_checkpoints_steps=$5 --train_batch_size=$6" + python3 -m run_classifier \ + ${COMMON_ARGS} \ + --do_train \ + --nodo_eval \ + --nodo_predict \ + --albert_hub_module_handle="${ALBERT_HUB_MODULE_HANDLE}" \ + --init_checkpoint="${INIT_CHECKPOINT}" + python3 -m run_classifier \ + ${COMMON_ARGS} \ + --nodo_train \ + --do_eval \ + --do_predict \ + --albert_hub_module_handle="${ALBERT_HUB_MODULE_HANDLE}" +} + +run_task SST-2 1256 1e-5 20935 100 32 +run_task MNLI 1000 3e-5 10000 100 128 +run_task CoLA 320 1e-5 5336 100 16 +run_task QNLI 1986 1e-5 33112 200 32 +run_task QQP 1000 5e-5 14000 100 128 +run_task RTE 200 3e-5 800 100 32 +run_task STS-B 214 2e-5 3598 100 16 +run_task MRPC 200 2e-5 800 100 32 diff --git a/Indic-BERT-v1-master/albert/run_pretraining.py b/Indic-BERT-v1-master/albert/run_pretraining.py new file mode 100644 index 0000000000000000000000000000000000000000..f2bac316632ca6a69ab47220ab784a36cbd2c3a0 --- /dev/null +++ b/Indic-BERT-v1-master/albert/run_pretraining.py @@ -0,0 +1,577 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python2, python3 +"""Run masked LM/next sentence masked_lm pre-training for ALBERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import time +from albert import modeling +from albert import optimization +from six.moves import range +import tensorflow.compat.v1 as tf +from tensorflow.contrib import cluster_resolver as contrib_cluster_resolver +from tensorflow.contrib import data as contrib_data +from tensorflow.contrib import tpu as contrib_tpu + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "albert_config_file", None, + "The config json file corresponding to the pre-trained ALBERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string( + "input_file", None, + "Input TF example files (can be a glob or comma separated).") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained ALBERT model).") + +flags.DEFINE_integer( + "max_seq_length", 512, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded. Must match data generation.") + +flags.DEFINE_integer( + "max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence. " + "Must match data generation.") + +flags.DEFINE_bool("do_train", True, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 4096, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 64, "Total batch size for eval.") + +flags.DEFINE_enum("optimizer", "lamb", ["adamw", "lamb"], + "The optimizer for training.") + +flags.DEFINE_float("learning_rate", 0.00176, "The initial learning rate.") + +flags.DEFINE_float("poly_power", 1.0, "The power of poly decay.") + +flags.DEFINE_integer("num_train_steps", 125000, "Number of training steps.") + +flags.DEFINE_integer("num_warmup_steps", 3125, "Number of warmup steps.") + +flags.DEFINE_integer("start_warmup_step", 0, "The starting step of warmup.") + +flags.DEFINE_integer("save_checkpoints_steps", 5000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("keep_checkpoint_max", 5, + "How many checkpoints to keep.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +flags.DEFINE_bool("init_from_group0", False, "Whether to initialize" + "parameters of other groups from group 0") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + +flags.DEFINE_float( + "masked_lm_budget", 0, + "If >0, the ratio of masked ngrams to unmasked ngrams. Default 0," + "for offline masking") + + +def model_fn_builder(albert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings, optimizer, poly_power, + start_warmup_step): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + masked_lm_positions = features["masked_lm_positions"] + masked_lm_ids = features["masked_lm_ids"] + masked_lm_weights = features["masked_lm_weights"] + # Note: We keep this feature name `next_sentence_labels` to be compatible + # with the original data created by lanzhzh@. However, in the ALBERT case + # it does represent sentence_order_labels. + sentence_order_labels = features["next_sentence_labels"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + model = modeling.AlbertModel( + config=albert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + (masked_lm_loss, masked_lm_example_loss, + masked_lm_log_probs) = get_masked_lm_output(albert_config, + model.get_sequence_output(), + model.get_embedding_table(), + masked_lm_positions, + masked_lm_ids, + masked_lm_weights) + + # (sentence_order_loss, sentence_order_example_loss, + # sentence_order_log_probs) = get_sentence_order_output( + # albert_config, model.get_pooled_output(), sentence_order_labels) + + total_loss = masked_lm_loss # + sentence_order_loss + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + tf.logging.info("number of hidden group %d to initialize", + albert_config.num_hidden_groups) + num_of_initialize_group = 1 + if FLAGS.init_from_group0: + num_of_initialize_group = albert_config.num_hidden_groups + if albert_config.net_structure_type > 0: + num_of_initialize_group = albert_config.num_hidden_layers + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint( + tvars, init_checkpoint, num_of_initialize_group) + if use_tpu: + + def tpu_scaffold(): + for gid in range(num_of_initialize_group): + tf.logging.info("initialize the %dth layer", gid) + tf.logging.info(assignment_map[gid]) + tf.train.init_from_checkpoint(init_checkpoint, assignment_map[gid]) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + for gid in range(num_of_initialize_group): + tf.logging.info("initialize the %dth layer", gid) + tf.logging.info(assignment_map[gid]) + tf.train.init_from_checkpoint(init_checkpoint, assignment_map[gid]) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, + use_tpu, optimizer, poly_power, start_warmup_step) + + output_spec = contrib_tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(*args): + """Computes the loss and accuracy of the model.""" + (masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, sentence_order_example_loss, + sentence_order_log_probs, sentence_order_labels) = args[:7] + + + masked_lm_log_probs = tf.reshape(masked_lm_log_probs, + [-1, masked_lm_log_probs.shape[-1]]) + masked_lm_predictions = tf.argmax( + masked_lm_log_probs, axis=-1, output_type=tf.int32) + masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) + masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) + masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) + masked_lm_accuracy = tf.metrics.accuracy( + labels=masked_lm_ids, + predictions=masked_lm_predictions, + weights=masked_lm_weights) + masked_lm_mean_loss = tf.metrics.mean( + values=masked_lm_example_loss, weights=masked_lm_weights) + + metrics = { + "masked_lm_accuracy": masked_lm_accuracy, + "masked_lm_loss": masked_lm_mean_loss, + } + + sentence_order_log_probs = tf.reshape( + sentence_order_log_probs, [-1, sentence_order_log_probs.shape[-1]]) + sentence_order_predictions = tf.argmax( + sentence_order_log_probs, axis=-1, output_type=tf.int32) + sentence_order_labels = tf.reshape(sentence_order_labels, [-1]) + sentence_order_accuracy = tf.metrics.accuracy( + labels=sentence_order_labels, + predictions=sentence_order_predictions) + sentence_order_mean_loss = tf.metrics.mean( + values=sentence_order_example_loss) + metrics.update({ + "sentence_order_accuracy": sentence_order_accuracy, + "sentence_order_loss": sentence_order_mean_loss + }) + return metrics + + metric_values = [ + masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, sentence_order_example_loss, + sentence_order_log_probs, sentence_order_labels + ] + + eval_metrics = (metric_fn, metric_values) + + output_spec = contrib_tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def get_masked_lm_output(albert_config, input_tensor, output_weights, positions, + label_ids, label_weights): + """Get loss and log probs for the masked LM.""" + input_tensor = gather_indexes(input_tensor, positions) + + + with tf.variable_scope("cls/predictions"): + # We apply one more non-linear transformation before the output layer. + # This matrix is not used after pre-training. + with tf.variable_scope("transform"): + input_tensor = tf.layers.dense( + input_tensor, + units=albert_config.embedding_size, + activation=modeling.get_activation(albert_config.hidden_act), + kernel_initializer=modeling.create_initializer( + albert_config.initializer_range)) + input_tensor = modeling.layer_norm(input_tensor) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + output_bias = tf.get_variable( + "output_bias", + shape=[albert_config.vocab_size], + initializer=tf.zeros_initializer()) + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + label_ids = tf.reshape(label_ids, [-1]) + label_weights = tf.reshape(label_weights, [-1]) + + one_hot_labels = tf.one_hot( + label_ids, depth=albert_config.vocab_size, dtype=tf.float32) + + # The `positions` tensor might be zero-padded (if the sequence is too + # short to have the maximum number of predictions). The `label_weights` + # tensor has a value of 1.0 for every real prediction and 0.0 for the + # padding predictions. + per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) + numerator = tf.reduce_sum(label_weights * per_example_loss) + denominator = tf.reduce_sum(label_weights) + 1e-5 + loss = numerator / denominator + + return (loss, per_example_loss, log_probs) + + +def get_sentence_order_output(albert_config, input_tensor, labels): + """Get loss and log probs for the next sentence prediction.""" + + # Simple binary classification. Note that 0 is "next sentence" and 1 is + # "random sentence". This weight matrix is not used after pre-training. + with tf.variable_scope("cls/seq_relationship"): + output_weights = tf.get_variable( + "output_weights", + shape=[2, albert_config.hidden_size], + initializer=modeling.create_initializer( + albert_config.initializer_range)) + output_bias = tf.get_variable( + "output_bias", shape=[2], initializer=tf.zeros_initializer()) + + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + labels = tf.reshape(labels, [-1]) + one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + return (loss, per_example_loss, log_probs) + + +def gather_indexes(sequence_tensor, positions): + """Gathers the vectors at the specific positions over a minibatch.""" + sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) + batch_size = sequence_shape[0] + seq_length = sequence_shape[1] + width = sequence_shape[2] + + flat_offsets = tf.reshape( + tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) + flat_positions = tf.reshape(positions + flat_offsets, [-1]) + flat_sequence_tensor = tf.reshape(sequence_tensor, + [batch_size * seq_length, width]) + output_tensor = tf.gather(flat_sequence_tensor, flat_positions) + return output_tensor + + +def input_fn_builder(input_files, + max_seq_length, + max_predictions_per_seq, + is_training, + num_cpu_threads=4): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + name_to_features = { + "input_ids": tf.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([max_seq_length], tf.int64), + # Note: We keep this feature name `next_sentence_labels` to be + # compatible with the original data created by lanzhzh@. However, in + # the ALBERT case it does represent sentence_order_labels. + "next_sentence_labels": tf.FixedLenFeature([1], tf.int64), + } + + if FLAGS.masked_lm_budget: + name_to_features.update({ + "token_boundary": + tf.FixedLenFeature([max_seq_length], tf.int64)}) + else: + name_to_features.update({ + "masked_lm_positions": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_ids": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_weights": + tf.FixedLenFeature([max_predictions_per_seq], tf.float32)}) + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + if is_training: + d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) + d = d.repeat() + d = d.shuffle(buffer_size=len(input_files)) + + # `cycle_length` is the number of parallel files that get read. + cycle_length = min(num_cpu_threads, len(input_files)) + + # `sloppy` mode means that the interleaving is not exact. This adds + # even more randomness to the training pipeline. + d = d.apply( + contrib_data.parallel_interleave( + tf.data.TFRecordDataset, + sloppy=is_training, + cycle_length=cycle_length)) + d = d.shuffle(buffer_size=100) + else: + d = tf.data.TFRecordDataset(input_files) + # Since we evaluate for a fixed number of steps we don't want to encounter + # out-of-range exceptions. + d = d.repeat() + + # We must `drop_remainder` on training because the TPU requires fixed + # size dimensions. For eval, we assume we are evaluating on the CPU or GPU + # and we *don't* want to drop the remainder, otherwise we wont cover + # every sample. + d = d.apply( + tf.data.experimental.map_and_batch_with_legacy_function( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + num_parallel_batches=num_cpu_threads, + drop_remainder=True)) + tf.logging.info(d) + return d + + return input_fn + + +def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + if not FLAGS.do_train and not FLAGS.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Input Files ***") + for input_file in input_files: + tf.logging.info(" %s" % input_file) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 + run_config = contrib_tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + keep_checkpoint_max=FLAGS.keep_checkpoint_max, + tpu_config=contrib_tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + model_fn = model_fn_builder( + albert_config=albert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=FLAGS.num_train_steps, + num_warmup_steps=FLAGS.num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu, + optimizer=FLAGS.optimizer, + poly_power=FLAGS.poly_power, + start_warmup_step=FLAGS.start_warmup_step) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = contrib_tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size) + + if FLAGS.do_train: + tf.logging.info("***** Running training *****") + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + train_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=True) + estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) + + if FLAGS.do_eval: + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + global_step = -1 + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + writer = tf.gfile.GFile(output_eval_file, "w") + eval_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=False) + best_perf = 0 + key_name = "masked_lm_accuracy" + while global_step < FLAGS.num_train_steps: + if estimator.latest_checkpoint() is None: + tf.logging.info("No checkpoint found yet. Sleeping.") + time.sleep(1) + else: + result = estimator.evaluate( + input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) + global_step = result["global_step"] + tf.logging.info("***** Eval results *****") + checkpoint_path = estimator.latest_checkpoint() + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + if result[key_name] > best_perf: + best_perf = result[key_name] + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tgt_ckpt = checkpoint_path.rsplit( + "-", 1)[0] + "-best.{}".format(ext) + tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt)) + tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) + writer.write("saved {} to {}\n".format(src_ckpt, tgt_ckpt)) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("albert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/Indic-BERT-v1-master/albert/run_pretraining_test.py b/Indic-BERT-v1-master/albert/run_pretraining_test.py new file mode 100644 index 0000000000000000000000000000000000000000..889661a75fa92f456f3bf14a18ba512064bd3636 --- /dev/null +++ b/Indic-BERT-v1-master/albert/run_pretraining_test.py @@ -0,0 +1,133 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python2, python3 +"""Tests for run_pretraining.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import random +import tempfile +from absl.testing import flagsaver +from albert import modeling +from albert import run_pretraining +import tensorflow.compat.v1 as tf + +FLAGS = tf.app.flags.FLAGS + + +def _create_config_file(filename, max_seq_length, vocab_size): + """Creates an AlbertConfig and saves it to file.""" + albert_config = modeling.AlbertConfig( + vocab_size, + embedding_size=5, + hidden_size=14, + num_hidden_layers=3, + num_hidden_groups=1, + num_attention_heads=2, + intermediate_size=19, + inner_group_num=1, + down_scale_factor=1, + hidden_act="gelu", + hidden_dropout_prob=0, + attention_probs_dropout_prob=0, + max_position_embeddings=max_seq_length, + type_vocab_size=2, + initializer_range=0.02) + with tf.gfile.Open(filename, "w") as outfile: + outfile.write(albert_config.to_json_string()) + + +def _create_record(max_predictions_per_seq, max_seq_length, vocab_size): + """Returns a tf.train.Example containing random data.""" + example = tf.train.Example() + example.features.feature["input_ids"].int64_list.value.extend( + [random.randint(0, vocab_size - 1) for _ in range(max_seq_length)]) + example.features.feature["input_mask"].int64_list.value.extend( + [random.randint(0, 1) for _ in range(max_seq_length)]) + example.features.feature["masked_lm_positions"].int64_list.value.extend([ + random.randint(0, max_seq_length - 1) + for _ in range(max_predictions_per_seq) + ]) + example.features.feature["masked_lm_ids"].int64_list.value.extend([ + random.randint(0, vocab_size - 1) for _ in range(max_predictions_per_seq) + ]) + example.features.feature["masked_lm_weights"].float_list.value.extend( + [1. for _ in range(max_predictions_per_seq)]) + example.features.feature["segment_ids"].int64_list.value.extend( + [0 for _ in range(max_seq_length)]) + example.features.feature["next_sentence_labels"].int64_list.value.append( + random.randint(0, 1)) + return example + + +def _create_input_file(filename, + max_predictions_per_seq, + max_seq_length, + vocab_size, + size=1000): + """Creates an input TFRecord file of specified size.""" + with tf.io.TFRecordWriter(filename) as writer: + for _ in range(size): + ex = _create_record(max_predictions_per_seq, max_seq_length, vocab_size) + writer.write(ex.SerializeToString()) + + +class RunPretrainingTest(tf.test.TestCase): + + def _verify_output_file(self, basename): + self.assertTrue(tf.gfile.Exists(os.path.join(FLAGS.output_dir, basename))) + + def _verify_checkpoint_files(self, name): + self._verify_output_file(name + ".meta") + self._verify_output_file(name + ".index") + self._verify_output_file(name + ".data-00000-of-00001") + + @flagsaver.flagsaver + def test_pretraining(self): + # Set up required flags. + vocab_size = 97 + FLAGS.max_predictions_per_seq = 7 + FLAGS.max_seq_length = 13 + FLAGS.output_dir = tempfile.mkdtemp("output_dir") + FLAGS.albert_config_file = os.path.join( + tempfile.mkdtemp("config_dir"), "albert_config.json") + FLAGS.input_file = os.path.join( + tempfile.mkdtemp("input_dir"), "input_data.tfrecord") + FLAGS.do_train = True + FLAGS.do_eval = True + FLAGS.num_train_steps = 1 + FLAGS.save_checkpoints_steps = 1 + + # Construct requisite input files. + _create_config_file(FLAGS.albert_config_file, FLAGS.max_seq_length, + vocab_size) + _create_input_file(FLAGS.input_file, FLAGS.max_predictions_per_seq, + FLAGS.max_seq_length, vocab_size) + + # Run the pretraining. + run_pretraining.main(None) + + # Verify output. + self._verify_checkpoint_files("model.ckpt-best") + self._verify_checkpoint_files("model.ckpt-1") + self._verify_output_file("eval_results.txt") + self._verify_output_file("checkpoint") + + +if __name__ == "__main__": + tf.test.main() diff --git a/Indic-BERT-v1-master/albert/run_race.py b/Indic-BERT-v1-master/albert/run_race.py new file mode 100644 index 0000000000000000000000000000000000000000..e5bb72b96c1ab77774cae03ea48c5f4adf4062fd --- /dev/null +++ b/Indic-BERT-v1-master/albert/run_race.py @@ -0,0 +1,458 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ALBERT finetuning runner with sentence piece tokenization.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +from albert import classifier_utils +from albert import fine_tuning_utils +from albert import modeling +from albert import race_utils +import tensorflow.compat.v1 as tf +from tensorflow.contrib import cluster_resolver as contrib_cluster_resolver +from tensorflow.contrib import tpu as contrib_tpu + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "data_dir", None, + "The input data dir. Should contain the .tsv files (or other data files) " + "for the task.") + +flags.DEFINE_string( + "albert_config_file", None, + "The config json file corresponding to the pre-trained ALBERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("task_name", "race", "The name of the task to train.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the ALBERT model was trained on.") + +flags.DEFINE_string("train_file", None, + "path to preprocessed tfrecord file. " + "The file will be generated if not exst.") + +flags.DEFINE_string("eval_file", None, + "path to preprocessed tfrecord file. " + "The file will be generated if not exst.") + +flags.DEFINE_string("predict_file", None, + "path to preprocessed tfrecord file. " + "The file will be generated if not exst.") + +flags.DEFINE_string("spm_model_file", None, + "The model file for sentence piece tokenization.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained ALBERT model).") + +flags.DEFINE_string( + "albert_hub_module_handle", None, + "If set, the ALBERT hub module to use.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_float("dropout_prob", 0.1, "dropout probability.") + +flags.DEFINE_integer( + "max_seq_length", 512, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_integer( + "max_qa_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_integer( + "num_keep_checkpoint", 5, + "maximum number of keep checkpoints") + + +flags.DEFINE_bool( + "high_only", False, + "Whether to only run the model on the high school set.") + +flags.DEFINE_bool( + "middle_only", False, + "Whether to only run the model on the middle school set.") + +flags.DEFINE_bool("do_train", True, "Whether to run training.") + +flags.DEFINE_bool("do_eval", True, "Whether to run eval on the dev set.") + +flags.DEFINE_bool( + "do_predict", False, + "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 1e-5, "The initial learning rate for Adam.") + +flags.DEFINE_integer("train_step", 12000, + "Total number of training epochs to perform.") + +flags.DEFINE_integer( + "warmup_step", 1000, + "number of steps to perform linear learning rate warmup for.") + +flags.DEFINE_integer("save_checkpoints_steps", 100, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + processors = { + "race": race_utils.RaceProcessor + } + + if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: + raise ValueError( + "At least one of `do_train`, `do_eval` or `do_predict' must be True.") + + albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file) + + if FLAGS.max_seq_length > albert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the ALBERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, albert_config.max_position_embeddings)) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]( + use_spm=True if FLAGS.spm_model_file else False, + do_lower_case=FLAGS.do_lower_case, + high_only=FLAGS.high_only, + middle_only=FLAGS.middle_only) + + label_list = processor.get_labels() + + tokenizer = fine_tuning_utils.create_vocab( + vocab_file=FLAGS.vocab_file, + do_lower_case=FLAGS.do_lower_case, + spm_model_file=FLAGS.spm_model_file, + hub_module=FLAGS.albert_hub_module_handle) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 + if FLAGS.do_train: + iterations_per_loop = int(min(FLAGS.iterations_per_loop, + FLAGS.save_checkpoints_steps)) + else: + iterations_per_loop = FLAGS.iterations_per_loop + run_config = contrib_tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=int(FLAGS.save_checkpoints_steps), + keep_checkpoint_max=0, + tpu_config=contrib_tpu.TPUConfig( + iterations_per_loop=iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + + model_fn = race_utils.model_fn_builder( + albert_config=albert_config, + num_labels=len(label_list), + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=FLAGS.train_step, + num_warmup_steps=FLAGS.warmup_step, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu, + max_seq_length=FLAGS.max_seq_length, + dropout_prob=FLAGS.dropout_prob, + hub_module=FLAGS.albert_hub_module_handle) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = contrib_tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + if not tf.gfile.Exists(FLAGS.train_file): + race_utils.file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, + FLAGS.train_file, FLAGS.max_qa_length) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", FLAGS.train_step) + train_input_fn = classifier_utils.file_based_input_fn_builder( + input_file=FLAGS.train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True, + task_name=task_name, + use_tpu=FLAGS.use_tpu, + bsz=FLAGS.train_batch_size, + multiple=len(label_list)) + estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_step) + + if FLAGS.do_eval: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(classifier_utils.PaddingInputExample()) + + if not tf.gfile.Exists(FLAGS.eval_file): + race_utils.file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, + FLAGS.eval_file, FLAGS.max_qa_length) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = classifier_utils.file_based_input_fn_builder( + input_file=FLAGS.eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder, + task_name=task_name, + use_tpu=FLAGS.use_tpu, + bsz=FLAGS.eval_batch_size, + multiple=len(label_list)) + + def _find_valid_cands(curr_step): + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + candidates = [] + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + idx = ckpt_name.split("-")[-1] + if idx != "best" and int(idx) > curr_step: + candidates.append(filename) + return candidates + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") + key_name = "eval_accuracy" + if tf.gfile.Exists(checkpoint_path + ".index"): + result = estimator.evaluate( + input_fn=eval_input_fn, + steps=eval_steps, + checkpoint_path=checkpoint_path) + best_perf = result[key_name] + global_step = result["global_step"] + else: + global_step = -1 + best_perf = -1 + checkpoint_path = None + writer = tf.gfile.GFile(output_eval_file, "w") + while global_step < FLAGS.train_step: + steps_and_files = {} + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + if cur_filename.split("-")[-1] == "best": + continue + gstep = int(cur_filename.split("-")[-1]) + if gstep not in steps_and_files: + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files[gstep] = cur_filename + tf.logging.info("found {} files.".format(len(steps_and_files))) + # steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + if not steps_and_files: + tf.logging.info("found 0 file, global step: {}. Sleeping." + .format(global_step)) + time.sleep(1) + else: + for ele in sorted(steps_and_files.items()): + step, checkpoint_path = ele + if global_step >= step: + if len(_find_valid_cands(step)) > 1: + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tf.logging.info("removing {}".format(src_ckpt)) + tf.gfile.Remove(src_ckpt) + continue + result = estimator.evaluate( + input_fn=eval_input_fn, + steps=eval_steps, + checkpoint_path=checkpoint_path) + global_step = result["global_step"] + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + writer.write("best = {}\n".format(best_perf)) + if result[key_name] > best_perf: + best_perf = result[key_name] + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tgt_ckpt = checkpoint_path.rsplit("-", 1)[0] + "-best.{}".format(ext) + tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt)) + tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) + writer.write("saved {} to {}\n".format(src_ckpt, tgt_ckpt)) + + if len(_find_valid_cands(global_step)) > 1: + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tf.logging.info("removing {}".format(src_ckpt)) + tf.gfile.Remove(src_ckpt) + writer.write("=" * 50 + "\n") + writer.close() + if FLAGS.do_predict: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_predict_examples = len(predict_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. + while len(predict_examples) % FLAGS.predict_batch_size != 0: + predict_examples.append(classifier_utils.PaddingInputExample()) + assert len(predict_examples) % FLAGS.predict_batch_size == 0 + predict_steps = int(len(predict_examples) // FLAGS.predict_batch_size) + + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + race_utils.file_based_convert_examples_to_features( + predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file, FLAGS.max_qa_length) + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(predict_examples), num_actual_predict_examples, + len(predict_examples) - num_actual_predict_examples) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = classifier_utils.file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder, + task_name=task_name, + use_tpu=FLAGS.use_tpu, + bsz=FLAGS.predict_batch_size, + multiple=len(label_list)) + + checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") + result = estimator.evaluate( + input_fn=predict_input_fn, + steps=predict_steps, + checkpoint_path=checkpoint_path) + + output_predict_file = os.path.join(FLAGS.output_dir, "predict_results.txt") + with tf.gfile.GFile(output_predict_file, "w") as pred_writer: + # num_written_lines = 0 + tf.logging.info("***** Predict results *****") + pred_writer.write("***** Predict results *****\n") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + pred_writer.write("%s = %s\n" % (key, str(result[key]))) + pred_writer.write("best = {}\n".format(best_perf)) + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("spm_model_file") + flags.mark_flag_as_required("albert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/Indic-BERT-v1-master/albert/run_squad_v1.py b/Indic-BERT-v1-master/albert/run_squad_v1.py new file mode 100644 index 0000000000000000000000000000000000000000..8cfbb6b852838602941a29b2c7a28c965cf35a88 --- /dev/null +++ b/Indic-BERT-v1-master/albert/run_squad_v1.py @@ -0,0 +1,547 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python2, python3 +"""Run ALBERT on SQuAD v1.1 using sentence piece tokenization.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +import json +import os +import random +import time +from albert import fine_tuning_utils +from albert import modeling +from albert import squad_utils +import six +import tensorflow.compat.v1 as tf + +from tensorflow.contrib import cluster_resolver as contrib_cluster_resolver +from tensorflow.contrib import tpu as contrib_tpu + + +# pylint: disable=g-import-not-at-top +if six.PY2: + import six.moves.cPickle as pickle +else: + import pickle +# pylint: enable=g-import-not-at-top + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "albert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string("spm_model_file", None, + "The model file for sentence piece tokenization.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string("train_file", None, + "SQuAD json for training. E.g., train-v1.1.json") + +flags.DEFINE_string( + "predict_file", None, + "SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") + +flags.DEFINE_string("train_feature_file", None, + "training feature file.") + +flags.DEFINE_string( + "predict_feature_file", None, + "Location of predict features. If it doesn't exist, it will be written. " + "If it does exist, it will be read.") + +flags.DEFINE_string( + "predict_feature_left_file", None, + "Location of predict features not passed to TPU. If it doesn't exist, it " + "will be written. If it does exist, it will be read.") + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_string( + "albert_hub_module_handle", None, + "If set, the ALBERT hub module to use.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 384, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_integer( + "doc_stride", 128, + "When splitting up a long document into chunks, how much stride to " + "take between chunks.") + +flags.DEFINE_integer( + "max_query_length", 64, + "The maximum number of tokens for the question. Questions longer than " + "this will be truncated to this length.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("predict_batch_size", 8, + "Total batch size for predictions.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer( + "n_best_size", 20, + "The total number of n-best predictions to generate in the " + "nbest_predictions.json output file.") + +flags.DEFINE_integer( + "max_answer_length", 30, + "The maximum length of an answer that can be generated. This is needed " + "because the start and end predictions are not conditioned on one another.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + +flags.DEFINE_bool( + "use_einsum", True, + "Whether to use tf.einsum or tf.reshape+tf.matmul for dense layers. Must " + "be set to False for TFLite compatibility.") + +flags.DEFINE_string( + "export_dir", + default=None, + help=("The directory where the exported SavedModel will be stored.")) + + +def validate_flags_or_throw(albert_config): + """Validate the input FLAGS or throw an exception.""" + + if not FLAGS.do_train and not FLAGS.do_predict and not FLAGS.export_dir: + err_msg = "At least one of `do_train` or `do_predict` or `export_dir`" + "must be True." + raise ValueError(err_msg) + + if FLAGS.do_train: + if not FLAGS.train_file: + raise ValueError( + "If `do_train` is True, then `train_file` must be specified.") + if FLAGS.do_predict: + if not FLAGS.predict_file: + raise ValueError( + "If `do_predict` is True, then `predict_file` must be specified.") + if not FLAGS.predict_feature_file: + raise ValueError( + "If `do_predict` is True, then `predict_feature_file` must be " + "specified.") + if not FLAGS.predict_feature_left_file: + raise ValueError( + "If `do_predict` is True, then `predict_feature_left_file` must be " + "specified.") + + if FLAGS.max_seq_length > albert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the ALBERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, albert_config.max_position_embeddings)) + + if FLAGS.max_seq_length <= FLAGS.max_query_length + 3: + raise ValueError( + "The max_seq_length (%d) must be greater than max_query_length " + "(%d) + 3" % (FLAGS.max_seq_length, FLAGS.max_query_length)) + + +def build_squad_serving_input_fn(seq_length): + """Builds a serving input fn for raw input.""" + + def _seq_serving_input_fn(): + """Serving input fn for raw images.""" + input_ids = tf.placeholder( + shape=[1, seq_length], name="input_ids", dtype=tf.int32) + input_mask = tf.placeholder( + shape=[1, seq_length], name="input_mask", dtype=tf.int32) + segment_ids = tf.placeholder( + shape=[1, seq_length], name="segment_ids", dtype=tf.int32) + + inputs = { + "input_ids": input_ids, + "input_mask": input_mask, + "segment_ids": segment_ids + } + return tf.estimator.export.ServingInputReceiver(features=inputs, + receiver_tensors=inputs) + + return _seq_serving_input_fn + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file) + + validate_flags_or_throw(albert_config) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + tokenizer = fine_tuning_utils.create_vocab( + vocab_file=FLAGS.vocab_file, + do_lower_case=FLAGS.do_lower_case, + spm_model_file=FLAGS.spm_model_file, + hub_module=FLAGS.albert_hub_module_handle) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 + if FLAGS.do_train: + iterations_per_loop = int(min(FLAGS.iterations_per_loop, + FLAGS.save_checkpoints_steps)) + else: + iterations_per_loop = FLAGS.iterations_per_loop + run_config = contrib_tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + keep_checkpoint_max=0, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=contrib_tpu.TPUConfig( + iterations_per_loop=iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = squad_utils.read_squad_examples( + input_file=FLAGS.train_file, is_training=True) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + # Pre-shuffle the input to avoid having to make a very large shuffle + # buffer in in the `input_fn`. + rng = random.Random(12345) + rng.shuffle(train_examples) + + model_fn = squad_utils.v1_model_fn_builder( + albert_config=albert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu, + use_einsum=FLAGS.use_einsum, + hub_module=FLAGS.albert_hub_module_handle) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = contrib_tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + # We write to a temporary file to avoid storing very large constant tensors + # in memory. + + if not tf.gfile.Exists(FLAGS.train_feature_file): + train_writer = squad_utils.FeatureWriter( + filename=os.path.join(FLAGS.train_feature_file), is_training=True) + squad_utils.convert_examples_to_features( + examples=train_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=True, + output_fn=train_writer.process_feature, + do_lower_case=FLAGS.do_lower_case) + train_writer.close() + + tf.logging.info("***** Running training *****") + tf.logging.info(" Num orig examples = %d", len(train_examples)) + # tf.logging.info(" Num split examples = %d", train_writer.num_features) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + del train_examples + + train_input_fn = squad_utils.input_fn_builder( + input_file=FLAGS.train_feature_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True, + use_tpu=FLAGS.use_tpu, + bsz=FLAGS.train_batch_size, + is_v2=False) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_predict: + with tf.gfile.Open(FLAGS.predict_file) as predict_file: + prediction_json = json.load(predict_file)["data"] + + eval_examples = squad_utils.read_squad_examples( + input_file=FLAGS.predict_file, is_training=False) + + if (tf.gfile.Exists(FLAGS.predict_feature_file) and tf.gfile.Exists( + FLAGS.predict_feature_left_file)): + tf.logging.info("Loading eval features from {}".format( + FLAGS.predict_feature_left_file)) + with tf.gfile.Open(FLAGS.predict_feature_left_file, "rb") as fin: + eval_features = pickle.load(fin) + else: + eval_writer = squad_utils.FeatureWriter( + filename=FLAGS.predict_feature_file, is_training=False) + eval_features = [] + + def append_feature(feature): + eval_features.append(feature) + eval_writer.process_feature(feature) + + squad_utils.convert_examples_to_features( + examples=eval_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=False, + output_fn=append_feature, + do_lower_case=FLAGS.do_lower_case) + eval_writer.close() + + with tf.gfile.Open(FLAGS.predict_feature_left_file, "wb") as fout: + pickle.dump(eval_features, fout) + + tf.logging.info("***** Running predictions *****") + tf.logging.info(" Num orig examples = %d", len(eval_examples)) + tf.logging.info(" Num split examples = %d", len(eval_features)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_input_fn = squad_utils.input_fn_builder( + input_file=FLAGS.predict_feature_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=False, + use_tpu=FLAGS.use_tpu, + bsz=FLAGS.predict_batch_size, + is_v2=False) + + def get_result(checkpoint): + """Evaluate the checkpoint on SQuAD 1.0.""" + # If running eval on the TPU, you will need to specify the number of + # steps. + reader = tf.train.NewCheckpointReader(checkpoint) + global_step = reader.get_tensor(tf.GraphKeys.GLOBAL_STEP) + all_results = [] + for result in estimator.predict( + predict_input_fn, yield_single_examples=True, + checkpoint_path=checkpoint): + if len(all_results) % 1000 == 0: + tf.logging.info("Processing example: %d" % (len(all_results))) + unique_id = int(result["unique_ids"]) + start_log_prob = [float(x) for x in result["start_log_prob"].flat] + end_log_prob = [float(x) for x in result["end_log_prob"].flat] + all_results.append( + squad_utils.RawResult( + unique_id=unique_id, + start_log_prob=start_log_prob, + end_log_prob=end_log_prob)) + + output_prediction_file = os.path.join( + FLAGS.output_dir, "predictions.json") + output_nbest_file = os.path.join( + FLAGS.output_dir, "nbest_predictions.json") + + result_dict = {} + squad_utils.accumulate_predictions_v1( + result_dict, eval_examples, eval_features, + all_results, FLAGS.n_best_size, FLAGS.max_answer_length) + predictions = squad_utils.write_predictions_v1( + result_dict, eval_examples, eval_features, all_results, + FLAGS.n_best_size, FLAGS.max_answer_length, + output_prediction_file, output_nbest_file) + + return squad_utils.evaluate_v1( + prediction_json, predictions), int(global_step) + + def _find_valid_cands(curr_step): + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + candidates = [] + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + idx = ckpt_name.split("-")[-1] + if idx != "best" and int(idx) > curr_step: + candidates.append(filename) + return candidates + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") + key_name = "f1" + writer = tf.gfile.GFile(output_eval_file, "w") + if tf.gfile.Exists(checkpoint_path + ".index"): + result = get_result(checkpoint_path) + best_perf = result[0][key_name] + global_step = result[1] + else: + global_step = -1 + best_perf = -1 + checkpoint_path = None + while global_step < num_train_steps: + steps_and_files = {} + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + if cur_filename.split("-")[-1] == "best": + continue + gstep = int(cur_filename.split("-")[-1]) + if gstep not in steps_and_files: + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files[gstep] = cur_filename + tf.logging.info("found {} files.".format(len(steps_and_files))) + if not steps_and_files: + tf.logging.info("found 0 file, global step: {}. Sleeping." + .format(global_step)) + time.sleep(60) + else: + for ele in sorted(steps_and_files.items()): + step, checkpoint_path = ele + if global_step >= step: + if len(_find_valid_cands(step)) > 1: + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tf.logging.info("removing {}".format(src_ckpt)) + tf.gfile.Remove(src_ckpt) + continue + result, global_step = get_result(checkpoint_path) + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + if result[key_name] > best_perf: + best_perf = result[key_name] + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tgt_ckpt = checkpoint_path.rsplit( + "-", 1)[0] + "-best.{}".format(ext) + tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt)) + tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) + writer.write("saved {} to {}\n".format(src_ckpt, tgt_ckpt)) + writer.write("best {} = {}\n".format(key_name, best_perf)) + tf.logging.info(" best {} = {}\n".format(key_name, best_perf)) + + if len(_find_valid_cands(global_step)) > 2: + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tf.logging.info("removing {}".format(src_ckpt)) + tf.gfile.Remove(src_ckpt) + writer.write("=" * 50 + "\n") + + checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") + result, global_step = get_result(checkpoint_path) + tf.logging.info("***** Final Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + writer.write("best perf happened at step: {}".format(global_step)) + + if FLAGS.export_dir: + tf.gfile.MakeDirs(FLAGS.export_dir) + squad_serving_input_fn = ( + build_squad_serving_input_fn(FLAGS.max_seq_length)) + tf.logging.info("Starting to export model.") + subfolder = estimator.export_saved_model( + export_dir_base=os.path.join(FLAGS.export_dir, "saved_model"), + serving_input_receiver_fn=squad_serving_input_fn) + + tf.logging.info("Starting to export TFLite.") + converter = tf.lite.TFLiteConverter.from_saved_model( + subfolder, + input_arrays=["input_ids", "input_mask", "segment_ids"], + output_arrays=["start_logits", "end_logits"]) + float_model = converter.convert() + tflite_file = os.path.join(FLAGS.export_dir, "albert_model.tflite") + with tf.gfile.GFile(tflite_file, "wb") as f: + f.write(float_model) + + +if __name__ == "__main__": + flags.mark_flag_as_required("spm_model_file") + flags.mark_flag_as_required("albert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/Indic-BERT-v1-master/albert/run_squad_v2.py b/Indic-BERT-v1-master/albert/run_squad_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..a4fd38a7348b43b68f46c874cc449da71d78f6af --- /dev/null +++ b/Indic-BERT-v1-master/albert/run_squad_v2.py @@ -0,0 +1,516 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python2, python3 +"""Run ALBERT on SQuAD v2.0 using sentence piece tokenization.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +import json +import os +import random +import time + +from albert import fine_tuning_utils +from albert import modeling +from albert import squad_utils +import six +import tensorflow.compat.v1 as tf + +from tensorflow.contrib import cluster_resolver as contrib_cluster_resolver +from tensorflow.contrib import tpu as contrib_tpu + + +# pylint: disable=g-import-not-at-top +if six.PY2: + import six.moves.cPickle as pickle +else: + import pickle +# pylint: enable=g-import-not-at-top + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "albert_config_file", None, + "The config json file corresponding to the pre-trained ALBERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the ALBERT model was trained on.") + +flags.DEFINE_string("spm_model_file", None, + "The model file for sentence piece tokenization.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string("train_file", None, + "SQuAD json for training. E.g., train-v1.1.json") + +flags.DEFINE_string( + "predict_file", None, + "SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") + +flags.DEFINE_string("train_feature_file", None, + "training feature file.") + +flags.DEFINE_string( + "predict_feature_file", None, + "Location of predict features. If it doesn't exist, it will be written. " + "If it does exist, it will be read.") + +flags.DEFINE_string( + "predict_feature_left_file", None, + "Location of predict features not passed to TPU. If it doesn't exist, it " + "will be written. If it does exist, it will be read.") + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_string( + "albert_hub_module_handle", None, + "If set, the ALBERT hub module to use.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 384, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_integer( + "doc_stride", 128, + "When splitting up a long document into chunks, how much stride to " + "take between chunks.") + +flags.DEFINE_integer( + "max_query_length", 64, + "The maximum number of tokens for the question. Questions longer than " + "this will be truncated to this length.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("predict_batch_size", 8, + "Total batch size for predictions.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer( + "n_best_size", 20, + "The total number of n-best predictions to generate in the " + "nbest_predictions.json output file.") + +flags.DEFINE_integer( + "max_answer_length", 30, + "The maximum length of an answer that can be generated. This is needed " + "because the start and end predictions are not conditioned on one another.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +flags.DEFINE_integer("start_n_top", 5, "beam size for the start positions.") + +flags.DEFINE_integer("end_n_top", 5, "beam size for the end positions.") + +flags.DEFINE_float("dropout_prob", 0.1, "dropout probability.") + + +def validate_flags_or_throw(albert_config): + """Validate the input FLAGS or throw an exception.""" + + if not FLAGS.do_train and not FLAGS.do_predict: + raise ValueError("At least one of `do_train` or `do_predict` must be True.") + + if FLAGS.do_train: + if not FLAGS.train_file: + raise ValueError( + "If `do_train` is True, then `train_file` must be specified.") + if FLAGS.do_predict: + if not FLAGS.predict_file: + raise ValueError( + "If `do_predict` is True, then `predict_file` must be specified.") + if not FLAGS.predict_feature_file: + raise ValueError( + "If `do_predict` is True, then `predict_feature_file` must be " + "specified.") + if not FLAGS.predict_feature_left_file: + raise ValueError( + "If `do_predict` is True, then `predict_feature_left_file` must be " + "specified.") + + if FLAGS.max_seq_length > albert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the ALBERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, albert_config.max_position_embeddings)) + + if FLAGS.max_seq_length <= FLAGS.max_query_length + 3: + raise ValueError( + "The max_seq_length (%d) must be greater than max_query_length " + "(%d) + 3" % (FLAGS.max_seq_length, FLAGS.max_query_length)) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file) + + validate_flags_or_throw(albert_config) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + tokenizer = fine_tuning_utils.create_vocab( + vocab_file=FLAGS.vocab_file, + do_lower_case=FLAGS.do_lower_case, + spm_model_file=FLAGS.spm_model_file, + hub_module=FLAGS.albert_hub_module_handle) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 + if FLAGS.do_train: + iterations_per_loop = int(min(FLAGS.iterations_per_loop, + FLAGS.save_checkpoints_steps)) + else: + iterations_per_loop = FLAGS.iterations_per_loop + run_config = contrib_tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + keep_checkpoint_max=0, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=contrib_tpu.TPUConfig( + iterations_per_loop=iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + train_examples = squad_utils.read_squad_examples( + input_file=FLAGS.train_file, is_training=True) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + if FLAGS.do_train: + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + # Pre-shuffle the input to avoid having to make a very large shuffle + # buffer in in the `input_fn`. + rng = random.Random(12345) + rng.shuffle(train_examples) + + model_fn = squad_utils.v2_model_fn_builder( + albert_config=albert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu, + max_seq_length=FLAGS.max_seq_length, + start_n_top=FLAGS.start_n_top, + end_n_top=FLAGS.end_n_top, + dropout_prob=FLAGS.dropout_prob, + hub_module=FLAGS.albert_hub_module_handle) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = contrib_tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + # We write to a temporary file to avoid storing very large constant tensors + # in memory. + + if not tf.gfile.Exists(FLAGS.train_feature_file): + train_writer = squad_utils.FeatureWriter( + filename=os.path.join(FLAGS.train_feature_file), is_training=True) + squad_utils.convert_examples_to_features( + examples=train_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=True, + output_fn=train_writer.process_feature, + do_lower_case=FLAGS.do_lower_case) + train_writer.close() + + tf.logging.info("***** Running training *****") + tf.logging.info(" Num orig examples = %d", len(train_examples)) + # tf.logging.info(" Num split examples = %d", train_writer.num_features) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + del train_examples + + train_input_fn = squad_utils.input_fn_builder( + input_file=FLAGS.train_feature_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True, + use_tpu=FLAGS.use_tpu, + bsz=FLAGS.train_batch_size, + is_v2=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_predict: + with tf.gfile.Open(FLAGS.predict_file) as predict_file: + prediction_json = json.load(predict_file)["data"] + eval_examples = squad_utils.read_squad_examples( + input_file=FLAGS.predict_file, is_training=False) + + if (tf.gfile.Exists(FLAGS.predict_feature_file) and tf.gfile.Exists( + FLAGS.predict_feature_left_file)): + tf.logging.info("Loading eval features from {}".format( + FLAGS.predict_feature_left_file)) + with tf.gfile.Open(FLAGS.predict_feature_left_file, "rb") as fin: + eval_features = pickle.load(fin) + else: + eval_writer = squad_utils.FeatureWriter( + filename=FLAGS.predict_feature_file, is_training=False) + eval_features = [] + + def append_feature(feature): + eval_features.append(feature) + eval_writer.process_feature(feature) + + squad_utils.convert_examples_to_features( + examples=eval_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=False, + output_fn=append_feature, + do_lower_case=FLAGS.do_lower_case) + eval_writer.close() + + with tf.gfile.Open(FLAGS.predict_feature_left_file, "wb") as fout: + pickle.dump(eval_features, fout) + + tf.logging.info("***** Running predictions *****") + tf.logging.info(" Num orig examples = %d", len(eval_examples)) + tf.logging.info(" Num split examples = %d", len(eval_features)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_input_fn = squad_utils.input_fn_builder( + input_file=FLAGS.predict_feature_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=False, + use_tpu=FLAGS.use_tpu, + bsz=FLAGS.predict_batch_size, + is_v2=True) + + def get_result(checkpoint): + """Evaluate the checkpoint on SQuAD v2.0.""" + # If running eval on the TPU, you will need to specify the number of + # steps. + reader = tf.train.NewCheckpointReader(checkpoint) + global_step = reader.get_tensor(tf.GraphKeys.GLOBAL_STEP) + all_results = [] + for result in estimator.predict( + predict_input_fn, yield_single_examples=True, + checkpoint_path=checkpoint): + if len(all_results) % 1000 == 0: + tf.logging.info("Processing example: %d" % (len(all_results))) + unique_id = int(result["unique_ids"]) + start_top_log_probs = ( + [float(x) for x in result["start_top_log_probs"].flat]) + start_top_index = [int(x) for x in result["start_top_index"].flat] + end_top_log_probs = ( + [float(x) for x in result["end_top_log_probs"].flat]) + end_top_index = [int(x) for x in result["end_top_index"].flat] + + cls_logits = float(result["cls_logits"].flat[0]) + all_results.append( + squad_utils.RawResultV2( + unique_id=unique_id, + start_top_log_probs=start_top_log_probs, + start_top_index=start_top_index, + end_top_log_probs=end_top_log_probs, + end_top_index=end_top_index, + cls_logits=cls_logits)) + + output_prediction_file = os.path.join( + FLAGS.output_dir, "predictions.json") + output_nbest_file = os.path.join( + FLAGS.output_dir, "nbest_predictions.json") + output_null_log_odds_file = os.path.join( + FLAGS.output_dir, "null_odds.json") + + result_dict = {} + cls_dict = {} + squad_utils.accumulate_predictions_v2( + result_dict, cls_dict, eval_examples, eval_features, + all_results, FLAGS.n_best_size, FLAGS.max_answer_length, + FLAGS.start_n_top, FLAGS.end_n_top) + + return squad_utils.evaluate_v2( + result_dict, cls_dict, prediction_json, eval_examples, + eval_features, all_results, FLAGS.n_best_size, + FLAGS.max_answer_length, output_prediction_file, output_nbest_file, + output_null_log_odds_file), int(global_step) + + def _find_valid_cands(curr_step): + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + candidates = [] + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + idx = ckpt_name.split("-")[-1] + if idx != "best" and int(idx) > curr_step: + candidates.append(filename) + return candidates + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") + key_name = "f1" + writer = tf.gfile.GFile(output_eval_file, "w") + if tf.gfile.Exists(checkpoint_path + ".index"): + result = get_result(checkpoint_path) + best_perf = result[0][key_name] + global_step = result[1] + else: + global_step = -1 + best_perf = -1 + checkpoint_path = None + while global_step < num_train_steps: + steps_and_files = {} + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + if cur_filename.split("-")[-1] == "best": + continue + gstep = int(cur_filename.split("-")[-1]) + if gstep not in steps_and_files: + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files[gstep] = cur_filename + tf.logging.info("found {} files.".format(len(steps_and_files))) + if not steps_and_files: + tf.logging.info("found 0 file, global step: {}. Sleeping." + .format(global_step)) + time.sleep(60) + else: + for ele in sorted(steps_and_files.items()): + step, checkpoint_path = ele + if global_step >= step: + if len(_find_valid_cands(step)) > 1: + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tf.logging.info("removing {}".format(src_ckpt)) + tf.gfile.Remove(src_ckpt) + continue + result, global_step = get_result(checkpoint_path) + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + if result[key_name] > best_perf: + best_perf = result[key_name] + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tgt_ckpt = checkpoint_path.rsplit( + "-", 1)[0] + "-best.{}".format(ext) + tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt)) + tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) + writer.write("saved {} to {}\n".format(src_ckpt, tgt_ckpt)) + writer.write("best {} = {}\n".format(key_name, best_perf)) + tf.logging.info(" best {} = {}\n".format(key_name, best_perf)) + + if len(_find_valid_cands(global_step)) > 2: + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tf.logging.info("removing {}".format(src_ckpt)) + tf.gfile.Remove(src_ckpt) + writer.write("=" * 50 + "\n") + + checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") + result, global_step = get_result(checkpoint_path) + tf.logging.info("***** Final Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + writer.write("best perf happened at step: {}".format(global_step)) + + +if __name__ == "__main__": + flags.mark_flag_as_required("spm_model_file") + flags.mark_flag_as_required("albert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/Indic-BERT-v1-master/albert/run_trivial_model_test.sh b/Indic-BERT-v1-master/albert/run_trivial_model_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..ffcffb2dc4aebba1e5d422789be65379b5882443 --- /dev/null +++ b/Indic-BERT-v1-master/albert/run_trivial_model_test.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Small integration test script. +# The values in this file are **not** meant for reproducing actual results. + +set -e +set -x + +virtualenv -p python3 . +source ./bin/activate + +OUTPUT_DIR_BASE="$(mktemp -d)" +OUTPUT_DIR="${OUTPUT_DIR_BASE}/output" + +pip install numpy +pip install -r requirements.txt +python -m run_pretraining_test \ + --output_dir="${OUTPUT_DIR}" \ + --do_train \ + --do_eval \ + --nouse_tpu \ + --train_batch_size=2 \ + --eval_batch_size=1 \ + --max_seq_length=4 \ + --num_train_steps=2 \ + --max_eval_steps=3 + + diff --git a/Indic-BERT-v1-master/albert/squad_utils.py b/Indic-BERT-v1-master/albert/squad_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f7bf05262cabb0099850dcdeb10359693993503b --- /dev/null +++ b/Indic-BERT-v1-master/albert/squad_utils.py @@ -0,0 +1,1735 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python2, python3 +"""Utility functions for SQuAD v1.1/v2.0 datasets.""" + +from __future__ import absolute_import +from __future__ import division +# from __future__ import google_type_annotations +from __future__ import print_function +import collections +import json +import math +import re +import string +import sys +from albert import fine_tuning_utils +from albert import modeling +from albert import optimization +from albert import tokenization +import numpy as np +import six +from six.moves import map +from six.moves import range +import tensorflow.compat.v1 as tf +from tensorflow.contrib import data as contrib_data +from tensorflow.contrib import layers as contrib_layers +from tensorflow.contrib import tpu as contrib_tpu + +_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", + "start_log_prob", "end_log_prob"]) + +_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) + +RawResult = collections.namedtuple("RawResult", + ["unique_id", + "start_log_prob", + "end_log_prob"]) + +RawResultV2 = collections.namedtuple( + "RawResultV2", + ["unique_id", "start_top_log_probs", "start_top_index", + "end_top_log_probs", "end_top_index", "cls_logits"]) + + +class SquadExample(object): + """A single training/test example for simple sequence classification. + + For examples without an answer, the start and end position are -1. + """ + + def __init__(self, + qas_id, + question_text, + paragraph_text, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=False): + self.qas_id = qas_id + self.question_text = question_text + self.paragraph_text = paragraph_text + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + def __str__(self): + return self.__repr__() + + def __repr__(self): + s = "" + s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) + s += ", question_text: %s" % ( + tokenization.printable_text(self.question_text)) + s += ", paragraph_text: [%s]" % (" ".join(self.paragraph_text)) + if self.start_position: + s += ", start_position: %d" % (self.start_position) + if self.start_position: + s += ", end_position: %d" % (self.end_position) + if self.start_position: + s += ", is_impossible: %r" % (self.is_impossible) + return s + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + unique_id, + example_index, + doc_span_index, + tok_start_to_orig_index, + tok_end_to_orig_index, + token_is_max_context, + tokens, + input_ids, + input_mask, + segment_ids, + paragraph_len, + p_mask=None, + start_position=None, + end_position=None, + is_impossible=None): + self.unique_id = unique_id + self.example_index = example_index + self.doc_span_index = doc_span_index + self.tok_start_to_orig_index = tok_start_to_orig_index + self.tok_end_to_orig_index = tok_end_to_orig_index + self.token_is_max_context = token_is_max_context + self.tokens = tokens + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.paragraph_len = paragraph_len + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + self.p_mask = p_mask + + +def read_squad_examples(input_file, is_training): + """Read a SQuAD json file into a list of SquadExample.""" + with tf.gfile.Open(input_file, "r") as reader: + input_data = json.load(reader)["data"] + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + orig_answer_text = None + is_impossible = False + + if is_training: + is_impossible = qa.get("is_impossible", False) + if (len(qa["answers"]) != 1) and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer.") + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + start_position = answer["answer_start"] + else: + start_position = -1 + orig_answer_text = "" + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + paragraph_text=paragraph_text, + orig_answer_text=orig_answer_text, + start_position=start_position, + is_impossible=is_impossible) + examples.append(example) + + return examples + + +def _convert_index(index, pos, m=None, is_start=True): + """Converts index.""" + if index[pos] is not None: + return index[pos] + n = len(index) + rear = pos + while rear < n - 1 and index[rear] is None: + rear += 1 + front = pos + while front > 0 and index[front] is None: + front -= 1 + assert index[front] is not None or index[rear] is not None + if index[front] is None: + if index[rear] >= 1: + if is_start: + return 0 + else: + return index[rear] - 1 + return index[rear] + if index[rear] is None: + if m is not None and index[front] < m - 1: + if is_start: + return index[front] + 1 + else: + return m - 1 + return index[front] + if is_start: + if index[rear] > index[front] + 1: + return index[front] + 1 + else: + return index[rear] + else: + if index[rear] > index[front] + 1: + return index[rear] - 1 + else: + return index[front] + + +def convert_examples_to_features(examples, tokenizer, max_seq_length, + doc_stride, max_query_length, is_training, + output_fn, do_lower_case): + """Loads a data file into a list of `InputBatch`s.""" + + cnt_pos, cnt_neg = 0, 0 + unique_id = 1000000000 + max_n, max_m = 1024, 1024 + f = np.zeros((max_n, max_m), dtype=np.float32) + + for (example_index, example) in enumerate(examples): + + if example_index % 100 == 0: + tf.logging.info("Converting {}/{} pos {} neg {}".format( + example_index, len(examples), cnt_pos, cnt_neg)) + + query_tokens = tokenization.encode_ids( + tokenizer.sp_model, + tokenization.preprocess_text( + example.question_text, lower=do_lower_case)) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + paragraph_text = example.paragraph_text + para_tokens = tokenization.encode_pieces( + tokenizer.sp_model, + tokenization.preprocess_text( + example.paragraph_text, lower=do_lower_case), + return_unicode=False) + + chartok_to_tok_index = [] + tok_start_to_chartok_index = [] + tok_end_to_chartok_index = [] + char_cnt = 0 + para_tokens = [six.ensure_text(token, "utf-8") for token in para_tokens] + for i, token in enumerate(para_tokens): + new_token = six.ensure_text(token).replace( + tokenization.SPIECE_UNDERLINE.decode("utf-8"), " ") + chartok_to_tok_index.extend([i] * len(new_token)) + tok_start_to_chartok_index.append(char_cnt) + char_cnt += len(new_token) + tok_end_to_chartok_index.append(char_cnt - 1) + + tok_cat_text = "".join(para_tokens).replace( + tokenization.SPIECE_UNDERLINE.decode("utf-8"), " ") + n, m = len(paragraph_text), len(tok_cat_text) + + if n > max_n or m > max_m: + max_n = max(n, max_n) + max_m = max(m, max_m) + f = np.zeros((max_n, max_m), dtype=np.float32) + + g = {} + + def _lcs_match(max_dist, n=n, m=m): + """Longest-common-substring algorithm.""" + f.fill(0) + g.clear() + + ### longest common sub sequence + # f[i, j] = max(f[i - 1, j], f[i, j - 1], f[i - 1, j - 1] + match(i, j)) + for i in range(n): + + # note(zhiliny): + # unlike standard LCS, this is specifically optimized for the setting + # because the mismatch between sentence pieces and original text will + # be small + for j in range(i - max_dist, i + max_dist): + if j >= m or j < 0: continue + + if i > 0: + g[(i, j)] = 0 + f[i, j] = f[i - 1, j] + + if j > 0 and f[i, j - 1] > f[i, j]: + g[(i, j)] = 1 + f[i, j] = f[i, j - 1] + + f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0 + if (tokenization.preprocess_text( + paragraph_text[i], lower=do_lower_case, + remove_space=False) == tok_cat_text[j] + and f_prev + 1 > f[i, j]): + g[(i, j)] = 2 + f[i, j] = f_prev + 1 + + max_dist = abs(n - m) + 5 + for _ in range(2): + _lcs_match(max_dist) + if f[n - 1, m - 1] > 0.8 * n: break + max_dist *= 2 + + orig_to_chartok_index = [None] * n + chartok_to_orig_index = [None] * m + i, j = n - 1, m - 1 + while i >= 0 and j >= 0: + if (i, j) not in g: break + if g[(i, j)] == 2: + orig_to_chartok_index[i] = j + chartok_to_orig_index[j] = i + i, j = i - 1, j - 1 + elif g[(i, j)] == 1: + j = j - 1 + else: + i = i - 1 + + if (all(v is None for v in orig_to_chartok_index) or + f[n - 1, m - 1] < 0.8 * n): + tf.logging.info("MISMATCH DETECTED!") + continue + + tok_start_to_orig_index = [] + tok_end_to_orig_index = [] + for i in range(len(para_tokens)): + start_chartok_pos = tok_start_to_chartok_index[i] + end_chartok_pos = tok_end_to_chartok_index[i] + start_orig_pos = _convert_index(chartok_to_orig_index, start_chartok_pos, + n, is_start=True) + end_orig_pos = _convert_index(chartok_to_orig_index, end_chartok_pos, + n, is_start=False) + + tok_start_to_orig_index.append(start_orig_pos) + tok_end_to_orig_index.append(end_orig_pos) + + if not is_training: + tok_start_position = tok_end_position = None + + if is_training and example.is_impossible: + tok_start_position = 0 + tok_end_position = 0 + + if is_training and not example.is_impossible: + start_position = example.start_position + end_position = start_position + len(example.orig_answer_text) - 1 + + start_chartok_pos = _convert_index(orig_to_chartok_index, start_position, + is_start=True) + tok_start_position = chartok_to_tok_index[start_chartok_pos] + + end_chartok_pos = _convert_index(orig_to_chartok_index, end_position, + is_start=False) + tok_end_position = chartok_to_tok_index[end_chartok_pos] + assert tok_start_position <= tok_end_position + + def _piece_to_id(x): + if six.PY2 and isinstance(x, six.text_type): + x = six.ensure_binary(x, "utf-8") + return tokenizer.sp_model.PieceToId(x) + + all_doc_tokens = list(map(_piece_to_id, para_tokens)) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + _DocSpan = collections.namedtuple( # pylint: disable=invalid-name + "DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_is_max_context = {} + segment_ids = [] + p_mask = [] + + cur_tok_start_to_orig_index = [] + cur_tok_end_to_orig_index = [] + + tokens.append(tokenizer.sp_model.PieceToId("[CLS]")) + segment_ids.append(0) + p_mask.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + p_mask.append(1) + tokens.append(tokenizer.sp_model.PieceToId("[SEP]")) + segment_ids.append(0) + p_mask.append(1) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + + cur_tok_start_to_orig_index.append( + tok_start_to_orig_index[split_token_index]) + cur_tok_end_to_orig_index.append( + tok_end_to_orig_index[split_token_index]) + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, + split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + p_mask.append(0) + tokens.append(tokenizer.sp_model.PieceToId("[SEP]")) + segment_ids.append(1) + p_mask.append(1) + + paragraph_len = len(tokens) + input_ids = tokens + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + p_mask.append(1) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + span_is_impossible = example.is_impossible + start_position = None + end_position = None + if is_training and not span_is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start and + tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + # continue + start_position = 0 + end_position = 0 + span_is_impossible = True + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + if is_training and span_is_impossible: + start_position = 0 + end_position = 0 + + if example_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("unique_id: %s" % (unique_id)) + tf.logging.info("example_index: %s" % (example_index)) + tf.logging.info("doc_span_index: %s" % (doc_span_index)) + tf.logging.info("tok_start_to_orig_index: %s" % " ".join( + [str(x) for x in cur_tok_start_to_orig_index])) + tf.logging.info("tok_end_to_orig_index: %s" % " ".join( + [str(x) for x in cur_tok_end_to_orig_index])) + tf.logging.info("token_is_max_context: %s" % " ".join([ + "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) + ])) + tf.logging.info("input_pieces: %s" % " ".join( + [tokenizer.sp_model.IdToPiece(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info( + "input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + + if is_training and span_is_impossible: + tf.logging.info("impossible example span") + + if is_training and not span_is_impossible: + pieces = [tokenizer.sp_model.IdToPiece(token) for token in + tokens[start_position: (end_position + 1)]] + answer_text = tokenizer.sp_model.DecodePieces(pieces) + tf.logging.info("start_position: %d" % (start_position)) + tf.logging.info("end_position: %d" % (end_position)) + tf.logging.info( + "answer: %s" % (tokenization.printable_text(answer_text))) + + # note(zhiliny): With multi processing, + # the example_index is actually the index within the current process + # therefore we use example_index=None to avoid being used in the future. + # The current code does not use example_index of training data. + if is_training: + feat_example_index = None + else: + feat_example_index = example_index + + feature = InputFeatures( + unique_id=unique_id, + example_index=feat_example_index, + doc_span_index=doc_span_index, + tok_start_to_orig_index=cur_tok_start_to_orig_index, + tok_end_to_orig_index=cur_tok_end_to_orig_index, + token_is_max_context=token_is_max_context, + tokens=[tokenizer.sp_model.IdToPiece(x) for x in tokens], + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + paragraph_len=paragraph_len, + start_position=start_position, + end_position=end_position, + is_impossible=span_is_impossible, + p_mask=p_mask) + + # Run callback + output_fn(feature) + + unique_id += 1 + if span_is_impossible: + cnt_neg += 1 + else: + cnt_pos += 1 + + tf.logging.info("Total number of instances: {} = pos {} neg {}".format( + cnt_pos + cnt_neg, cnt_pos, cnt_neg)) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + +class FeatureWriter(object): + """Writes InputFeature to TF example file.""" + + def __init__(self, filename, is_training): + self.filename = filename + self.is_training = is_training + self.num_features = 0 + self._writer = tf.python_io.TFRecordWriter(filename) + + def process_feature(self, feature): + """Write a InputFeature to the TFRecordWriter as a tf.train.Example.""" + self.num_features += 1 + + def create_int_feature(values): + feature = tf.train.Feature( + int64_list=tf.train.Int64List(value=list(values))) + return feature + + features = collections.OrderedDict() + features["unique_ids"] = create_int_feature([feature.unique_id]) + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["p_mask"] = create_int_feature(feature.p_mask) + + if self.is_training: + features["start_positions"] = create_int_feature([feature.start_position]) + features["end_positions"] = create_int_feature([feature.end_position]) + impossible = 0 + if feature.is_impossible: + impossible = 1 + features["is_impossible"] = create_int_feature([impossible]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + self._writer.write(tf_example.SerializeToString()) + + def close(self): + self._writer.close() + + +def input_fn_builder(input_file, seq_length, is_training, + drop_remainder, use_tpu, bsz, is_v2): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "unique_ids": tf.FixedLenFeature([], tf.int64), + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + } + # p_mask is not required for SQuAD v1.1 + if is_v2: + name_to_features["p_mask"] = tf.FixedLenFeature([seq_length], tf.int64) + + if is_training: + name_to_features["start_positions"] = tf.FixedLenFeature([], tf.int64) + name_to_features["end_positions"] = tf.FixedLenFeature([], tf.int64) + name_to_features["is_impossible"] = tf.FixedLenFeature([], tf.int64) + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + if use_tpu: + batch_size = params["batch_size"] + else: + batch_size = bsz + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + contrib_data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +def create_v1_model(albert_config, is_training, input_ids, input_mask, + segment_ids, use_one_hot_embeddings, use_einsum, + hub_module): + """Creates a classification model.""" + (_, final_hidden) = fine_tuning_utils.create_albert( + albert_config=albert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings, + use_einsum=use_einsum, + hub_module=hub_module) + + final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) + batch_size = final_hidden_shape[0] + seq_length = final_hidden_shape[1] + hidden_size = final_hidden_shape[2] + + output_weights = tf.get_variable( + "cls/squad/output_weights", [2, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) + + final_hidden_matrix = tf.reshape(final_hidden, + [batch_size * seq_length, hidden_size]) + logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + + logits = tf.reshape(logits, [batch_size, seq_length, 2]) + logits = tf.transpose(logits, [2, 0, 1]) + + unstacked_logits = tf.unstack(logits, axis=0) + + (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) + + return (start_logits, end_logits) + + +def v1_model_fn_builder(albert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings, use_einsum, hub_module): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + if "unique_ids" in features: + unique_ids = features["unique_ids"] + else: + unique_ids = None + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (start_logits, end_logits) = create_v1_model( + albert_config=albert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings, + use_einsum=use_einsum, + hub_module=hub_module) + + # Assign names to the logits so that we can refer to them as output tensors. + start_logits = tf.identity(start_logits, name="start_logits") + end_logits = tf.identity(end_logits, name="end_logits") + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + seq_length = modeling.get_shape_list(input_ids)[1] + + def compute_loss(logits, positions): + one_hot_positions = tf.one_hot( + positions, depth=seq_length, dtype=tf.float32) + log_probs = tf.nn.log_softmax(logits, axis=-1) + loss = -tf.reduce_mean( + tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) + return loss + + start_positions = features["start_positions"] + end_positions = features["end_positions"] + + start_loss = compute_loss(start_logits, start_positions) + end_loss = compute_loss(end_logits, end_positions) + + total_loss = (start_loss + end_loss) / 2.0 + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = contrib_tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.PREDICT: + predictions = { + "start_log_prob": start_logits, + "end_log_prob": end_logits, + } + if unique_ids is not None: + predictions["unique_ids"] = unique_ids + output_spec = contrib_tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + else: + raise ValueError( + "Only TRAIN and PREDICT modes are supported: %s" % (mode)) + return output_spec + + return model_fn + + +def accumulate_predictions_v1(result_dict, all_examples, all_features, + all_results, n_best_size, max_answer_length): + """accumulate predictions for each positions in a dictionary.""" + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + if example_index not in result_dict: + result_dict[example_index] = {} + features = example_index_to_features[example_index] + + prelim_predictions = [] + min_null_feature_index = 0 # the paragraph slice with min mull score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + for (feature_index, feature) in enumerate(features): + if feature.unique_id not in result_dict[example_index]: + result_dict[example_index][feature.unique_id] = {} + result = unique_id_to_result[feature.unique_id] + start_indexes = _get_best_indexes(result.start_log_prob, n_best_size) + end_indexes = _get_best_indexes(result.end_log_prob, n_best_size) + for start_index in start_indexes: + for end_index in end_indexes: + doc_offset = feature.tokens.index("[SEP]") + 1 + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index - doc_offset >= len(feature.tok_start_to_orig_index): + continue + if end_index - doc_offset >= len(feature.tok_end_to_orig_index): + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + start_log_prob = result.start_log_prob[start_index] + end_log_prob = result.end_log_prob[end_index] + start_idx = start_index - doc_offset + end_idx = end_index - doc_offset + if (start_idx, end_idx) not in result_dict[example_index][feature.unique_id]: + result_dict[example_index][feature.unique_id][(start_idx, end_idx)] = [] + result_dict[example_index][feature.unique_id][(start_idx, end_idx)].append((start_log_prob, end_log_prob)) + + +def write_predictions_v1(result_dict, all_examples, all_features, + all_results, n_best_size, max_answer_length, + output_prediction_file, output_nbest_file): + """Write final predictions to the json file and log-odds of null if needed.""" + tf.logging.info("Writing predictions to: %s" % (output_prediction_file)) + tf.logging.info("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min mull score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + for (feature_index, feature) in enumerate(features): + for ((start_idx, end_idx), logprobs) in \ + result_dict[example_index][feature.unique_id].items(): + start_log_prob = 0 + end_log_prob = 0 + for logprob in logprobs: + start_log_prob += logprob[0] + end_log_prob += logprob[1] + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_idx, + end_index=end_idx, + start_log_prob=start_log_prob / len(logprobs), + end_log_prob=end_log_prob / len(logprobs))) + + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_log_prob + x.end_log_prob), + reverse=True) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index >= 0: # this is a non-null prediction + tok_start_to_orig_index = feature.tok_start_to_orig_index + tok_end_to_orig_index = feature.tok_end_to_orig_index + start_orig_pos = tok_start_to_orig_index[pred.start_index] + end_orig_pos = tok_end_to_orig_index[pred.end_index] + + paragraph_text = example.paragraph_text + final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_log_prob=pred.start_log_prob, + end_log_prob=pred.end_log_prob)) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="empty", start_log_prob=0.0, end_log_prob=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_log_prob + entry.end_log_prob) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_log_prob"] = entry.start_log_prob + output["end_log_prob"] = entry.end_log_prob + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + all_predictions[example.qas_id] = nbest_json[0]["text"] + all_nbest_json[example.qas_id] = nbest_json + + with tf.gfile.GFile(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + + with tf.gfile.GFile(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + + return all_predictions + + +####### following are from official SQuAD v1.1 evaluation scripts +def normalize_answer_v1(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(text): + return re.sub(r"\b(a|an|the)\b", " ", text) + + def white_space_fix(text): + return " ".join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return "".join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def f1_score(prediction, ground_truth): + prediction_tokens = normalize_answer_v1(prediction).split() + ground_truth_tokens = normalize_answer_v1(ground_truth).split() + common = ( + collections.Counter(prediction_tokens) + & collections.Counter(ground_truth_tokens)) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(ground_truth_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def exact_match_score(prediction, ground_truth): + return (normalize_answer_v1(prediction) == normalize_answer_v1(ground_truth)) + + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + + +def evaluate_v1(dataset, predictions): + f1 = exact_match = total = 0 + for article in dataset: + for paragraph in article["paragraphs"]: + for qa in paragraph["qas"]: + total += 1 + if qa["id"] not in predictions: + message = ("Unanswered question " + six.ensure_str(qa["id"]) + + " will receive score 0.") + print(message, file=sys.stderr) + continue + ground_truths = [x["text"] for x in qa["answers"]] + # ground_truths = list(map(lambda x: x["text"], qa["answers"])) + prediction = predictions[qa["id"]] + exact_match += metric_max_over_ground_truths(exact_match_score, + prediction, ground_truths) + f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths) + + exact_match = 100.0 * exact_match / total + f1 = 100.0 * f1 / total + + return {"exact_match": exact_match, "f1": f1} + +####### above are from official SQuAD v1.1 evaluation scripts +####### following are from official SQuAD v2.0 evaluation scripts +def make_qid_to_has_ans(dataset): + qid_to_has_ans = {} + for article in dataset: + for p in article['paragraphs']: + for qa in p['qas']: + qid_to_has_ans[qa['id']] = bool(qa['answers']) + return qid_to_has_ans + +def normalize_answer_v2(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + def remove_articles(text): + regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) + return re.sub(regex, ' ', text) + def white_space_fix(text): + return ' '.join(text.split()) + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + def lower(text): + return text.lower() + return white_space_fix(remove_articles(remove_punc(lower(s)))) + +def get_tokens(s): + if not s: return [] + return normalize_answer_v2(s).split() + +def compute_exact(a_gold, a_pred): + return int(normalize_answer_v2(a_gold) == normalize_answer_v2(a_pred)) + +def compute_f1(a_gold, a_pred): + gold_toks = get_tokens(a_gold) + pred_toks = get_tokens(a_pred) + common = collections.Counter(gold_toks) & collections.Counter(pred_toks) + num_same = sum(common.values()) + if len(gold_toks) == 0 or len(pred_toks) == 0: + # If either is no-answer, then F1 is 1 if they agree, 0 otherwise + return int(gold_toks == pred_toks) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(pred_toks) + recall = 1.0 * num_same / len(gold_toks) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + +def get_raw_scores(dataset, preds): + exact_scores = {} + f1_scores = {} + for article in dataset: + for p in article['paragraphs']: + for qa in p['qas']: + qid = qa['id'] + gold_answers = [a['text'] for a in qa['answers'] + if normalize_answer_v2(a['text'])] + if not gold_answers: + # For unanswerable questions, only correct answer is empty string + gold_answers = [''] + if qid not in preds: + print('Missing prediction for %s' % qid) + continue + a_pred = preds[qid] + # Take max over all gold answers + exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers) + f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers) + return exact_scores, f1_scores + +def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): + new_scores = {} + for qid, s in scores.items(): + pred_na = na_probs[qid] > na_prob_thresh + if pred_na: + new_scores[qid] = float(not qid_to_has_ans[qid]) + else: + new_scores[qid] = s + return new_scores + +def make_eval_dict(exact_scores, f1_scores, qid_list=None): + if not qid_list: + total = len(exact_scores) + return collections.OrderedDict([ + ('exact', 100.0 * sum(exact_scores.values()) / total), + ('f1', 100.0 * sum(f1_scores.values()) / total), + ('total', total), + ]) + else: + total = len(qid_list) + return collections.OrderedDict([ + ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total), + ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total), + ('total', total), + ]) + + +def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): + num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) + cur_score = num_no_ans + best_score = cur_score + best_thresh = 0.0 + qid_list = sorted(na_probs, key=lambda k: na_probs[k]) + for i, qid in enumerate(qid_list): + if qid not in scores: continue + if qid_to_has_ans[qid]: + diff = scores[qid] + else: + if preds[qid]: + diff = -1 + else: + diff = 0 + cur_score += diff + if cur_score > best_score: + best_score = cur_score + best_thresh = na_probs[qid] + return 100.0 * best_score / len(scores), best_thresh + + +def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): + best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans) + best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans) + main_eval['best_exact'] = best_exact + main_eval['best_exact_thresh'] = exact_thresh + main_eval['best_f1'] = best_f1 + main_eval['best_f1_thresh'] = f1_thresh + + +def merge_eval(main_eval, new_eval, prefix): + for k in new_eval: + main_eval['%s_%s' % (prefix, k)] = new_eval[k] + +####### above are from official SQuAD v2.0 evaluation scripts + +def accumulate_predictions_v2(result_dict, cls_dict, all_examples, + all_features, all_results, n_best_size, + max_answer_length, start_n_top, end_n_top): + """accumulate predictions for each positions in a dictionary.""" + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + if example_index not in result_dict: + result_dict[example_index] = {} + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + + for (feature_index, feature) in enumerate(features): + if feature.unique_id not in result_dict[example_index]: + result_dict[example_index][feature.unique_id] = {} + result = unique_id_to_result[feature.unique_id] + cur_null_score = result.cls_logits + + # if we could have irrelevant answers, get the min score of irrelevant + score_null = min(score_null, cur_null_score) + + doc_offset = feature.tokens.index("[SEP]") + 1 + for i in range(start_n_top): + for j in range(end_n_top): + start_log_prob = result.start_top_log_probs[i] + start_index = result.start_top_index[i] + + j_index = i * end_n_top + j + + end_log_prob = result.end_top_log_probs[j_index] + end_index = result.end_top_index[j_index] + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index - doc_offset >= len(feature.tok_start_to_orig_index): + continue + if start_index - doc_offset < 0: + continue + if end_index - doc_offset >= len(feature.tok_end_to_orig_index): + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + start_idx = start_index - doc_offset + end_idx = end_index - doc_offset + if (start_idx, end_idx) not in result_dict[example_index][feature.unique_id]: + result_dict[example_index][feature.unique_id][(start_idx, end_idx)] = [] + result_dict[example_index][feature.unique_id][(start_idx, end_idx)].append((start_log_prob, end_log_prob)) + if example_index not in cls_dict: + cls_dict[example_index] = [] + cls_dict[example_index].append(score_null) + + +def write_predictions_v2(result_dict, cls_dict, all_examples, all_features, + all_results, n_best_size, max_answer_length, + output_prediction_file, + output_nbest_file, output_null_log_odds_file, + null_score_diff_threshold): + """Write final predictions to the json file and log-odds of null if needed.""" + tf.logging.info("Writing predictions to: %s" % (output_prediction_file)) + tf.logging.info("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + # score_null = 1000000 # large and positive + + for (feature_index, feature) in enumerate(features): + for ((start_idx, end_idx), logprobs) in \ + result_dict[example_index][feature.unique_id].items(): + start_log_prob = 0 + end_log_prob = 0 + for logprob in logprobs: + start_log_prob += logprob[0] + end_log_prob += logprob[1] + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_idx, + end_index=end_idx, + start_log_prob=start_log_prob / len(logprobs), + end_log_prob=end_log_prob / len(logprobs))) + + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_log_prob + x.end_log_prob), + reverse=True) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + + tok_start_to_orig_index = feature.tok_start_to_orig_index + tok_end_to_orig_index = feature.tok_end_to_orig_index + start_orig_pos = tok_start_to_orig_index[pred.start_index] + end_orig_pos = tok_end_to_orig_index[pred.end_index] + + paragraph_text = example.paragraph_text + final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() + + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_log_prob=pred.start_log_prob, + end_log_prob=pred.end_log_prob)) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction( + text="", + start_log_prob=-1e6, + end_log_prob=-1e6)) + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_log_prob + entry.end_log_prob) + if not best_non_null_entry: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_log_prob"] = entry.start_log_prob + output["end_log_prob"] = entry.end_log_prob + nbest_json.append(output) + + assert len(nbest_json) >= 1 + assert best_non_null_entry is not None + + score_diff = sum(cls_dict[example_index]) / len(cls_dict[example_index]) + scores_diff_json[example.qas_id] = score_diff + # predict null answers when null threshold is provided + if null_score_diff_threshold is None or score_diff < null_score_diff_threshold: + all_predictions[example.qas_id] = best_non_null_entry.text + else: + all_predictions[example.qas_id] = "" + + all_nbest_json[example.qas_id] = nbest_json + assert len(nbest_json) >= 1 + + with tf.gfile.GFile(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + + with tf.gfile.GFile(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + + with tf.gfile.GFile(output_null_log_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + return all_predictions, scores_diff_json + + +def create_v2_model(albert_config, is_training, input_ids, input_mask, + segment_ids, use_one_hot_embeddings, features, + max_seq_length, start_n_top, end_n_top, dropout_prob, + hub_module): + """Creates a classification model.""" + (_, output) = fine_tuning_utils.create_albert( + albert_config=albert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings, + use_einsum=True, + hub_module=hub_module) + + bsz = tf.shape(output)[0] + return_dict = {} + output = tf.transpose(output, [1, 0, 2]) + + # invalid position mask such as query and special symbols (PAD, SEP, CLS) + p_mask = tf.cast(features["p_mask"], dtype=tf.float32) + + # logit of the start position + with tf.variable_scope("start_logits"): + start_logits = tf.layers.dense( + output, + 1, + kernel_initializer=modeling.create_initializer( + albert_config.initializer_range)) + start_logits = tf.transpose(tf.squeeze(start_logits, -1), [1, 0]) + start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask + start_log_probs = tf.nn.log_softmax(start_logits_masked, -1) + + # logit of the end position + with tf.variable_scope("end_logits"): + if is_training: + # during training, compute the end logits based on the + # ground truth of the start position + start_positions = tf.reshape(features["start_positions"], [-1]) + start_index = tf.one_hot(start_positions, depth=max_seq_length, axis=-1, + dtype=tf.float32) + start_features = tf.einsum("lbh,bl->bh", output, start_index) + start_features = tf.tile(start_features[None], [max_seq_length, 1, 1]) + end_logits = tf.layers.dense( + tf.concat([output, start_features], axis=-1), + albert_config.hidden_size, + kernel_initializer=modeling.create_initializer( + albert_config.initializer_range), + activation=tf.tanh, + name="dense_0") + end_logits = contrib_layers.layer_norm(end_logits, begin_norm_axis=-1) + + end_logits = tf.layers.dense( + end_logits, + 1, + kernel_initializer=modeling.create_initializer( + albert_config.initializer_range), + name="dense_1") + end_logits = tf.transpose(tf.squeeze(end_logits, -1), [1, 0]) + end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask + end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) + else: + # during inference, compute the end logits based on beam search + + start_top_log_probs, start_top_index = tf.nn.top_k( + start_log_probs, k=start_n_top) + start_index = tf.one_hot(start_top_index, + depth=max_seq_length, axis=-1, dtype=tf.float32) + start_features = tf.einsum("lbh,bkl->bkh", output, start_index) + end_input = tf.tile(output[:, :, None], + [1, 1, start_n_top, 1]) + start_features = tf.tile(start_features[None], + [max_seq_length, 1, 1, 1]) + end_input = tf.concat([end_input, start_features], axis=-1) + end_logits = tf.layers.dense( + end_input, + albert_config.hidden_size, + kernel_initializer=modeling.create_initializer( + albert_config.initializer_range), + activation=tf.tanh, + name="dense_0") + end_logits = contrib_layers.layer_norm(end_logits, begin_norm_axis=-1) + end_logits = tf.layers.dense( + end_logits, + 1, + kernel_initializer=modeling.create_initializer( + albert_config.initializer_range), + name="dense_1") + end_logits = tf.reshape(end_logits, [max_seq_length, -1, start_n_top]) + end_logits = tf.transpose(end_logits, [1, 2, 0]) + end_logits_masked = end_logits * ( + 1 - p_mask[:, None]) - 1e30 * p_mask[:, None] + end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) + end_top_log_probs, end_top_index = tf.nn.top_k( + end_log_probs, k=end_n_top) + end_top_log_probs = tf.reshape( + end_top_log_probs, + [-1, start_n_top * end_n_top]) + end_top_index = tf.reshape( + end_top_index, + [-1, start_n_top * end_n_top]) + + if is_training: + return_dict["start_log_probs"] = start_log_probs + return_dict["end_log_probs"] = end_log_probs + else: + return_dict["start_top_log_probs"] = start_top_log_probs + return_dict["start_top_index"] = start_top_index + return_dict["end_top_log_probs"] = end_top_log_probs + return_dict["end_top_index"] = end_top_index + + # an additional layer to predict answerability + with tf.variable_scope("answer_class"): + # get the representation of CLS + cls_index = tf.one_hot(tf.zeros([bsz], dtype=tf.int32), + max_seq_length, + axis=-1, dtype=tf.float32) + cls_feature = tf.einsum("lbh,bl->bh", output, cls_index) + + # get the representation of START + start_p = tf.nn.softmax(start_logits_masked, axis=-1, + name="softmax_start") + start_feature = tf.einsum("lbh,bl->bh", output, start_p) + + # note(zhiliny): no dependency on end_feature so that we can obtain + # one single `cls_logits` for each sample + ans_feature = tf.concat([start_feature, cls_feature], -1) + ans_feature = tf.layers.dense( + ans_feature, + albert_config.hidden_size, + activation=tf.tanh, + kernel_initializer=modeling.create_initializer( + albert_config.initializer_range), + name="dense_0") + ans_feature = tf.layers.dropout(ans_feature, dropout_prob, + training=is_training) + cls_logits = tf.layers.dense( + ans_feature, + 1, + kernel_initializer=modeling.create_initializer( + albert_config.initializer_range), + name="dense_1", + use_bias=False) + cls_logits = tf.squeeze(cls_logits, -1) + + return_dict["cls_logits"] = cls_logits + + return return_dict + + +def v2_model_fn_builder(albert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings, max_seq_length, start_n_top, + end_n_top, dropout_prob, hub_module): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + # unique_ids = features["unique_ids"] + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + outputs = create_v2_model( + albert_config=albert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings, + features=features, + max_seq_length=max_seq_length, + start_n_top=start_n_top, + end_n_top=end_n_top, + dropout_prob=dropout_prob, + hub_module=hub_module) + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + seq_length = modeling.get_shape_list(input_ids)[1] + + def compute_loss(log_probs, positions): + one_hot_positions = tf.one_hot( + positions, depth=seq_length, dtype=tf.float32) + + loss = - tf.reduce_sum(one_hot_positions * log_probs, axis=-1) + loss = tf.reduce_mean(loss) + return loss + + start_loss = compute_loss( + outputs["start_log_probs"], features["start_positions"]) + end_loss = compute_loss( + outputs["end_log_probs"], features["end_positions"]) + + total_loss = (start_loss + end_loss) * 0.5 + + cls_logits = outputs["cls_logits"] + is_impossible = tf.reshape(features["is_impossible"], [-1]) + regression_loss = tf.nn.sigmoid_cross_entropy_with_logits( + labels=tf.cast(is_impossible, dtype=tf.float32), logits=cls_logits) + regression_loss = tf.reduce_mean(regression_loss) + + # note(zhiliny): by default multiply the loss by 0.5 so that the scale is + # comparable to start_loss and end_loss + total_loss += regression_loss * 0.5 + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = contrib_tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.PREDICT: + predictions = { + "unique_ids": features["unique_ids"], + "start_top_index": outputs["start_top_index"], + "start_top_log_probs": outputs["start_top_log_probs"], + "end_top_index": outputs["end_top_index"], + "end_top_log_probs": outputs["end_top_log_probs"], + "cls_logits": outputs["cls_logits"] + } + output_spec = contrib_tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + else: + raise ValueError( + "Only TRAIN and PREDICT modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def evaluate_v2(result_dict, cls_dict, prediction_json, eval_examples, + eval_features, all_results, n_best_size, max_answer_length, + output_prediction_file, output_nbest_file, + output_null_log_odds_file): + null_score_diff_threshold = None + predictions, na_probs = write_predictions_v2( + result_dict, cls_dict, eval_examples, eval_features, + all_results, n_best_size, max_answer_length, + output_prediction_file, output_nbest_file, + output_null_log_odds_file, null_score_diff_threshold) + + na_prob_thresh = 1.0 # default value taken from the eval script + qid_to_has_ans = make_qid_to_has_ans(prediction_json) # maps qid to True/False + has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] + no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] + exact_raw, f1_raw = get_raw_scores(prediction_json, predictions) + exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans, + na_prob_thresh) + f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, + na_prob_thresh) + out_eval = make_eval_dict(exact_thresh, f1_thresh) + find_all_best_thresh(out_eval, predictions, exact_raw, f1_raw, na_probs, qid_to_has_ans) + null_score_diff_threshold = out_eval["best_f1_thresh"] + + predictions, na_probs = write_predictions_v2( + result_dict, cls_dict,eval_examples, eval_features, + all_results, n_best_size, max_answer_length, + output_prediction_file, output_nbest_file, + output_null_log_odds_file, null_score_diff_threshold) + + qid_to_has_ans = make_qid_to_has_ans(prediction_json) # maps qid to True/False + has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] + no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] + exact_raw, f1_raw = get_raw_scores(prediction_json, predictions) + exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans, + na_prob_thresh) + f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, + na_prob_thresh) + out_eval = make_eval_dict(exact_thresh, f1_thresh) + out_eval["null_score_diff_threshold"] = null_score_diff_threshold + return out_eval diff --git a/Indic-BERT-v1-master/albert/tokenization.py b/Indic-BERT-v1-master/albert/tokenization.py new file mode 100644 index 0000000000000000000000000000000000000000..ba5bff10eddce14d1ec62f45e699a26c824f84d5 --- /dev/null +++ b/Indic-BERT-v1-master/albert/tokenization.py @@ -0,0 +1,465 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python2, python3 +# coding=utf-8 +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import unicodedata +import six +from six.moves import range +import tensorflow.compat.v1 as tf +import tensorflow_hub as hub +import sentencepiece as spm + +SPIECE_UNDERLINE = u"▁".encode("utf-8") + + +def preprocess_text(inputs, remove_space=True, lower=False): + """preprocess data by removing extra space and normalize data.""" + outputs = inputs + if remove_space: + outputs = " ".join(inputs.strip().split()) + + if six.PY2 and isinstance(outputs, str): + try: + outputs = six.ensure_text(outputs, "utf-8") + except UnicodeDecodeError: + outputs = six.ensure_text(outputs, "latin-1") + + outputs = unicodedata.normalize("NFKD", outputs) + outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) + if lower: + outputs = outputs.lower() + + return outputs + + +def encode_pieces(sp_model, text, return_unicode=True, sample=False): + """turn sentences into word pieces.""" + + if six.PY2 and isinstance(text, six.text_type): + text = six.ensure_binary(text, "utf-8") + + if not sample: + pieces = sp_model.EncodeAsPieces(text) + else: + pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1) + new_pieces = [] + for piece in pieces: + piece = printable_text(piece) + if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit(): + cur_pieces = sp_model.EncodeAsPieces( + six.ensure_binary(piece[:-1]).replace(SPIECE_UNDERLINE, b"")) + if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: + if len(cur_pieces[0]) == 1: + cur_pieces = cur_pieces[1:] + else: + cur_pieces[0] = cur_pieces[0][1:] + cur_pieces.append(piece[-1]) + new_pieces.extend(cur_pieces) + else: + new_pieces.append(piece) + + # note(zhiliny): convert back to unicode for py2 + if six.PY2 and return_unicode: + ret_pieces = [] + for piece in new_pieces: + if isinstance(piece, str): + piece = six.ensure_text(piece, "utf-8") + ret_pieces.append(piece) + new_pieces = ret_pieces + + return new_pieces + + +def encode_ids(sp_model, text, sample=False): + pieces = encode_pieces(sp_model, text, return_unicode=False, sample=sample) + ids = [sp_model.PieceToId(piece) for piece in pieces] + return ids + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return six.ensure_text(text, "utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return six.ensure_text(text, "utf-8", "ignore") + elif isinstance(text, six.text_type): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return six.ensure_text(text, "utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, six.text_type): + return six.ensure_binary(text, "utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with tf.gfile.GFile(vocab_file, "r") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip().split()[0] if token.strip() else " " + if token not in vocab: + vocab[token] = len(vocab) + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None): + self.vocab = None + self.sp_model = None + if spm_model_file: + self.sp_model = spm.SentencePieceProcessor() + tf.logging.info("loading sentence piece model") + # Handle cases where SP can't load the file, but gfile can. + sp_model_ = tf.gfile.GFile(spm_model_file, "rb").read() + self.sp_model.LoadFromSerializedProto(sp_model_) + # Note(mingdachen): For the purpose of consisent API, we are + # generating a vocabulary for the sentence piece tokenizer. + self.vocab = {self.sp_model.IdToPiece(i): i for i + in range(self.sp_model.GetPieceSize())} + else: + self.vocab = load_vocab(vocab_file) + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + + @classmethod + def from_scratch(cls, vocab_file, do_lower_case, spm_model_file): + return FullTokenizer(vocab_file, do_lower_case, spm_model_file) + + @classmethod + def from_hub_module(cls, hub_module, use_spm=True): + """Get the vocab file and casing info from the Hub module.""" + with tf.Graph().as_default(): + albert_module = hub.Module(hub_module) + tokenization_info = albert_module(signature="tokenization_info", + as_dict=True) + with tf.Session() as sess: + vocab_file, do_lower_case = sess.run( + [tokenization_info["vocab_file"], + tokenization_info["do_lower_case"]]) + if use_spm: + spm_model_file = vocab_file + vocab_file = None + return FullTokenizer( + vocab_file=vocab_file, do_lower_case=do_lower_case, + spm_model_file=spm_model_file) + + def tokenize(self, text): + if self.sp_model: + split_tokens = encode_pieces(self.sp_model, text, return_unicode=False) + else: + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + if self.sp_model: + tf.logging.info("using sentence piece tokenzier.") + return [self.sp_model.PieceToId( + printable_text(token)) for token in tokens] + else: + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + if self.sp_model: + tf.logging.info("using sentence piece tokenzier.") + return [self.sp_model.IdToPiece(id_) for id_ in ids] + else: + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + six.ensure_str(substr) + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically control characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat in ("Cc", "Cf"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/Indic-BERT-v1-master/albert/tokenization_test.py b/Indic-BERT-v1-master/albert/tokenization_test.py new file mode 100644 index 0000000000000000000000000000000000000000..6dbd626a2bdbb39a5ea02102cd84d648e936b05b --- /dev/null +++ b/Indic-BERT-v1-master/albert/tokenization_test.py @@ -0,0 +1,137 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python2, python3 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import tempfile +from albert import tokenization +import six +import tensorflow.compat.v1 as tf + + +class TokenizationTest(tf.test.TestCase): + + def test_full_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing", "," + ] + with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: + if six.PY2: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + else: + contents = "".join([six.ensure_str(x) + "\n" for x in vocab_tokens]) + vocab_writer.write(six.ensure_binary(contents, "utf-8")) + + vocab_file = vocab_writer.name + + tokenizer = tokenization.FullTokenizer(vocab_file) + os.unlink(vocab_file) + + tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") + self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) + + self.assertAllEqual( + tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) + + def test_chinese(self): + tokenizer = tokenization.BasicTokenizer() + + self.assertAllEqual( + tokenizer.tokenize(u"ah\u535A\u63A8zz"), + [u"ah", u"\u535A", u"\u63A8", u"zz"]) + + def test_basic_tokenizer_lower(self): + tokenizer = tokenization.BasicTokenizer(do_lower_case=True) + + self.assertAllEqual( + tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + ["hello", "!", "how", "are", "you", "?"]) + self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) + + def test_basic_tokenizer_no_lower(self): + tokenizer = tokenization.BasicTokenizer(do_lower_case=False) + + self.assertAllEqual( + tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + ["HeLLo", "!", "how", "Are", "yoU", "?"]) + + def test_wordpiece_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing" + ] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) + + self.assertAllEqual(tokenizer.tokenize(""), []) + + self.assertAllEqual( + tokenizer.tokenize("unwanted running"), + ["un", "##want", "##ed", "runn", "##ing"]) + + self.assertAllEqual( + tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) + + def test_convert_tokens_to_ids(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing" + ] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + + self.assertAllEqual( + tokenization.convert_tokens_to_ids( + vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9]) + + def test_is_whitespace(self): + self.assertTrue(tokenization._is_whitespace(u" ")) + self.assertTrue(tokenization._is_whitespace(u"\t")) + self.assertTrue(tokenization._is_whitespace(u"\r")) + self.assertTrue(tokenization._is_whitespace(u"\n")) + self.assertTrue(tokenization._is_whitespace(u"\u00A0")) + + self.assertFalse(tokenization._is_whitespace(u"A")) + self.assertFalse(tokenization._is_whitespace(u"-")) + + def test_is_control(self): + self.assertTrue(tokenization._is_control(u"\u0005")) + + self.assertFalse(tokenization._is_control(u"A")) + self.assertFalse(tokenization._is_control(u" ")) + self.assertFalse(tokenization._is_control(u"\t")) + self.assertFalse(tokenization._is_control(u"\r")) + self.assertFalse(tokenization._is_control(u"\U0001F4A9")) + + def test_is_punctuation(self): + self.assertTrue(tokenization._is_punctuation(u"-")) + self.assertTrue(tokenization._is_punctuation(u"$")) + self.assertTrue(tokenization._is_punctuation(u"`")) + self.assertTrue(tokenization._is_punctuation(u".")) + + self.assertFalse(tokenization._is_punctuation(u"A")) + self.assertFalse(tokenization._is_punctuation(u" ")) + + +if __name__ == "__main__": + tf.test.main() diff --git a/Indic-BERT-v1-master/albert/train.py b/Indic-BERT-v1-master/albert/train.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Indic-BERT-v1-master/configs/albert_base_config.json b/Indic-BERT-v1-master/configs/albert_base_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f036a4688c914e5b994d18e5dcb788bd03d020a0 --- /dev/null +++ b/Indic-BERT-v1-master/configs/albert_base_config.json @@ -0,0 +1,21 @@ +{ + "model_type": "albert", + "attention_probs_dropout_prob": 0, + "hidden_act": "gelu", + "hidden_dropout_prob": 0, + "embedding_size": 128, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "num_hidden_groups": 1, + "net_structure_type": 0, + "gap_size": 0, + "num_memory_blocks": 0, + "inner_group_num": 1, + "down_scale_factor": 1, + "type_vocab_size": 2, + "vocab_size": 200000 +} diff --git a/Indic-BERT-v1-master/configs/albert_large_config.json b/Indic-BERT-v1-master/configs/albert_large_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3c13c27e5928b57702a0759a28d7bc3e74a4b32a --- /dev/null +++ b/Indic-BERT-v1-master/configs/albert_large_config.json @@ -0,0 +1,21 @@ +{ + "model_type": "albert", + "attention_probs_dropout_prob": 0, + "hidden_act": "gelu", + "hidden_dropout_prob": 0, + "embedding_size": 128, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "max_position_embeddings": 512, + "num_attention_heads": 16, + "num_hidden_layers": 24, + "num_hidden_groups": 1, + "net_structure_type": 0, + "gap_size": 0, + "num_memory_blocks": 0, + "inner_group_num": 1, + "down_scale_factor": 1, + "type_vocab_size": 2, + "vocab_size": 200000 +} diff --git a/Indic-BERT-v1-master/docs/advanced-usage.md b/Indic-BERT-v1-master/docs/advanced-usage.md new file mode 100644 index 0000000000000000000000000000000000000000..548ab5eab59bd1c30f407a6d8b159a153839c040 --- /dev/null +++ b/Indic-BERT-v1-master/docs/advanced-usage.md @@ -0,0 +1,45 @@ +## Advanced Usage + +Note that the following sections describe how to use the fine-tuning CLI for advanced purposes. To do this on Colab, simply use the arguments mentioned here in the `argvec` list in our [Colab notebook](https://colab.research.google.com/github/ai4bharat/indic-bert/blob/master/notebooks/finetuning.ipynb) + +#### Using any Huggingface Model + +```python +python3 -m fine_tune.cli --model --dataset --lang --iglue_dir --output_dir +``` + +where HF name refers to the Huggingface shortcut name for the model. For the list of all shortcut names, refer the official docs [https://huggingface.co/transformers/pretrained_models.html](https://huggingface.co/transformers/pretrained_models.html) + + + +#### Loading Model from Local File + +All models in the code are loaded through HF transformers library. For any model, you need the following three files: + +* `config.json`: config file in HF format; check config files used by transformers, for example [here](https://github.com/huggingface/transformers/blob/master/src/transformers/configuration_bert.py). +* `tok.model`: the tokenizer (spm, wordpiece etc.) model file. +* `pytorch_model.bin`: pytorch binary of the transformer model which stores parameters. + +If you have tensorflow checkpoints instead of pytorch binary, then use the following command to first generate the pytorch binary file: + +```bash +MODEL_DIR=$1 + +# modify model_type and filenames accordingly +transformers-cli convert --model_type albert \ + --tf_checkpoint $MODEL_DIR/tf_model \ + --config $MODEL_DIR/config.json \ + --pytorch_dump_output $MODEL_DIR/pytorch_model.bin +``` + +Finally, run the evaluation using the following command: + +```bash +python3 -m fine_tune.cli --model --tokenizer_name --config_name --dataset --lang --iglue_dir --output_dir +``` + + + +#### Running Cross-lingual Experiments + +_Add later_ \ No newline at end of file diff --git a/Indic-BERT-v1-master/docs/arxiv2020_indicnlp_corpus.pdf b/Indic-BERT-v1-master/docs/arxiv2020_indicnlp_corpus.pdf new file mode 100644 index 0000000000000000000000000000000000000000..77dc52adb9ea70095fb729007e7f8973c842eb94 Binary files /dev/null and b/Indic-BERT-v1-master/docs/arxiv2020_indicnlp_corpus.pdf differ diff --git a/Indic-BERT-v1-master/fine_tune/__init__.py b/Indic-BERT-v1-master/fine_tune/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Indic-BERT-v1-master/fine_tune/cli.py b/Indic-BERT-v1-master/fine_tune/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..cf984c61e413aba28fac626010fd847655e8bd88 --- /dev/null +++ b/Indic-BERT-v1-master/fine_tune/cli.py @@ -0,0 +1,196 @@ +import argparse +import os +import sys + +from .modules import get_modules + + +# For every dataset, add an entry: +# [, ] +ALL_DATASETS = { + 'indicnlp-articles': ['text_classification', True], + 'wikiann-ner': ['token_classification', True], + 'wiki-cloze': ['masked_lm', False], + 'wiki-section-titles': ['multiple_choice', True], + 'indicnlp-articles-headlines': ['multiple_choice', True], + 'cvit-mkb': ['xsent_retrieval', False], + 'bbc-articles': ['text_classification', True], + 'iitp-movie-reviews': ['text_classification', True], + 'iitp-product-reviews': ['text_classification', True], + 'soham-articles': ['text_classification', True], + 'inltk-headlines': ['text_classification', True], + 'actsa': ['text_classification', True], + 'midas-discourse': ['text_classification', True], + 'wnli-translated': ['text_classification', True], + 'copa-translated': ['multiple_choice', True], + 'amrita-paraphrase-exact': ['text_classification', True], + 'amrita-paraphrase-fuzzy': ['text_classification', True], +} + + +def add_generic_args(parser, root_dir): + # task-specific args START + parser.add_argument( + '--dataset', + type=str, + required=True, + help='The evaluation dataset to use' + ) + + parser.add_argument( + '--lang', + default=None, + type=str, + required=True, + help='ISO code of test language', + ) + parser.add_argument( + '--train_lang', + default=None, + type=str, + help='ISO code of train language. If not specified, it is assumed to be the same as the test langauges', + ) + # task-specific args END + + # model structural parameters START + parser.add_argument( + '--model', + default=None, + type=str, + required=True, + help='Path to pretrained model or model identifier from huggingface.co/models', + ) + + parser.add_argument( + '--config_name', default='', type=str, help='Pretrained config name or path if not the same as model_name' + ) + + parser.add_argument( + '--tokenizer_name', + default='', + type=str, + help='Pretrained tokenizer name or path if not the same as model_name', + ) + + parser.add_argument( + '--max_seq_length', + default=128, + type=int, + help='The maximum total input sequence length after tokenization. Sequences longer ' + 'than this will be truncated, sequences shorter will be padded.', + ) + # model structural parameters END + + # data I/O args START + parser.add_argument( + '--iglue_dir', + default=None, + type=str, + required=True, + help='The input data dir', + ) + + parser.add_argument( + '--overwrite_cache', action='store_true', help='Overwrite the cached training and evaluation sets' + ) + + parser.add_argument( + '--output_dir', + default=None, + type=str, + required=True, + help='The output directory where the model predictions and checkpoints will be written.', + ) + + parser.add_argument( + '--cache_dir', + default=None, + type=str, + help='Where do you want to store the pre-trained models downloaded from s3', + ) + # data I/O args END + + # model training and inference parameters START + parser.add_argument( + '--fp16', + action='store_true', + help='Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit', + ) + + parser.add_argument( + '--fp16_opt_level', + type=str, + default='O1', + help='For fp16: Apex AMP optimization level selected in ["O0", "O1", "O2", and "O3"].' + 'See details at https://nvidia.github.io/apex/amp.html', + ) + + parser.add_argument('--n_gpu', type=int, default=1) + parser.add_argument('--n_tpu_cores', type=int, default=0) + parser.add_argument('--max_grad_norm', default=1.0, type=float, help='Max gradient norm.') + parser.add_argument('--do_train', action='store_true', help='Whether to run training.') + parser.add_argument('--do_predict', action='store_true', help='Whether to run predictions on the test set.') + parser.add_argument( + '--gradient_accumulation_steps', + type=int, + default=1, + help='Number of updates steps to accumulate before performing a backward/update pass.', + ) + + parser.add_argument('--seed', type=int, default=2, help='random seed for initialization') + parser.add_argument('--learning_rate', default=2e-5, type=float, help='The initial learning rate for Adam.') + parser.add_argument('--weight_decay', default=0.0, type=float, help='Weight decay if we apply some.') + parser.add_argument('--adam_epsilon', default=1e-8, type=float, help='Epsilon for Adam optimizer.') + parser.add_argument('--warmup_steps', default=0, type=int, help='Linear warmup over warmup_steps.') + parser.add_argument( + '--num_train_epochs', default=3, type=int, help='Total number of training epochs to perform.' + ) + parser.add_argument('--train_batch_size', default=32, type=int) + parser.add_argument('--eval_batch_size', default=32, type=int) + # model training and inference parameters END + + +def main(argvec=None): + parser = argparse.ArgumentParser() + add_generic_args(parser, os.getcwd()) + for module in get_modules(): + module.add_model_specific_args(parser, os.getcwd()) + args = parser.parse_args(argvec) + hparams = vars(args) + + # high-level command line parameters + dataset = hparams['dataset'] + train_lang = hparams.get('train_lang', hparams['lang']) + test_lang = hparams['lang'] + model = hparams['model'] + iglue_dir = hparams['iglue_dir'] + + data_dir = os.path.join(iglue_dir, dataset) + output_dir = os.path.join(hparams['output_dir'], dataset, + 'train-{}'.format(train_lang), + 'model-{}'.format(model.replace('/', '-'))) + + hparams['model_name_or_path'] = hparams['model'] + hparams['train_lang'] = train_lang + hparams['test_lang'] = test_lang + hparams['data_dir'] = data_dir + hparams['output_dir'] = output_dir + hparams['do_train'] = ALL_DATASETS[dataset][1] + hparams['do_predict'] = True + + if dataset not in ALL_DATASETS: + print('Unrecognized dataset') + sys.exit() + + os.makedirs(output_dir, exist_ok=True) + + module_name = ALL_DATASETS[dataset][0] + module_class = get_modules(module_name) + module = module_class(hparams) + module.run_module() + + return module + + +if __name__ == '__main__': + main() diff --git a/Indic-BERT-v1-master/fine_tune/data/__init__.py b/Indic-BERT-v1-master/fine_tune/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..380aa8056751e6c68b8012be6fb1b75fd98acbf0 --- /dev/null +++ b/Indic-BERT-v1-master/fine_tune/data/__init__.py @@ -0,0 +1,27 @@ + +from .processors import * + + +PROCESSORS_TABLE = { + 'indicnlp-articles-headlines': IndicNLPHeadlines, + 'wiki-cloze': WikiCloze, + 'indicnlp-articles': IndicNLPGenre, + 'wikiann-ner': WikiNER, + 'wiki-section-titles': WikiSectionTitles, + 'cvit-mkb': ManKiBaat, + 'actsa': ACTSA, + 'bbc-articles': BBCNews, + 'iitp-movie-reviews': IITPMovies, + 'iitp-product-reviews': IITProducts, + 'inltk-headlines': INLTKHeadlines, + 'soham-articles': SohamArticles, + 'midas-discourse': MidasDiscourse, + 'wnli-translated': WNLI, + 'copa-translated': COPA, + 'amrita-paraphrase-exact': AmritaParaphraseExact, + 'amrita-paraphrase-fuzzy': AmritaParaphraseFuzzy +} + + +def load_dataset(dataset_name, data_dir): + return PROCESSORS_TABLE[dataset_name](data_dir) diff --git a/Indic-BERT-v1-master/fine_tune/data/examples.py b/Indic-BERT-v1-master/fine_tune/data/examples.py new file mode 100644 index 0000000000000000000000000000000000000000..379cf91d73d5e60864fe4636ccbc9dd306c7e449 --- /dev/null +++ b/Indic-BERT-v1-master/fine_tune/data/examples.py @@ -0,0 +1,327 @@ +import tqdm +import logging + +from dataclasses import dataclass +from typing import Optional, List, Any, Union +from transformers import PreTrainedTokenizer + + +logger = logging.getLogger(__name__) + + +@dataclass +class TextExample: + """ + A single training/test example for simple sequence classification. + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + guid: str + text_a: str + text_b: Optional[str] = None + label: Optional[str] = None + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(dataclasses.asdict(self), indent=2) + "\n" + + +@dataclass(frozen=True) +class MultipleChoiceExample: + """ + A single training/test example for multiple choice + + Args: + example_id: Unique id for the example. + question: string. The untokenized text of the second sequence + (question). + contexts: list of str. The untokenized text of the first sequence + (context of corresponding question). + endings: list of str. multiple choice's options. Its length must be + equal to contexts' length. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + example_id: str + question: str + contexts: List[str] + endings: List[str] + label: Optional[str] + + +@dataclass +class TokensExample: + """ + A single training/test example for token classification. + + Args: + guid: Unique id for the example. + words: list. The words of the sequence. + labels: (Optional) list. The labels for each word of the sequence. This + should be specified for train and dev examples, but not for test + examples. + """ + guid: str + words: List[str] + labels: Optional[List[str]] + + +@dataclass +class InputFeatures: + """ + A single set of features of data. + Property names are the same names as the corresponding inputs to a model. + """ + input_ids: Any + attention_mask: Any + token_type_ids: Any = None + label: Any = None + candidates: Any = None + example_id: str = None + + +def convert_multiple_choice_examples_to_features( + examples: List[MultipleChoiceExample], + tokenizer: PreTrainedTokenizer, + max_length: int, + label_list: List[str], + pad_token_segment_id=0, + pad_on_left=False, + pad_token=0, + mask_padding_with_zero=True, +) -> List[InputFeatures]: + """ + Loads a data file into a list of `InputFeatures` + """ + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"): + if ex_index % 10000 == 0: + logger.info("Writing example %d of %d" % (ex_index, len(examples))) + choices_inputs = [] + for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)): + text_a = context + if example.question.find("_") != -1: + # this is for cloze question + text_b = example.question.replace("_", ending) + else: + text_b = example.question + " " + ending + + inputs = tokenizer( + text_a, + text_b, + add_special_tokens=True, + max_length=max_length, + truncation='longest_first', + pad_to_max_length=True, + ) + if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0: + logger.info( + "Attention! you are cropping tokens (swag task is ok). " + "If you are training ARC and RACE and you are poping question + options," + "you need to try to use a bigger max seq length!" + ) + + choices_inputs.append(inputs) + + label = label_map[example.label] + + input_ids = [x["input_ids"] for x in choices_inputs] + attention_mask = ( + [x["attention_mask"] for x in choices_inputs] if "attention_mask" in choices_inputs[0] else None + ) + token_type_ids = ( + [x["token_type_ids"] for x in choices_inputs] if "token_type_ids" in choices_inputs[0] else None + ) + + features.append( + InputFeatures( + example_id=example.example_id, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + label=label, + ) + ) + + for f in features[:2]: + logger.info("*** Example ***") + logger.info("feature: %s" % f) + + return features + + +def convert_tokens_examples_to_features( + examples: List[TokensExample], + label_list: List[str], + max_seq_length: int, + tokenizer: PreTrainedTokenizer, + cls_token_at_end=False, + cls_token='[CLS]', + cls_token_segment_id=1, + sep_token='[SEP]', + sep_token_extra=False, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=0, + pad_token_label_id=-100, + sequence_a_segment_id=0, + mask_padding_with_zero=True, +) -> List[InputFeatures]: + """ Loads a data file into a list of `InputFeatures` + `cls_token_at_end` define the location of the CLS token: + - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] + - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] + `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) + """ + # TODO clean up all this to leverage built-in features of tokenizers + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10_000 == 0: + logger.info("Writing example %d of %d", ex_index, len(examples)) + + tokens = [] + label_ids = [] + for word, label in zip(example.words, example.labels): + word_tokens = tokenizer.tokenize(word) + + # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space. + if len(word_tokens) > 0: + tokens.extend(word_tokens) + # Use the real label id for the first token of the word, and padding ids for the remaining tokens + label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1)) + + # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. + special_tokens_count = tokenizer.num_special_tokens_to_add() + if len(tokens) > max_seq_length - special_tokens_count: + tokens = tokens[: (max_seq_length - special_tokens_count)] + label_ids = label_ids[: (max_seq_length - special_tokens_count)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens += [sep_token] + label_ids += [pad_token_label_id] + if sep_token_extra: + # roberta uses an extra separator b/w pairs of sentences + tokens += [sep_token] + label_ids += [pad_token_label_id] + segment_ids = [sequence_a_segment_id] * len(tokens) + + if cls_token_at_end: + tokens += [cls_token] + label_ids += [pad_token_label_id] + segment_ids += [cls_token_segment_id] + else: + tokens = [cls_token] + tokens + label_ids = [pad_token_label_id] + label_ids + segment_ids = [cls_token_segment_id] + segment_ids + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + padding_length = max_seq_length - len(input_ids) + if pad_on_left: + input_ids = ([pad_token] * padding_length) + input_ids + input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask + segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids + label_ids = ([pad_token_label_id] * padding_length) + label_ids + else: + input_ids += [pad_token] * padding_length + input_mask += [0 if mask_padding_with_zero else 1] * padding_length + segment_ids += [pad_token_segment_id] * padding_length + label_ids += [pad_token_label_id] * padding_length + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(label_ids) == max_seq_length + + if ex_index < 5: + logger.info("*** Example ***") + logger.info("guid: %s", example.guid) + logger.info("tokens: %s", " ".join([str(x) for x in tokens])) + logger.info("input_ids: %s", " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s", " ".join([str(x) for x in input_mask])) + logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) + logger.info("label_ids: %s", " ".join([str(x) for x in label_ids])) + + if "token_type_ids" not in tokenizer.model_input_names: + segment_ids = None + + features.append( + InputFeatures( + input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label=label_ids + ) + ) + return features + + +def convert_text_examples_to_features( + examples: List[TextExample], + tokenizer: PreTrainedTokenizer, + max_length: Optional[int] = None, + label_list=None, + output_mode=None, +): + if max_length is None: + max_length = tokenizer.model_max_length + + label_map = {label: i for i, label in enumerate(label_list)} + + def label_from_example(example: TextExample) -> Union[int, float, None]: + if example.label is None: + return None + if output_mode == "classification": + return label_map[example.label] + elif output_mode == "regression": + return float(example.label) + raise KeyError(output_mode) + + labels = [label_from_example(example) for example in examples] + + batch_encoding = tokenizer( + [example.text_a if example.text_b is None else (example.text_a, example.text_b) for example in examples], + max_length=max_length, + padding="max_length", + truncation=True, + ) + + features = [] + for i in range(len(examples)): + inputs = {k: batch_encoding[k][i] for k in batch_encoding} + + feature = InputFeatures(**inputs, label=labels[i]) + features.append(feature) + + return features \ No newline at end of file diff --git a/Indic-BERT-v1-master/fine_tune/data/processors.py b/Indic-BERT-v1-master/fine_tune/data/processors.py new file mode 100644 index 0000000000000000000000000000000000000000..53a52b9437992fbaeafdaea25d88e7243a184c32 --- /dev/null +++ b/Indic-BERT-v1-master/fine_tune/data/processors.py @@ -0,0 +1,521 @@ + +import csv +import json +import os + +from .examples import MultipleChoiceExample, TextExample, TokensExample + + +class DataProcessor: + """Base class for data converters for sequence classification data sets.""" + + def __init__(self, data_dir): + self.data_dir = data_dir + + def get_examples(self, lang, mode): + if mode == 'train': + return self.get_train_examples(lang) + elif mode == 'dev': + return self.get_dev_examples(lang) + elif mode == 'test': + return self.get_test_examples(lang) + + def modes(self): + return ['train', 'dev', 'test'] + + def get_train_examples(self, lang): + """Gets a collection of :class:`InputExample` for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, lang): + """Gets a collection of :class:`InputExample` for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, lang): + """Gets a collection of :class:`InputExample` for the test set.""" + raise NotImplementedError() + + def get_labels(self, lang): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def read_csv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, encoding='utf-8') as fp: + return list(csv.reader(fp, delimiter=',')) + + @classmethod + def read_json(cls, input_file): + """Reads a json file file.""" + with open(input_file, encoding='utf-8') as fp: + return json.load(fp) + + @classmethod + def readlines(cls, filepath): + with open(filepath, encoding='utf-8') as fp: + return fp.readlines() + + @classmethod + def read_jsonl(cls, filepath): + with open(filepath, 'r', encoding='utf-8') as fp: + data = fp.readlines() + data = list(map(lambda l: json.loads(l), data)) + return data + + +class IndicNLPHeadlines(DataProcessor): + """Processor for the Headline Predction dataset""" + + def __init__(self, data_dir): + self.data_dir = data_dir + + def get_train_examples(self, lang): + """See base class.""" + fname = '{}/{}-train.json'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_json(fpath), 'train') + + def get_dev_examples(self, lang): + '''See base class.''' + fname = '{}/{}-valid.json'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_json(fpath), 'dev') + + def get_test_examples(self, lang): + '''See base class.''' + fname = '{}/{}-test.json'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_json(fpath), 'test') + + def get_labels(self, lang): + """See base class.""" + return ['A', 'B', 'C', 'D'] + + def _create_examples(self, items, set_type): + """Creates examples for the training and dev sets.""" + examples = [ + MultipleChoiceExample( + example_id=idx, + question='', + contexts=[item['content'], item['content'], item['content'], + item['content']], + endings=[item['optionA'], item['optionB'], item['optionC'], + item['optionD']], + label=item['correctOption'], + ) + for idx, item in enumerate(items) + ] + return examples + + +class WikiCloze(DataProcessor): + """Processor for Wiki Cloze QA dataset""" + + def __init__(self, data_dir): + self.data_dir = data_dir + + def modes(self): + return ['test'] + + def get_test_examples(self, lang): + """See base class.""" + fname = '{}.json'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_json(fpath)['cloze_data'], 'test') + + def get_labels(self, lang): + """See base class.""" + return list(range(4)) + + def _create_examples(self, items, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, item) in enumerate(items): + if '' in [option.strip() for option in item['options']]: + continue + example = MultipleChoiceExample( + example_id=i, + question=item['question'].replace('', '[MASK]'), + contexts=[], + endings=item['options'], + label=item['options'].index(item['answer']) + ) + examples.append(example) + return examples + + +class IndicNLPGenre(DataProcessor): + """Processor for the Article Genre Classification data set""" + + def __init__(self, data_dir): + self.data_dir = data_dir + + def get_train_examples(self, lang): + """See base class.""" + fname = '{}/{}-train.csv'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_csv(fpath), 'train') + + def get_dev_examples(self, lang): + """See base class.""" + fname = '{}/{}-valid.csv'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_csv(fpath), 'dev') + + def get_test_examples(self, lang): + fname = '{}/{}-test.csv'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_csv(fpath), 'test') + + def get_labels(self, lang): + """See base class.""" + filename = '{}/{}-train.csv'.format(lang, lang) + lines = self.read_csv(os.path.join(self.data_dir, filename)) + labels = map(lambda l: l[0], lines) + labels = list(set(labels)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + example = TextExample( + guid=('%s-%s' % (set_type, i)), + text_a=line[1], + label=line[0] + ) + examples.append(example) + return examples + + +class WikiNER(DataProcessor): + + def __init__(self, data_dir): + self.data_dir = data_dir + + def get_examples(self, lang, mode): + mode = 'valid' if mode == 'dev' else mode + file_path = os.path.join(self.data_dir, lang, f'{mode}.txt') + guid_index = 1 + examples = [] + with open(file_path, encoding='utf-8') as f: + words = [] + labels = [] + for line in f: + if line.startswith('-DOCSTART-') or line == '' or line == '\n': + if words: + example = TokensExample( + guid=f'{mode}-{guid_index}', + words=words, + labels=labels + ) + examples.append(example) + guid_index += 1 + words = [] + labels = [] + else: + splits = line.split(' ') + words.append(splits[0]) + if len(splits) > 1: + labels.append(splits[-1].replace('\n', '')) + else: + # Examples could have no label for mode = 'test' + labels.append('O') + if words: + example = TokensExample( + guid=f'{mode}-{guid_index}', + words=words, + labels=labels + ) + examples.append(example) + return examples + + def get_labels(self, lang): + path = os.path.join(self.data_dir, lang, 'labels.txt') + with open(path, 'r') as f: + labels = f.read().splitlines() + if 'O' not in labels: + labels = ['O'] + labels + return labels + + +class WikiSectionTitles(DataProcessor): + """Processor for the Wikipedia Section Title Prediction dataset""" + + def __init__(self, data_dir): + self.data_dir = data_dir + + def get_train_examples(self, lang): + """See base class.""" + fname = '{}/{}-train.json'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_json(fpath), 'train') + + def get_dev_examples(self, lang): + """See base class.""" + fname = '{}/{}-valid.json'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_json(fpath), 'dev') + + def get_test_examples(self, lang): + """See base class.""" + fname = '{}/{}-test.json'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_json(fpath), 'test') + + def get_labels(self, lang): + """See base class.""" + return ['titleA', 'titleB', 'titleC', 'titleD'] + + def _create_examples(self, items, set_type): + """Creates examples for the training and dev sets.""" + examples = [ + MultipleChoiceExample( + example_id=idx, + question='', + contexts=[item['sectionText'], item['sectionText'], + item['sectionText'], item['sectionText']], + endings=[item['titleA'], item['titleB'], item['titleC'], + item['titleD']], + label=item['correctTitle'], + ) + for idx, item in enumerate(items) + ] + return examples + + +class ManKiBaat(DataProcessor): + """Processor for Man ki Baat dataset""" + + def __init__(self, data_dir): + self.data_dir = data_dir + + def modes(self): + return ['en', 'in'] + + def get_examples(self, lang, mode): + if mode == 'en': + return self.get_examples_en(lang) + elif mode == 'in': + return self.get_examples_in(lang) + + def get_examples_en(self, lang): + """Get examples of English language""" + fname = 'en-{}/mkb.en'.format(lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.readlines(fpath), 'en') + + def get_examples_in(self, lang): + """Get examples of the Indian language""" + fname = 'en-{}/mkb.{}'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.readlines(fpath), 'in') + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + example = TextExample( + guid=('%s-%s' % (set_type, i)), + text_a=line, + label=i + ) + examples.append(example) + return examples + + def get_labels(self, lang): + # return dummy value greater than number of examples + return list(range(10000)) + + +class ACTSA(IndicNLPGenre): + pass + + +class BBCNews(IndicNLPGenre): + + def get_dev_examples(self, lang): + """See base class.""" + fname = '{}/{}-test.csv'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_csv(fpath), 'dev') + + +class INLTKHeadlines(IndicNLPGenre): + pass + + +class SohamArticles(IndicNLPGenre): + pass + + +class IITPMovies(IndicNLPGenre): + pass + + +class IITProducts(IndicNLPGenre): + pass + + +class AmritaParaphraseExact(IndicNLPGenre): + + def get_dev_examples(self, lang): + """See base class.""" + fname = '{}/{}-test.csv'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_csv(fpath), 'dev') + + def get_labels(self, lang): + """See base class.""" + filename = '{}/{}-train.csv'.format(lang, lang) + lines = self.read_csv(os.path.join(self.data_dir, filename)) + labels = map(lambda l: l[2], lines) + labels = list(set(labels)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + example = TextExample( + guid=('%s-%s' % (set_type, i)), + text_a=line[0], + text_b=line[1], + label=line[2] + ) + examples.append(example) + return examples + + +class AmritaParaphraseFuzzy(AmritaParaphraseExact): + pass + + +class MidasDiscourse(DataProcessor): + """Processor for the Article Genre Classification data set""" + + def __init__(self, data_dir): + self.data_dir = data_dir + + def get_train_examples(self, lang): + """See base class.""" + fname = '{}/train.json'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_json(fpath), 'train') + + def get_dev_examples(self, lang): + """See base class.""" + fname = '{}/val.json'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_json(fpath), 'dev') + + def get_test_examples(self, lang): + fname = '{}/test.json'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_json(fpath), 'test') + + def get_labels(self, lang): + """See base class.""" + filename = '{}/train.json'.format(lang, lang) + lines = self.read_json(os.path.join(self.data_dir, filename)) + labels = map(lambda l: l['Discourse Mode'], lines) + labels = list(set(labels)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + example = TextExample( + guid=('%s-%s' % (set_type, i)), + text_a=line['Sentence'], + label=line['Discourse Mode'] + ) + examples.append(example) + return examples + + +class WNLI(DataProcessor): + """Processor for the WNLI data set (GLUE version).""" + + def __init__(self, data_dir): + self.data_dir = data_dir + + def get_train_examples(self, lang): + """See base class.""" + fname = '{}/train.csv'.format(lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_csv(fpath), 'train') + + def get_dev_examples(self, lang): + """See base class.""" + fname = '{}/dev.csv'.format(lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_csv(fpath), 'dev') + + def get_test_examples(self, lang): + """See base class.""" + fname = '{}/dev.csv'.format(lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_csv(fpath), 'test') + + def get_labels(self, lang): + """See base class.""" + return ['0', '1'] + + def _create_examples(self, lines, set_type): + """Creates examples for the training, dev and test sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append(TextExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class COPA(DataProcessor): + """Processor for the Wikipedia Section Title Prediction dataset""" + + def __init__(self, data_dir): + self.data_dir = data_dir + + def get_train_examples(self, lang): + """See base class.""" + fname = '{}/train.jsonl'.format(lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_jsonl(fpath), 'train') + + def get_dev_examples(self, lang): + """See base class.""" + fname = '{}/val.jsonl'.format(lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_jsonl(fpath), 'dev') + + def get_test_examples(self, lang): + """See base class.""" + fname = '{}/val.jsonl'.format(lang, lang) + fpath = os.path.join(self.data_dir, fname) + return self._create_examples(self.read_jsonl(fpath), 'test') + + def get_labels(self, lang): + """See base class.""" + return [0, 1] + + def _create_examples(self, items, set_type): + """Creates examples for the training and dev sets.""" + examples = [ + MultipleChoiceExample( + example_id=idx, + question='', + contexts=[item['premise'], item['premise']], + endings=[item['choice1'], item['choice2']], + label=item['label'], + ) + for idx, item in enumerate(items) + ] + return examples diff --git a/Indic-BERT-v1-master/fine_tune/modules/__init__.py b/Indic-BERT-v1-master/fine_tune/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f4e330f491ed6fb36e37f6185d25c6d5ad2e2d01 --- /dev/null +++ b/Indic-BERT-v1-master/fine_tune/modules/__init__.py @@ -0,0 +1,22 @@ + + +from .masked_lm import MaskedLM +from .multiple_choice import MultipleChoice +from .text_classification import TextClassification +from .token_classification import TokenClassification +from .xsent_retrieval import XSentRetrieval + + +modules = { + 'masked_lm': MaskedLM, + 'multiple_choice': MultipleChoice, + 'text_classification': TextClassification, + 'token_classification': TokenClassification, + 'xsent_retrieval': XSentRetrieval +} + + +def get_modules(name=None): + if name: + return modules[name] + return modules.values() diff --git a/Indic-BERT-v1-master/fine_tune/modules/base.py b/Indic-BERT-v1-master/fine_tune/modules/base.py new file mode 100644 index 0000000000000000000000000000000000000000..4607c2ba04742a7f60eaf30632760993676b5668 --- /dev/null +++ b/Indic-BERT-v1-master/fine_tune/modules/base.py @@ -0,0 +1,397 @@ + +import argparse +import logging +import os +import glob +import random +import copy +import numpy as np +import pytorch_lightning as pl +import torch +import torch.nn as nn + +from torch.nn import CrossEntropyLoss, MSELoss +from torch.utils.data import DataLoader, TensorDataset +from transformers import ( + AdamW, + AutoConfig, + AutoModel, + AutoModelForPreTraining, + AutoModelForQuestionAnswering, + AutoModelForSequenceClassification, + AutoModelForTokenClassification, + AutoModelWithLMHead, + AutoModelForMultipleChoice, + AutoTokenizer, + get_linear_schedule_with_warmup, +) + +from ..data import load_dataset +from ..data.examples import * + + +logger = logging.getLogger(__name__) + + +MODEL_MODES = { + 'base': AutoModel, + 'sequence-classification': AutoModelForSequenceClassification, + 'question-answering': AutoModelForQuestionAnswering, + 'pretraining': AutoModelForPreTraining, + 'token-classification': AutoModelForTokenClassification, + 'language-modeling': AutoModelWithLMHead, + 'multiple-choice': AutoModelForMultipleChoice, +} + + +def get_model_class(model_type, mode): + return MODEL_MODES[mode] + + +def set_seed(hparams): + random.seed(hparams['seed']) + np.random.seed(hparams['seed']) + torch.manual_seed(hparams['seed']) + if hparams['n_gpu'] > 0: + torch.cuda.manual_seed_all(hparams['seed']) + + +class BaseModule(pl.LightningModule): + """ + The base module has 4 components: config, tokenizer, transformer model, + and dataset + + Loading of a dataset: + 1. Load instances of a dataset in the form of `Examples` + 2. Convert all examples into features - may require tokenizer + 3. Create a tensor dataset and loader given all the converted features + + """ + + def __init__(self, hparams): + super().__init__() + + hparams['mode'] = self.mode + hparams['output_mode'] = self.output_mode + hparams['example_type'] = self.example_type + hparams['dev_lang'] = hparams['train_lang'] + self.hparams = hparams # must come after super + self.dataset = load_dataset(hparams['dataset'], hparams['data_dir']) + if self.output_mode == 'classification': + self.labels = self.dataset.get_labels(hparams['train_lang']) + + # setup config object + config_name = hparams['config_name'] or hparams['model_name_or_path'] + args = {} + if self.output_mode == 'classification': + hparams['num_labels'] = len(self.dataset.get_labels(hparams['train_lang'])) + args = {'num_labels': hparams['num_labels']} + + self.config = AutoConfig.from_pretrained( + config_name, + **args, + cache_dir=hparams['cache_dir'] + ) + + # setup tokenizer object + tok_name = hparams['tokenizer_name'] or hparams['model_name_or_path'] + self.tokenizer = AutoTokenizer.from_pretrained( + tok_name, + config=self.config, + cache_dir=hparams['cache_dir'], + ) + + # setup transformer model + model_class = get_model_class(self.config.model_type, hparams['mode']) + self.model = model_class.from_pretrained( + hparams['model_name_or_path'], + config=self.config, + cache_dir=hparams['cache_dir'], + ) + + def forward(self, **inputs): + return self.model(**inputs) + + def prepare_data(self): + """Cache feature files on disk for every mode at the onset""" + modes = self.dataset.modes() + for mode in modes: + cached_features_file = self._feature_file(mode) + if not os.path.exists(cached_features_file)\ + or self.hparams['overwrite_cache']: + self.load_features(mode) + + def load_features(self, mode): + """Load examples and convert them into features""" + if mode in ('train', 'dev', 'test'): + lang = self.hparams['{}_lang'.format(mode)] + else: + lang = self.hparams['test_lang'] + examples = self.dataset.get_examples(lang, mode) + + cached_features_file = self._feature_file(mode) + if os.path.exists(cached_features_file)\ + and not self.hparams['overwrite_cache']: + features = torch.load(cached_features_file) + else: + features = self.convert_examples_to_features(examples) + torch.save(features, cached_features_file) + + return features + + def convert_examples_to_features(self, examples): + if self.hparams['example_type'] == 'multiple-choice': + features = convert_multiple_choice_examples_to_features( + examples, + self.tokenizer, + max_length=self.hparams['max_seq_length'], + label_list=self.labels + ) + elif self.hparams['example_type'] == 'text': + features = convert_text_examples_to_features( + examples, + self.tokenizer, + max_length=self.hparams['max_seq_length'], + label_list=self.labels, + output_mode=self.output_mode, + ) + elif self.hparams['example_type'] == 'tokens': + features = convert_tokens_examples_to_features( + examples, + self.labels, + self.hparams['max_seq_length'], + self.tokenizer, + cls_token_at_end=bool(self.config.model_type in ["xlnet"]), + cls_token=self.tokenizer.cls_token, + cls_token_segment_id=2 if self.config.model_type in ["xlnet"] else 0, + sep_token=self.tokenizer.sep_token, + sep_token_extra=bool(self.config.model_type in ["roberta"]), + pad_on_left=bool(self.config.model_type in ["xlnet"]), + pad_token=self.tokenizer.pad_token_id, + pad_token_segment_id=self.tokenizer.pad_token_type_id, + pad_token_label_id=self.pad_token_label_id, + ) + return features + + def make_loader(self, features, batch_size): + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids or 0 for f in features], dtype=torch.long) + # all_candidates = torch.tensor([f.candidates for f in features], dtype=torch.long) + if self.hparams['output_mode'] == 'classification': + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + elif self.hparams['output_mode'] == 'regression': + all_labels = torch.tensor([f.label for f in features], dtype=torch.float) + + return DataLoader( + TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels), + # TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_candidates), + batch_size=batch_size, + ) + + def train_dataloader(self): + train_batch_size = self.hparams['train_batch_size'] + train_features = self.load_features('train') + dataloader = self.make_loader(train_features, train_batch_size) + + t_total = ( + (len(dataloader.dataset) // (train_batch_size * max(1, self.hparams['n_gpu']))) + // self.hparams['gradient_accumulation_steps'] + * float(self.hparams['num_train_epochs']) + ) + scheduler = get_linear_schedule_with_warmup( + self.opt, num_warmup_steps=self.hparams['warmup_steps'], num_training_steps=t_total + ) + self.lr_scheduler = scheduler + return dataloader + + def val_dataloader(self): + dev_features = self.load_features('dev') + dataloader = self.make_loader(dev_features, self.hparams['eval_batch_size']) + return dataloader + + def test_dataloader(self): + test_features = self.load_features('test') + dataloader = self.make_loader(test_features, self.hparams['eval_batch_size']) + return dataloader + + def training_step(self, batch, batch_idx): + inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} + if self.config.model_type != 'distilbert': + inputs['token_type_ids'] = ( + batch[2] if self.config.model_type in ['bert', 'xlnet', 'albert'] else None + ) # XLM and RoBERTa don't use token_type_ids + + outputs = self(**inputs) + loss = outputs[0] + + tensorboard_logs = {'loss': loss, 'rate': self.lr_scheduler.get_last_lr()[-1]} + return {'loss': loss, 'log': tensorboard_logs} + + def validation_step(self, batch, batch_nb): + inputs = {'input_ids': batch[0], + 'attention_mask': batch[1], + 'labels': batch[3]} + + # XLM and RoBERTa don't use token_type_ids + inputs['token_type_ids'] = None + if self.config.model_type in ['bert', 'xlnet', 'albert']: + inputs['token_type_ids'] = batch[2] + + outputs = self(**inputs) + tmp_eval_loss, logits = outputs[:2] + preds = logits.detach().cpu().numpy() + out_label_ids = inputs['labels'].detach().cpu().numpy() + + return {'val_loss': tmp_eval_loss.detach().cpu(), + 'pred': preds, + 'target': out_label_ids} + + def test_step(self, batch, batch_nb): + return self.validation_step(batch, batch_nb) + + def _feature_file(self, mode): + if mode in ('train', 'dev', 'test'): + lang = self.hparams['{}_lang'.format(mode)] + else: + lang = self.hparams['test_lang'] + return os.path.join( + self.hparams['data_dir'], + 'cached_{}_{}_{}_{}'.format( + lang, + mode, + list(filter(None, self.hparams['model_name_or_path'].split('/'))).pop(), + str(self.hparams['max_seq_length']), + ), + ) + + def is_logger(self): + return self.trainer.global_rank <= 0 + + def configure_optimizers(self): + """Prepare optimizer and schedule (linear warmup and decay)""" + + model = self.model + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + { + 'params': [p for n, p in model.named_parameters() + if not any(nd in n for nd in no_decay)], + 'weight_decay': self.hparams['weight_decay'], + }, + { + 'params': [p for n, p in model.named_parameters() + if any(nd in n for nd in no_decay)], + 'weight_decay': 0.0, + }, + ] + optimizer = AdamW(optimizer_grouped_parameters, + lr=self.hparams['learning_rate'], + eps=self.hparams['adam_epsilon']) + self.opt = optimizer + return [optimizer] + + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, + second_order_closure=None): + if self.trainer.use_tpu: + import torch_xla.core.xla_model as xm + xm.optimizer_step(optimizer) + else: + optimizer.step() + optimizer.zero_grad() + self.lr_scheduler.step() + + def get_tqdm_dict(self): + avg_loss = getattr(self.trainer, 'avg_loss', 0.0) + tqdm_dict = {'loss': '{:.3f}'.format(avg_loss), 'lr': self.lr_scheduler.get_last_lr()[-1]} + return tqdm_dict + + def run_module(self): + trainer = create_trainer(self, self.hparams) + hparams_copy = copy.deepcopy(self.hparams) + + if self.hparams['do_train']: + checkpoints = list(sorted(glob.glob(os.path.join(self.hparams['output_dir'], 'checkpointepoch=*.ckpt'), recursive=True))) + if len(checkpoints) == 0: + trainer.fit(self) + checkpoints = list(sorted(glob.glob(os.path.join(self.hparams['output_dir'], 'checkpointepoch=*.ckpt'), recursive=True))) + self.trained_model = self.load_from_checkpoint(checkpoints[-1]) + self.trained_model.hparams = hparams_copy + + # Optionally, predict on dev set and write to output_dir + if self.hparams['do_predict']: + trainer.test(self.trained_model) + + +# Fixes __temp_weight_ddp_end.ckpt bug +# See https://github.com/PyTorchLightning/pytorch-lightning/issues/1142 +class MonkeyPatchedTrainer(pl.Trainer): + def load_spawn_weights(self, original_model): + pass + + +pl.Trainer = MonkeyPatchedTrainer + + +class LoggingCallback(pl.Callback): + def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule): + logger.info("***** Validation results *****") + if pl_module.is_logger(): + metrics = trainer.callback_metrics + # Log results + for key in sorted(metrics): + if key not in ["log", "progress_bar"]: + logger.info("{} = {}\n".format(key, str(metrics[key]))) + + def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule): + logger.info("***** Test results *****") + print(trainer.callback_metrics) + + if pl_module.is_logger(): + metrics = trainer.callback_metrics + + # Log and save results to file + output_dir = pl_module.hparams['output_dir'] + test_lang = pl_module.hparams['test_lang'] + output_test_results_file = os.path.join(output_dir, 'test_results_{}.txt'.format(test_lang)) + with open(output_test_results_file, "w") as writer: + for key in sorted(metrics): + if key not in ["log", "progress_bar"]: + logger.info("{} = {}\n".format(key, str(metrics[key]))) + writer.write("{} = {}\n".format(key, str(metrics[key]))) + + +def create_trainer(model, hparams): + # init model + set_seed(hparams) + + # if os.path.exists(hparams['output_dir']) and os.listdir(hparams['output_dir']) and hparams['do_train']: + # raise ValueError('Output directory ({}) already exists and is not empty.'.format(hparams['output_dir'])) + + checkpoint_callback = pl.callbacks.ModelCheckpoint( + filepath=hparams['output_dir'], prefix='checkpoint', monitor='val_loss', mode='min', save_top_k=5 + ) + + train_params = dict( + accumulate_grad_batches=hparams['gradient_accumulation_steps'], + gpus=hparams['n_gpu'], + max_epochs=hparams['num_train_epochs'], + early_stop_callback=False, + gradient_clip_val=hparams['max_grad_norm'], + checkpoint_callback=checkpoint_callback, + callbacks=[LoggingCallback()], + ) + + if hparams['fp16']: + train_params['use_amp'] = hparams['fp16'] + train_params['amp_level'] = hparams['fp16_opt_level'] + + if hparams['n_tpu_cores'] > 0: + train_params['tpu_cores'] = hparams['n_tpu_cores'] + train_params['gpus'] = 0 + + if hparams['n_gpu'] > 1: + train_params['distributed_backend'] = 'ddp' + + trainer = pl.Trainer(**train_params) + return trainer diff --git a/Indic-BERT-v1-master/fine_tune/modules/masked_lm.py b/Indic-BERT-v1-master/fine_tune/modules/masked_lm.py new file mode 100644 index 0000000000000000000000000000000000000000..9234bbde4a965e1b0078fd82f253d475e6426bf8 --- /dev/null +++ b/Indic-BERT-v1-master/fine_tune/modules/masked_lm.py @@ -0,0 +1,155 @@ +""" +Based on https://github.com/huggingface/transformers/issues/80 + +""" + +import json +import argparse +import glob +import sys +import logging +import os +import time +import string +from filelock import FileLock + +import numpy as np +import pickle +import torch +from torch.utils.data import DataLoader, TensorDataset + +from .base import BaseModule, create_trainer +from ..data.examples import InputFeatures +from collections import ChainMap +from torch.utils.data import DataLoader, TensorDataset + + +logger = logging.getLogger(__name__) + + +class MaskedLM(BaseModule): + + mode = 'language-modeling' + output_mode = 'classification' + example_type = 'multiple-choice' + + def __init__(self, hparams): + super().__init__(hparams) + + self.mask_id = self.tokenizer.convert_tokens_to_ids('[MASK]') + self.test_results_fpath = 'test_results' + if os.path.exists(self.test_results_fpath): + os.remove(self.test_results_fpath) + + def convert_examples_to_features(self, examples): + + batch_encoding = self.tokenizer( + [example.question for example in examples], + max_length=self.hparams['max_seq_length'], + padding='max_length', + truncation=True, + ) + + features = [] + for i in range(len(examples)): + inputs = {k: batch_encoding[k][i] for k in batch_encoding} + candidates = examples[i].endings + tokens = [self.tokenizer.tokenize(cand) for cand in candidates] + token_candidates = [] + + for toks in tokens: + if len(toks) == 0: + token_candidates.append(self.tokenizer.unk_token) + else: + token_candidates.append(max(toks, key=lambda t: len(t.strip(string.punctuation)))) + candidate_ids = self.tokenizer.convert_tokens_to_ids(token_candidates) + + feature = InputFeatures(**inputs, candidates=candidate_ids, label=examples[i].label) + features.append(feature) + + return features + + def test_dataloader(self): + mode = 'test' + cached_features_file = self._feature_file(mode) + if os.path.exists(cached_features_file) and not self.hparams['overwrite_cache']: + features = torch.load(cached_features_file) + else: + features = self.load_features(mode) + torch.save(features, cached_features_file) + + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids or 0 for f in features], dtype=torch.long) + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + all_cands = torch.tensor([f.candidates for f in features], dtype=torch.long) + all_answers = torch.tensor([f.label for f in features], dtype=torch.long) + + return DataLoader( + TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_cands, all_answers), + batch_size=self.hparams['eval_batch_size'], + ) + + def test_step(self, batch, batch_idx): + inputs = {'input_ids': batch[0], 'token_type_ids': batch[2], + 'attention_mask': batch[1]} + + answers = batch[3].detach().cpu().numpy() + candidates = batch[4].detach().cpu().numpy() + + # get first mask location + input_ids = batch[0].detach().cpu().numpy() + mask_ids = (input_ids == self.mask_id).argmax(axis=1) + mask_ids = torch.from_numpy(mask_ids) + + predictions = self(**inputs)[0] + + i = torch.arange(0, predictions.shape[0], dtype=torch.int64) + predictions = predictions[i, mask_ids] + predictions = predictions.detach().cpu().numpy() + + right, wrong = 0, 0 + + for i, pred in enumerate(predictions): + prob = pred[candidates[i]] + pred_answer = int(np.argmax(prob)) + if answers[i] == pred_answer: + right += 1 + else: + wrong += 1 + + return {"right": right, "wrong": wrong} + + def test_epoch_end(self, outputs): + right = sum(output['right'] for output in outputs) + wrong = sum(output['wrong'] for output in outputs) + merged = {'right': right, 'wrong': wrong} + + with FileLock(self.test_results_fpath + '.lock'): + if os.path.exists(self.test_results_fpath): + with open(self.test_results_fpath, 'rb') as fp: + data = pickle.load(fp) + data = {'right': data['right'] + merged['right'], 'wrong': data['wrong'] + merged['wrong']} + else: + data = merged + with open(self.test_results_fpath, 'wb') as fp: + pickle.dump(data, fp) + + return data + + @staticmethod + def add_model_specific_args(parser, root_dir): + return parser + + def run_module(self): + self.eval() + self.freeze() + torch.no_grad() + + trainer = create_trainer(self, self.hparams) + + trainer.test(self) + preds = pickle.load(open(self.test_results_fpath, 'rb')) + correct, wrong = preds['right'], preds['wrong'] + with open(os.path.join(self.hparams['output_dir'], 'test_results.txt'), 'w') as fp: + json.dump({'test_acc': correct/(correct + wrong)}, fp) diff --git a/Indic-BERT-v1-master/fine_tune/modules/multiple_choice.py b/Indic-BERT-v1-master/fine_tune/modules/multiple_choice.py new file mode 100644 index 0000000000000000000000000000000000000000..ccf2fe057b33e4cb0e9d85c6a4fd35ef8371c7df --- /dev/null +++ b/Indic-BERT-v1-master/fine_tune/modules/multiple_choice.py @@ -0,0 +1,51 @@ +import torch +import numpy as np + +from .base import BaseModule +from .utils import mean_accuracy + + +class MultipleChoice(BaseModule): + + mode = 'multiple-choice' + output_mode = 'classification' + example_type = 'multiple-choice' + + def __init__(self, hparams): + super().__init__(hparams) + + def _eval_end(self, outputs): + val_loss_mean = torch.stack([x['val_loss'] for x in outputs])\ + .mean().detach().cpu().item() + preds = np.concatenate([x['pred'] for x in outputs], axis=0) + preds = np.argmax(preds, axis=1) + + out_label_ids = np.concatenate([x['target'] for x in outputs], axis=0) + out_label_list = [[] for _ in range(out_label_ids.shape[0])] + preds_list = [[] for _ in range(out_label_ids.shape[0])] + + results = {**{'val_loss': val_loss_mean}, + **mean_accuracy(preds, out_label_ids)} + + ret = {k: v for k, v in results.items()} + ret['log'] = results + return ret, preds_list, out_label_list + + def validation_epoch_end(self, outputs: list) -> dict: + ret, preds, targets = self._eval_end(outputs) + logs = ret['log'] + return {'val_loss': logs['val_loss'], 'log': logs, 'progress_bar': logs} + + def test_epoch_end(self, outputs): + ret, predictions, targets = self._eval_end(outputs) + + # Converting to the dic required by pl + logs = ret['log'] + # `val_loss` is the key returned by `self._eval_end()` + # but actually refers to `test_loss` + return {'avg_test_loss': logs['val_loss'], + 'log': logs, 'progress_bar': logs} + + @staticmethod + def add_model_specific_args(parser, root_dir): + return parser \ No newline at end of file diff --git a/Indic-BERT-v1-master/fine_tune/modules/question_answering.py b/Indic-BERT-v1-master/fine_tune/modules/question_answering.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Indic-BERT-v1-master/fine_tune/modules/text_classification.py b/Indic-BERT-v1-master/fine_tune/modules/text_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..856c2dfd86879d4ecd8c3b444f667aa0eaf54ecd --- /dev/null +++ b/Indic-BERT-v1-master/fine_tune/modules/text_classification.py @@ -0,0 +1,70 @@ +""" +Code inspired from the Huggingface's transformer library: +File path: transformers/examples/text-classification/run_pl_glue.py + +To handle large documents, we use head-truncation. Check the following +paper for a detailed analysis of text classification techniques using +bert-like models: https://arxiv.org/pdf/1905.05583.pdf +""" + +import argparse +import glob +import logging +import os +import time + +import numpy as np +import torch +from torch.utils.data import DataLoader, TensorDataset + +from .base import BaseModule, create_trainer +from .utils import mean_accuracy + + +logger = logging.getLogger(__name__) + + +class TextClassification(BaseModule): + + mode = 'sequence-classification' + output_mode = 'classification' + example_type = 'text' + + def __init__(self, hparams): + super().__init__(hparams) + + def _eval_end(self, outputs): + val_loss_mean = torch.stack([x['val_loss'] for x in outputs])\ + .mean().detach().cpu().item() + preds = np.concatenate([x['pred'] for x in outputs], axis=0) + preds = np.argmax(preds, axis=1) + + out_label_ids = np.concatenate([x['target'] for x in outputs], axis=0) + out_label_list = [[] for _ in range(out_label_ids.shape[0])] + preds_list = [[] for _ in range(out_label_ids.shape[0])] + + results = {**{'val_loss': val_loss_mean}, + **mean_accuracy(preds, out_label_ids)} + + ret = {k: v for k, v in results.items()} + ret['log'] = results + return ret, preds_list, out_label_list + + def validation_epoch_end(self, outputs: list) -> dict: + ret, preds, targets = self._eval_end(outputs) + logs = ret['log'] + return {'val_loss': logs['val_loss'], 'log': logs, 'progress_bar': logs} + + def test_epoch_end(self, outputs): + ret, predictions, targets = self._eval_end(outputs) + + # Converting to the dic required by pl + logs = ret['log'] + # `val_loss` is the key returned by `self._eval_end()` + # but actually refers to `test_loss` + return {'avg_test_loss': logs['val_loss'], + 'log': logs, 'progress_bar': logs} + + @staticmethod + def add_model_specific_args(parser, root_dir): + return parser diff --git a/Indic-BERT-v1-master/fine_tune/modules/token_classification.py b/Indic-BERT-v1-master/fine_tune/modules/token_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..62f37de6f2423ee9df7c7649e8966834a5d82799 --- /dev/null +++ b/Indic-BERT-v1-master/fine_tune/modules/token_classification.py @@ -0,0 +1,87 @@ +import argparse +import glob +import logging +import os +import subprocess + +import numpy as np +import torch +from seqeval.metrics import f1_score, precision_score, recall_score +from torch.nn import CrossEntropyLoss + +from .base import BaseModule + + +logger = logging.getLogger(__name__) + + +class TokenClassification(BaseModule): + + mode = 'token-classification' + output_mode = 'classification' + example_type = 'tokens' + + def __init__(self, hyparams): + self.pad_token_label_id = CrossEntropyLoss().ignore_index + + script_path = os.path.join(os.path.dirname(__file__), '../..', 'scripts/ner_preprocess.sh') + cmd = f"bash {script_path} {hyparams['data_dir']} {hyparams['train_lang']} "\ + f"{hyparams['test_lang']} {hyparams['model_name_or_path']} {hyparams['max_seq_length']}" + subprocess.call(cmd, shell=True) + + super().__init__(hyparams) + + def _eval_end(self, outputs): + """Evaluation called for both Val and Test""" + val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean() + preds = np.concatenate([x['pred'] for x in outputs], axis=0) + preds = np.argmax(preds, axis=2) + out_label_ids = np.concatenate([x['target'] for x in outputs], axis=0) + + label_map = {i: label for i, label in enumerate(self.labels)} + out_label_list = [[] for _ in range(out_label_ids.shape[0])] + preds_list = [[] for _ in range(out_label_ids.shape[0])] + + for i in range(out_label_ids.shape[0]): + for j in range(out_label_ids.shape[1]): + if out_label_ids[i, j] != self.pad_token_label_id: + out_label_list[i].append(label_map[out_label_ids[i][j]]) + preds_list[i].append(label_map[preds[i][j]]) + + results = { + 'val_loss': val_loss_mean, + 'precision': precision_score(out_label_list, preds_list), + 'recall': recall_score(out_label_list, preds_list), + 'f1': f1_score(out_label_list, preds_list), + } + + ret = {k: v for k, v in results.items()} + ret['log'] = results + return ret, preds_list, out_label_list + + def validation_epoch_end(self, outputs): + # when stable + ret, preds, targets = self._eval_end(outputs) + logs = ret['log'] + return {'val_loss': logs['val_loss'], 'log': logs, 'progress_bar': logs} + + def test_epoch_end(self, outputs): + # updating to test_epoch_end instead of deprecated test_end + ret, predictions, targets = self._eval_end(outputs) + + # Converting to the dict required by pl + # https://github.com/PyTorchLightning/pytorch-lightning/blob/master/\ + # pytorch_lightning/trainer/logging.py#L139 + logs = ret['log'] + # `val_loss` is the key returned by `self._eval_end()` but actually refers to `test_loss` + return {'avg_test_loss': logs['val_loss'], 'log': logs, 'progress_bar': logs} + + @staticmethod + def add_model_specific_args(parser, root_dir): + parser.add_argument( + '--labels', + default='', + type=str, + help='Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.', + ) + return parser \ No newline at end of file diff --git a/Indic-BERT-v1-master/fine_tune/modules/utils.py b/Indic-BERT-v1-master/fine_tune/modules/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ba8e210733ee420c1da0717e2f3720350ff7dbfc --- /dev/null +++ b/Indic-BERT-v1-master/fine_tune/modules/utils.py @@ -0,0 +1,4 @@ + + +def mean_accuracy(preds, labels): + return {'acc': (preds == labels).mean()} diff --git a/Indic-BERT-v1-master/fine_tune/modules/xsent_retrieval.py b/Indic-BERT-v1-master/fine_tune/modules/xsent_retrieval.py new file mode 100644 index 0000000000000000000000000000000000000000..6dea0e1d9577655d7610adcfe5c5e7abc31b5c16 --- /dev/null +++ b/Indic-BERT-v1-master/fine_tune/modules/xsent_retrieval.py @@ -0,0 +1,111 @@ +""" +""" +import logging +import json +import os +import pickle +import scipy.spatial as sp +from filelock import FileLock + +import numpy as np +import torch + +from .base import BaseModule, create_trainer + + +logger = logging.getLogger(__name__) + + +class XSentRetrieval(BaseModule): + + mode = 'base' + output_mode = 'classification' + example_type = 'text' + + def __init__(self, hparams): + self.test_results_fpath = 'test_results' + if os.path.exists(self.test_results_fpath): + os.remove(self.test_results_fpath) + + super().__init__(hparams) + + def forward(self, **inputs): + outputs = self.model(**inputs) + last_hidden = outputs[0] + mean_pooled = torch.mean(last_hidden, 1) + return mean_pooled + + def test_dataloader_en(self): + test_features = self.load_features('en') + dataloader = self.make_loader(test_features, self.hparams['eval_batch_size']) + return dataloader + + def test_dataloader_in(self): + test_features = self.load_features('in') + dataloader = self.make_loader(test_features, self.hparams['eval_batch_size']) + return dataloader + + def test_step(self, batch, batch_idx): + inputs = {'input_ids': batch[0], 'token_type_ids': batch[2], + 'attention_mask': batch[1]} + labels = batch[3].detach().cpu().numpy() + sentvecs = self(**inputs) + sentvecs = sentvecs.detach().cpu().numpy() + sentvecs = np.hstack([labels[:, None], sentvecs]) + + return {'sentvecs': sentvecs} + + def test_epoch_end(self, outputs): + all_sentvecs = np.vstack([x['sentvecs'] for x in outputs]) + + with FileLock(self.test_results_fpath + '.lock'): + if os.path.exists(self.test_results_fpath): + with open(self.test_results_fpath, 'rb') as fp: + data = pickle.load(fp) + data = np.vstack([data, all_sentvecs]) + else: + data = all_sentvecs + with open(self.test_results_fpath, 'wb') as fp: + pickle.dump(data, fp) + + return {'sentvecs': all_sentvecs} + + @staticmethod + def add_model_specific_args(parser, root_dir): + return parser + + def run_module(self): + self.eval() + self.freeze() + + trainer = create_trainer(self, self.hparams) + + trainer.test(self, self.test_dataloader_en()) + sentvecs1 = pickle.load(open(self.test_results_fpath, 'rb')) + os.remove(self.test_results_fpath) + + trainer.test(self, self.test_dataloader_in()) + sentvecs2 = pickle.load(open(self.test_results_fpath, 'rb')) + os.remove(self.test_results_fpath) + + sentvecs1 = sentvecs1[sentvecs1[:, 0].argsort()][:, 1:] + sentvecs2 = sentvecs2[sentvecs2[:, 0].argsort()][:, 1:] + + result_path = os.path.join(self.hparams['output_dir'], 'test_results.txt') + with open(result_path, 'w') as fp: + metrics = {'test_acc': precision_at_10(sentvecs1, sentvecs2)} + json.dump(metrics, fp) + + +def precision_at_10(sentvecs1, sentvecs2): + n = sentvecs1.shape[0] + + # mean centering + sentvecs1 = sentvecs1 - np.mean(sentvecs1, axis=0) + sentvecs2 = sentvecs2 - np.mean(sentvecs2, axis=0) + + sim = sp.distance.cdist(sentvecs1, sentvecs2, 'cosine') + actual = np.array(range(n)) + preds = sim.argsort(axis=1)[:, :10] + matches = np.any(preds == actual[:, None], axis=1) + return matches.mean() diff --git a/Indic-BERT-v1-master/notebooks/finetuning.ipynb b/Indic-BERT-v1-master/notebooks/finetuning.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b17d69041654827ab68cf9dea27329b50a4b2d5d --- /dev/null +++ b/Indic-BERT-v1-master/notebooks/finetuning.ipynb @@ -0,0 +1,145 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "finetuning.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "duPhpC7UYvOb", + "colab_type": "text" + }, + "source": [ + "#**Setup**\n", + "\n", + "---\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "08KZUbQnhKwE", + "colab_type": "code", + "colab": {} + }, + "source": [ + "\n", + "!git clone https://github.com/ai4bharat/indic-bert\n", + "%cd indic-bert\n", + "!pip3 install -r requirements.txt\n", + "%cd ..\n", + "!mkdir indic-glue outputs" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u4TbQgpAYrSL", + "colab_type": "text" + }, + "source": [ + "#**Download Datasets**\n", + "---\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "B8Te43TtV9OV", + "colab_type": "code", + "colab": {} + }, + "source": [ + "\n", + "% cd indic-glue\n", + "# Download the dataset -- insert link obtained from https://indicnlp.ai4bharat.org/indic-glue/#downloads\n", + "!wget https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/evaluations/wiki-cloze.tar.gz\n", + "!tar -xaf wiki-cloze.tar.gz\n", + "% cd ..\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cAs6r-QSUosR", + "colab_type": "text" + }, + "source": [ + "#**Fine-tune the Model**\n", + "---\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "vUCIjREpQFhv", + "colab_type": "code", + "colab": {} + }, + "source": [ + "\n", + "%cd indic-bert\n", + "\n", + "import os\n", + "\n", + "from fine_tune.cli import main as finetune_main\n", + "\n", + "argvec = ['--lang', 'gu',\n", + " '--dataset', 'wiki-cloze', # use the right dataset key, check https://github.com/AI4Bharat/indic-bert/blob/master/fine_tune/cli.py#L10\n", + " '--model', 'ai4bharat/indic-bert',\n", + " '--iglue_dir', '../indic-glue',\n", + " '--output_dir', '../outputs',\n", + " '--max_seq_length', '128',\n", + " '--learning_rate', '2e-5',\n", + " '--num_train_epochs', '3',\n", + " '--train_batch_size', '32'\n", + "]\n", + "\n", + "finetune_main(argvec)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zUh3Vw7SUwMW", + "colab_type": "text" + }, + "source": [ + "#**Check the Results**\n", + "---" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Ic0Qpfl-U0Xw", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!cat /content/outputs/wiki-cloze/gu-gu/model-ai4bharat-indic-bert/test_results.txt" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/Indic-BERT-v1-master/readme.md b/Indic-BERT-v1-master/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..65adeee1a3aa32ec60ceac5c940ce2bd11d81d26 --- /dev/null +++ b/Indic-BERT-v1-master/readme.md @@ -0,0 +1,329 @@ +### As of May 2023, we recommend using [IndicBERT](https://github.com/AI4Bharat/IndicBERT) Repository: +[IndicBERT](https://github.com/AI4Bharat/IndicBERT) is the new and improved implementation of BERT supporting fine-tuning with HuggingFace. +All the download links for IndicCorpv2, IndicXTREME and various IndicBERTv2 models are available [here](https://github.com/AI4Bharat/IndicBERT). + +
+

IndicBERT

+ Website | + Downloads | + Paper

+ Doc     +

+
+ + +Indic bert is a multilingual ALBERT model that exclusively covers 12 major Indian languages. It is pre-trained on our novel corpus of around 9 billion tokens and evaluated on a set of diverse tasks. Indic-bert has around 10x fewer parameters than other popular publicly available multilingual models while it also achieves a performance on-par or better than these models. + +We also introduce IndicGLUE - a set of standard evaluation tasks that can be used to measure the NLU performance of monolingual and multilingual models on Indian languages. Along with IndicGLUE, we also compile a list of additional evaluation tasks. This repository contains code for running all these evaluation tasks on indic-bert and other bert-like models. + + + +### Table of Contents + +* [Introduction](#introduction) +* [Setting up the Code](#setting-up-the-code) +* [Running Experiments](#running-experiments) +* [Pretraining Corpus](#pretraining-corpus) +* [IndicGLUE](#iglue) +* [Additional Evaluation Tasks](#additional-evaluation-tasks) +* [Evaluation Results](#evaluation-results) +* [Downloads](#downloads) +* [Citing](#citing) +* [License](#license) +* [Contributors](#contributors) +* [Contact](#contact) + + + +### Introduction + +The Indic BERT model is based on the ALBERT model, a recent derivative of BERT. It is pre-trained on 12 Indian languages: Assamese, Bengali, English, Gujarati, Hindi, Kannada, Malayalam, Marathi, Oriya, Punjabi, Tamil, Telugu. + +The easiest way to use Indic BERT is through the Huggingface transformers library. It can be simply loaded like this: + +```python +# pip3 install transformers +# pip3 install sentencepiece + +from transformers import AutoModel, AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert') +model = AutoModel.from_pretrained('ai4bharat/indic-bert') +``` +Note: To preserve accents (vowel matras / diacritics) while tokenization (Read this issue for more details [#26](../../issues/26) ), use this: +```python +tokenizer = transformers.AutoTokenizer.from_pretrained('ai4bharat/indic-bert', keep_accents=True) +``` + +### Setting up the Code + +The code can be run on GPU, TPU or on Google's Colab platform. If you want to run it on Colab, you can simply use our fine-tuning notebook [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ai4bharat/indic-bert/blob/master/notebooks/finetuning.ipynb). For running it in your own VM, start with running the following commands: + +```bash +git clone https://github.com/AI4Bharat/indic-bert +cd indic-bert +sudo pip3 install -r requirements.txt +``` + +By default, the installation will use GPU. For TPU support, first update your `.bashrc` with the following variables: + +```bash +export PYTHONPATH="${PYTHONPATH}:/usr/share/tpu/models: --dataset --lang --iglue_dir --output_dir +``` + +For more advanced usage of the fine-tuning code, refer [this document](https://github.com/AI4Bharat/indic-bert/blob/master/docs/advanced-usage.md). + +### Pretraining Corpus + +We pre-trained indic-bert on AI4Bharat's monolingual corpus. The corpus has the following distribution of languages: + + +| Language | as | bn | en | gu | hi | kn | | +| ----------------- | ------ | ------ | ------ | ------ | ------ | ------ | ------- | +| **No. of Tokens** | 36.9M | 815M | 1.34B | 724M | 1.84B | 712M | | +| **Language** | **ml** | **mr** | **or** | **pa** | **ta** | **te** | **all** | +| **No. of Tokens** | 767M | 560M | 104M | 814M | 549M | 671M | 8.9B | + + + +### IndicGLUE + +IGLUE is a natural language understanding benchmark for Indian languages that we propose. While building this benchmark, our objective was also to cover most of the 11 Indian languages for each task. It consists of the following tasks: + +##### News Category Classification + +Predict the genre of a given news article. The dataset contains around 125k news articles across 9 Indian languages. Example: + +*Article Snippet*: + +``` +கர்நாடக சட்டப் பேரவையில் வெற்றி பெற்ற எம்எல்ஏக்கள் இன்று பதவியேற்றுக் கொண்ட நிலையில் , காங்கிரஸ் எம்எல்ஏ ஆனந்த் சிங் க்கள் ஆப்சென்ட் ஆகி அதிர்ச்சியை ஏற்படுத்தியுள்ளார் . உச்சநீதிமன்ற உத்தரவுப்படி இன்று மாலை முதலமைச்சர் எடியூரப்பா இன்று நம்பிக்கை வாக்கெடுப்பு நடத்தி பெரும்பான்மையை நிரூபிக்க உச்சநீதிமன்றம் உத்தரவிட்டது . +``` + +*Category*: Politics + + + +##### Named Entity Recognition + +Recognize entities and their coarse types in a sequence of words. The dataset contains around 787k examples across 11 Indian languages. + +*Example*: + + +| | | | | | | | | | | +|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------| +| **Token** | चाणक्य | पुरी | को | यहाँ | देखने | हेतु | यहाँ | क्लिक | करें | +| **Type** | B-LOC | I-LOC | O | O | O | O | O | O | O | + + + +##### Headline Prediction + + Predict the correct headline for a news article from a given list of four candidate headlines. The dataset contains around 880k examples across 11 Indian languages. Example: + +*News Article:* + + ರಾಷ್ಟ್ರೀಯ\nಪುಣೆ: 23 ವರ್ಷದ ಇನ್ಫೋಸಿಸ್ ಮಹಿಳಾ ಟೆಕ್ಕಿಯೊಬ್ಬರನ್ನು ನಡು ರಸ್ತೆಯಲ್ಲಿಯೇ ಮಾರಾಕಾಸ್ತ್ರಗಳಿಂದ ಬರ್ಬರವಾಗಿ ಹತ್ಯೆ ಮಾಡಿರುವ ಘಟನೆ ಪುಣೆಯಲ್ಲಿ ಶನಿವಾರ ರಾತ್ರಿ ನಡೆದಿದೆ.\nಅಂತರ ದಾಸ್ ಕೊಲೆಯಾದ ಮಹಿಳಾ ಟೆಕ್ಕಿಯಾಗಿದ್ದಾರೆ. ಅಂತರಾ ಅವರು ಪಶ್ಚಿಮ ಬಂಗಾಳದ ಮೂಲದವರಾಗಿದ್ದಾರೆ. ಕಳೆದ ರಾತ್ರಿ 8.00 ಗಂಟೆ ಸುಮಾರಿಗೆ ಕೆಲಸ ಮುಗಿಸಿ ಮನೆಗೆ ತೆರಳುತ್ತಿದ್ದ ಸಂದರ್ಭದಲ್ಲಿ ಅಂತರಾ ಅವರ ಮೇಲೆ ದಾಳಿ ಮಾಡಿರುವ ದುಷ್ಕರ್ಮಿಗಳು ಮಾರಾಕಾಸ್ತ್ರಗಳಿಂದ ಹಲ್ಲೆ ನಡೆಸಿದ್ದಾರೆಂದು ಪೊಲೀಸರು ಹೇಳಿದ್ದಾರೆ.\nದಾಳಿ ನಡೆಸಿದ ನಂತರ ರಕ್ತದ ಮಡುವಿನಲ್ಲಿ ಬಿದ್ದು ಒದ್ದಾಡುತ್ತಿದ್ದ ಅಂತರಾ ಅವರನ್ನು ಸ್ಥಳೀಯರು ಆಸ್ಪತ್ರೆಗೆ ದಾಳಸಿದ್ದಾರೆ. ಆದರೆ, ಆಸ್ಪತ್ರೆಗೆ ದಾಖಲಿಸುವಷ್ಟರಲ್ಲಿ ಅಂತರಾ ಅವರು ಸಾವನ್ನಪ್ಪಿದ್ದಾರೆಂದು ಅವರು ಹೇಳಿದ್ದಾರೆ.\nಪ್ರಕರಣ ದಾಖಲಿಸಿಕೊಂಡಿರುವ ಪೊಲೀಸರು ತನಿಖೆ ಆರಂಭಿಸಿದ್ದಾರೆ", +*Candidate 1*: ಇನ್ಫೋಸಿಸ್ ಮಹಿಳಾ ಟೆಕ್ಕಿಯ ಬರ್ಬರ ಹತ್ಯೆ *[correct answer]* +*Candidate 2:* ಮಾನಸಿಕ ಅಸ್ವಸ್ಥೆ ಮೇಲೆ ಮಕ್ಕಳ ಕಳ್ಳಿ ಎಂದು ಭೀಕರ ಹಲ್ಲೆ +*Candidate 3:* ಕಸಬ ಬೆಂಗ್ರೆಯಲ್ಲಿ ಮುಸುಕುಧಾರಿಗಳ ತಂಡದಿಂದ ಮೂವರು ಯುವಕರ ಮೇಲೆ ಹಲ್ಲೆ : ಓರ್ವ ಗಂಭೀರ +*Candidate 4:* ಕಣಿವೆ ರಾಜ್ಯದಲ್ಲಿ mobile ಬಂದ್, ಪ್ರಿಂಟಿಂಗ್ ಪ್ರೆಸ್ ಮೇಲೆ ದಾಳಿ + + + +##### Wikipedia Section Title Prediction + +Predict the correct title for a Wikipedia section from a given list of four candidate titles. The dataset has 400k examples across 11 Indian languages. + +*Section Text*: + +``` +2005માં, જેકમેન નિર્માણ કંપની, સીડ પ્રોડકશન્સ ઊભી કરવા તેના લાંબાસમયના મદદનીશ જહોન પાલેર્મો સાથે જોડાયા, જેમનો પ્રથમ પ્રોજેકટ 2007માં વિવા લાફલિન હતો. જેકમેનની અભિનેત્રી પત્ની ડેબોરા-લી ફર્નેસ પણ કંપનીમાં જોડાઈ, અને પાલેર્મોએ પોતાના, ફર્નેસ અને જેકમેન માટે “ યુનિટી ” અર્થવાળા લખાણની આ ત્રણ વીંટીઓ બનાવી.[૨૭] ત્રણેયના સહયોગ અંગે જેકમેને જણાવ્યું કે “ મારી જિંદગીમાં જેમની સાથે મેં કામ કર્યું તે ભાગીદારો અંગે ડેબ અને જહોન પાલેર્મો અંગે હું ખૂબ નસીબદાર છું. ખરેખર તેથી કામ થયું. અમારી પાસે જુદું જુદું સાર્મથ્ય હતું. હું તે પસંદ કરતો હતો. I love it. તે ખૂબ ઉત્તેજક છે. ”[૨૮]ફોકસ આધારિત સીડ લેબલ, આમન્ડા સ્કિવેઈટઝર, કેથરિન ટેમ્બલિન, એલન મંડેલબમ અને જોય મરિનો તેમજ સાથે સિડની આધારિત નિર્માણ કચેરીનું સંચાલન કરનાર અલાના ફ્રીનો સમાવેશ થતાં કદમાં વિસ્તૃત બની. આ કંપીનોનો ઉદ્દેશ જેકમેનના વતનના દેશની સ્થાનિક પ્રતિભાને કામે લેવા મધ્યમ બજેટવાળી ફિલ્મો બનાવવાનો છે. +``` + +*Candidate 1:* એકસ-મેન + +*Candidate 2:* કારકીર્દિ + +*Candidate 3:* નિર્માણ કંપન [*correct answer*] + +*Candidate 4:* ઓસ્ટ્રેલિય + + + +##### Cloze-style Question Answering (WCQA) + +Given a text with an entity randomly masked, the task is to predict that masked entity from a list of 4 candidate entities. The dataset contains around 239k examples across 11 languages. Example: + +*Text* + +```markdown +ਹੋਮੀ ਭਾਬਾ ਦਾ ਜਨਮ 1949 ਈ ਨੂਂ ਮੁੰਬਈ ਵਿੱਚ ਪਾਰਸੀ ਪਰਿਵਾਰ ਵਿੱਚ ਹੋਇਆ । ਸੇਂਟ ਮੇਰੀ ਤੋਂ ਮੁਢਲੀ ਸਿਖਿਆ ਪ੍ਰਾਪਤ ਕਰਕੇ ਉਹ ਬੰਬੇ ਯੂਨੀਵਰਸਿਟੀ ਗ੍ਰੈਜੁਏਸ਼ਨ ਲਈ ਚਲਾ ਗਿਆ । ਇਸ ਤੋਂ ਬਾਅਦ ਉਹ ਉਚੇਰੀ ਸਿਖਿਆ ਲਈ ਚਲਾ ਗਿਆ । ਉਸਨੇ ਓਥੇ ਆਕਸਫੋਰਡ ਯੂਨੀਵਰਸਿਟੀ ਤੋਂ ਐਮ.ਏ ਅਤੇ ਐਮ ਫਿਲ ਦੀਆਂ ਡਿਗਰੀਆਂ ਪ੍ਰਾਪਤ ਕੀਤੀਆਂ । ਤਕਰੀਬਨ ਦਸ ਸਾਲ ਤਕ ਉਸਨੇ ਸੁਸੈਕਸ ਯੂਨੀਵਰਸਿਟੀ ਦੇ ਅੰਗਰੇਜ਼ੀ ਵਿਭਾਗ ਵਿੱਚ ਬਤੌਰ ਲੈਕਚਰਾਰ ਕਾਰਜ ਨਿਭਾਇਆ । ਇਸਤੋਂ ਇਲਾਵਾ ਹੋਮੀ ਭਾਬਾ ਪੈਨਸੁਲਵੇਨਿਆ , ਸ਼ਿਕਾਗੋ ਅਤੇ ਅਮਰੀਕਾ ਦੀ ਹਾਰਵਰਡ ਯੂਨੀਵਰਸਿਟੀ ਵਿੱਚ ਵੀ ਪ੍ਰੋਫ਼ੇਸਰ ਦੇ ਆਹੁਦੇ ਤੇ ਰਿਹਾ । +``` + +*Candidate 1*: ਬਰਤਾਨੀਆ *[correct answer]* +*Candidate 2*: ਭਾਰਤ +*Candidate 3*: ਸ਼ਿਕਾਗੋ +*Candidate 4*: ਪਾਕਿਸਤਾਨ + + + +##### Cross-lingual Sentence Retrieval (XSR) + +Given a sentence in language $L_1$ the task is to retrieve its translation from a set of candidate sentences in language $L_2$. The dataset contains around 39k parallel sentence pairs across 8 Indian languages. Example: + +*Input Sentence* + +``` +In the health sector the nation has now moved ahead from the conventional approach. +``` + +*Retrieve the following translation from a set of 4886 sentences:* + +``` +ആരോഗ്യമേഖലയില് ഇന്ന് രാജ്യം പരമ്പരാഗത രീതികളില് നിന്ന് മുന്നേറിക്കഴിഞ്ഞു. +``` + + + +### Additional Evaluation Tasks + +##### Natural Language Inference + +- Winnograd Natural Language Inference (WNLI) +- Choice of Plausible Alternatives (COPA) + +##### Sentiment Analysis + +- IITP Movie Reviews Sentiment +- IITP Product Reviews +- ACTSA Sentiment Classifcation + +##### Genre Classification + +- Soham Articles Genre Classification +- iNLTK Headlines Genre Classifcation +- BBC News Articles + +##### Discourse Analysis + +* MIDAS Discourse + + + +### Evaluation Results + + +##### IndicGLUE + +Task | mBERT | XLM-R | IndicBERT +-----| ----- | ----- | ------ +News Article Headline Prediction | 89.58 | 95.52 | **95.87** +Wikipedia Section Title Prediction| **73.66** | 66.33 | 73.31 +Cloze-style multiple-choice QA | 39.16 | 27.98 | **41.87** +Article Genre Classification | 90.63 | 97.03 | **97.34** +Named Entity Recognition (F1-score) | **73.24** | 65.93 | 64.47 +Cross-Lingual Sentence Retrieval Task | 21.46 | 13.74 | **27.12** +Average | 64.62 | 61.09 | **66.66** + +##### Additional Tasks + + +Task | Task Type | mBERT | XLM-R | IndicBERT +-----| ----- | ----- | ------ | ----- +BBC News Classification | Genre Classification | 60.55 | **75.52** | 74.60 +IIT Product Reviews | Sentiment Analysis | 74.57 | **78.97** | 71.32 +IITP Movie Reviews | Sentiment Analaysis | 56.77 | **61.61** | 59.03 +Soham News Article | Genre Classification | 80.23 | **87.6** | 78.45 +Midas Discourse | Discourse Analysis | 71.20 | **79.94** | 78.44 +iNLTK Headlines Classification | Genre Classification | 87.95 | 93.38 | **94.52** +ACTSA Sentiment Analysis | Sentiment Analysis | 48.53 | 59.33 | **61.18** +Winograd NLI | Natural Language Inference | 56.34 | 55.87 | **56.34** +Choice of Plausible Alternative (COPA) | Natural Language Inference | 54.92 | 51.13 | **58.33** +Amrita Exact Paraphrase | Paraphrase Detection | **93.81** | 93.02 | 93.75 +Amrita Rough Paraphrase | Paraphrase Detection | 83.38 | 82.20 | **84.33** +Average | | 69.84 | **74.42** | 73.66 + + +\* Note: all models have been restricted to a max_seq_length of 128. + + + +### Downloads + +The model can be downloaded [here](https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/models/indic-bert-v1.tar.gz). Both tf checkpoints and pytorch binaries are included in the archive. Alternatively, you can also download it from [Huggingface](https://huggingface.co/ai4bharat/indic-bert). + + + +### Citing + +If you are using any of the resources, please cite the following article: + +``` +@inproceedings{kakwani2020indicnlpsuite, + title={{IndicNLPSuite: Monolingual Corpora, Evaluation Benchmarks and Pre-trained Multilingual Language Models for Indian Languages}}, + author={Divyanshu Kakwani and Anoop Kunchukuttan and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, + year={2020}, + booktitle={Findings of EMNLP}, +} +``` + +We would like to hear from you if: + +- You are using our resources. Please let us know how you are putting these resources to use. +- You have any feedback on these resources. + + + +### License + +The IndicBERT code (and models) are released under the MIT License. + +### Contributors + +- Divyanshu Kakwani +- Anoop Kunchukuttan +- Gokul NC +- Satish Golla +- Avik Bhattacharyya +- Mitesh Khapra +- Pratyush Kumar + +This work is the outcome of a volunteer effort as part of [AI4Bharat initiative](https://ai4bharat.org). + + + +### Contact + +- Anoop Kunchukuttan ([anoop.kunchukuttan@gmail.com](mailto:anoop.kunchukuttan@gmail.com)) +- Mitesh Khapra ([miteshk@cse.iitm.ac.in](mailto:miteshk@cse.iitm.ac.in)) +- Pratyush Kumar ([pratyush@cse.iitm.ac.in](mailto:pratyush@cse.iitm.ac.in)) diff --git a/Indic-BERT-v1-master/requirements.txt b/Indic-BERT-v1-master/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e41fae97f3de831cb10b483529b33a949136d5c --- /dev/null +++ b/Indic-BERT-v1-master/requirements.txt @@ -0,0 +1,59 @@ +# Run pip install --upgrade pip if tensorflow 1.15 cannot be found +tensorflow==1.15.2 # CPU Version of TensorFlow +# tensorflow-gpu==1.15 # GPU version of TensorFlow +tensorflow_hub==0.7 +torch==1.6.0 +absl-py==0.10.0 +cachetools==4.1.1 +certifi==2020.6.20 +chardet==3.0.4 +click==7.1.2 +cloud-tpu-client==0.10 +dataclasses +filelock==3.0.12 +future==0.18.2 +google-api-core==1.22.1 +google-api-python-client==1.8.0 +google-auth==1.21.0 +google-auth-httplib2==0.0.4 +google-auth-oauthlib==0.4.1 +googleapis-common-protos==1.52.0 +grpcio==1.31.0 +h5py==2.10.0 +httplib2==0.18.1 +idna==2.10 +importlib-metadata==1.7.0 +joblib==0.16.0 +Keras==2.4.3 +Markdown==3.2.2 +numpy==1.19.1 +oauth2client==4.1.3 +oauthlib==3.1.0 +packaging==20.4 +Pillow==8.1.1 +protobuf==3.13.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pyparsing==2.4.7 +pytorch-lightning==0.8.1 +pytz==2020.1 +PyYAML==5.3.1 +regex==2020.7.14 +requests==2.24.0 +requests-oauthlib==1.3.0 +rsa==4.6 +sacremoses==0.0.43 +scipy==1.5.2 +sentencepiece==0.1.91 +seqeval==0.0.12 +six==1.15.0 +tensorboard +tensorboard-plugin-wit==1.7.0 +tokenizers==0.10.2 +tqdm==4.48.2 +transformers==4.5.0 +typing-extensions==3.7.4.3 +uritemplate==3.0.1 +urllib3==1.25.10 +Werkzeug==1.0.1 +zipp==3.1.0 diff --git a/Indic-BERT-v1-master/requirements_colab.txt b/Indic-BERT-v1-master/requirements_colab.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bde11217944e49b184fe384c8d8ac82e039ef80 --- /dev/null +++ b/Indic-BERT-v1-master/requirements_colab.txt @@ -0,0 +1,59 @@ +# Run pip install --upgrade pip if tensorflow 1.15 cannot be found +tensorflow==1.15.2 # CPU Version of TensorFlow +# tensorflow-gpu==1.15 # GPU version of TensorFlow +tensorflow_hub==0.7 +# torch==1.6.0 +absl-py==0.10.0 +cachetools==4.1.1 +certifi==2020.6.20 +chardet==3.0.4 +click==7.1.2 +cloud-tpu-client==0.10 +dataclasses +filelock==3.0.12 +future==0.18.2 +google-api-core==1.22.1 +google-api-python-client==1.8.0 +google-auth==1.21.0 +google-auth-httplib2==0.0.4 +google-auth-oauthlib==0.4.1 +googleapis-common-protos==1.52.0 +grpcio==1.31.0 +h5py==2.10.0 +httplib2==0.18.1 +idna==2.10 +importlib-metadata==1.7.0 +joblib==0.16.0 +Keras==2.4.3 +Markdown==3.2.2 +numpy==1.19.1 +oauth2client==4.1.3 +oauthlib==3.1.0 +packaging==20.4 +Pillow==8.1.1 +protobuf==3.13.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pyparsing==2.4.7 +pytorch-lightning==0.8.1 +pytz==2020.1 +PyYAML==5.3.1 +regex==2020.7.14 +requests==2.24.0 +requests-oauthlib==1.3.0 +rsa==4.6 +sacremoses==0.0.43 +scipy==1.5.2 +sentencepiece==0.1.91 +seqeval==0.0.12 +six==1.15.0 +tensorboard==2.2.0 +tensorboard-plugin-wit==1.7.0 +tokenizers==0.8.1rc1 +tqdm==4.48.2 +transformers==3.0.2 +typing-extensions==3.7.4.3 +uritemplate==3.0.1 +urllib3==1.25.10 +Werkzeug==1.0.1 +zipp==3.1.0 diff --git a/Indic-BERT-v1-master/scripts/convert_to_pt.sh b/Indic-BERT-v1-master/scripts/convert_to_pt.sh new file mode 100644 index 0000000000000000000000000000000000000000..16aebafaa1e4ef58d625c9fe79088b84b0cee7f7 --- /dev/null +++ b/Indic-BERT-v1-master/scripts/convert_to_pt.sh @@ -0,0 +1,7 @@ + +export ALBERT_BASE_DIR=$1 + +transformers-cli convert --model_type albert \ + --tf_checkpoint $ALBERT_BASE_DIR/tf_model \ + --config $ALBERT_BASE_DIR/config.json \ + --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin diff --git a/Indic-BERT-v1-master/scripts/create_masked_data.sh b/Indic-BERT-v1-master/scripts/create_masked_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..32f5c0c609ce690e01ec68611d881fb493d286d3 --- /dev/null +++ b/Indic-BERT-v1-master/scripts/create_masked_data.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +if [ $# != 2 ]; then + echo "USAGE: ./create_masked_data.sh "; + exit +fi + +TRAIN_FILE="$1/train.txt" +OUTPUT_DIR="$2" +shards_dir="$OUTPUT_DIR/shards" +data_dir="$OUTPUT_DIR/pretrain" + +# create shards +mkdir "$shards_dir" +split --lines=500000 "$TRAIN_FILE" "$shards_dir" + +mkdir "$data_dir" + +ls "$shards_dir"| xargs -I {} python3 albert/create_pretraining_data.py \ + --input_file="$shards_dir/{}" \ + --output_file="$data_dir/{}.tf_record" \ + --spm_model_file="$OUTPUT_DIR/spm.unigram.model" \ + --vocab_file="$OUTPUT_DIR/spm.unigram.vocab" \ + --max_seq_length=128 \ + --max_predictions_per_seq=20 \ + --masked_lm_prob=0.15 \ + --random_seed=12345 \ + --dupe_factor=3 diff --git a/Indic-BERT-v1-master/scripts/evaluate.py b/Indic-BERT-v1-master/scripts/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..868d6b8dff6626adace7d4174b38ccb7799cc2b9 --- /dev/null +++ b/Indic-BERT-v1-master/scripts/evaluate.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +""" +A script to run IGLUE tasks. Also supports cross-lingual tasks +""" + + +import sys +import os + +sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) + +from fine_tune.cli import main as finetune_main + + + +argvec = [ + '--train_lang', 'pa', + '--test_lang', 'pa', + '--task', 'paraphrase-fuzzy', + '--model_name_or_path', 'ai4bharat/indic-bert', + '--config_name', '', + '--tokenizer_name', '', + '--data_dir', '../iglue', + '--output_dir', '../outputs', + '--max_seq_length', '128', + '--learning_rate', '2e-5', + '--num_train_epochs', '3', + '--train_batch_size', '32', + '--seed', '2', + '--n_gpu', '1' +] + +finetune_main(argvec) + diff --git a/Indic-BERT-v1-master/scripts/gen_mtxt.sh b/Indic-BERT-v1-master/scripts/gen_mtxt.sh new file mode 100644 index 0000000000000000000000000000000000000000..f53a0b8546e123e8160330b130fcdf95d698a9b7 --- /dev/null +++ b/Indic-BERT-v1-master/scripts/gen_mtxt.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +if [ $# != 2 ]; then + echo "USAGE: ./gen_mtxt.sh "; + exit +fi + +declare -a langs=("as" "or" "kn" "ml" "ta" "te" "gu" "mr" "en" "hi" "pa" "bn") +DATA_DIR="$1" + +# Generate train small file + +OUTPUT="$DATA_DIR/train_small.txt" + +if [ -f "$OUTPUT" ]; then + echo "Output file already exists. Please remove it first" + exit +fi + +for lang in ${langs[@]}; do + echo "Processing $lang" + lines=$(wc -l "$DATA_DIR/$lang.txt" | cut -d' ' -f1) + smtlines=$(echo "e(l($lines*100)*0.7)/1" | bc -l) + smtlines=${smtlines%.*} + echo "Sampling $smtlines from $lines lines"; + cat "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt"\ + "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" | head -n "$smtlines" >> "$OUTPUT" +done + + +# Generate train file +OUTPUT="$DATA_DIR/train.txt" + +if [ -f "$OUTPUT" ]; then + echo "Output file already exists. Please remove it first" + exit +fi + +for lang in ${langs[@]}; do + echo "Processing $lang" + lines=$(wc -l "$DATA_DIR/$lang.txt" | cut -d' ' -f1) + smtlines=$(echo "e(l($lines*2100)*0.7)/1" | bc -l) + smtlines=${smtlines%.*} + echo "Sampling $smtlines from $lines lines"; + cat "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt"\ + "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" | head -n "$smtlines" >> "$OUTPUT" +done + diff --git a/Indic-BERT-v1-master/scripts/ner_preprocess.sh b/Indic-BERT-v1-master/scripts/ner_preprocess.sh new file mode 100644 index 0000000000000000000000000000000000000000..f88291d7353a8eca9e0e5c941884b9d07c19a030 --- /dev/null +++ b/Indic-BERT-v1-master/scripts/ner_preprocess.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +export DATA_DIR=$1 +export TRAIN_LANG=$2 +export TEST_LANG=$3 +export BERT_MODEL=$4 +export MAX_LENGTH=$5 +export SCRIPT="$(dirname $0)/preprocess.py" + +cat "$DATA_DIR/$TRAIN_LANG/$TRAIN_LANG-train.txt" | awk -F" " '{if($NF>0) {print $1, $(NF)} else {print $0;}}' > "$DATA_DIR/$TRAIN_LANG/train.txt.tmp" +cat "$DATA_DIR/$TRAIN_LANG/$TRAIN_LANG-valid.txt" | awk -F" " '{if($NF>0) {print $1, $(NF)} else {print $0;}}' > "$DATA_DIR/$TRAIN_LANG/valid.txt.tmp" +cat "$DATA_DIR/$TEST_LANG/$TEST_LANG-test.txt" | awk -F" " '{if($NF>0) {print $1, $(NF)} else {print $0;}}' > "$DATA_DIR/$TEST_LANG/test.txt.tmp" + +python3 scripts/preprocess.py "$DATA_DIR/$TRAIN_LANG/train.txt.tmp" $BERT_MODEL $MAX_LENGTH > "$DATA_DIR/$TRAIN_LANG/train.txt" +python3 scripts/preprocess.py "$DATA_DIR/$TRAIN_LANG/valid.txt.tmp" $BERT_MODEL $MAX_LENGTH > "$DATA_DIR/$TRAIN_LANG/valid.txt" +python3 scripts/preprocess.py "$DATA_DIR/$TEST_LANG/test.txt.tmp" $BERT_MODEL $MAX_LENGTH > "$DATA_DIR/$TEST_LANG/test.txt" + +cat "$DATA_DIR/$TRAIN_LANG/train.txt" "$DATA_DIR/$TRAIN_LANG/valid.txt" | cut -d " " -f 2 | grep -v "^$"| sort | uniq > "$DATA_DIR/$TRAIN_LANG/labels.txt" diff --git a/Indic-BERT-v1-master/scripts/preprocess.py b/Indic-BERT-v1-master/scripts/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..f061b95cef4651c1cf40df6b66980211814a601b --- /dev/null +++ b/Indic-BERT-v1-master/scripts/preprocess.py @@ -0,0 +1,40 @@ +import sys + +from transformers import AutoTokenizer + +dataset = sys.argv[1] +model_name_or_path = sys.argv[2] +max_len = int(sys.argv[3]) + +subword_len_counter = 0 + +tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) +max_len -= tokenizer.num_special_tokens_to_add() + +with open(dataset, "rt", encoding='utf-8') as f_p: + for line in f_p: + line = line.rstrip() + + if not line: + print(line) + subword_len_counter = 0 + continue + + token = line.split()[0] + + current_subwords_len = len(tokenizer.tokenize(token)) + + # Token contains strange control characters like \x96 or \x95 + # Just filter out the complete line + if current_subwords_len == 0: + continue + + if (subword_len_counter + current_subwords_len) > max_len: + print("") + print(line) + subword_len_counter = current_subwords_len + continue + + subword_len_counter += current_subwords_len + + print(line) diff --git a/Indic-BERT-v1-master/scripts/pretrain_albert.sh b/Indic-BERT-v1-master/scripts/pretrain_albert.sh new file mode 100644 index 0000000000000000000000000000000000000000..83064b86d3124b941f921c301496bd9b7125e4de --- /dev/null +++ b/Indic-BERT-v1-master/scripts/pretrain_albert.sh @@ -0,0 +1,18 @@ + +python3 -m albert.run_pretraining \ + --input_file=${STORAGE_BUCKET}/indicnlp-datasets/multilingual/orig/small_tfrecords/*\ + --output_dir=${STORAGE_BUCKET}/albert-base-orig \ + --albert_config_file=configs/albert_base_config.json \ + --do_train \ + --train_batch_size=4096 \ + --max_seq_length=128 \ + --max_predictions_per_seq=20 \ + --optimizer='lamb' \ + --learning_rate=.00176 \ + --num_train_steps=125000 \ + --num_warmup_steps=3125 \ + --save_checkpoints_steps=5000 \ + --use_tpu \ + --tpu_name=node-2 \ + --tpu_zone=europe-west4-a \ + --num_tpu_cores=8 diff --git a/Indic-BERT-v1-master/scripts/train_tokenizer.sh b/Indic-BERT-v1-master/scripts/train_tokenizer.sh new file mode 100644 index 0000000000000000000000000000000000000000..ea82b05347a238616e5e788328ae7bf55a556f01 --- /dev/null +++ b/Indic-BERT-v1-master/scripts/train_tokenizer.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +if [ $# != 3 ]; then + echo "USAGE: ./train_tokenizer.sh "; + exit +fi + +VOCAB_SIZE="$1" +DATA_DIR="$2" +OUTPUT_DIR="$3" +TRAIN_FILE="$DATA_DIR/train_small.txt" + +spm_train \ + --input "$TRAIN_FILE"\ + --model_prefix="$OUTPUT_DIR/spm.unigram" --vocab_size="$VOCAB_SIZE" \ + --pad_id=0 --unk_id=1 --eos_id=-1 --bos_id=-1 \ + --control_symbols=[CLS],[SEP],[MASK] \ + --shuffle_input_sentence=true \ + --character_coverage=0.99995 --model_type=unigram diff --git a/Indic-BERT-v1-master/scripts/vocab_dist.py b/Indic-BERT-v1-master/scripts/vocab_dist.py new file mode 100644 index 0000000000000000000000000000000000000000..c791a583a7e5b15673ab1492958d6a43140ddba2 --- /dev/null +++ b/Indic-BERT-v1-master/scripts/vocab_dist.py @@ -0,0 +1,30 @@ + +import sys +import unicodedata as ud +import collections + + +def get_lang(w): + try: + if w[0] == '▁': + lang = ud.name(w[1]).split()[0] + else: + lang = ud.name(w[0]).split()[0] + return lang + except: + return 'unk' + + +fname = sys.argv[1] +words = open(fname).read().split('\n') +words = map(lambda w: w.split()[0] if w != '' else '', words) +words = filter(lambda w: '[' not in w, words) +words = map(lambda w: w.replace('#', ''), words) + +langs = map(lambda w: get_lang(w), words) +counter = collections.Counter(langs) +counter = sorted(counter.items(), key=lambda k: -k[1]) +counter = list(filter(lambda item: item[1] > 10, counter)) + +for k, v in counter: + print(k, ": ", v)