Spaces:
Runtime error
Runtime error
harveen
commited on
Commit
·
4192287
1
Parent(s):
87dca7d
Changing structure
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- indicTrans/IndicTrans_training.ipynb → IndicTrans_training.ipynb +0 -0
- indicTrans/LICENSE → LICENSE +0 -0
- README.md +287 -28
- indicTrans/api.py → api.py +0 -0
- indicTrans/apply_bpe_traindevtest_notag.sh → apply_bpe_traindevtest_notag.sh +0 -0
- indicTrans/apply_single_bpe_traindevtest_notag.sh → apply_single_bpe_traindevtest_notag.sh +0 -0
- indicTrans/binarize_training_exp.sh → binarize_training_exp.sh +0 -0
- indicTrans/compute_bleu.sh → compute_bleu.sh +0 -0
- indicTrans/.gitignore +0 -143
- indicTrans/README.md +0 -296
- indicTrans/indicTrans_Finetuning.ipynb → indicTrans_Finetuning.ipynb +0 -0
- indicTrans/indicTrans_python_interface.ipynb → indicTrans_python_interface.ipynb +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/LICENSE +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/README.md +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/contrib/README.md +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/contrib/correct_moses_tokenizer.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/contrib/hindi_to_kannada_transliterator.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/contrib/indic_scraper_project_sample.ipynb +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/Makefile +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/cmd.rst +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/code.rst +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/conf.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/index.rst +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.MD +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.cli.rst +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.morph.rst +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.normalize.rst +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.pdf +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.rst +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.script.rst +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.syllable.rst +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.tokenize.rst +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.transliterate.rst +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/make.bat +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/docs/modules.rst +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/__init__.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/cli/__init__.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/cli/cliparser.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/common.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/langinfo.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/loader.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/morph/__init__.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/morph/unsupervised_morph.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/normalize/__init__.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/normalize/indic_normalize.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/script/__init__.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/script/english_script.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/script/indic_scripts.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/script/phonetic_sim.py +0 -0
- {indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/syllable/__init__.py +0 -0
indicTrans/IndicTrans_training.ipynb → IndicTrans_training.ipynb
RENAMED
File without changes
|
indicTrans/LICENSE → LICENSE
RENAMED
File without changes
|
README.md
CHANGED
@@ -1,37 +1,296 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
app_file: app.py
|
8 |
-
pinned: false
|
9 |
-
---
|
10 |
|
11 |
-
|
12 |
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
15 |
|
16 |
-
`emoji`: _string_
|
17 |
-
Space emoji (emoji-only character allowed)
|
18 |
|
19 |
-
`colorFrom`: _string_
|
20 |
-
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
21 |
|
22 |
-
`colorTo`: _string_
|
23 |
-
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
24 |
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
`sdk_version` : _string_
|
29 |
-
Only applicable for `streamlit` SDK.
|
30 |
-
See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<div align="center">
|
2 |
+
<h1><b><i>IndicTrans</i></b></h1>
|
3 |
+
<a href="http://indicnlp.ai4bharat.org/samanantar">Website</a> |
|
4 |
+
<a href="https://arxiv.org/abs/2104.05596">Paper</a> |
|
5 |
+
<a href="https://youtu.be/QwYPOd1eBtQ?t=383">Video</a><br><br>
|
6 |
+
</div>
|
|
|
|
|
|
|
7 |
|
8 |
+
**IndicTrans** is a Transformer-4x ( ~434M ) multilingual NMT model trained on [Samanantar](https://indicnlp.ai4bharat.org/samanantar) dataset which is the largest publicly available parallel corpora collection for Indic languages at the time of writing ( 14 April 2021 ). It is a single script model i.e we convert all the Indic data to the Devanagari script which allows for ***better lexical sharing between languages for transfer learning, prevents fragmentation of the subword vocabulary between Indic languages and allows using a smaller subword vocabulary***. We currently release two models - Indic to English and English to Indic and support the following 11 indic languages:
|
9 |
|
10 |
+
| <!-- --> | <!-- --> | <!-- --> | <!-- --> |
|
11 |
+
| ------------- | -------------- | ------------ | ----------- |
|
12 |
+
| Assamese (as) | Hindi (hi) | Marathi (mr) | Tamil (ta) |
|
13 |
+
| Bengali (bn) | Kannada (kn) | Oriya (or) | Telugu (te) |
|
14 |
+
| Gujarati (gu) | Malayalam (ml) | Punjabi (pa) |
|
15 |
|
|
|
|
|
16 |
|
|
|
|
|
17 |
|
|
|
|
|
18 |
|
19 |
+
- [Updates](#updates)
|
20 |
+
- [Download IndicTrans models:](#download-indictrans-models)
|
21 |
+
- [Using the model for translating any input](#using-the-model-for-translating-any-input)
|
22 |
+
- [Finetuning the model on your input dataset](#finetuning-the-model-on-your-input-dataset)
|
23 |
+
- [Mining Indic to Indic pairs from english centric corpus](#mining-indic-to-indic-pairs-from-english-centric-corpus)
|
24 |
+
- [Installation](#installation)
|
25 |
+
- [How to train the indictrans model on your training data?](#how-to-train-the-indictrans-model-on-your-training-data)
|
26 |
+
- [Network & Training Details](#network--training-details)
|
27 |
+
- [Folder Structure](#folder-structure)
|
28 |
+
- [Citing](#citing)
|
29 |
+
- [License](#license)
|
30 |
+
- [Contributors](#contributors)
|
31 |
+
- [Contact](#contact)
|
32 |
|
|
|
|
|
|
|
33 |
|
34 |
+
## Updates
|
35 |
+
<details><summary>Click to expand </summary>
|
36 |
+
18 December 2021
|
37 |
|
38 |
+
```
|
39 |
+
Tutorials updated with latest model links
|
40 |
+
```
|
41 |
+
|
42 |
+
|
43 |
+
26 November 2021
|
44 |
+
```
|
45 |
+
- v0.3 models are now available for download
|
46 |
+
```
|
47 |
+
|
48 |
+
27 June 2021
|
49 |
+
```
|
50 |
+
- Updated links for indic to indic model
|
51 |
+
- Add more comments to training scripts
|
52 |
+
- Add link to [Samanantar Video](https://youtu.be/QwYPOd1eBtQ?t=383)
|
53 |
+
- Add folder structure in readme
|
54 |
+
- Add python wrapper for model inference
|
55 |
+
```
|
56 |
+
|
57 |
+
09 June 2021
|
58 |
+
```
|
59 |
+
- Updated links for models
|
60 |
+
- Added Indic to Indic model
|
61 |
+
```
|
62 |
+
|
63 |
+
09 May 2021
|
64 |
+
```
|
65 |
+
- Added fix for finetuning on datasets where some lang pairs are not present. Previously the script assumed the finetuning dataset will have data for all 11 indic lang pairs
|
66 |
+
- Added colab notebook for finetuning instructions
|
67 |
+
```
|
68 |
+
</details>
|
69 |
+
|
70 |
+
## Download IndicTrans models:
|
71 |
+
|
72 |
+
Indic to English: [v0.3](https://storage.googleapis.com/samanantar-public/V0.3/models/indic-en.zip)
|
73 |
+
|
74 |
+
English to Indic: [v0.3](https://storage.googleapis.com/samanantar-public/V0.3/models/en-indic.zip)
|
75 |
+
|
76 |
+
Indic to Indic: [v0.3](https://storage.googleapis.com/samanantar-public/V0.3/models/m2m.zip)
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
## Using the model for translating any input
|
81 |
+
|
82 |
+
The model is trained on single sentences and hence, users need to split parapgraphs to sentences before running the translation when using our command line interface (The python interface has `translate_paragraph` method to handle multi sentence translations).
|
83 |
+
|
84 |
+
Note: IndicTrans is trained with a max sequence length of **200** tokens (subwords). If your sentence is too long (> 200 tokens), the sentence will be truncated to 200 tokens before translation.
|
85 |
+
|
86 |
+
Here is an example snippet to split paragraphs into sentences for English and Indic languages supported by our model:
|
87 |
+
```python
|
88 |
+
# install these libraries
|
89 |
+
# pip install mosestokenizer
|
90 |
+
# pip install indic-nlp-library
|
91 |
+
|
92 |
+
from mosestokenizer import *
|
93 |
+
from indicnlp.tokenize import sentence_tokenize
|
94 |
+
|
95 |
+
INDIC = ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"]
|
96 |
+
|
97 |
+
def split_sentences(paragraph, language):
|
98 |
+
if language == "en":
|
99 |
+
with MosesSentenceSplitter(language) as splitter:
|
100 |
+
return splitter([paragraph])
|
101 |
+
elif language in INDIC:
|
102 |
+
return sentence_tokenize.sentence_split(paragraph, lang=language)
|
103 |
+
|
104 |
+
split_sentences("""COVID-19 is caused by infection with the severe acute respiratory
|
105 |
+
syndrome coronavirus 2 (SARS-CoV-2) virus strain. The disease is mainly transmitted via the respiratory
|
106 |
+
route when people inhale droplets and particles that infected people release as they breathe, talk, cough, sneeze, or sing. """, language='en')
|
107 |
+
|
108 |
+
>> ['COVID-19 is caused by infection with the severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) virus strain.',
|
109 |
+
'The disease is mainly transmitted via the respiratory route when people inhale droplets and particles that infected people release as they breathe, talk, cough, sneeze, or sing.']
|
110 |
+
|
111 |
+
split_sentences("""இத்தொற்றுநோய் உலகளாவிய சமூக மற்றும் பொருளாதார சீர்குலைவை ஏற்படுத்தியுள்ளது.இதனால் பெரும் பொருளாதார மந்தநிலைக்குப் பின்னர் உலகளவில் மிகப்பெரிய மந்தநிலை ஏற்பட்டுள்ளது. இது விளையாட்டு,மத, அரசியல் மற்றும் கலாச்சார நிகழ்வுகளை ஒத்திவைக்க அல்லது ரத்து செய்ய வழிவகுத்தது.
|
112 |
+
அச்சம் காரணமாக முகக்கவசம், கிருமிநாசினி உள்ளிட்ட பொருட்களை அதிக நபர்கள் வாங்கியதால் விநியோகப் பற்றாக்குறை ஏற்பட்டது.""",
|
113 |
+
language='ta')
|
114 |
+
|
115 |
+
>> ['இத்தொற்றுநோய் உலகளாவிய சமூக மற்றும் பொருளாதார சீர்குலைவை ஏற்படுத்தியுள்ளது.',
|
116 |
+
'இதனால் பெரும் பொருளாதார மந்தநிலைக்குப் பின்னர் உலகளவில் மிகப்பெரிய மந்தநிலை ஏற்பட்டுள்ளது.',
|
117 |
+
'இது விளையாட்டு,மத, அரசியல் மற்றும் கலாச்சார நிகழ்வுகளை ஒத்திவைக்க அல்லது ரத்து செய்ய வழிவகுத்தது.',
|
118 |
+
'அச்சம் காரணமாக முகக்கவசம், கிருமிநாசினி உள்ளிட்ட பொருட்களை அதிக நபர்கள் வாங்கியதால் விநியோகப் பற்றாக்குறை ஏற்பட்டது.']
|
119 |
+
|
120 |
+
|
121 |
+
```
|
122 |
+
|
123 |
+
Follow the colab notebook to setup the environment, download the trained _IndicTrans_ models and translating your own text.
|
124 |
+
|
125 |
+
Command line interface --> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AI4Bharat/indicTrans/blob/main/indictrans_fairseq_inference.ipynb)
|
126 |
+
|
127 |
+
|
128 |
+
Python interface --> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AI4Bharat/indicTrans/blob/main/indicTrans_python_interface.ipynb)
|
129 |
+
|
130 |
+
The python interface is useful in case you want to reuse the model for multiple translations and do not want to reinitialize the model each time
|
131 |
+
|
132 |
+
|
133 |
+
## Finetuning the model on your input dataset
|
134 |
+
|
135 |
+
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AI4Bharat/indicTrans/blob/main/indicTrans_Finetuning.ipynb)
|
136 |
+
|
137 |
+
The colab notebook can be used to setup the environment, download the trained _IndicTrans_ models and prepare your custom dataset for funetuning the indictrans model. There is also a section on mining indic to indic data from english centric corpus for finetuning indic to indic model.
|
138 |
+
|
139 |
+
**Note**: Since this is a big model (400M params), you might not be able to train with reasonable batch sizes in the free google Colab account. We are planning to release smaller models (after pruning / distallation) soon.
|
140 |
+
|
141 |
+
## Mining Indic to Indic pairs from english centric corpus
|
142 |
+
|
143 |
+
The `extract_non_english_pairs` in `scripts/extract_non_english_pairs.py` can be used to mine indic to indic pairs from english centric corpus.
|
144 |
+
|
145 |
+
As described in the [paper](https://arxiv.org/pdf/2104.05596.pdf) (section 2.5) , we use a very strict deduplication criterion to avoid the creation of very similar parallel sentences. For example, if an en sentence is aligned to *M* hi sentences and *N* ta sentences, then we would get *MN* hi-ta pairs. However, these pairs would be very similar and not contribute much to the training process. Hence, we retain only 1 randomly chosen pair out of these *MN* pairs.
|
146 |
+
|
147 |
+
```bash
|
148 |
+
extract_non_english_pairs(indir, outdir, LANGS):
|
149 |
+
"""
|
150 |
+
Extracts non-english pair parallel corpora
|
151 |
+
indir: contains english centric data in the following form:
|
152 |
+
- directory named en-xx for language xx
|
153 |
+
- each directory contains a train.en and train.xx
|
154 |
+
outdir: output directory to store mined data for each pair.
|
155 |
+
One directory is created for each pair.
|
156 |
+
LANGS: list of languages in the corpus (other than English).
|
157 |
+
The language codes must correspond to the ones used in the
|
158 |
+
files and directories in indir. Prefarably, sort the languages
|
159 |
+
in this list in alphabetic order. outdir will contain data for xx-yy,
|
160 |
+
but not for yy-xx, so it will be convenient to have this list in sorted order.
|
161 |
+
"""
|
162 |
+
```
|
163 |
+
|
164 |
+
## Installation
|
165 |
+
<details><summary>Click to expand </summary>
|
166 |
+
|
167 |
+
```bash
|
168 |
+
cd indicTrans
|
169 |
+
git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git
|
170 |
+
git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
|
171 |
+
git clone https://github.com/rsennrich/subword-nmt.git
|
172 |
+
# install required libraries
|
173 |
+
pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library
|
174 |
+
|
175 |
+
# Install fairseq from source
|
176 |
+
git clone https://github.com/pytorch/fairseq.git
|
177 |
+
cd fairseq
|
178 |
+
pip install --editable ./
|
179 |
+
|
180 |
+
```
|
181 |
+
</details>
|
182 |
+
|
183 |
+
## How to train the indictrans model on your training data?
|
184 |
+
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AI4Bharat/indicTrans/blob/main/IndicTrans_training.ipynb)
|
185 |
+
|
186 |
+
|
187 |
+
Follow the colab notebook to setup the environment, download the dataset and train the indicTrans model
|
188 |
+
|
189 |
+
## Network & Training Details
|
190 |
+
|
191 |
+
- Architechture: IndicTrans uses 6 encoder and decoder layers, input embeddings of size 1536 with 16 attention heads and
|
192 |
+
feedforward dimension of 4096 with total number of parameters of 434M
|
193 |
+
- Loss: Cross entropy loss
|
194 |
+
- Optimizer: Adam
|
195 |
+
- Label Smoothing: 0.1
|
196 |
+
- Gradient clipping: 1.0
|
197 |
+
- Learning rate: 5e-4
|
198 |
+
- Warmup_steps: 4000
|
199 |
+
|
200 |
+
Please refer to section 4, 5 of our [paper](https://arxiv.org/ftp/arxiv/papers/2104/2104.05596.pdf) for more details on training/experimental setup.
|
201 |
+
|
202 |
+
## Folder Structure
|
203 |
+
```
|
204 |
+
|
205 |
+
IndicTrans
|
206 |
+
│ .gitignore
|
207 |
+
│ apply_bpe_traindevtest_notag.sh # apply bpe for joint vocab (Train, dev and test)
|
208 |
+
│ apply_single_bpe_traindevtest_notag.sh # apply bpe for seperate vocab (Train, dev and test)
|
209 |
+
│ binarize_training_exp.sh # binarize the training data after preprocessing for fairseq-training
|
210 |
+
│ compute_bleu.sh # Compute blue scores with postprocessing after translating with `joint_translate.sh`
|
211 |
+
│ indictrans_fairseq_inference.ipynb # colab example to show how to use model for inference
|
212 |
+
│ indicTrans_Finetuning.ipynb # colab example to show how to use model for finetuning on custom domain data
|
213 |
+
│ joint_translate.sh # used for inference (see colab inference notebook for more details on usage)
|
214 |
+
│ learn_bpe.sh # learning joint bpe on preprocessed text
|
215 |
+
│ learn_single_bpe.sh # learning seperate bpe on preprocessed text
|
216 |
+
│ LICENSE
|
217 |
+
│ prepare_data.sh # prepare data given an experiment dir (this does preprocessing,
|
218 |
+
│ # building vocab, binarization ) for bilingual training
|
219 |
+
│ prepare_data_joint_training.sh # prepare data given an experiment dir (this does preprocessing,
|
220 |
+
│ # building vocab, binarization ) for joint training
|
221 |
+
│ README.md
|
222 |
+
│
|
223 |
+
├───legacy # old unused scripts
|
224 |
+
├───model_configs # custom model configrations are stored here
|
225 |
+
│ custom_transformer.py # contains custom 4x transformer models
|
226 |
+
│ __init__.py
|
227 |
+
├───inference
|
228 |
+
│ custom_interactive.py # for python wrapper around fairseq-interactive
|
229 |
+
│ engine.py # python interface for model inference
|
230 |
+
└───scripts # stores python scripts that are used by other bash scripts
|
231 |
+
│ add_joint_tags_translate.py # add lang tags to the processed training data for bilingual training
|
232 |
+
│ add_tags_translate.py # add lang tags to the processed training data for joint training
|
233 |
+
│ clean_vocab.py # clean vocabulary after building with subword_nmt
|
234 |
+
│ concat_joint_data.py # concatenates lang pair data and creates text files to keep track
|
235 |
+
│ # of number of lines in each lang pair.
|
236 |
+
│ extract_non_english_pairs.py # Mining Indic to Indic pairs from english centric corpus
|
237 |
+
│ postprocess_translate.py # Postprocesses translations
|
238 |
+
│ preprocess_translate.py # Preprocess translations and for script conversion (from indic to devnagiri)
|
239 |
+
│ remove_large_sentences.py # to remove large sentences from training data
|
240 |
+
└───remove_train_devtest_overlaps.py # Finds and removes overlaped data of train with dev and test sets
|
241 |
+
```
|
242 |
+
|
243 |
+
|
244 |
+
## Citing
|
245 |
+
|
246 |
+
If you are using any of the resources, please cite the following article:
|
247 |
+
```
|
248 |
+
@misc{ramesh2021samanantar,
|
249 |
+
title={Samanantar: The Largest Publicly Available Parallel Corpora Collection for 11 Indic Languages},
|
250 |
+
author={Gowtham Ramesh and Sumanth Doddapaneni and Aravinth Bheemaraj and Mayank Jobanputra and Raghavan AK and Ajitesh Sharma and Sujit Sahoo and Harshita Diddee and Mahalakshmi J and Divyanshu Kakwani and Navneet Kumar and Aswin Pradeep and Kumar Deepak and Vivek Raghavan and Anoop Kunchukuttan and Pratyush Kumar and Mitesh Shantadevi Khapra},
|
251 |
+
year={2021},
|
252 |
+
eprint={2104.05596},
|
253 |
+
archivePrefix={arXiv},
|
254 |
+
primaryClass={cs.CL}
|
255 |
+
}
|
256 |
+
```
|
257 |
+
|
258 |
+
We would like to hear from you if:
|
259 |
+
|
260 |
+
- You are using our resources. Please let us know how you are putting these resources to use.
|
261 |
+
- You have any feedback on these resources.
|
262 |
+
|
263 |
+
|
264 |
+
|
265 |
+
### License
|
266 |
+
|
267 |
+
The IndicTrans code (and models) are released under the MIT License.
|
268 |
+
|
269 |
+
|
270 |
+
### Contributors
|
271 |
+
|
272 |
+
- Gowtham Ramesh, <sub>([RBCDSAI](https://rbcdsai.iitm.ac.in), [IITM](https://www.iitm.ac.in))</sub>
|
273 |
+
- Sumanth Doddapaneni, <sub>([RBCDSAI](https://rbcdsai.iitm.ac.in), [IITM](https://www.iitm.ac.in))</sub>
|
274 |
+
- Aravinth Bheemaraj, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
|
275 |
+
- Mayank Jobanputra, <sub>([IITM](https://www.iitm.ac.in))</sub>
|
276 |
+
- Raghavan AK, <sub>([AI4Bharat](https://ai4bharat.org))</sub>
|
277 |
+
- Ajitesh Sharma, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
|
278 |
+
- Sujit Sahoo, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
|
279 |
+
- Harshita Diddee, <sub>([AI4Bharat](https://ai4bharat.org))</sub>
|
280 |
+
- Mahalakshmi J, <sub>([AI4Bharat](https://ai4bharat.org))</sub>
|
281 |
+
- Divyanshu Kakwani, <sub>([IITM](https://www.iitm.ac.in), [AI4Bharat](https://ai4bharat.org))</sub>
|
282 |
+
- Navneet Kumar, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
|
283 |
+
- Aswin Pradeep, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
|
284 |
+
- Kumar Deepak, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
|
285 |
+
- Vivek Raghavan, <sub>([EkStep](https://ekstep.in))</sub>
|
286 |
+
- Anoop Kunchukuttan, <sub>([Microsoft](https://www.microsoft.com/en-in/), [AI4Bharat](https://ai4bharat.org))</sub>
|
287 |
+
- Pratyush Kumar, <sub>([RBCDSAI](https://rbcdsai.iitm.ac.in), [AI4Bharat](https://ai4bharat.org), [IITM](https://www.iitm.ac.in))</sub>
|
288 |
+
- Mitesh Shantadevi Khapra, <sub>([RBCDSAI](https://rbcdsai.iitm.ac.in), [AI4Bharat](https://ai4bharat.org), [IITM](https://www.iitm.ac.in))</sub>
|
289 |
+
|
290 |
+
|
291 |
+
|
292 |
+
### Contact
|
293 |
+
|
294 |
+
- Anoop Kunchukuttan ([anoop.kunchukuttan@gmail.com](mailto:anoop.kunchukuttan@gmail.com))
|
295 |
+
- Mitesh Khapra ([miteshk@cse.iitm.ac.in](mailto:miteshk@cse.iitm.ac.in))
|
296 |
+
- Pratyush Kumar ([pratyush@cse.iitm.ac.in](mailto:pratyush@cse.iitm.ac.in))
|
indicTrans/api.py → api.py
RENAMED
File without changes
|
indicTrans/apply_bpe_traindevtest_notag.sh → apply_bpe_traindevtest_notag.sh
RENAMED
File without changes
|
indicTrans/apply_single_bpe_traindevtest_notag.sh → apply_single_bpe_traindevtest_notag.sh
RENAMED
File without changes
|
indicTrans/binarize_training_exp.sh → binarize_training_exp.sh
RENAMED
File without changes
|
indicTrans/compute_bleu.sh → compute_bleu.sh
RENAMED
File without changes
|
indicTrans/.gitignore
DELETED
@@ -1,143 +0,0 @@
|
|
1 |
-
#ignore libs folder we use
|
2 |
-
indic_nlp_library
|
3 |
-
indic_nlp_resources
|
4 |
-
subword-nmt
|
5 |
-
|
6 |
-
# Byte-compiled / optimized / DLL files
|
7 |
-
__pycache__/
|
8 |
-
*.py[cod]
|
9 |
-
*$py.class
|
10 |
-
|
11 |
-
# C extensions
|
12 |
-
*.so
|
13 |
-
|
14 |
-
# Distribution / packaging
|
15 |
-
.Python
|
16 |
-
build/
|
17 |
-
develop-eggs/
|
18 |
-
dist/
|
19 |
-
downloads/
|
20 |
-
eggs/
|
21 |
-
.eggs/
|
22 |
-
lib/
|
23 |
-
lib64/
|
24 |
-
parts/
|
25 |
-
sdist/
|
26 |
-
var/
|
27 |
-
wheels/
|
28 |
-
share/python-wheels/
|
29 |
-
*.egg-info/
|
30 |
-
.installed.cfg
|
31 |
-
*.egg
|
32 |
-
MANIFEST
|
33 |
-
|
34 |
-
# PyInstaller
|
35 |
-
# Usually these files are written by a python script from a template
|
36 |
-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
37 |
-
*.manifest
|
38 |
-
*.spec
|
39 |
-
|
40 |
-
# Installer logs
|
41 |
-
pip-log.txt
|
42 |
-
pip-delete-this-directory.txt
|
43 |
-
|
44 |
-
# Unit test / coverage reports
|
45 |
-
htmlcov/
|
46 |
-
.tox/
|
47 |
-
.nox/
|
48 |
-
.coverage
|
49 |
-
.coverage.*
|
50 |
-
.cache
|
51 |
-
nosetests.xml
|
52 |
-
coverage.xml
|
53 |
-
*.cover
|
54 |
-
*.py,cover
|
55 |
-
.hypothesis/
|
56 |
-
.pytest_cache/
|
57 |
-
cover/
|
58 |
-
|
59 |
-
# Translations
|
60 |
-
*.mo
|
61 |
-
*.pot
|
62 |
-
|
63 |
-
# Django stuff:
|
64 |
-
*.log
|
65 |
-
local_settings.py
|
66 |
-
db.sqlite3
|
67 |
-
db.sqlite3-journal
|
68 |
-
|
69 |
-
# Flask stuff:
|
70 |
-
instance/
|
71 |
-
.webassets-cache
|
72 |
-
|
73 |
-
# Scrapy stuff:
|
74 |
-
.scrapy
|
75 |
-
|
76 |
-
# Sphinx documentation
|
77 |
-
docs/_build/
|
78 |
-
|
79 |
-
# PyBuilder
|
80 |
-
.pybuilder/
|
81 |
-
target/
|
82 |
-
|
83 |
-
# Jupyter Notebook
|
84 |
-
.ipynb_checkpoints
|
85 |
-
|
86 |
-
# IPython
|
87 |
-
profile_default/
|
88 |
-
ipython_config.py
|
89 |
-
|
90 |
-
# pyenv
|
91 |
-
# For a library or package, you might want to ignore these files since the code is
|
92 |
-
# intended to run in multiple environments; otherwise, check them in:
|
93 |
-
# .python-version
|
94 |
-
|
95 |
-
# pipenv
|
96 |
-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
97 |
-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
98 |
-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
99 |
-
# install all needed dependencies.
|
100 |
-
#Pipfile.lock
|
101 |
-
|
102 |
-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
103 |
-
__pypackages__/
|
104 |
-
|
105 |
-
# Celery stuff
|
106 |
-
celerybeat-schedule
|
107 |
-
celerybeat.pid
|
108 |
-
|
109 |
-
# SageMath parsed files
|
110 |
-
*.sage.py
|
111 |
-
|
112 |
-
# Environments
|
113 |
-
.env
|
114 |
-
.venv
|
115 |
-
env/
|
116 |
-
venv/
|
117 |
-
ENV/
|
118 |
-
env.bak/
|
119 |
-
venv.bak/
|
120 |
-
|
121 |
-
# Spyder project settings
|
122 |
-
.spyderproject
|
123 |
-
.spyproject
|
124 |
-
|
125 |
-
# Rope project settings
|
126 |
-
.ropeproject
|
127 |
-
|
128 |
-
# mkdocs documentation
|
129 |
-
/site
|
130 |
-
|
131 |
-
# mypy
|
132 |
-
.mypy_cache/
|
133 |
-
.dmypy.json
|
134 |
-
dmypy.json
|
135 |
-
|
136 |
-
# Pyre type checker
|
137 |
-
.pyre/
|
138 |
-
|
139 |
-
# pytype static type analyzer
|
140 |
-
.pytype/
|
141 |
-
|
142 |
-
# Cython debug symbols
|
143 |
-
cython_debug/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
indicTrans/README.md
DELETED
@@ -1,296 +0,0 @@
|
|
1 |
-
<div align="center">
|
2 |
-
<h1><b><i>IndicTrans</i></b></h1>
|
3 |
-
<a href="http://indicnlp.ai4bharat.org/samanantar">Website</a> |
|
4 |
-
<a href="https://arxiv.org/abs/2104.05596">Paper</a> |
|
5 |
-
<a href="https://youtu.be/QwYPOd1eBtQ?t=383">Video</a><br><br>
|
6 |
-
</div>
|
7 |
-
|
8 |
-
**IndicTrans** is a Transformer-4x ( ~434M ) multilingual NMT model trained on [Samanantar](https://indicnlp.ai4bharat.org/samanantar) dataset which is the largest publicly available parallel corpora collection for Indic languages at the time of writing ( 14 April 2021 ). It is a single script model i.e we convert all the Indic data to the Devanagari script which allows for ***better lexical sharing between languages for transfer learning, prevents fragmentation of the subword vocabulary between Indic languages and allows using a smaller subword vocabulary***. We currently release two models - Indic to English and English to Indic and support the following 11 indic languages:
|
9 |
-
|
10 |
-
| <!-- --> | <!-- --> | <!-- --> | <!-- --> |
|
11 |
-
| ------------- | -------------- | ------------ | ----------- |
|
12 |
-
| Assamese (as) | Hindi (hi) | Marathi (mr) | Tamil (ta) |
|
13 |
-
| Bengali (bn) | Kannada (kn) | Oriya (or) | Telugu (te) |
|
14 |
-
| Gujarati (gu) | Malayalam (ml) | Punjabi (pa) |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
- [Updates](#updates)
|
20 |
-
- [Download IndicTrans models:](#download-indictrans-models)
|
21 |
-
- [Using the model for translating any input](#using-the-model-for-translating-any-input)
|
22 |
-
- [Finetuning the model on your input dataset](#finetuning-the-model-on-your-input-dataset)
|
23 |
-
- [Mining Indic to Indic pairs from english centric corpus](#mining-indic-to-indic-pairs-from-english-centric-corpus)
|
24 |
-
- [Installation](#installation)
|
25 |
-
- [How to train the indictrans model on your training data?](#how-to-train-the-indictrans-model-on-your-training-data)
|
26 |
-
- [Network & Training Details](#network--training-details)
|
27 |
-
- [Folder Structure](#folder-structure)
|
28 |
-
- [Citing](#citing)
|
29 |
-
- [License](#license)
|
30 |
-
- [Contributors](#contributors)
|
31 |
-
- [Contact](#contact)
|
32 |
-
|
33 |
-
|
34 |
-
## Updates
|
35 |
-
<details><summary>Click to expand </summary>
|
36 |
-
18 December 2021
|
37 |
-
|
38 |
-
```
|
39 |
-
Tutorials updated with latest model links
|
40 |
-
```
|
41 |
-
|
42 |
-
|
43 |
-
26 November 2021
|
44 |
-
```
|
45 |
-
- v0.3 models are now available for download
|
46 |
-
```
|
47 |
-
|
48 |
-
27 June 2021
|
49 |
-
```
|
50 |
-
- Updated links for indic to indic model
|
51 |
-
- Add more comments to training scripts
|
52 |
-
- Add link to [Samanantar Video](https://youtu.be/QwYPOd1eBtQ?t=383)
|
53 |
-
- Add folder structure in readme
|
54 |
-
- Add python wrapper for model inference
|
55 |
-
```
|
56 |
-
|
57 |
-
09 June 2021
|
58 |
-
```
|
59 |
-
- Updated links for models
|
60 |
-
- Added Indic to Indic model
|
61 |
-
```
|
62 |
-
|
63 |
-
09 May 2021
|
64 |
-
```
|
65 |
-
- Added fix for finetuning on datasets where some lang pairs are not present. Previously the script assumed the finetuning dataset will have data for all 11 indic lang pairs
|
66 |
-
- Added colab notebook for finetuning instructions
|
67 |
-
```
|
68 |
-
</details>
|
69 |
-
|
70 |
-
## Download IndicTrans models:
|
71 |
-
|
72 |
-
Indic to English: [v0.3](https://storage.googleapis.com/samanantar-public/V0.3/models/indic-en.zip)
|
73 |
-
|
74 |
-
English to Indic: [v0.3](https://storage.googleapis.com/samanantar-public/V0.3/models/en-indic.zip)
|
75 |
-
|
76 |
-
Indic to Indic: [v0.3](https://storage.googleapis.com/samanantar-public/V0.3/models/m2m.zip)
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
## Using the model for translating any input
|
81 |
-
|
82 |
-
The model is trained on single sentences and hence, users need to split parapgraphs to sentences before running the translation when using our command line interface (The python interface has `translate_paragraph` method to handle multi sentence translations).
|
83 |
-
|
84 |
-
Note: IndicTrans is trained with a max sequence length of **200** tokens (subwords). If your sentence is too long (> 200 tokens), the sentence will be truncated to 200 tokens before translation.
|
85 |
-
|
86 |
-
Here is an example snippet to split paragraphs into sentences for English and Indic languages supported by our model:
|
87 |
-
```python
|
88 |
-
# install these libraries
|
89 |
-
# pip install mosestokenizer
|
90 |
-
# pip install indic-nlp-library
|
91 |
-
|
92 |
-
from mosestokenizer import *
|
93 |
-
from indicnlp.tokenize import sentence_tokenize
|
94 |
-
|
95 |
-
INDIC = ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"]
|
96 |
-
|
97 |
-
def split_sentences(paragraph, language):
|
98 |
-
if language == "en":
|
99 |
-
with MosesSentenceSplitter(language) as splitter:
|
100 |
-
return splitter([paragraph])
|
101 |
-
elif language in INDIC:
|
102 |
-
return sentence_tokenize.sentence_split(paragraph, lang=language)
|
103 |
-
|
104 |
-
split_sentences("""COVID-19 is caused by infection with the severe acute respiratory
|
105 |
-
syndrome coronavirus 2 (SARS-CoV-2) virus strain. The disease is mainly transmitted via the respiratory
|
106 |
-
route when people inhale droplets and particles that infected people release as they breathe, talk, cough, sneeze, or sing. """, language='en')
|
107 |
-
|
108 |
-
>> ['COVID-19 is caused by infection with the severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) virus strain.',
|
109 |
-
'The disease is mainly transmitted via the respiratory route when people inhale droplets and particles that infected people release as they breathe, talk, cough, sneeze, or sing.']
|
110 |
-
|
111 |
-
split_sentences("""இத்தொற்றுநோய் உலகளாவிய சமூக மற்���ும் பொருளாதார சீர்குலைவை ஏற்படுத்தியுள்ளது.இதனால் பெரும் பொருளாதார மந்தநிலைக்குப் பின்னர் உலகளவில் மிகப்பெரிய மந்தநிலை ஏற்பட்டுள்ளது. இது விளையாட்டு,மத, அரசியல் மற்றும் கலாச்சார நிகழ்வுகளை ஒத்திவைக்க அல்லது ரத்து செய்ய வழிவகுத்தது.
|
112 |
-
அச்சம் காரணமாக முகக்கவசம், கிருமிநாசினி உள்ளிட்ட பொருட்களை அதிக நபர்கள் வாங்கியதால் விநியோகப் பற்றாக்குறை ஏற்பட்டது.""",
|
113 |
-
language='ta')
|
114 |
-
|
115 |
-
>> ['இத்தொற்றுநோய் உலகளாவிய சமூக மற்றும் பொருளாதார சீர்குலைவை ஏற்படுத்தியுள்ளது.',
|
116 |
-
'இதனால் பெரும் பொருளாதார மந்தநிலைக்குப் பின்னர் உலகளவில் மிகப்பெரிய மந்தநிலை ஏற்பட்டுள்ளது.',
|
117 |
-
'இது விளையாட்டு,மத, அரசியல் மற்றும் கலாச்சார நிகழ்வுகளை ஒத்திவைக்க அல்லது ரத்து செய்ய வழிவகுத்தது.',
|
118 |
-
'அச்சம் காரணமாக முகக்கவசம், கிருமிநாசினி உள்ளிட்ட பொருட்களை அதிக நபர்கள் வாங்கியதால் விநியோகப் பற்றாக்குறை ஏற்பட்டது.']
|
119 |
-
|
120 |
-
|
121 |
-
```
|
122 |
-
|
123 |
-
Follow the colab notebook to setup the environment, download the trained _IndicTrans_ models and translating your own text.
|
124 |
-
|
125 |
-
Command line interface --> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AI4Bharat/indicTrans/blob/main/indictrans_fairseq_inference.ipynb)
|
126 |
-
|
127 |
-
|
128 |
-
Python interface --> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AI4Bharat/indicTrans/blob/main/indicTrans_python_interface.ipynb)
|
129 |
-
|
130 |
-
The python interface is useful in case you want to reuse the model for multiple translations and do not want to reinitialize the model each time
|
131 |
-
|
132 |
-
|
133 |
-
## Finetuning the model on your input dataset
|
134 |
-
|
135 |
-
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AI4Bharat/indicTrans/blob/main/indicTrans_Finetuning.ipynb)
|
136 |
-
|
137 |
-
The colab notebook can be used to setup the environment, download the trained _IndicTrans_ models and prepare your custom dataset for funetuning the indictrans model. There is also a section on mining indic to indic data from english centric corpus for finetuning indic to indic model.
|
138 |
-
|
139 |
-
**Note**: Since this is a big model (400M params), you might not be able to train with reasonable batch sizes in the free google Colab account. We are planning to release smaller models (after pruning / distallation) soon.
|
140 |
-
|
141 |
-
## Mining Indic to Indic pairs from english centric corpus
|
142 |
-
|
143 |
-
The `extract_non_english_pairs` in `scripts/extract_non_english_pairs.py` can be used to mine indic to indic pairs from english centric corpus.
|
144 |
-
|
145 |
-
As described in the [paper](https://arxiv.org/pdf/2104.05596.pdf) (section 2.5) , we use a very strict deduplication criterion to avoid the creation of very similar parallel sentences. For example, if an en sentence is aligned to *M* hi sentences and *N* ta sentences, then we would get *MN* hi-ta pairs. However, these pairs would be very similar and not contribute much to the training process. Hence, we retain only 1 randomly chosen pair out of these *MN* pairs.
|
146 |
-
|
147 |
-
```bash
|
148 |
-
extract_non_english_pairs(indir, outdir, LANGS):
|
149 |
-
"""
|
150 |
-
Extracts non-english pair parallel corpora
|
151 |
-
indir: contains english centric data in the following form:
|
152 |
-
- directory named en-xx for language xx
|
153 |
-
- each directory contains a train.en and train.xx
|
154 |
-
outdir: output directory to store mined data for each pair.
|
155 |
-
One directory is created for each pair.
|
156 |
-
LANGS: list of languages in the corpus (other than English).
|
157 |
-
The language codes must correspond to the ones used in the
|
158 |
-
files and directories in indir. Prefarably, sort the languages
|
159 |
-
in this list in alphabetic order. outdir will contain data for xx-yy,
|
160 |
-
but not for yy-xx, so it will be convenient to have this list in sorted order.
|
161 |
-
"""
|
162 |
-
```
|
163 |
-
|
164 |
-
## Installation
|
165 |
-
<details><summary>Click to expand </summary>
|
166 |
-
|
167 |
-
```bash
|
168 |
-
cd indicTrans
|
169 |
-
git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git
|
170 |
-
git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
|
171 |
-
git clone https://github.com/rsennrich/subword-nmt.git
|
172 |
-
# install required libraries
|
173 |
-
pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library
|
174 |
-
|
175 |
-
# Install fairseq from source
|
176 |
-
git clone https://github.com/pytorch/fairseq.git
|
177 |
-
cd fairseq
|
178 |
-
pip install --editable ./
|
179 |
-
|
180 |
-
```
|
181 |
-
</details>
|
182 |
-
|
183 |
-
## How to train the indictrans model on your training data?
|
184 |
-
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AI4Bharat/indicTrans/blob/main/IndicTrans_training.ipynb)
|
185 |
-
|
186 |
-
|
187 |
-
Follow the colab notebook to setup the environment, download the dataset and train the indicTrans model
|
188 |
-
|
189 |
-
## Network & Training Details
|
190 |
-
|
191 |
-
- Architechture: IndicTrans uses 6 encoder and decoder layers, input embeddings of size 1536 with 16 attention heads and
|
192 |
-
feedforward dimension of 4096 with total number of parameters of 434M
|
193 |
-
- Loss: Cross entropy loss
|
194 |
-
- Optimizer: Adam
|
195 |
-
- Label Smoothing: 0.1
|
196 |
-
- Gradient clipping: 1.0
|
197 |
-
- Learning rate: 5e-4
|
198 |
-
- Warmup_steps: 4000
|
199 |
-
|
200 |
-
Please refer to section 4, 5 of our [paper](https://arxiv.org/ftp/arxiv/papers/2104/2104.05596.pdf) for more details on training/experimental setup.
|
201 |
-
|
202 |
-
## Folder Structure
|
203 |
-
```
|
204 |
-
|
205 |
-
IndicTrans
|
206 |
-
│ .gitignore
|
207 |
-
│ apply_bpe_traindevtest_notag.sh # apply bpe for joint vocab (Train, dev and test)
|
208 |
-
│ apply_single_bpe_traindevtest_notag.sh # apply bpe for seperate vocab (Train, dev and test)
|
209 |
-
│ binarize_training_exp.sh # binarize the training data after preprocessing for fairseq-training
|
210 |
-
│ compute_bleu.sh # Compute blue scores with postprocessing after translating with `joint_translate.sh`
|
211 |
-
│ indictrans_fairseq_inference.ipynb # colab example to show how to use model for inference
|
212 |
-
│ indicTrans_Finetuning.ipynb # colab example to show how to use model for finetuning on custom domain data
|
213 |
-
│ joint_translate.sh # used for inference (see colab inference notebook for more details on usage)
|
214 |
-
│ learn_bpe.sh # learning joint bpe on preprocessed text
|
215 |
-
│ learn_single_bpe.sh # learning seperate bpe on preprocessed text
|
216 |
-
│ LICENSE
|
217 |
-
│ prepare_data.sh # prepare data given an experiment dir (this does preprocessing,
|
218 |
-
│ # building vocab, binarization ) for bilingual training
|
219 |
-
│ prepare_data_joint_training.sh # prepare data given an experiment dir (this does preprocessing,
|
220 |
-
│ # building vocab, binarization ) for joint training
|
221 |
-
│ README.md
|
222 |
-
│
|
223 |
-
├───legacy # old unused scripts
|
224 |
-
├───model_configs # custom model configrations are stored here
|
225 |
-
│ custom_transformer.py # contains custom 4x transformer models
|
226 |
-
│ __init__.py
|
227 |
-
├───inference
|
228 |
-
│ custom_interactive.py # for python wrapper around fairseq-interactive
|
229 |
-
│ engine.py # python interface for model inference
|
230 |
-
└───scripts # stores python scripts that are used by other bash scripts
|
231 |
-
│ add_joint_tags_translate.py # add lang tags to the processed training data for bilingual training
|
232 |
-
│ add_tags_translate.py # add lang tags to the processed training data for joint training
|
233 |
-
│ clean_vocab.py # clean vocabulary after building with subword_nmt
|
234 |
-
│ concat_joint_data.py # concatenates lang pair data and creates text files to keep track
|
235 |
-
│ # of number of lines in each lang pair.
|
236 |
-
│ extract_non_english_pairs.py # Mining Indic to Indic pairs from english centric corpus
|
237 |
-
│ postprocess_translate.py # Postprocesses translations
|
238 |
-
│ preprocess_translate.py # Preprocess translations and for script conversion (from indic to devnagiri)
|
239 |
-
│ remove_large_sentences.py # to remove large sentences from training data
|
240 |
-
└───remove_train_devtest_overlaps.py # Finds and removes overlaped data of train with dev and test sets
|
241 |
-
```
|
242 |
-
|
243 |
-
|
244 |
-
## Citing
|
245 |
-
|
246 |
-
If you are using any of the resources, please cite the following article:
|
247 |
-
```
|
248 |
-
@misc{ramesh2021samanantar,
|
249 |
-
title={Samanantar: The Largest Publicly Available Parallel Corpora Collection for 11 Indic Languages},
|
250 |
-
author={Gowtham Ramesh and Sumanth Doddapaneni and Aravinth Bheemaraj and Mayank Jobanputra and Raghavan AK and Ajitesh Sharma and Sujit Sahoo and Harshita Diddee and Mahalakshmi J and Divyanshu Kakwani and Navneet Kumar and Aswin Pradeep and Kumar Deepak and Vivek Raghavan and Anoop Kunchukuttan and Pratyush Kumar and Mitesh Shantadevi Khapra},
|
251 |
-
year={2021},
|
252 |
-
eprint={2104.05596},
|
253 |
-
archivePrefix={arXiv},
|
254 |
-
primaryClass={cs.CL}
|
255 |
-
}
|
256 |
-
```
|
257 |
-
|
258 |
-
We would like to hear from you if:
|
259 |
-
|
260 |
-
- You are using our resources. Please let us know how you are putting these resources to use.
|
261 |
-
- You have any feedback on these resources.
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
### License
|
266 |
-
|
267 |
-
The IndicTrans code (and models) are released under the MIT License.
|
268 |
-
|
269 |
-
|
270 |
-
### Contributors
|
271 |
-
|
272 |
-
- Gowtham Ramesh, <sub>([RBCDSAI](https://rbcdsai.iitm.ac.in), [IITM](https://www.iitm.ac.in))</sub>
|
273 |
-
- Sumanth Doddapaneni, <sub>([RBCDSAI](https://rbcdsai.iitm.ac.in), [IITM](https://www.iitm.ac.in))</sub>
|
274 |
-
- Aravinth Bheemaraj, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
|
275 |
-
- Mayank Jobanputra, <sub>([IITM](https://www.iitm.ac.in))</sub>
|
276 |
-
- Raghavan AK, <sub>([AI4Bharat](https://ai4bharat.org))</sub>
|
277 |
-
- Ajitesh Sharma, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
|
278 |
-
- Sujit Sahoo, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
|
279 |
-
- Harshita Diddee, <sub>([AI4Bharat](https://ai4bharat.org))</sub>
|
280 |
-
- Mahalakshmi J, <sub>([AI4Bharat](https://ai4bharat.org))</sub>
|
281 |
-
- Divyanshu Kakwani, <sub>([IITM](https://www.iitm.ac.in), [AI4Bharat](https://ai4bharat.org))</sub>
|
282 |
-
- Navneet Kumar, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
|
283 |
-
- Aswin Pradeep, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
|
284 |
-
- Kumar Deepak, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
|
285 |
-
- Vivek Raghavan, <sub>([EkStep](https://ekstep.in))</sub>
|
286 |
-
- Anoop Kunchukuttan, <sub>([Microsoft](https://www.microsoft.com/en-in/), [AI4Bharat](https://ai4bharat.org))</sub>
|
287 |
-
- Pratyush Kumar, <sub>([RBCDSAI](https://rbcdsai.iitm.ac.in), [AI4Bharat](https://ai4bharat.org), [IITM](https://www.iitm.ac.in))</sub>
|
288 |
-
- Mitesh Shantadevi Khapra, <sub>([RBCDSAI](https://rbcdsai.iitm.ac.in), [AI4Bharat](https://ai4bharat.org), [IITM](https://www.iitm.ac.in))</sub>
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
### Contact
|
293 |
-
|
294 |
-
- Anoop Kunchukuttan ([anoop.kunchukuttan@gmail.com](mailto:anoop.kunchukuttan@gmail.com))
|
295 |
-
- Mitesh Khapra ([miteshk@cse.iitm.ac.in](mailto:miteshk@cse.iitm.ac.in))
|
296 |
-
- Pratyush Kumar ([pratyush@cse.iitm.ac.in](mailto:pratyush@cse.iitm.ac.in))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
indicTrans/indicTrans_Finetuning.ipynb → indicTrans_Finetuning.ipynb
RENAMED
File without changes
|
indicTrans/indicTrans_python_interface.ipynb → indicTrans_python_interface.ipynb
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/LICENSE
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/README.md
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/contrib/README.md
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/contrib/correct_moses_tokenizer.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/contrib/hindi_to_kannada_transliterator.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/contrib/indic_scraper_project_sample.ipynb
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/Makefile
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/cmd.rst
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/code.rst
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/conf.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/index.rst
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.MD
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.cli.rst
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.morph.rst
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.normalize.rst
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.pdf
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.rst
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.script.rst
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.syllable.rst
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.tokenize.rst
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/indicnlp.transliterate.rst
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/make.bat
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/docs/modules.rst
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/__init__.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/cli/__init__.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/cli/cliparser.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/common.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/langinfo.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/loader.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/morph/__init__.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/morph/unsupervised_morph.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/normalize/__init__.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/normalize/indic_normalize.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/script/__init__.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/script/english_script.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/script/indic_scripts.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/script/phonetic_sim.py
RENAMED
File without changes
|
{indicTrans/indic_nlp_library → indic_nlp_library}/indicnlp/syllable/__init__.py
RENAMED
File without changes
|