Update README.md
Browse files
README.md
CHANGED
@@ -3,12 +3,16 @@ This is the IndicBART model. For detailed documentation look here: https://indic
|
|
3 |
Usage:
|
4 |
|
5 |
```
|
6 |
-
from transformers import MBartForConditionalGeneration
|
7 |
-
from transformers import AlbertTokenizer
|
8 |
|
9 |
tokenizer = AlbertTokenizer.from_pretrained("prajdabre/IndicBARTTokenizer", do_lower_case=False, use_fast=False, keep_accents=True)
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
|
13 |
# First tokenize the input and outputs. The format below is how IndicBART was trained so the input should be "Sentence </s> <2xx>" where xx is the language code. Similarly, the output should be "<2yy> Sentence </s>".
|
14 |
inp = tokenizer("I am a boy <\/s> <2en>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids
|
@@ -25,6 +29,8 @@ model_outputs.logits
|
|
25 |
|
26 |
# For generation. Pardon the messiness. Note the decoder_start_token_id.
|
27 |
|
|
|
|
|
28 |
model_output=model.generate(inp, use_cache=True, num_beams=4, max_length=20, min_length=1, early_stopping=True, pad_token_id=tokenizer.pad_token_id, decoder_start_token_id=tokenizer(["<2en>"], add_special_tokens=False).input_ids[0][0])
|
29 |
|
30 |
|
|
|
3 |
Usage:
|
4 |
|
5 |
```
|
6 |
+
from transformers import MBartForConditionalGeneration, AutoModelForSeq2SeqLM
|
7 |
+
from transformers import AlbertTokenizer, AutoTokenizer
|
8 |
|
9 |
tokenizer = AlbertTokenizer.from_pretrained("prajdabre/IndicBARTTokenizer", do_lower_case=False, use_fast=False, keep_accents=True)
|
10 |
|
11 |
+
# Or use tokenizer = AutoTokenizer.from_pretrained("prajdabre/IndicBARTTokenizer", do_lower_case=False, use_fast=False, keep_accents=True)
|
12 |
+
|
13 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("prajdabre/IndicBART")
|
14 |
+
|
15 |
+
# Or use model = MBartForConditionalGeneration.from_pretrained("prajdabre/IndicBART")
|
16 |
|
17 |
# First tokenize the input and outputs. The format below is how IndicBART was trained so the input should be "Sentence </s> <2xx>" where xx is the language code. Similarly, the output should be "<2yy> Sentence </s>".
|
18 |
inp = tokenizer("I am a boy <\/s> <2en>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids
|
|
|
29 |
|
30 |
# For generation. Pardon the messiness. Note the decoder_start_token_id.
|
31 |
|
32 |
+
model.eval() # Det dropouts to zero
|
33 |
+
|
34 |
model_output=model.generate(inp, use_cache=True, num_beams=4, max_length=20, min_length=1, early_stopping=True, pad_token_id=tokenizer.pad_token_id, decoder_start_token_id=tokenizer(["<2en>"], add_special_tokens=False).input_ids[0][0])
|
35 |
|
36 |
|